duckdb 0.7.2-dev1034.0 → 0.7.2-dev1138.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/package.json +1 -1
  2. package/src/duckdb/extension/icu/third_party/icu/stubdata/stubdata.cpp +1 -1
  3. package/src/duckdb/src/common/hive_partitioning.cpp +3 -1
  4. package/src/duckdb/src/common/progress_bar/progress_bar.cpp +7 -0
  5. package/src/duckdb/src/common/serializer/enum_serializer.cpp +6 -6
  6. package/src/duckdb/src/common/sort/comparators.cpp +14 -5
  7. package/src/duckdb/src/common/types/interval.cpp +0 -41
  8. package/src/duckdb/src/common/types/list_segment.cpp +658 -0
  9. package/src/duckdb/src/common/types/string_heap.cpp +1 -1
  10. package/src/duckdb/src/common/types/string_type.cpp +1 -1
  11. package/src/duckdb/src/common/types/vector.cpp +1 -1
  12. package/src/duckdb/src/common/value_operations/comparison_operations.cpp +14 -22
  13. package/src/duckdb/src/common/vector_operations/comparison_operators.cpp +10 -10
  14. package/src/duckdb/src/common/vector_operations/is_distinct_from.cpp +11 -10
  15. package/src/duckdb/src/execution/expression_executor/execute_comparison.cpp +2 -2
  16. package/src/duckdb/src/execution/index/art/art.cpp +13 -0
  17. package/src/duckdb/src/execution/operator/aggregate/physical_hash_aggregate.cpp +1 -1
  18. package/src/duckdb/src/execution/operator/join/physical_hash_join.cpp +2 -0
  19. package/src/duckdb/src/execution/operator/join/physical_index_join.cpp +1 -0
  20. package/src/duckdb/src/execution/operator/join/physical_join.cpp +0 -3
  21. package/src/duckdb/src/execution/operator/persistent/base_csv_reader.cpp +5 -1
  22. package/src/duckdb/src/execution/operator/persistent/buffered_csv_reader.cpp +18 -5
  23. package/src/duckdb/src/execution/operator/persistent/csv_reader_options.cpp +3 -0
  24. package/src/duckdb/src/execution/operator/persistent/physical_batch_insert.cpp +2 -1
  25. package/src/duckdb/src/execution/operator/persistent/physical_delete.cpp +1 -3
  26. package/src/duckdb/src/execution/operator/persistent/physical_insert.cpp +1 -0
  27. package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +0 -4
  28. package/src/duckdb/src/execution/physical_plan/plan_aggregate.cpp +1 -0
  29. package/src/duckdb/src/execution/physical_plan/plan_comparison_join.cpp +1 -1
  30. package/src/duckdb/src/execution/physical_plan/plan_create_index.cpp +2 -1
  31. package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +1 -0
  32. package/src/duckdb/src/function/aggregate/nested/list.cpp +6 -712
  33. package/src/duckdb/src/function/scalar/list/list_sort.cpp +25 -18
  34. package/src/duckdb/src/function/table/read_csv.cpp +5 -0
  35. package/src/duckdb/src/function/table/table_scan.cpp +8 -11
  36. package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
  37. package/src/duckdb/src/include/duckdb/common/helper.hpp +1 -1
  38. package/src/duckdb/src/include/duckdb/common/operator/comparison_operators.hpp +45 -149
  39. package/src/duckdb/src/include/duckdb/common/progress_bar/progress_bar.hpp +2 -0
  40. package/src/duckdb/src/include/duckdb/common/types/interval.hpp +39 -3
  41. package/src/duckdb/src/include/duckdb/common/types/list_segment.hpp +70 -0
  42. package/src/duckdb/src/include/duckdb/common/types/string_type.hpp +73 -3
  43. package/src/duckdb/src/include/duckdb/execution/index/art/art.hpp +1 -12
  44. package/src/duckdb/src/include/duckdb/execution/operator/persistent/csv_reader_options.hpp +4 -0
  45. package/src/duckdb/src/include/duckdb/main/client_config.hpp +2 -0
  46. package/src/duckdb/src/include/duckdb/storage/compression/chimp/chimp_scan.hpp +1 -0
  47. package/src/duckdb/src/include/duckdb/storage/compression/patas/patas_scan.hpp +1 -0
  48. package/src/duckdb/src/include/duckdb/storage/data_pointer.hpp +0 -2
  49. package/src/duckdb/src/include/duckdb/storage/data_table.hpp +1 -0
  50. package/src/duckdb/src/include/duckdb/storage/index.hpp +1 -1
  51. package/src/duckdb/src/include/duckdb/storage/string_uncompressed.hpp +1 -1
  52. package/src/duckdb/src/include/duckdb/storage/table/column_data.hpp +18 -7
  53. package/src/duckdb/src/include/duckdb/storage/table/column_segment.hpp +0 -3
  54. package/src/duckdb/src/include/duckdb/storage/table/column_segment_tree.hpp +18 -0
  55. package/src/duckdb/src/include/duckdb/storage/table/persistent_table_data.hpp +0 -1
  56. package/src/duckdb/src/include/duckdb/storage/table/row_group.hpp +35 -43
  57. package/src/duckdb/src/include/duckdb/storage/table/row_group_collection.hpp +18 -5
  58. package/src/duckdb/src/include/duckdb/storage/table/row_group_segment_tree.hpp +2 -4
  59. package/src/duckdb/src/include/duckdb/storage/table/scan_state.hpp +12 -29
  60. package/src/duckdb/src/include/duckdb/storage/table/segment_base.hpp +2 -3
  61. package/src/duckdb/src/include/duckdb/storage/table/segment_tree.hpp +11 -1
  62. package/src/duckdb/src/include/duckdb/storage/table/standard_column_data.hpp +0 -4
  63. package/src/duckdb/src/include/duckdb/transaction/local_storage.hpp +4 -1
  64. package/src/duckdb/src/include/duckdb.h +21 -0
  65. package/src/duckdb/src/main/capi/table_function-c.cpp +23 -0
  66. package/src/duckdb/src/main/settings/settings.cpp +20 -8
  67. package/src/duckdb/src/optimizer/filter_combiner.cpp +2 -5
  68. package/src/duckdb/src/optimizer/join_order/cardinality_estimator.cpp +2 -0
  69. package/src/duckdb/src/optimizer/join_order/join_order_optimizer.cpp +1 -0
  70. package/src/duckdb/src/parallel/meta_pipeline.cpp +0 -3
  71. package/src/duckdb/src/parser/transform/expression/transform_function.cpp +22 -0
  72. package/src/duckdb/src/planner/binder/statement/bind_create_table.cpp +1 -0
  73. package/src/duckdb/src/storage/compression/bitpacking.cpp +1 -1
  74. package/src/duckdb/src/storage/compression/fixed_size_uncompressed.cpp +2 -1
  75. package/src/duckdb/src/storage/compression/numeric_constant.cpp +1 -1
  76. package/src/duckdb/src/storage/compression/rle.cpp +1 -0
  77. package/src/duckdb/src/storage/compression/validity_uncompressed.cpp +1 -1
  78. package/src/duckdb/src/storage/data_table.cpp +3 -3
  79. package/src/duckdb/src/storage/local_storage.cpp +7 -0
  80. package/src/duckdb/src/storage/storage_info.cpp +1 -1
  81. package/src/duckdb/src/storage/table/column_data.cpp +75 -18
  82. package/src/duckdb/src/storage/table/column_data_checkpointer.cpp +3 -1
  83. package/src/duckdb/src/storage/table/column_segment.cpp +17 -31
  84. package/src/duckdb/src/storage/table/list_column_data.cpp +9 -12
  85. package/src/duckdb/src/storage/table/row_group.cpp +200 -136
  86. package/src/duckdb/src/storage/table/row_group_collection.cpp +75 -45
  87. package/src/duckdb/src/storage/table/scan_state.cpp +31 -38
  88. package/src/duckdb/src/storage/table/standard_column_data.cpp +4 -6
  89. package/src/duckdb/src/storage/table/struct_column_data.cpp +11 -18
  90. package/src/duckdb/src/storage/table/update_segment.cpp +3 -0
  91. package/src/duckdb/ub_src_common_types.cpp +2 -0
@@ -10,6 +10,7 @@
10
10
 
11
11
  #include "duckdb/common/assert.hpp"
12
12
  #include "duckdb/common/constants.hpp"
13
+ #include "duckdb/common/helper.hpp"
13
14
 
14
15
  #include <cstring>
15
16
 
@@ -117,10 +118,79 @@ public:
117
118
 
118
119
  void Verify() const;
119
120
  void VerifyNull() const;
121
+
122
+ struct StringComparisonOperators {
123
+ static inline bool Equals(const string_t &a, const string_t &b) {
124
+ #ifdef DUCKDB_DEBUG_NO_INLINE
125
+ if (a.GetSize() != b.GetSize())
126
+ return false;
127
+ return (memcmp(a.GetDataUnsafe(), b.GetDataUnsafe(), a.GetSize()) == 0);
128
+ #endif
129
+ uint64_t A = Load<uint64_t>((const_data_ptr_t)&a);
130
+ uint64_t B = Load<uint64_t>((const_data_ptr_t)&b);
131
+ if (A != B) {
132
+ // Either length or prefix are different -> not equal
133
+ return false;
134
+ }
135
+ // they have the same length and same prefix!
136
+ A = Load<uint64_t>((const_data_ptr_t)&a + 8u);
137
+ B = Load<uint64_t>((const_data_ptr_t)&b + 8u);
138
+ if (A == B) {
139
+ // either they are both inlined (so compare equal) or point to the same string (so compare equal)
140
+ return true;
141
+ }
142
+ if (!a.IsInlined()) {
143
+ // 'long' strings of the same length -> compare pointed value
144
+ if (memcmp(a.value.pointer.ptr, b.value.pointer.ptr, a.GetSize()) == 0) {
145
+ return true;
146
+ }
147
+ }
148
+ // either they are short string of same length but different content
149
+ // or they point to string with different content
150
+ // either way, they can't represent the same underlying string
151
+ return false;
152
+ }
153
+ // compare up to shared length. if still the same, compare lengths
154
+ static bool GreaterThan(const string_t &left, const string_t &right) {
155
+ const uint32_t left_length = left.GetSize();
156
+ const uint32_t right_length = right.GetSize();
157
+ const uint32_t min_length = std::min<uint32_t>(left_length, right_length);
158
+
159
+ #ifndef DUCKDB_DEBUG_NO_INLINE
160
+ uint32_t A = Load<uint32_t>((const_data_ptr_t)left.GetPrefix());
161
+ uint32_t B = Load<uint32_t>((const_data_ptr_t)right.GetPrefix());
162
+
163
+ // Utility to move 0xa1b2c3d4 into 0xd4c3b2a1, basically inverting the order byte-a-byte
164
+ auto bswap = [](uint32_t v) -> uint32_t {
165
+ uint32_t t1 = (v >> 16u) | (v << 16u);
166
+ uint32_t t2 = t1 & 0x00ff00ff;
167
+ uint32_t t3 = t1 & 0xff00ff00;
168
+ return (t2 << 8u) | (t3 >> 8u);
169
+ };
170
+
171
+ // Check on prefix -----
172
+ // We dont' need to mask since:
173
+ // if the prefix is greater(after bswap), it will stay greater regardless of the extra bytes
174
+ // if the prefix is smaller(after bswap), it will stay smaller regardless of the extra bytes
175
+ // if the prefix is equal, the extra bytes are guaranteed to be /0 for the shorter one
176
+
177
+ if (A != B)
178
+ return bswap(A) > bswap(B);
179
+ #endif
180
+ auto memcmp_res = memcmp(left.GetDataUnsafe(), right.GetDataUnsafe(), min_length);
181
+ return memcmp_res > 0 || (memcmp_res == 0 && left_length > right_length);
182
+ }
183
+ };
184
+
185
+ bool operator==(const string_t &r) const {
186
+ return StringComparisonOperators::Equals(*this, r);
187
+ }
188
+
189
+ bool operator>(const string_t &r) const {
190
+ return StringComparisonOperators::GreaterThan(*this, r);
191
+ }
120
192
  bool operator<(const string_t &r) const {
121
- auto this_str = this->GetString();
122
- auto r_str = r.GetString();
123
- return this_str < r_str;
193
+ return r > *this;
124
194
  }
125
195
 
126
196
  private:
@@ -27,18 +27,7 @@
27
27
  namespace duckdb {
28
28
 
29
29
  class ConflictManager;
30
-
31
- struct ARTIndexScanState : public IndexScanState {
32
-
33
- //! Scan predicates (single predicate scan or range scan)
34
- Value values[2];
35
- //! Expressions of the scan predicates
36
- ExpressionType expressions[2];
37
- bool checked = false;
38
- //! All scanned row IDs
39
- vector<row_t> result_ids;
40
- Iterator iterator;
41
- };
30
+ struct ARTIndexScanState;
42
31
 
43
32
  enum class VerifyExistenceType : uint8_t {
44
33
  APPEND = 0, // appends to a table
@@ -89,6 +89,8 @@ struct BufferedCSVReaderOptions {
89
89
 
90
90
  //! How many leading rows to skip
91
91
  idx_t skip_rows = 0;
92
+ //! Whether or not the skip_rows is set by the user
93
+ bool skip_rows_set = false;
92
94
  //! Maximum CSV line size: specified because if we reach this amount, we likely have wrong delimiters (default: 2MB)
93
95
  //! note that this is the guaranteed line length that will succeed, longer lines may be accepted if slightly above
94
96
  idx_t maximum_line_size = 2097152;
@@ -116,6 +118,8 @@ struct BufferedCSVReaderOptions {
116
118
  idx_t buffer_size = CSVBuffer::INITIAL_BUFFER_SIZE_COLOSSAL;
117
119
  //! Decimal separator when reading as numeric
118
120
  string decimal_separator = ".";
121
+ //! Whether or not to pad rows that do not have enough columns with NULL values
122
+ bool null_padding = true;
119
123
 
120
124
  //===--------------------------------------------------------------------===//
121
125
  // WriteCSVOptions
@@ -40,6 +40,8 @@ struct ClientConfig {
40
40
  //! to output anything
41
41
  bool emit_profiler_output = true;
42
42
 
43
+ //! system-wide progress bar disable.
44
+ const char *system_progress_bar_disable_reason = nullptr;
43
45
  //! If the progress bar is enabled or not.
44
46
  bool enable_progress_bar = false;
45
47
  //! If the print of the progress bar is enabled
@@ -24,6 +24,7 @@
24
24
 
25
25
  #include "duckdb/storage/compression/chimp/algorithm/flag_buffer.hpp"
26
26
  #include "duckdb/storage/compression/chimp/algorithm/leading_zero_buffer.hpp"
27
+ #include "duckdb/storage/table/scan_state.hpp"
27
28
 
28
29
  namespace duckdb {
29
30
 
@@ -21,6 +21,7 @@
21
21
  #include "duckdb/storage/table/column_data_checkpointer.hpp"
22
22
  #include "duckdb/storage/table/column_segment.hpp"
23
23
  #include "duckdb/common/operator/subtract.hpp"
24
+ #include "duckdb/storage/table/scan_state.hpp"
24
25
 
25
26
  namespace duckdb {
26
27
 
@@ -34,8 +34,6 @@ struct RowGroupPointer {
34
34
  uint64_t tuple_count;
35
35
  //! The data pointers of the column segments stored in the row group
36
36
  vector<BlockPointer> data_pointers;
37
- //! The per-column statistics of the row group
38
- vector<BaseStatistics> statistics;
39
37
  //! The versions information of the row group (if any)
40
38
  shared_ptr<VersionNode> versions;
41
39
  };
@@ -39,6 +39,7 @@ class Transaction;
39
39
  class WriteAheadLog;
40
40
  class TableDataWriter;
41
41
  class ConflictManager;
42
+ class TableScanState;
42
43
  enum class VerifyExistenceType : uint8_t;
43
44
 
44
45
  //! DataTable represents a physical table on disk
@@ -14,7 +14,6 @@
14
14
  #include "duckdb/common/sort/sort.hpp"
15
15
  #include "duckdb/parser/parsed_expression.hpp"
16
16
  #include "duckdb/planner/expression.hpp"
17
- #include "duckdb/storage/table/scan_state.hpp"
18
17
  #include "duckdb/storage/meta_block_writer.hpp"
19
18
  #include "duckdb/execution/expression_executor.hpp"
20
19
  #include "duckdb/common/types/constraint_conflict_info.hpp"
@@ -27,6 +26,7 @@ class Transaction;
27
26
  class ConflictManager;
28
27
 
29
28
  struct IndexLock;
29
+ struct IndexScanState;
30
30
 
31
31
  //! The index is an abstract base class that serves as the basis for indexes
32
32
  class Index {
@@ -9,7 +9,7 @@
9
9
  #include "duckdb/storage/buffer_manager.hpp"
10
10
  #include "duckdb/storage/checkpoint/string_checkpoint_state.hpp"
11
11
  #include "duckdb/storage/segment/uncompressed.hpp"
12
-
12
+ #include "duckdb/storage/table/scan_state.hpp"
13
13
  #include "duckdb/storage/string_uncompressed.hpp"
14
14
  #include "duckdb/storage/table/append_state.hpp"
15
15
  #include "duckdb/storage/table/column_segment.hpp"
@@ -9,14 +9,12 @@
9
9
  #pragma once
10
10
 
11
11
  #include "duckdb/common/types/data_chunk.hpp"
12
- #include "duckdb/storage/table/append_state.hpp"
13
- #include "duckdb/storage/table/scan_state.hpp"
14
12
  #include "duckdb/storage/statistics/base_statistics.hpp"
15
13
  #include "duckdb/storage/data_pointer.hpp"
16
14
  #include "duckdb/storage/table/persistent_table_data.hpp"
17
15
  #include "duckdb/storage/statistics/segment_statistics.hpp"
18
16
  #include "duckdb/storage/table/segment_tree.hpp"
19
- #include "duckdb/storage/table/column_segment.hpp"
17
+ #include "duckdb/storage/table/column_segment_tree.hpp"
20
18
  #include "duckdb/common/mutex.hpp"
21
19
 
22
20
  namespace duckdb {
@@ -36,8 +34,6 @@ struct ColumnCheckpointInfo {
36
34
  CompressionType compression_type;
37
35
  };
38
36
 
39
- class ColumnSegmentTree : public SegmentTree<ColumnSegment> {};
40
-
41
37
  class ColumnData {
42
38
  friend class ColumnDataCheckpointer;
43
39
 
@@ -47,14 +43,16 @@ public:
47
43
  ColumnData(ColumnData &other, idx_t start, ColumnData *parent);
48
44
  virtual ~ColumnData();
49
45
 
46
+ //! The start row
47
+ const idx_t start;
48
+ //! The count of the column data
49
+ idx_t count;
50
50
  //! The block manager
51
51
  BlockManager &block_manager;
52
52
  //! Table info for the column
53
53
  DataTableInfo &info;
54
54
  //! The column index of the column, either within the parent table or within the parent
55
55
  idx_t column_index;
56
- //! The start row
57
- idx_t start;
58
56
  //! The type of the column
59
57
  LogicalType type;
60
58
  //! The parent column (if any)
@@ -63,6 +61,9 @@ public:
63
61
  public:
64
62
  virtual bool CheckZonemap(ColumnScanState &state, TableFilter &filter) = 0;
65
63
 
64
+ BlockManager &GetBlockManager() {
65
+ return block_manager;
66
+ }
66
67
  DatabaseInstance &GetDatabase() const;
67
68
  DataTableInfo &GetTableInfo() const;
68
69
  virtual idx_t GetMaxEntry();
@@ -96,6 +97,8 @@ public:
96
97
  virtual void InitializeAppend(ColumnAppendState &state);
97
98
  //! Append a vector of type [type] to the end of the column
98
99
  virtual void Append(BaseStatistics &stats, ColumnAppendState &state, Vector &vector, idx_t count);
100
+ //! Append a vector of type [type] to the end of the column
101
+ void Append(ColumnAppendState &state, Vector &vector, idx_t count);
99
102
  virtual void AppendData(BaseStatistics &stats, ColumnAppendState &state, UnifiedVectorFormat &vdata, idx_t count);
100
103
  //! Revert a set of appends to the ColumnData
101
104
  virtual void RevertAppend(row_t start_row);
@@ -130,6 +133,8 @@ public:
130
133
  virtual void GetStorageInfo(idx_t row_group_index, vector<idx_t> col_path, TableStorageInfo &result);
131
134
  virtual void Verify(RowGroup &parent);
132
135
 
136
+ bool CheckZonemap(TableFilter &filter);
137
+
133
138
  static shared_ptr<ColumnData> CreateColumn(BlockManager &block_manager, DataTableInfo &info, idx_t column_index,
134
139
  idx_t start_row, const LogicalType &type, ColumnData *parent = nullptr);
135
140
  static shared_ptr<ColumnData> CreateColumn(ColumnData &other, idx_t start_row, ColumnData *parent = nullptr);
@@ -138,6 +143,10 @@ public:
138
143
  ColumnData *parent = nullptr);
139
144
  static unique_ptr<ColumnData> CreateColumnUnique(ColumnData &other, idx_t start_row, ColumnData *parent = nullptr);
140
145
 
146
+ void MergeStatistics(const BaseStatistics &other);
147
+ void MergeIntoStatistics(BaseStatistics &other);
148
+ unique_ptr<BaseStatistics> GetStatistics();
149
+
141
150
  protected:
142
151
  //! Append a transient segment
143
152
  void AppendTransientSegment(SegmentLock &l, idx_t start_row);
@@ -158,6 +167,8 @@ protected:
158
167
  unique_ptr<UpdateSegment> updates;
159
168
  //! The internal version of the column data
160
169
  idx_t version;
170
+ //! The stats of the root segment
171
+ unique_ptr<SegmentStatistics> stats;
161
172
  };
162
173
 
163
174
  } // namespace duckdb
@@ -14,7 +14,6 @@
14
14
  #include "duckdb/storage/buffer_manager.hpp"
15
15
  #include "duckdb/storage/statistics/segment_statistics.hpp"
16
16
  #include "duckdb/storage/storage_lock.hpp"
17
- #include "duckdb/storage/table/scan_state.hpp"
18
17
  #include "duckdb/function/compression_function.hpp"
19
18
  #include "duckdb/storage/table/segment_base.hpp"
20
19
 
@@ -39,8 +38,6 @@ class ColumnSegment : public SegmentBase<ColumnSegment> {
39
38
  public:
40
39
  ~ColumnSegment();
41
40
 
42
- //! The index within the segment tree
43
- idx_t index;
44
41
  //! The database instance
45
42
  DatabaseInstance &db;
46
43
  //! The type stored in the column
@@ -0,0 +1,18 @@
1
+ //===----------------------------------------------------------------------===//
2
+ // DuckDB
3
+ //
4
+ // duckdb/storage/table/column_segment_tree.hpp
5
+ //
6
+ //
7
+ //===----------------------------------------------------------------------===//
8
+
9
+ #pragma once
10
+
11
+ #include "duckdb/storage/table/segment_tree.hpp"
12
+ #include "duckdb/storage/table/column_segment.hpp"
13
+
14
+ namespace duckdb {
15
+
16
+ class ColumnSegmentTree : public SegmentTree<ColumnSegment> {};
17
+
18
+ } // namespace duckdb
@@ -10,7 +10,6 @@
10
10
 
11
11
  #include "duckdb/common/constants.hpp"
12
12
  #include "duckdb/common/vector.hpp"
13
- #include "duckdb/storage/table/segment_tree.hpp"
14
13
  #include "duckdb/storage/data_pointer.hpp"
15
14
  #include "duckdb/storage/table/table_statistics.hpp"
16
15
 
@@ -10,13 +10,13 @@
10
10
 
11
11
  #include "duckdb/common/vector_size.hpp"
12
12
  #include "duckdb/storage/table/chunk_info.hpp"
13
- #include "duckdb/storage/table/append_state.hpp"
14
- #include "duckdb/storage/table/scan_state.hpp"
15
13
  #include "duckdb/storage/statistics/segment_statistics.hpp"
14
+ #include "duckdb/common/types/data_chunk.hpp"
16
15
  #include "duckdb/common/enums/scan_options.hpp"
17
16
  #include "duckdb/common/mutex.hpp"
18
17
  #include "duckdb/parser/column_list.hpp"
19
18
  #include "duckdb/storage/table/segment_base.hpp"
19
+ #include "duckdb/storage/block.hpp"
20
20
 
21
21
  namespace duckdb {
22
22
  class AttachedDatabase;
@@ -27,6 +27,7 @@ class DataTable;
27
27
  class PartialBlockManager;
28
28
  struct DataTableInfo;
29
29
  class ExpressionExecutor;
30
+ class RowGroupCollection;
30
31
  class RowGroupWriter;
31
32
  class UpdateSegment;
32
33
  class TableStatistics;
@@ -36,6 +37,10 @@ struct ColumnCheckpointState;
36
37
  struct RowGroupPointer;
37
38
  struct TransactionData;
38
39
  struct VersionNode;
40
+ class CollectionScanState;
41
+ class TableFilterSet;
42
+ struct ColumnFetchState;
43
+ struct RowGroupAppendState;
39
44
 
40
45
  struct RowGroupWriteData {
41
46
  vector<unique_ptr<ColumnCheckpointState>> states;
@@ -52,51 +57,33 @@ public:
52
57
  static constexpr const idx_t ROW_GROUP_VECTOR_COUNT = ROW_GROUP_SIZE / STANDARD_VECTOR_SIZE;
53
58
 
54
59
  public:
55
- RowGroup(AttachedDatabase &db, BlockManager &block_manager, DataTableInfo &table_info, idx_t start, idx_t count);
56
- RowGroup(AttachedDatabase &db, BlockManager &block_manager, DataTableInfo &table_info,
57
- const vector<LogicalType> &types, RowGroupPointer &&pointer);
58
- RowGroup(RowGroup &row_group, idx_t start);
60
+ RowGroup(RowGroupCollection &collection, idx_t start, idx_t count);
61
+ RowGroup(RowGroupCollection &collection, RowGroupPointer &&pointer);
62
+ RowGroup(RowGroup &row_group, RowGroupCollection &collection, idx_t start);
59
63
  ~RowGroup();
60
64
 
61
- //! The index within the segment tree
62
- idx_t index;
63
-
64
65
  private:
65
- //! The database instance
66
- AttachedDatabase &db;
67
- //! The block manager
68
- BlockManager &block_manager;
69
- //! The table info of this row_group
70
- DataTableInfo &table_info;
66
+ //! The RowGroupCollection this row-group is a part of
67
+ RowGroupCollection &collection;
71
68
  //! The version info of the row_group (inserted and deleted tuple info)
72
69
  shared_ptr<VersionNode> version_info;
73
70
  //! The column data of the row_group
74
71
  vector<shared_ptr<ColumnData>> columns;
75
- //! The segment statistics for each of the columns
76
- vector<SegmentStatistics> stats;
77
72
 
78
73
  public:
79
- DatabaseInstance &GetDatabase();
80
- BlockManager &GetBlockManager() {
81
- return block_manager;
82
- }
83
- DataTableInfo &GetTableInfo() {
84
- return table_info;
85
- }
86
- idx_t GetColumnIndex(ColumnData *data) {
87
- for (idx_t i = 0; i < columns.size(); i++) {
88
- if (columns[i].get() == data) {
89
- return i;
90
- }
91
- }
92
- return 0;
74
+ RowGroupCollection &GetCollection() {
75
+ return collection;
93
76
  }
77
+ DatabaseInstance &GetDatabase();
78
+ BlockManager &GetBlockManager();
79
+ DataTableInfo &GetTableInfo();
94
80
 
95
- unique_ptr<RowGroup> AlterType(const LogicalType &target_type, idx_t changed_idx, ExpressionExecutor &executor,
96
- RowGroupScanState &scan_state, DataChunk &scan_chunk);
97
- unique_ptr<RowGroup> AddColumn(ColumnDefinition &new_column, ExpressionExecutor &executor,
98
- Expression *default_value, Vector &intermediate);
99
- unique_ptr<RowGroup> RemoveColumn(idx_t removed_column);
81
+ unique_ptr<RowGroup> AlterType(RowGroupCollection &collection, const LogicalType &target_type, idx_t changed_idx,
82
+ ExpressionExecutor &executor, CollectionScanState &scan_state,
83
+ DataChunk &scan_chunk);
84
+ unique_ptr<RowGroup> AddColumn(RowGroupCollection &collection, ColumnDefinition &new_column,
85
+ ExpressionExecutor &executor, Expression *default_value, Vector &intermediate);
86
+ unique_ptr<RowGroup> RemoveColumn(RowGroupCollection &collection, idx_t removed_column);
100
87
 
101
88
  void CommitDrop();
102
89
  void CommitDropColumn(idx_t index);
@@ -104,16 +91,16 @@ public:
104
91
  void InitializeEmpty(const vector<LogicalType> &types);
105
92
 
106
93
  //! Initialize a scan over this row_group
107
- bool InitializeScan(RowGroupScanState &state);
108
- bool InitializeScanWithOffset(RowGroupScanState &state, idx_t vector_offset);
94
+ bool InitializeScan(CollectionScanState &state);
95
+ bool InitializeScanWithOffset(CollectionScanState &state, idx_t vector_offset);
109
96
  //! Checks the given set of table filters against the row-group statistics. Returns false if the entire row group
110
97
  //! can be skipped.
111
98
  bool CheckZonemap(TableFilterSet &filters, const vector<column_t> &column_ids);
112
99
  //! Checks the given set of table filters against the per-segment statistics. Returns false if any segments were
113
100
  //! skipped.
114
- bool CheckZonemapSegments(RowGroupScanState &state);
115
- void Scan(TransactionData transaction, RowGroupScanState &state, DataChunk &result);
116
- void ScanCommitted(RowGroupScanState &state, DataChunk &result, TableScanType type);
101
+ bool CheckZonemapSegments(CollectionScanState &state);
102
+ void Scan(TransactionData transaction, CollectionScanState &state, DataChunk &result);
103
+ void ScanCommitted(CollectionScanState &state, DataChunk &result, TableScanType type);
117
104
 
118
105
  idx_t GetSelVector(TransactionData transaction, idx_t vector_idx, SelectionVector &sel_vector, idx_t max_count);
119
106
  idx_t GetCommittedSelVector(transaction_t start_time, transaction_t transaction_id, idx_t vector_idx,
@@ -158,13 +145,16 @@ public:
158
145
 
159
146
  void Verify();
160
147
 
161
- void NextVector(RowGroupScanState &state);
148
+ void NextVector(CollectionScanState &state);
162
149
 
163
150
  private:
164
151
  ChunkInfo *GetChunkInfo(idx_t vector_idx);
152
+ ColumnData &GetColumn(idx_t c);
153
+ idx_t GetColumnCount() const;
154
+ vector<shared_ptr<ColumnData>> &GetColumns();
165
155
 
166
156
  template <TableScanType TYPE>
167
- void TemplatedScan(TransactionData transaction, RowGroupScanState &state, DataChunk &result);
157
+ void TemplatedScan(TransactionData transaction, CollectionScanState &state, DataChunk &result);
168
158
 
169
159
  static void CheckpointDeletes(VersionNode *versions, Serializer &serializer);
170
160
  static shared_ptr<VersionNode> DeserializeDeletes(Deserializer &source);
@@ -172,6 +162,8 @@ private:
172
162
  private:
173
163
  mutex row_group_lock;
174
164
  mutex stats_lock;
165
+ vector<BlockPointer> column_pointers;
166
+ unique_ptr<atomic<bool>[]> is_loaded;
175
167
  };
176
168
 
177
169
  struct VersionNode {
@@ -15,14 +15,16 @@
15
15
 
16
16
  namespace duckdb {
17
17
  struct ParallelTableScanState;
18
-
18
+ struct ParallelCollectionScanState;
19
+ class CreateIndexScanState;
20
+ class CollectionScanState;
19
21
  class PersistentTableData;
20
22
  class TableDataWriter;
21
23
  class TableIndexList;
22
24
  class TableStatistics;
23
-
25
+ struct TableAppendState;
26
+ class DuckTransaction;
24
27
  class BoundConstraint;
25
-
26
28
  class RowGroupSegmentTree;
27
29
 
28
30
  class RowGroupCollection {
@@ -48,8 +50,8 @@ public:
48
50
  void InitializeCreateIndexScan(CreateIndexScanState &state);
49
51
  void InitializeScanWithOffset(CollectionScanState &state, const vector<column_t> &column_ids, idx_t start_row,
50
52
  idx_t end_row);
51
- static bool InitializeScanInRowGroup(CollectionScanState &state, ParallelCollectionScanState &parallel_state,
52
- idx_t vector_index, idx_t max_row);
53
+ static bool InitializeScanInRowGroup(CollectionScanState &state, RowGroupCollection &collection,
54
+ RowGroup &row_group, idx_t vector_index, idx_t max_row);
53
55
  void InitializeParallelScan(ParallelCollectionScanState &state);
54
56
  bool NextParallelScan(ClientContext &context, ParallelCollectionScanState &state, CollectionScanState &scan_state);
55
57
 
@@ -99,6 +101,15 @@ public:
99
101
  unique_ptr<BaseStatistics> CopyStats(column_t column_id);
100
102
  void SetDistinct(column_t column_id, unique_ptr<DistinctStatistics> distinct_stats);
101
103
 
104
+ AttachedDatabase &GetAttached();
105
+ DatabaseInstance &GetDatabase();
106
+ BlockManager &GetBlockManager() {
107
+ return block_manager;
108
+ }
109
+ DataTableInfo &GetTableInfo() {
110
+ return *info;
111
+ }
112
+
102
113
  private:
103
114
  bool IsEmpty(SegmentLock &) const;
104
115
 
@@ -107,7 +118,9 @@ private:
107
118
  BlockManager &block_manager;
108
119
  //! The number of rows in the table
109
120
  atomic<idx_t> total_rows;
121
+ //! The data table info
110
122
  shared_ptr<DataTableInfo> info;
123
+ //! The column types of the row group collection
111
124
  vector<LogicalType> types;
112
125
  idx_t row_start;
113
126
  //! The segment trees holding the various row_groups of the table
@@ -18,7 +18,7 @@ class MetaBlockReader;
18
18
 
19
19
  class RowGroupSegmentTree : public SegmentTree<RowGroup, true> {
20
20
  public:
21
- RowGroupSegmentTree(DataTableInfo &table_info_p, BlockManager &block_manager_p, vector<LogicalType> column_types_p);
21
+ RowGroupSegmentTree(RowGroupCollection &collection);
22
22
  ~RowGroupSegmentTree() override;
23
23
 
24
24
  void Initialize(PersistentTableData &data);
@@ -26,9 +26,7 @@ public:
26
26
  protected:
27
27
  unique_ptr<RowGroup> LoadSegment() override;
28
28
 
29
- DataTableInfo &info;
30
- BlockManager &block_manager;
31
- vector<LogicalType> column_types;
29
+ RowGroupCollection &collection;
32
30
  idx_t current_row_group;
33
31
  idx_t max_row_group;
34
32
  unique_ptr<MetaBlockReader> reader;
@@ -72,12 +72,11 @@ struct ColumnScanState {
72
72
  idx_t last_offset = 0;
73
73
 
74
74
  public:
75
+ void Initialize(const LogicalType &type);
75
76
  //! Move the scan state forward by "count" rows (including all child states)
76
77
  void Next(idx_t count);
77
78
  //! Move ONLY this state forward by "count" rows (i.e. not the child states)
78
79
  void NextInternal(idx_t count);
79
- //! Move the scan state forward by STANDARD_VECTOR_SIZE rows
80
- void NextVector();
81
80
  };
82
81
 
83
82
  struct ColumnFetchState {
@@ -89,39 +88,18 @@ struct ColumnFetchState {
89
88
  BufferHandle &GetOrInsertHandle(ColumnSegment &segment);
90
89
  };
91
90
 
92
- class RowGroupScanState {
91
+ class CollectionScanState {
93
92
  public:
94
- RowGroupScanState(CollectionScanState &parent_p)
95
- : row_group(nullptr), vector_index(0), max_row(0), parent(parent_p) {
96
- }
93
+ CollectionScanState(TableScanState &parent_p);
97
94
 
98
95
  //! The current row_group we are scanning
99
- RowGroup *row_group = nullptr;
96
+ RowGroup *row_group;
100
97
  //! The vector index within the row_group
101
- idx_t vector_index = 0;
102
- //! The maximum row index of this row_group scan
103
- idx_t max_row = 0;
98
+ idx_t vector_index;
99
+ //! The maximum row within the row group
100
+ idx_t max_row_group_row;
104
101
  //! Child column scans
105
102
  unique_ptr<ColumnScanState[]> column_scans;
106
-
107
- public:
108
- const vector<column_t> &GetColumnIds();
109
- TableFilterSet *GetFilters();
110
- AdaptiveFilter *GetAdaptiveFilter();
111
- idx_t GetParentMaxRow();
112
-
113
- private:
114
- //! The parent scan state
115
- CollectionScanState &parent;
116
- };
117
-
118
- class CollectionScanState {
119
- public:
120
- CollectionScanState(TableScanState &parent_p)
121
- : row_group_state(*this), row_groups(nullptr), max_row(0), batch_index(0), parent(parent_p) {};
122
-
123
- //! The row_group scan state
124
- RowGroupScanState row_group_state;
125
103
  //! Row group segment tree
126
104
  RowGroupSegmentTree *row_groups;
127
105
  //! The total maximum row index
@@ -130,6 +108,7 @@ public:
130
108
  idx_t batch_index;
131
109
 
132
110
  public:
111
+ void Initialize(const vector<LogicalType> &types);
133
112
  const vector<column_t> &GetColumnIds();
134
113
  TableFilterSet *GetFilters();
135
114
  AdaptiveFilter *GetAdaptiveFilter();
@@ -167,12 +146,16 @@ private:
167
146
  };
168
147
 
169
148
  struct ParallelCollectionScanState {
149
+ ParallelCollectionScanState();
150
+
170
151
  //! The row group collection we are scanning
171
152
  RowGroupCollection *collection;
172
153
  RowGroup *current_row_group;
173
154
  idx_t vector_index;
174
155
  idx_t max_row;
175
156
  idx_t batch_index;
157
+ atomic<idx_t> processed_rows;
158
+ mutex lock;
176
159
  };
177
160
 
178
161
  struct ParallelTableScanState {