duckdb 0.7.2-dev1188.0 → 0.7.2-dev1238.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/extension/parquet/column_reader.cpp +2 -2
- package/src/duckdb/extension/parquet/include/parquet_timestamp.hpp +1 -1
- package/src/duckdb/extension/parquet/parquet_reader.cpp +5 -0
- package/src/duckdb/extension/parquet/parquet_statistics.cpp +24 -5
- package/src/duckdb/extension/parquet/parquet_timestamp.cpp +1 -1
- package/src/duckdb/src/common/string_util.cpp +14 -0
- package/src/duckdb/src/execution/aggregate_hashtable.cpp +88 -69
- package/src/duckdb/src/execution/join_hashtable.cpp +3 -1
- package/src/duckdb/src/execution/operator/set/physical_recursive_cte.cpp +2 -1
- package/src/duckdb/src/execution/partitionable_hashtable.cpp +7 -4
- package/src/duckdb/src/execution/radix_partitioned_hashtable.cpp +6 -6
- package/src/duckdb/src/function/aggregate/sorted_aggregate_function.cpp +84 -33
- package/src/duckdb/src/function/pragma/pragma_queries.cpp +24 -1
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/string_util.hpp +2 -0
- package/src/duckdb/src/include/duckdb/execution/aggregate_hashtable.hpp +34 -20
- package/src/duckdb/src/include/duckdb/execution/partitionable_hashtable.hpp +2 -1
package/package.json
CHANGED
|
@@ -1401,7 +1401,7 @@ unique_ptr<ColumnReader> ColumnReader::CreateReader(ParquetReader &reader, const
|
|
|
1401
1401
|
case LogicalTypeId::TIME_TZ:
|
|
1402
1402
|
if (schema_p.__isset.logicalType && schema_p.logicalType.__isset.TIME) {
|
|
1403
1403
|
if (schema_p.logicalType.TIME.unit.__isset.MILLIS) {
|
|
1404
|
-
return make_unique<CallbackColumnReader<
|
|
1404
|
+
return make_unique<CallbackColumnReader<int32_t, dtime_t, ParquetIntToTimeMs>>(
|
|
1405
1405
|
reader, type_p, schema_p, file_idx_p, max_define, max_repeat);
|
|
1406
1406
|
} else if (schema_p.logicalType.TIME.unit.__isset.MICROS) {
|
|
1407
1407
|
return make_unique<CallbackColumnReader<int64_t, dtime_t, ParquetIntToTime>>(
|
|
@@ -1416,7 +1416,7 @@ unique_ptr<ColumnReader> ColumnReader::CreateReader(ParquetReader &reader, const
|
|
|
1416
1416
|
return make_unique<CallbackColumnReader<int64_t, dtime_t, ParquetIntToTime>>(
|
|
1417
1417
|
reader, type_p, schema_p, file_idx_p, max_define, max_repeat);
|
|
1418
1418
|
case ConvertedType::TIME_MILLIS:
|
|
1419
|
-
return make_unique<CallbackColumnReader<
|
|
1419
|
+
return make_unique<CallbackColumnReader<int32_t, dtime_t, ParquetIntToTimeMs>>(
|
|
1420
1420
|
reader, type_p, schema_p, file_idx_p, max_define, max_repeat);
|
|
1421
1421
|
default:
|
|
1422
1422
|
break;
|
|
@@ -22,7 +22,7 @@ timestamp_t ParquetTimestampMicrosToTimestamp(const int64_t &raw_ts);
|
|
|
22
22
|
timestamp_t ParquetTimestampMsToTimestamp(const int64_t &raw_ts);
|
|
23
23
|
timestamp_t ParquetTimestampNsToTimestamp(const int64_t &raw_ts);
|
|
24
24
|
date_t ParquetIntToDate(const int32_t &raw_date);
|
|
25
|
-
dtime_t ParquetIntToTimeMs(const
|
|
25
|
+
dtime_t ParquetIntToTimeMs(const int32_t &raw_time);
|
|
26
26
|
dtime_t ParquetIntToTime(const int64_t &raw_time);
|
|
27
27
|
dtime_t ParquetIntToTimeNs(const int64_t &raw_time);
|
|
28
28
|
|
|
@@ -200,6 +200,11 @@ LogicalType ParquetReader::DeriveLogicalType(const SchemaElement &s_ele, bool bi
|
|
|
200
200
|
throw IOException("UTF8 converted type can only be set for Type::(FIXED_LEN_)BYTE_ARRAY");
|
|
201
201
|
}
|
|
202
202
|
case ConvertedType::TIME_MILLIS:
|
|
203
|
+
if (s_ele.type == Type::INT32) {
|
|
204
|
+
return LogicalType::TIME;
|
|
205
|
+
} else {
|
|
206
|
+
throw IOException("TIME_MILLIS converted type can only be set for value of Type::INT32");
|
|
207
|
+
}
|
|
203
208
|
case ConvertedType::TIME_MICROS:
|
|
204
209
|
if (s_ele.type == Type::INT64) {
|
|
205
210
|
return LogicalType::TIME;
|
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
#include "parquet_statistics.hpp"
|
|
2
2
|
#include "parquet_decimal_utils.hpp"
|
|
3
3
|
#include "parquet_timestamp.hpp"
|
|
4
|
-
|
|
5
4
|
#include "duckdb.hpp"
|
|
6
5
|
#ifndef DUCKDB_AMALGAMATION
|
|
7
6
|
#include "duckdb/common/types/blob.hpp"
|
|
8
7
|
#include "duckdb/common/types/value.hpp"
|
|
9
|
-
|
|
8
|
+
#include "duckdb/common/types/time.hpp"
|
|
10
9
|
#endif
|
|
11
10
|
|
|
12
11
|
namespace duckdb {
|
|
@@ -155,11 +154,31 @@ Value ParquetStatisticsUtils::ConvertValue(const LogicalType &type,
|
|
|
155
154
|
return Value::DATE(date_t(Load<int32_t>((data_ptr_t)stats.c_str())));
|
|
156
155
|
case LogicalTypeId::TIME:
|
|
157
156
|
case LogicalTypeId::TIME_TZ: {
|
|
158
|
-
|
|
157
|
+
int64_t val;
|
|
158
|
+
if (stats.size() == sizeof(int32_t)) {
|
|
159
|
+
val = Load<int32_t>((data_ptr_t)stats.c_str());
|
|
160
|
+
} else if (stats.size() == sizeof(int64_t)) {
|
|
161
|
+
val = Load<int64_t>((data_ptr_t)stats.c_str());
|
|
162
|
+
} else {
|
|
159
163
|
throw InternalException("Incorrect stats size for type TIME");
|
|
160
164
|
}
|
|
161
|
-
|
|
162
|
-
|
|
165
|
+
if (schema_ele.__isset.logicalType && schema_ele.logicalType.__isset.TIME) {
|
|
166
|
+
// logical type
|
|
167
|
+
if (schema_ele.logicalType.TIME.unit.__isset.MILLIS) {
|
|
168
|
+
return Value::TIME(Time::FromTimeMs(val));
|
|
169
|
+
} else if (schema_ele.logicalType.TIME.unit.__isset.NANOS) {
|
|
170
|
+
return Value::TIME(Time::FromTimeNs(val));
|
|
171
|
+
} else if (schema_ele.logicalType.TIME.unit.__isset.MICROS) {
|
|
172
|
+
return Value::TIME(dtime_t(val));
|
|
173
|
+
} else {
|
|
174
|
+
throw InternalException("Time logicalType is set but unit is not defined");
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
if (schema_ele.converted_type == duckdb_parquet::format::ConvertedType::TIME_MILLIS) {
|
|
178
|
+
return Value::TIME(Time::FromTimeMs(val));
|
|
179
|
+
} else {
|
|
180
|
+
return Value::TIME(dtime_t(val));
|
|
181
|
+
}
|
|
163
182
|
}
|
|
164
183
|
case LogicalTypeId::TIMESTAMP:
|
|
165
184
|
case LogicalTypeId::TIMESTAMP_TZ: {
|
|
@@ -11,9 +11,23 @@
|
|
|
11
11
|
#include <sstream>
|
|
12
12
|
#include <stdarg.h>
|
|
13
13
|
#include <string.h>
|
|
14
|
+
#include <random>
|
|
14
15
|
|
|
15
16
|
namespace duckdb {
|
|
16
17
|
|
|
18
|
+
string StringUtil::GenerateRandomName(idx_t length) {
|
|
19
|
+
std::random_device rd;
|
|
20
|
+
std::mt19937 gen(rd());
|
|
21
|
+
std::uniform_int_distribution<> dis(0, 15);
|
|
22
|
+
|
|
23
|
+
std::stringstream ss;
|
|
24
|
+
ss << std::hex;
|
|
25
|
+
for (idx_t i = 0; i < length; i++) {
|
|
26
|
+
ss << dis(gen);
|
|
27
|
+
}
|
|
28
|
+
return ss.str();
|
|
29
|
+
}
|
|
30
|
+
|
|
17
31
|
bool StringUtil::Contains(const string &haystack, const string &needle) {
|
|
18
32
|
return (haystack.find(needle) != string::npos);
|
|
19
33
|
}
|
|
@@ -21,9 +21,9 @@ using ValidityBytes = RowLayout::ValidityBytes;
|
|
|
21
21
|
GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, Allocator &allocator,
|
|
22
22
|
vector<LogicalType> group_types, vector<LogicalType> payload_types,
|
|
23
23
|
const vector<BoundAggregateExpression *> &bindings,
|
|
24
|
-
HtEntryType entry_type)
|
|
24
|
+
HtEntryType entry_type, idx_t initial_capacity)
|
|
25
25
|
: GroupedAggregateHashTable(context, allocator, std::move(group_types), std::move(payload_types),
|
|
26
|
-
AggregateObject::CreateAggregateObjects(bindings), entry_type) {
|
|
26
|
+
AggregateObject::CreateAggregateObjects(bindings), entry_type, initial_capacity) {
|
|
27
27
|
}
|
|
28
28
|
|
|
29
29
|
GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, Allocator &allocator,
|
|
@@ -31,17 +31,19 @@ GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, All
|
|
|
31
31
|
: GroupedAggregateHashTable(context, allocator, std::move(group_types), {}, vector<AggregateObject>()) {
|
|
32
32
|
}
|
|
33
33
|
|
|
34
|
+
AggregateHTAppendState::AggregateHTAppendState()
|
|
35
|
+
: ht_offsets(LogicalTypeId::BIGINT), hash_salts(LogicalTypeId::SMALLINT),
|
|
36
|
+
group_compare_vector(STANDARD_VECTOR_SIZE), no_match_vector(STANDARD_VECTOR_SIZE),
|
|
37
|
+
empty_vector(STANDARD_VECTOR_SIZE), new_groups(STANDARD_VECTOR_SIZE), addresses(LogicalType::POINTER) {
|
|
38
|
+
}
|
|
39
|
+
|
|
34
40
|
GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, Allocator &allocator,
|
|
35
41
|
vector<LogicalType> group_types_p,
|
|
36
42
|
vector<LogicalType> payload_types_p,
|
|
37
43
|
vector<AggregateObject> aggregate_objects_p,
|
|
38
|
-
HtEntryType entry_type)
|
|
44
|
+
HtEntryType entry_type, idx_t initial_capacity)
|
|
39
45
|
: BaseAggregateHashTable(context, allocator, aggregate_objects_p, std::move(payload_types_p)),
|
|
40
|
-
entry_type(entry_type), capacity(0), entries(0), payload_page_offset(0), is_finalized(false)
|
|
41
|
-
ht_offsets(LogicalTypeId::BIGINT), hash_salts(LogicalTypeId::SMALLINT),
|
|
42
|
-
group_compare_vector(STANDARD_VECTOR_SIZE), no_match_vector(STANDARD_VECTOR_SIZE),
|
|
43
|
-
empty_vector(STANDARD_VECTOR_SIZE) {
|
|
44
|
-
|
|
46
|
+
entry_type(entry_type), capacity(0), entries(0), payload_page_offset(0), is_finalized(false) {
|
|
45
47
|
// Append hash column to the end and initialise the row layout
|
|
46
48
|
group_types_p.emplace_back(LogicalType::HASH);
|
|
47
49
|
layout.Initialize(std::move(group_types_p), std::move(aggregate_objects_p));
|
|
@@ -59,12 +61,12 @@ GroupedAggregateHashTable::GroupedAggregateHashTable(ClientContext &context, All
|
|
|
59
61
|
switch (entry_type) {
|
|
60
62
|
case HtEntryType::HT_WIDTH_64: {
|
|
61
63
|
hash_prefix_shift = (HASH_WIDTH - sizeof(aggr_ht_entry_64::salt)) * 8;
|
|
62
|
-
Resize<aggr_ht_entry_64>(
|
|
64
|
+
Resize<aggr_ht_entry_64>(initial_capacity);
|
|
63
65
|
break;
|
|
64
66
|
}
|
|
65
67
|
case HtEntryType::HT_WIDTH_32: {
|
|
66
68
|
hash_prefix_shift = (HASH_WIDTH - sizeof(aggr_ht_entry_32::salt)) * 8;
|
|
67
|
-
Resize<aggr_ht_entry_32>(
|
|
69
|
+
Resize<aggr_ht_entry_32>(initial_capacity);
|
|
68
70
|
break;
|
|
69
71
|
}
|
|
70
72
|
default:
|
|
@@ -155,6 +157,10 @@ void GroupedAggregateHashTable::VerifyInternal() {
|
|
|
155
157
|
D_ASSERT(count == entries);
|
|
156
158
|
}
|
|
157
159
|
|
|
160
|
+
idx_t GroupedAggregateHashTable::InitialCapacity() {
|
|
161
|
+
return STANDARD_VECTOR_SIZE * 2ULL;
|
|
162
|
+
}
|
|
163
|
+
|
|
158
164
|
idx_t GroupedAggregateHashTable::GetMaxCapacity(HtEntryType entry_type, idx_t tuple_size) {
|
|
159
165
|
idx_t max_pages;
|
|
160
166
|
idx_t max_tuples;
|
|
@@ -213,7 +219,6 @@ void GroupedAggregateHashTable::Resize(idx_t size) {
|
|
|
213
219
|
hashes_hdl_ptr = hashes_hdl.Ptr();
|
|
214
220
|
}
|
|
215
221
|
memset(hashes_hdl_ptr, 0, byte_size);
|
|
216
|
-
hashes_end_ptr = hashes_hdl_ptr + byte_size;
|
|
217
222
|
capacity = size;
|
|
218
223
|
|
|
219
224
|
auto hashes_arr = (ENTRY *)hashes_hdl_ptr;
|
|
@@ -240,7 +245,8 @@ void GroupedAggregateHashTable::Resize(idx_t size) {
|
|
|
240
245
|
Verify();
|
|
241
246
|
}
|
|
242
247
|
|
|
243
|
-
idx_t GroupedAggregateHashTable::AddChunk(DataChunk &groups, DataChunk &payload,
|
|
248
|
+
idx_t GroupedAggregateHashTable::AddChunk(AggregateHTAppendState &state, DataChunk &groups, DataChunk &payload,
|
|
249
|
+
AggregateType filter) {
|
|
244
250
|
vector<idx_t> aggregate_filter;
|
|
245
251
|
|
|
246
252
|
auto &aggregates = layout.GetAggregates();
|
|
@@ -250,34 +256,32 @@ idx_t GroupedAggregateHashTable::AddChunk(DataChunk &groups, DataChunk &payload,
|
|
|
250
256
|
aggregate_filter.push_back(i);
|
|
251
257
|
}
|
|
252
258
|
}
|
|
253
|
-
return AddChunk(groups, payload, aggregate_filter);
|
|
259
|
+
return AddChunk(state, groups, payload, aggregate_filter);
|
|
254
260
|
}
|
|
255
261
|
|
|
256
|
-
idx_t GroupedAggregateHashTable::AddChunk(DataChunk &groups, DataChunk &payload,
|
|
262
|
+
idx_t GroupedAggregateHashTable::AddChunk(AggregateHTAppendState &state, DataChunk &groups, DataChunk &payload,
|
|
263
|
+
const vector<idx_t> &filter) {
|
|
257
264
|
Vector hashes(LogicalType::HASH);
|
|
258
265
|
groups.Hash(hashes);
|
|
259
266
|
|
|
260
|
-
return AddChunk(groups, hashes, payload, filter);
|
|
267
|
+
return AddChunk(state, groups, hashes, payload, filter);
|
|
261
268
|
}
|
|
262
269
|
|
|
263
|
-
idx_t GroupedAggregateHashTable::AddChunk(DataChunk &groups, Vector &group_hashes,
|
|
264
|
-
const vector<idx_t> &filter) {
|
|
270
|
+
idx_t GroupedAggregateHashTable::AddChunk(AggregateHTAppendState &state, DataChunk &groups, Vector &group_hashes,
|
|
271
|
+
DataChunk &payload, const vector<idx_t> &filter) {
|
|
265
272
|
D_ASSERT(!is_finalized);
|
|
266
273
|
|
|
267
274
|
if (groups.size() == 0) {
|
|
268
275
|
return 0;
|
|
269
276
|
}
|
|
270
|
-
// dummy
|
|
271
|
-
SelectionVector new_groups(STANDARD_VECTOR_SIZE);
|
|
272
277
|
|
|
273
278
|
D_ASSERT(groups.ColumnCount() + 1 == layout.ColumnCount());
|
|
274
279
|
for (idx_t i = 0; i < groups.ColumnCount(); i++) {
|
|
275
280
|
D_ASSERT(groups.GetTypes()[i] == layout.GetTypes()[i]);
|
|
276
281
|
}
|
|
277
282
|
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
VectorOperations::AddInPlace(addresses, layout.GetAggrOffset(), payload.size());
|
|
283
|
+
auto new_group_count = FindOrCreateGroups(state, groups, group_hashes, state.addresses, state.new_groups);
|
|
284
|
+
VectorOperations::AddInPlace(state.addresses, layout.GetAggrOffset(), payload.size());
|
|
281
285
|
|
|
282
286
|
// now every cell has an entry
|
|
283
287
|
// update the aggregates
|
|
@@ -290,20 +294,21 @@ idx_t GroupedAggregateHashTable::AddChunk(DataChunk &groups, Vector &group_hashe
|
|
|
290
294
|
if (filter_idx >= filter.size() || i < filter[filter_idx]) {
|
|
291
295
|
// Skip all the aggregates that are not in the filter
|
|
292
296
|
payload_idx += aggr.child_count;
|
|
293
|
-
VectorOperations::AddInPlace(addresses, aggr.payload_size, payload.size());
|
|
297
|
+
VectorOperations::AddInPlace(state.addresses, aggr.payload_size, payload.size());
|
|
294
298
|
continue;
|
|
295
299
|
}
|
|
296
300
|
D_ASSERT(i == filter[filter_idx]);
|
|
297
301
|
|
|
298
302
|
if (aggr.aggr_type != AggregateType::DISTINCT && aggr.filter) {
|
|
299
|
-
RowOperations::UpdateFilteredStates(filter_set.GetFilterData(i), aggr, addresses, payload,
|
|
303
|
+
RowOperations::UpdateFilteredStates(filter_set.GetFilterData(i), aggr, state.addresses, payload,
|
|
304
|
+
payload_idx);
|
|
300
305
|
} else {
|
|
301
|
-
RowOperations::UpdateStates(aggr, addresses, payload, payload_idx, payload.size());
|
|
306
|
+
RowOperations::UpdateStates(aggr, state.addresses, payload, payload_idx, payload.size());
|
|
302
307
|
}
|
|
303
308
|
|
|
304
309
|
// move to the next aggregate
|
|
305
310
|
payload_idx += aggr.child_count;
|
|
306
|
-
VectorOperations::AddInPlace(addresses, aggr.payload_size, payload.size());
|
|
311
|
+
VectorOperations::AddInPlace(state.addresses, aggr.payload_size, payload.size());
|
|
307
312
|
filter_idx++;
|
|
308
313
|
}
|
|
309
314
|
|
|
@@ -321,16 +326,23 @@ void GroupedAggregateHashTable::FetchAggregates(DataChunk &groups, DataChunk &re
|
|
|
321
326
|
if (groups.size() == 0) {
|
|
322
327
|
return;
|
|
323
328
|
}
|
|
329
|
+
|
|
324
330
|
// find the groups associated with the addresses
|
|
325
331
|
// FIXME: this should not use the FindOrCreateGroups, creating them is unnecessary
|
|
332
|
+
AggregateHTAppendState append_state;
|
|
326
333
|
Vector addresses(LogicalType::POINTER);
|
|
327
|
-
FindOrCreateGroups(groups, addresses);
|
|
334
|
+
FindOrCreateGroups(append_state, groups, addresses);
|
|
328
335
|
// now fetch the aggregates
|
|
329
336
|
RowOperations::FinalizeStates(layout, addresses, result, 0);
|
|
330
337
|
}
|
|
331
338
|
|
|
339
|
+
idx_t GroupedAggregateHashTable::ResizeThreshold() {
|
|
340
|
+
return capacity / LOAD_FACTOR;
|
|
341
|
+
}
|
|
342
|
+
|
|
332
343
|
template <class ENTRY>
|
|
333
|
-
idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(
|
|
344
|
+
idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(AggregateHTAppendState &state, DataChunk &groups,
|
|
345
|
+
Vector &group_hashes, Vector &addresses,
|
|
334
346
|
SelectionVector &new_groups_out) {
|
|
335
347
|
D_ASSERT(!is_finalized);
|
|
336
348
|
|
|
@@ -339,7 +351,7 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(DataChunk &groups, V
|
|
|
339
351
|
}
|
|
340
352
|
|
|
341
353
|
// resize at 50% capacity, also need to fit the entire vector
|
|
342
|
-
if (capacity - entries <= groups.size() || entries >
|
|
354
|
+
if (capacity - entries <= groups.size() || entries > ResizeThreshold()) {
|
|
343
355
|
Resize<ENTRY>(capacity * 2);
|
|
344
356
|
}
|
|
345
357
|
|
|
@@ -352,42 +364,47 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(DataChunk &groups, V
|
|
|
352
364
|
group_hashes.Flatten(groups.size());
|
|
353
365
|
auto group_hashes_ptr = FlatVector::GetData<hash_t>(group_hashes);
|
|
354
366
|
|
|
355
|
-
D_ASSERT(ht_offsets.GetVectorType() == VectorType::FLAT_VECTOR);
|
|
356
|
-
D_ASSERT(ht_offsets.GetType() == LogicalType::BIGINT);
|
|
367
|
+
D_ASSERT(state.ht_offsets.GetVectorType() == VectorType::FLAT_VECTOR);
|
|
368
|
+
D_ASSERT(state.ht_offsets.GetType() == LogicalType::BIGINT);
|
|
357
369
|
|
|
358
370
|
D_ASSERT(addresses.GetType() == LogicalType::POINTER);
|
|
359
371
|
addresses.Flatten(groups.size());
|
|
360
372
|
auto addresses_ptr = FlatVector::GetData<data_ptr_t>(addresses);
|
|
361
373
|
|
|
362
|
-
//
|
|
363
|
-
|
|
374
|
+
// compute the entry in the table based on the hash using a modulo
|
|
375
|
+
// and precompute the hash salts for faster comparison below
|
|
376
|
+
D_ASSERT(state.hash_salts.GetType() == LogicalType::SMALLINT);
|
|
377
|
+
auto ht_offsets_ptr = FlatVector::GetData<uint64_t>(state.ht_offsets);
|
|
378
|
+
auto hash_salts_ptr = FlatVector::GetData<uint16_t>(state.hash_salts);
|
|
379
|
+
for (idx_t r = 0; r < groups.size(); r++) {
|
|
380
|
+
auto element = group_hashes_ptr[r];
|
|
364
381
|
D_ASSERT((element & bitmask) == (element % capacity));
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
// precompute the hash salts for faster comparison below
|
|
370
|
-
D_ASSERT(hash_salts.GetType() == LogicalType::SMALLINT);
|
|
371
|
-
UnaryExecutor::Execute<hash_t, uint16_t>(group_hashes, hash_salts, groups.size(),
|
|
372
|
-
[&](hash_t element) { return (element >> hash_prefix_shift); });
|
|
373
|
-
auto hash_salts_ptr = FlatVector::GetData<uint16_t>(hash_salts);
|
|
374
|
-
|
|
382
|
+
ht_offsets_ptr[r] = element & bitmask;
|
|
383
|
+
hash_salts_ptr[r] = element >> hash_prefix_shift;
|
|
384
|
+
}
|
|
375
385
|
// we start out with all entries [0, 1, 2, ..., groups.size()]
|
|
376
386
|
const SelectionVector *sel_vector = FlatVector::IncrementalSelectionVector();
|
|
377
387
|
|
|
378
388
|
idx_t remaining_entries = groups.size();
|
|
379
389
|
|
|
380
390
|
// make a chunk that references the groups and the hashes
|
|
381
|
-
|
|
382
|
-
|
|
391
|
+
if (state.group_chunk.ColumnCount() == 0) {
|
|
392
|
+
state.group_chunk.InitializeEmpty(layout.GetTypes());
|
|
393
|
+
}
|
|
394
|
+
D_ASSERT(state.group_chunk.ColumnCount() == layout.GetTypes().size());
|
|
383
395
|
for (idx_t grp_idx = 0; grp_idx < groups.ColumnCount(); grp_idx++) {
|
|
384
|
-
group_chunk.data[grp_idx].Reference(groups.data[grp_idx]);
|
|
396
|
+
state.group_chunk.data[grp_idx].Reference(groups.data[grp_idx]);
|
|
385
397
|
}
|
|
386
|
-
group_chunk.data[groups.ColumnCount()].Reference(group_hashes);
|
|
387
|
-
group_chunk.SetCardinality(groups);
|
|
398
|
+
state.group_chunk.data[groups.ColumnCount()].Reference(group_hashes);
|
|
399
|
+
state.group_chunk.SetCardinality(groups);
|
|
388
400
|
|
|
389
401
|
// convert all vectors to unified format
|
|
390
|
-
|
|
402
|
+
if (!state.group_data) {
|
|
403
|
+
state.group_data = unique_ptr<UnifiedVectorFormat[]>(new UnifiedVectorFormat[state.group_chunk.ColumnCount()]);
|
|
404
|
+
}
|
|
405
|
+
for (idx_t col_idx = 0; col_idx < state.group_chunk.ColumnCount(); col_idx++) {
|
|
406
|
+
state.group_chunk.data[col_idx].ToUnifiedFormat(state.group_chunk.size(), state.group_data[col_idx]);
|
|
407
|
+
}
|
|
391
408
|
|
|
392
409
|
idx_t new_group_count = 0;
|
|
393
410
|
while (remaining_entries > 0) {
|
|
@@ -420,7 +437,7 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(DataChunk &groups, V
|
|
|
420
437
|
ht_entry_ptr->page_offset = payload_page_offset++;
|
|
421
438
|
|
|
422
439
|
// update selection lists for outer loops
|
|
423
|
-
empty_vector.set_index(new_entry_count++, index);
|
|
440
|
+
state.empty_vector.set_index(new_entry_count++, index);
|
|
424
441
|
new_groups_out.set_index(new_group_count++, index);
|
|
425
442
|
entries++;
|
|
426
443
|
|
|
@@ -430,37 +447,37 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(DataChunk &groups, V
|
|
|
430
447
|
// cell is occupied: add to check list
|
|
431
448
|
// only need to check if hash salt in ptr == prefix of hash in payload
|
|
432
449
|
if (ht_entry_ptr->salt == hash_salts_ptr[index]) {
|
|
433
|
-
group_compare_vector.set_index(need_compare_count++, index);
|
|
450
|
+
state.group_compare_vector.set_index(need_compare_count++, index);
|
|
434
451
|
|
|
435
452
|
auto page_ptr = payload_hds_ptrs[ht_entry_ptr->page_nr - 1];
|
|
436
453
|
auto page_offset = ht_entry_ptr->page_offset * tuple_size;
|
|
437
454
|
addresses_ptr[index] = page_ptr + page_offset;
|
|
438
455
|
|
|
439
456
|
} else {
|
|
440
|
-
no_match_vector.set_index(no_match_count++, index);
|
|
457
|
+
state.no_match_vector.set_index(no_match_count++, index);
|
|
441
458
|
}
|
|
442
459
|
}
|
|
443
460
|
}
|
|
444
461
|
|
|
445
462
|
// for each of the locations that are empty, serialize the group columns to the locations
|
|
446
|
-
RowOperations::Scatter(group_chunk, group_data.get(), layout, addresses, *string_heap,
|
|
447
|
-
new_entry_count);
|
|
448
|
-
RowOperations::InitializeStates(layout, addresses, empty_vector, new_entry_count);
|
|
463
|
+
RowOperations::Scatter(state.group_chunk, state.group_data.get(), layout, addresses, *string_heap,
|
|
464
|
+
state.empty_vector, new_entry_count);
|
|
465
|
+
RowOperations::InitializeStates(layout, addresses, state.empty_vector, new_entry_count);
|
|
449
466
|
|
|
450
467
|
// now we have only the tuples remaining that might match to an existing group
|
|
451
468
|
// start performing comparisons with each of the groups
|
|
452
|
-
RowOperations::Match(group_chunk, group_data.get(), layout, addresses, predicates,
|
|
453
|
-
need_compare_count, &no_match_vector, no_match_count);
|
|
469
|
+
RowOperations::Match(state.group_chunk, state.group_data.get(), layout, addresses, predicates,
|
|
470
|
+
state.group_compare_vector, need_compare_count, &state.no_match_vector, no_match_count);
|
|
454
471
|
|
|
455
472
|
// each of the entries that do not match we move them to the next entry in the HT
|
|
456
473
|
for (idx_t i = 0; i < no_match_count; i++) {
|
|
457
|
-
idx_t index = no_match_vector.get_index(i);
|
|
474
|
+
idx_t index = state.no_match_vector.get_index(i);
|
|
458
475
|
ht_offsets_ptr[index]++;
|
|
459
476
|
if (ht_offsets_ptr[index] >= capacity) {
|
|
460
477
|
ht_offsets_ptr[index] = 0;
|
|
461
478
|
}
|
|
462
479
|
}
|
|
463
|
-
sel_vector = &no_match_vector;
|
|
480
|
+
sel_vector = &state.no_match_vector;
|
|
464
481
|
remaining_entries = no_match_count;
|
|
465
482
|
}
|
|
466
483
|
|
|
@@ -469,29 +486,30 @@ idx_t GroupedAggregateHashTable::FindOrCreateGroupsInternal(DataChunk &groups, V
|
|
|
469
486
|
|
|
470
487
|
// this is to support distinct aggregations where we need to record whether we
|
|
471
488
|
// have already seen a value for a group
|
|
472
|
-
idx_t GroupedAggregateHashTable::FindOrCreateGroups(
|
|
489
|
+
idx_t GroupedAggregateHashTable::FindOrCreateGroups(AggregateHTAppendState &state, DataChunk &groups,
|
|
490
|
+
Vector &group_hashes, Vector &addresses_out,
|
|
473
491
|
SelectionVector &new_groups_out) {
|
|
474
492
|
switch (entry_type) {
|
|
475
493
|
case HtEntryType::HT_WIDTH_64:
|
|
476
|
-
return FindOrCreateGroupsInternal<aggr_ht_entry_64>(groups, group_hashes, addresses_out, new_groups_out);
|
|
494
|
+
return FindOrCreateGroupsInternal<aggr_ht_entry_64>(state, groups, group_hashes, addresses_out, new_groups_out);
|
|
477
495
|
case HtEntryType::HT_WIDTH_32:
|
|
478
|
-
return FindOrCreateGroupsInternal<aggr_ht_entry_32>(groups, group_hashes, addresses_out, new_groups_out);
|
|
496
|
+
return FindOrCreateGroupsInternal<aggr_ht_entry_32>(state, groups, group_hashes, addresses_out, new_groups_out);
|
|
479
497
|
default:
|
|
480
498
|
throw InternalException("Unknown HT entry width");
|
|
481
499
|
}
|
|
482
500
|
}
|
|
483
501
|
|
|
484
|
-
void GroupedAggregateHashTable::FindOrCreateGroups(
|
|
502
|
+
void GroupedAggregateHashTable::FindOrCreateGroups(AggregateHTAppendState &state, DataChunk &groups,
|
|
503
|
+
Vector &addresses) {
|
|
485
504
|
// create a dummy new_groups sel vector
|
|
486
|
-
|
|
487
|
-
FindOrCreateGroups(groups, addresses, new_groups);
|
|
505
|
+
FindOrCreateGroups(state, groups, addresses, state.new_groups);
|
|
488
506
|
}
|
|
489
507
|
|
|
490
|
-
idx_t GroupedAggregateHashTable::FindOrCreateGroups(
|
|
491
|
-
SelectionVector &new_groups_out) {
|
|
508
|
+
idx_t GroupedAggregateHashTable::FindOrCreateGroups(AggregateHTAppendState &state, DataChunk &groups,
|
|
509
|
+
Vector &addresses_out, SelectionVector &new_groups_out) {
|
|
492
510
|
Vector hashes(LogicalType::HASH);
|
|
493
511
|
groups.Hash(hashes);
|
|
494
|
-
return FindOrCreateGroups(groups, hashes, addresses_out, new_groups_out);
|
|
512
|
+
return FindOrCreateGroups(state, groups, hashes, addresses_out, new_groups_out);
|
|
495
513
|
}
|
|
496
514
|
|
|
497
515
|
struct FlushMoveState {
|
|
@@ -521,7 +539,8 @@ void GroupedAggregateHashTable::FlushMove(FlushMoveState &state, Vector &source_
|
|
|
521
539
|
*FlatVector::IncrementalSelectionVector(), count, layout, col_no);
|
|
522
540
|
}
|
|
523
541
|
|
|
524
|
-
|
|
542
|
+
AggregateHTAppendState append_state;
|
|
543
|
+
FindOrCreateGroups(append_state, state.groups, source_hashes, state.group_addresses, state.new_groups_sel);
|
|
525
544
|
|
|
526
545
|
RowOperations::CombineStates(layout, source_addresses, state.group_addresses, count);
|
|
527
546
|
}
|
|
@@ -219,7 +219,9 @@ void JoinHashTable::Build(DataChunk &keys, DataChunk &payload) {
|
|
|
219
219
|
}
|
|
220
220
|
info.correlated_payload.SetCardinality(keys);
|
|
221
221
|
info.correlated_payload.data[0].Reference(keys.data[info.correlated_types.size()]);
|
|
222
|
-
|
|
222
|
+
AggregateHTAppendState append_state;
|
|
223
|
+
info.correlated_counts->AddChunk(append_state, info.group_chunk, info.correlated_payload,
|
|
224
|
+
AggregateType::NON_DISTINCT);
|
|
223
225
|
}
|
|
224
226
|
|
|
225
227
|
// prepare the keys for processing
|
|
@@ -42,6 +42,7 @@ public:
|
|
|
42
42
|
bool initialized = false;
|
|
43
43
|
bool finished_scan = false;
|
|
44
44
|
SelectionVector new_groups;
|
|
45
|
+
AggregateHTAppendState append_state;
|
|
45
46
|
};
|
|
46
47
|
|
|
47
48
|
unique_ptr<GlobalSinkState> PhysicalRecursiveCTE::GetGlobalSinkState(ClientContext &context) const {
|
|
@@ -52,7 +53,7 @@ idx_t PhysicalRecursiveCTE::ProbeHT(DataChunk &chunk, RecursiveCTEState &state)
|
|
|
52
53
|
Vector dummy_addresses(LogicalType::POINTER);
|
|
53
54
|
|
|
54
55
|
// Use the HT to eliminate duplicate rows
|
|
55
|
-
idx_t new_group_count = state.ht->FindOrCreateGroups(chunk, dummy_addresses, state.new_groups);
|
|
56
|
+
idx_t new_group_count = state.ht->FindOrCreateGroups(state.append_state, chunk, dummy_addresses, state.new_groups);
|
|
56
57
|
|
|
57
58
|
// we only return entries we have not seen before (i.e. new groups)
|
|
58
59
|
chunk.Slice(state.new_groups, new_group_count);
|
|
@@ -80,15 +80,17 @@ idx_t PartitionableHashTable::ListAddChunk(HashTableList &list, DataChunk &group
|
|
|
80
80
|
DataChunk &payload, const vector<idx_t> &filter) {
|
|
81
81
|
// If this is false, a single AddChunk would overflow the max capacity
|
|
82
82
|
D_ASSERT(list.empty() || groups.size() <= list.back()->MaxCapacity());
|
|
83
|
-
if (list.empty() || list.back()->Size() + groups.size()
|
|
83
|
+
if (list.empty() || list.back()->Size() + groups.size() >= list.back()->MaxCapacity()) {
|
|
84
|
+
idx_t new_capacity = GroupedAggregateHashTable::InitialCapacity();
|
|
84
85
|
if (!list.empty()) {
|
|
86
|
+
new_capacity = list.back()->Capacity();
|
|
85
87
|
// early release first part of ht and prevent adding of more data
|
|
86
88
|
list.back()->Finalize();
|
|
87
89
|
}
|
|
88
90
|
list.push_back(make_unique<GroupedAggregateHashTable>(context, allocator, group_types, payload_types, bindings,
|
|
89
|
-
GetHTEntrySize()));
|
|
91
|
+
GetHTEntrySize(), new_capacity));
|
|
90
92
|
}
|
|
91
|
-
return list.back()->AddChunk(groups, group_hashes, payload, filter);
|
|
93
|
+
return list.back()->AddChunk(append_state, groups, group_hashes, payload, filter);
|
|
92
94
|
}
|
|
93
95
|
|
|
94
96
|
idx_t PartitionableHashTable::AddChunk(DataChunk &groups, DataChunk &payload, bool do_partition,
|
|
@@ -150,6 +152,7 @@ void PartitionableHashTable::Partition() {
|
|
|
150
152
|
D_ASSERT(partition_info.n_partitions > 1);
|
|
151
153
|
|
|
152
154
|
vector<GroupedAggregateHashTable *> partition_hts(partition_info.n_partitions);
|
|
155
|
+
radix_partitioned_hts.resize(partition_info.n_partitions);
|
|
153
156
|
for (auto &unpartitioned_ht : unpartitioned_hts) {
|
|
154
157
|
for (idx_t r = 0; r < partition_info.n_partitions; r++) {
|
|
155
158
|
radix_partitioned_hts[r].push_back(make_unique<GroupedAggregateHashTable>(
|
|
@@ -181,7 +184,7 @@ HashTableList PartitionableHashTable::GetUnpartitioned() {
|
|
|
181
184
|
void PartitionableHashTable::Finalize() {
|
|
182
185
|
if (IsPartitioned()) {
|
|
183
186
|
for (auto &ht_list : radix_partitioned_hts) {
|
|
184
|
-
for (auto &ht : ht_list
|
|
187
|
+
for (auto &ht : ht_list) {
|
|
185
188
|
D_ASSERT(ht);
|
|
186
189
|
ht->Finalize();
|
|
187
190
|
}
|
|
@@ -78,6 +78,7 @@ public:
|
|
|
78
78
|
bool is_partitioned = false;
|
|
79
79
|
|
|
80
80
|
RadixPartitionInfo partition_info;
|
|
81
|
+
AggregateHTAppendState append_state;
|
|
81
82
|
};
|
|
82
83
|
|
|
83
84
|
class RadixHTLocalState : public LocalSinkState {
|
|
@@ -151,7 +152,8 @@ void RadixPartitionedHashTable::Sink(ExecutionContext &context, GlobalSinkState
|
|
|
151
152
|
}
|
|
152
153
|
D_ASSERT(gstate.finalized_hts.size() == 1);
|
|
153
154
|
D_ASSERT(gstate.finalized_hts[0]);
|
|
154
|
-
llstate.total_groups +=
|
|
155
|
+
llstate.total_groups +=
|
|
156
|
+
gstate.finalized_hts[0]->AddChunk(gstate.append_state, group_chunk, payload_input, filter);
|
|
155
157
|
return;
|
|
156
158
|
}
|
|
157
159
|
|
|
@@ -194,15 +196,13 @@ void RadixPartitionedHashTable::Combine(ExecutionContext &context, GlobalSinkSta
|
|
|
194
196
|
llstate.ht->Partition();
|
|
195
197
|
}
|
|
196
198
|
|
|
197
|
-
|
|
199
|
+
// we will never add new values to these HTs so we can drop the first part of the HT
|
|
200
|
+
llstate.ht->Finalize();
|
|
198
201
|
|
|
202
|
+
lock_guard<mutex> glock(gstate.lock);
|
|
199
203
|
if (!llstate.is_empty) {
|
|
200
204
|
gstate.is_empty = false;
|
|
201
205
|
}
|
|
202
|
-
|
|
203
|
-
// we will never add new values to these HTs so we can drop the first part of the HT
|
|
204
|
-
llstate.ht->Finalize();
|
|
205
|
-
|
|
206
206
|
// at this point we just collect them the PhysicalHashAggregateFinalizeTask (below) will merge them in parallel
|
|
207
207
|
gstate.intermediate_hts.push_back(std::move(llstate.ht));
|
|
208
208
|
}
|
|
@@ -10,25 +10,29 @@
|
|
|
10
10
|
namespace duckdb {
|
|
11
11
|
|
|
12
12
|
struct SortedAggregateBindData : public FunctionData {
|
|
13
|
-
SortedAggregateBindData(ClientContext &context,
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
bind_info(std::move(bind_info_p)) {
|
|
13
|
+
SortedAggregateBindData(ClientContext &context, BoundAggregateExpression &expr)
|
|
14
|
+
: buffer_manager(BufferManager::GetBufferManager(context)), function(expr.function),
|
|
15
|
+
bind_info(std::move(expr.bind_info)) {
|
|
16
|
+
auto &children = expr.children;
|
|
18
17
|
arg_types.reserve(children.size());
|
|
19
18
|
for (const auto &child : children) {
|
|
20
19
|
arg_types.emplace_back(child->return_type);
|
|
21
20
|
}
|
|
21
|
+
auto &order_bys = *expr.order_bys;
|
|
22
22
|
sort_types.reserve(order_bys.orders.size());
|
|
23
23
|
for (auto &order : order_bys.orders) {
|
|
24
24
|
orders.emplace_back(order.Copy());
|
|
25
25
|
sort_types.emplace_back(order.expression->return_type);
|
|
26
26
|
}
|
|
27
|
+
sorted_on_args = (children.size() == order_bys.orders.size());
|
|
28
|
+
for (size_t i = 0; sorted_on_args && i < children.size(); ++i) {
|
|
29
|
+
sorted_on_args = children[i]->Equals(order_bys.orders[i].expression.get());
|
|
30
|
+
}
|
|
27
31
|
}
|
|
28
32
|
|
|
29
33
|
SortedAggregateBindData(const SortedAggregateBindData &other)
|
|
30
34
|
: buffer_manager(other.buffer_manager), function(other.function), arg_types(other.arg_types),
|
|
31
|
-
sort_types(other.sort_types) {
|
|
35
|
+
sort_types(other.sort_types), sorted_on_args(other.sorted_on_args) {
|
|
32
36
|
if (other.bind_info) {
|
|
33
37
|
bind_info = other.bind_info->Copy();
|
|
34
38
|
}
|
|
@@ -71,13 +75,14 @@ struct SortedAggregateBindData : public FunctionData {
|
|
|
71
75
|
|
|
72
76
|
vector<BoundOrderByNode> orders;
|
|
73
77
|
vector<LogicalType> sort_types;
|
|
78
|
+
bool sorted_on_args;
|
|
74
79
|
};
|
|
75
80
|
|
|
76
81
|
struct SortedAggregateState {
|
|
77
82
|
//! Default buffer size, optimised for small group to avoid blowing out memory.
|
|
78
83
|
static const idx_t BUFFER_CAPACITY = 16;
|
|
79
84
|
|
|
80
|
-
SortedAggregateState() : nsel(0) {
|
|
85
|
+
SortedAggregateState() : nsel(0), offset(0) {
|
|
81
86
|
}
|
|
82
87
|
|
|
83
88
|
static inline void InitializeBuffer(DataChunk &chunk, const vector<LogicalType> &types) {
|
|
@@ -103,23 +108,31 @@ struct SortedAggregateState {
|
|
|
103
108
|
ordering->Append(sort_buffer);
|
|
104
109
|
ResetBuffer(sort_buffer, order_bind.sort_types);
|
|
105
110
|
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
111
|
+
if (!order_bind.sorted_on_args) {
|
|
112
|
+
arguments = make_unique<ColumnDataCollection>(order_bind.buffer_manager, order_bind.arg_types);
|
|
113
|
+
InitializeBuffer(arg_buffer, order_bind.arg_types);
|
|
114
|
+
arguments->Append(arg_buffer);
|
|
115
|
+
ResetBuffer(arg_buffer, order_bind.arg_types);
|
|
116
|
+
}
|
|
110
117
|
}
|
|
111
118
|
|
|
112
119
|
void Update(SortedAggregateBindData &order_bind, DataChunk &sort_chunk, DataChunk &arg_chunk) {
|
|
113
120
|
// Lazy instantiation of the buffer chunks
|
|
114
121
|
InitializeBuffer(sort_buffer, order_bind.sort_types);
|
|
115
|
-
|
|
122
|
+
if (!order_bind.sorted_on_args) {
|
|
123
|
+
InitializeBuffer(arg_buffer, order_bind.arg_types);
|
|
124
|
+
}
|
|
116
125
|
|
|
117
126
|
if (sort_chunk.size() + sort_buffer.size() > STANDARD_VECTOR_SIZE) {
|
|
118
127
|
Flush(order_bind);
|
|
119
128
|
}
|
|
120
|
-
if (
|
|
129
|
+
if (arguments) {
|
|
121
130
|
ordering->Append(sort_chunk);
|
|
122
131
|
arguments->Append(arg_chunk);
|
|
132
|
+
} else if (ordering) {
|
|
133
|
+
ordering->Append(sort_chunk);
|
|
134
|
+
} else if (order_bind.sorted_on_args) {
|
|
135
|
+
sort_buffer.Append(sort_chunk, true);
|
|
123
136
|
} else {
|
|
124
137
|
sort_buffer.Append(sort_chunk, true);
|
|
125
138
|
arg_buffer.Append(arg_chunk, true);
|
|
@@ -129,12 +142,14 @@ struct SortedAggregateState {
|
|
|
129
142
|
void UpdateSlice(SortedAggregateBindData &order_bind, DataChunk &sort_inputs, DataChunk &arg_inputs) {
|
|
130
143
|
// Lazy instantiation of the buffer chunks
|
|
131
144
|
InitializeBuffer(sort_buffer, order_bind.sort_types);
|
|
132
|
-
|
|
145
|
+
if (!order_bind.sorted_on_args) {
|
|
146
|
+
InitializeBuffer(arg_buffer, order_bind.arg_types);
|
|
147
|
+
}
|
|
133
148
|
|
|
134
149
|
if (nsel + sort_buffer.size() > STANDARD_VECTOR_SIZE) {
|
|
135
150
|
Flush(order_bind);
|
|
136
151
|
}
|
|
137
|
-
if (
|
|
152
|
+
if (arguments) {
|
|
138
153
|
sort_buffer.Reset();
|
|
139
154
|
sort_buffer.Slice(sort_inputs, sel, nsel);
|
|
140
155
|
ordering->Append(sort_buffer);
|
|
@@ -142,27 +157,38 @@ struct SortedAggregateState {
|
|
|
142
157
|
arg_buffer.Reset();
|
|
143
158
|
arg_buffer.Slice(arg_inputs, sel, nsel);
|
|
144
159
|
arguments->Append(arg_buffer);
|
|
160
|
+
} else if (ordering) {
|
|
161
|
+
sort_buffer.Reset();
|
|
162
|
+
sort_buffer.Slice(sort_inputs, sel, nsel);
|
|
163
|
+
ordering->Append(sort_buffer);
|
|
164
|
+
} else if (order_bind.sorted_on_args) {
|
|
165
|
+
sort_buffer.Append(sort_inputs, true, &sel, nsel);
|
|
145
166
|
} else {
|
|
146
167
|
sort_buffer.Append(sort_inputs, true, &sel, nsel);
|
|
147
168
|
arg_buffer.Append(arg_inputs, true, &sel, nsel);
|
|
148
169
|
}
|
|
149
170
|
|
|
150
171
|
nsel = 0;
|
|
172
|
+
offset = 0;
|
|
151
173
|
}
|
|
152
174
|
|
|
153
175
|
void Combine(SortedAggregateBindData &order_bind, SortedAggregateState &other) {
|
|
154
|
-
if (other.
|
|
155
|
-
// Force CDC if the other
|
|
176
|
+
if (other.arguments) {
|
|
177
|
+
// Force CDC if the other has it
|
|
156
178
|
Flush(order_bind);
|
|
157
179
|
ordering->Combine(*other.ordering);
|
|
158
180
|
arguments->Combine(*other.arguments);
|
|
181
|
+
} else if (other.ordering) {
|
|
182
|
+
// Force CDC if the other has it
|
|
183
|
+
Flush(order_bind);
|
|
184
|
+
ordering->Combine(*other.ordering);
|
|
159
185
|
} else if (other.sort_buffer.size()) {
|
|
160
186
|
Update(order_bind, other.sort_buffer, other.arg_buffer);
|
|
161
187
|
}
|
|
162
188
|
}
|
|
163
189
|
|
|
164
|
-
void Finalize(LocalSortState &local_sort) {
|
|
165
|
-
if (
|
|
190
|
+
void Finalize(SortedAggregateBindData &order_bind, LocalSortState &local_sort) {
|
|
191
|
+
if (arguments) {
|
|
166
192
|
ColumnDataScanState sort_state;
|
|
167
193
|
ordering->InitializeScan(sort_state);
|
|
168
194
|
ColumnDataScanState arg_state;
|
|
@@ -174,6 +200,15 @@ struct SortedAggregateState {
|
|
|
174
200
|
}
|
|
175
201
|
ordering->Reset();
|
|
176
202
|
arguments->Reset();
|
|
203
|
+
} else if (ordering) {
|
|
204
|
+
ColumnDataScanState sort_state;
|
|
205
|
+
ordering->InitializeScan(sort_state);
|
|
206
|
+
for (sort_buffer.Reset(); ordering->Scan(sort_state, sort_buffer); sort_buffer.Reset()) {
|
|
207
|
+
local_sort.SinkChunk(sort_buffer, sort_buffer);
|
|
208
|
+
}
|
|
209
|
+
ordering->Reset();
|
|
210
|
+
} else if (order_bind.sorted_on_args) {
|
|
211
|
+
local_sort.SinkChunk(sort_buffer, sort_buffer);
|
|
177
212
|
} else {
|
|
178
213
|
local_sort.SinkChunk(sort_buffer, arg_buffer);
|
|
179
214
|
}
|
|
@@ -188,6 +223,7 @@ struct SortedAggregateState {
|
|
|
188
223
|
// Selection for scattering
|
|
189
224
|
SelectionVector sel;
|
|
190
225
|
idx_t nsel;
|
|
226
|
+
idx_t offset;
|
|
191
227
|
};
|
|
192
228
|
|
|
193
229
|
struct SortedAggregateFunction {
|
|
@@ -205,11 +241,13 @@ struct SortedAggregateFunction {
|
|
|
205
241
|
DataChunk &arg_chunk, DataChunk &sort_chunk) {
|
|
206
242
|
idx_t col = 0;
|
|
207
243
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
dst.
|
|
244
|
+
if (!order_bind->sorted_on_args) {
|
|
245
|
+
arg_chunk.InitializeEmpty(order_bind->arg_types);
|
|
246
|
+
for (auto &dst : arg_chunk.data) {
|
|
247
|
+
dst.Reference(inputs[col++]);
|
|
248
|
+
}
|
|
249
|
+
arg_chunk.SetCardinality(count);
|
|
211
250
|
}
|
|
212
|
-
arg_chunk.SetCardinality(count);
|
|
213
251
|
|
|
214
252
|
sort_chunk.InitializeEmpty(order_bind->sort_types);
|
|
215
253
|
for (auto &dst : sort_chunk.data) {
|
|
@@ -246,15 +284,27 @@ struct SortedAggregateFunction {
|
|
|
246
284
|
UnifiedVectorFormat svdata;
|
|
247
285
|
states.ToUnifiedFormat(count, svdata);
|
|
248
286
|
|
|
249
|
-
//
|
|
287
|
+
// Size the selection vector for each state.
|
|
250
288
|
auto sdata = (SortedAggregateState **)svdata.data;
|
|
251
289
|
for (idx_t i = 0; i < count; ++i) {
|
|
252
290
|
auto sidx = svdata.sel->get_index(i);
|
|
253
291
|
auto order_state = sdata[sidx];
|
|
254
|
-
|
|
255
|
-
|
|
292
|
+
order_state->nsel++;
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
// Build the selection vector for each state.
|
|
296
|
+
vector<sel_t> sel_data(count);
|
|
297
|
+
idx_t start = 0;
|
|
298
|
+
for (idx_t i = 0; i < count; ++i) {
|
|
299
|
+
auto sidx = svdata.sel->get_index(i);
|
|
300
|
+
auto order_state = sdata[sidx];
|
|
301
|
+
if (!order_state->offset) {
|
|
302
|
+
// First one
|
|
303
|
+
order_state->offset = start;
|
|
304
|
+
order_state->sel.Initialize(sel_data.data() + order_state->offset);
|
|
305
|
+
start += order_state->nsel;
|
|
256
306
|
}
|
|
257
|
-
order_state->
|
|
307
|
+
sel_data[order_state->offset++] = sidx;
|
|
258
308
|
}
|
|
259
309
|
|
|
260
310
|
// Append nonempty slices to the arguments
|
|
@@ -317,7 +367,7 @@ struct SortedAggregateFunction {
|
|
|
317
367
|
auto global_sort = make_unique<GlobalSortState>(buffer_manager, orders, payload_layout);
|
|
318
368
|
LocalSortState local_sort;
|
|
319
369
|
local_sort.Initialize(*global_sort, global_sort->buffer_manager);
|
|
320
|
-
state->Finalize(local_sort);
|
|
370
|
+
state->Finalize(*order_bind, local_sort);
|
|
321
371
|
global_sort->AddLocalState(local_sort);
|
|
322
372
|
|
|
323
373
|
if (!global_sort->sorted_blocks.empty()) {
|
|
@@ -399,12 +449,13 @@ void FunctionBinder::BindSortedAggregate(ClientContext &context, BoundAggregateE
|
|
|
399
449
|
auto &bound_function = expr.function;
|
|
400
450
|
auto &children = expr.children;
|
|
401
451
|
auto &order_bys = *expr.order_bys;
|
|
402
|
-
auto sorted_bind = make_unique<SortedAggregateBindData>(context,
|
|
403
|
-
std::move(expr.bind_info), order_bys);
|
|
452
|
+
auto sorted_bind = make_unique<SortedAggregateBindData>(context, expr);
|
|
404
453
|
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
454
|
+
if (!sorted_bind->sorted_on_args) {
|
|
455
|
+
// The arguments are the children plus the sort columns.
|
|
456
|
+
for (auto &order : order_bys.orders) {
|
|
457
|
+
children.emplace_back(std::move(order.expression));
|
|
458
|
+
}
|
|
408
459
|
}
|
|
409
460
|
|
|
410
461
|
vector<LogicalType> arguments;
|
|
@@ -1,8 +1,11 @@
|
|
|
1
|
+
#include "duckdb/catalog/catalog_search_path.hpp"
|
|
1
2
|
#include "duckdb/common/constants.hpp"
|
|
2
3
|
#include "duckdb/common/file_system.hpp"
|
|
3
4
|
#include "duckdb/common/string_util.hpp"
|
|
4
5
|
#include "duckdb/function/pragma/pragma_functions.hpp"
|
|
5
6
|
#include "duckdb/main/config.hpp"
|
|
7
|
+
#include "duckdb/main/database_manager.hpp"
|
|
8
|
+
#include "duckdb/main/client_data.hpp"
|
|
6
9
|
#include "duckdb/parser/parser.hpp"
|
|
7
10
|
#include "duckdb/parser/qualified_name.hpp"
|
|
8
11
|
#include "duckdb/parser/statement/copy_statement.hpp"
|
|
@@ -15,7 +18,27 @@ string PragmaTableInfo(ClientContext &context, const FunctionParameters ¶met
|
|
|
15
18
|
}
|
|
16
19
|
|
|
17
20
|
string PragmaShowTables(ClientContext &context, const FunctionParameters ¶meters) {
|
|
18
|
-
|
|
21
|
+
auto catalog = DatabaseManager::GetDefaultDatabase(context);
|
|
22
|
+
auto schema = ClientData::Get(context).catalog_search_path->GetDefault().schema;
|
|
23
|
+
schema = (schema == INVALID_SCHEMA) ? DEFAULT_SCHEMA : schema; // NOLINT
|
|
24
|
+
|
|
25
|
+
auto where_clause =
|
|
26
|
+
StringUtil::Join({"where database_name = '", catalog, "' and schema_name = '", schema, "'"}, "");
|
|
27
|
+
// clang-format off
|
|
28
|
+
auto pragma_query = StringUtil::Join(
|
|
29
|
+
{"with tables as (",
|
|
30
|
+
" SELECT table_name as name FROM duckdb_tables ", where_clause,
|
|
31
|
+
"), views as (",
|
|
32
|
+
" SELECT view_name as name FROM duckdb_views ", where_clause,
|
|
33
|
+
"), indexes as (",
|
|
34
|
+
" SELECT index_name as name FROM duckdb_indexes ", where_clause,
|
|
35
|
+
"), db_objects as (",
|
|
36
|
+
" SELECT name FROM tables UNION ALL SELECT name FROM views UNION ALL SELECT name FROM indexes",
|
|
37
|
+
") SELECT name FROM db_objects ORDER BY name;"
|
|
38
|
+
}, "");
|
|
39
|
+
// clang-format on
|
|
40
|
+
|
|
41
|
+
return pragma_query;
|
|
19
42
|
}
|
|
20
43
|
|
|
21
44
|
string PragmaShowTablesExpanded(ClientContext &context, const FunctionParameters ¶meters) {
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
|
2
|
-
#define DUCKDB_VERSION "0.7.2-
|
|
2
|
+
#define DUCKDB_VERSION "0.7.2-dev1238"
|
|
3
3
|
#endif
|
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
|
5
|
+
#define DUCKDB_SOURCE_ID "4be6bdb565"
|
|
6
6
|
#endif
|
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
|
8
8
|
#include "duckdb/main/database.hpp"
|
|
@@ -62,6 +62,20 @@ struct AggregateHTScanState {
|
|
|
62
62
|
idx_t scan_position = 0;
|
|
63
63
|
};
|
|
64
64
|
|
|
65
|
+
struct AggregateHTAppendState {
|
|
66
|
+
AggregateHTAppendState();
|
|
67
|
+
|
|
68
|
+
Vector ht_offsets;
|
|
69
|
+
Vector hash_salts;
|
|
70
|
+
SelectionVector group_compare_vector;
|
|
71
|
+
SelectionVector no_match_vector;
|
|
72
|
+
SelectionVector empty_vector;
|
|
73
|
+
SelectionVector new_groups;
|
|
74
|
+
Vector addresses;
|
|
75
|
+
unique_ptr<UnifiedVectorFormat[]> group_data;
|
|
76
|
+
DataChunk group_chunk;
|
|
77
|
+
};
|
|
78
|
+
|
|
65
79
|
class GroupedAggregateHashTable : public BaseAggregateHashTable {
|
|
66
80
|
public:
|
|
67
81
|
//! The hash table load factor, when a resize is triggered
|
|
@@ -71,10 +85,12 @@ public:
|
|
|
71
85
|
public:
|
|
72
86
|
GroupedAggregateHashTable(ClientContext &context, Allocator &allocator, vector<LogicalType> group_types,
|
|
73
87
|
vector<LogicalType> payload_types, const vector<BoundAggregateExpression *> &aggregates,
|
|
74
|
-
HtEntryType entry_type = HtEntryType::HT_WIDTH_64
|
|
88
|
+
HtEntryType entry_type = HtEntryType::HT_WIDTH_64,
|
|
89
|
+
idx_t initial_capacity = InitialCapacity());
|
|
75
90
|
GroupedAggregateHashTable(ClientContext &context, Allocator &allocator, vector<LogicalType> group_types,
|
|
76
91
|
vector<LogicalType> payload_types, vector<AggregateObject> aggregates,
|
|
77
|
-
HtEntryType entry_type = HtEntryType::HT_WIDTH_64
|
|
92
|
+
HtEntryType entry_type = HtEntryType::HT_WIDTH_64,
|
|
93
|
+
idx_t initial_capacity = InitialCapacity());
|
|
78
94
|
GroupedAggregateHashTable(ClientContext &context, Allocator &allocator, vector<LogicalType> group_types);
|
|
79
95
|
~GroupedAggregateHashTable() override;
|
|
80
96
|
|
|
@@ -85,9 +101,10 @@ public:
|
|
|
85
101
|
//! Add the given data to the HT, computing the aggregates grouped by the
|
|
86
102
|
//! data in the group chunk. When resize = true, aggregates will not be
|
|
87
103
|
//! computed but instead just assigned.
|
|
88
|
-
idx_t AddChunk(DataChunk &groups, DataChunk &payload, const vector<idx_t> &filter);
|
|
89
|
-
idx_t AddChunk(DataChunk &groups, Vector &group_hashes, DataChunk &payload,
|
|
90
|
-
|
|
104
|
+
idx_t AddChunk(AggregateHTAppendState &state, DataChunk &groups, DataChunk &payload, const vector<idx_t> &filter);
|
|
105
|
+
idx_t AddChunk(AggregateHTAppendState &state, DataChunk &groups, Vector &group_hashes, DataChunk &payload,
|
|
106
|
+
const vector<idx_t> &filter);
|
|
107
|
+
idx_t AddChunk(AggregateHTAppendState &state, DataChunk &groups, DataChunk &payload, AggregateType filter);
|
|
91
108
|
|
|
92
109
|
//! Scan the HT starting from the scan_position until the result and group
|
|
93
110
|
//! chunks are filled. scan_position will be updated by this function.
|
|
@@ -100,18 +117,24 @@ public:
|
|
|
100
117
|
//! Finds or creates groups in the hashtable using the specified group keys. The addresses vector will be filled
|
|
101
118
|
//! with pointers to the groups in the hash table, and the new_groups selection vector will point to the newly
|
|
102
119
|
//! created groups. The return value is the amount of newly created groups.
|
|
103
|
-
idx_t FindOrCreateGroups(
|
|
120
|
+
idx_t FindOrCreateGroups(AggregateHTAppendState &state, DataChunk &groups, Vector &group_hashes,
|
|
121
|
+
Vector &addresses_out, SelectionVector &new_groups_out);
|
|
122
|
+
idx_t FindOrCreateGroups(AggregateHTAppendState &state, DataChunk &groups, Vector &addresses_out,
|
|
104
123
|
SelectionVector &new_groups_out);
|
|
105
|
-
|
|
106
|
-
void FindOrCreateGroups(DataChunk &groups, Vector &addresses_out);
|
|
124
|
+
void FindOrCreateGroups(AggregateHTAppendState &state, DataChunk &groups, Vector &addresses_out);
|
|
107
125
|
|
|
108
126
|
//! Executes the filter(if any) and update the aggregates
|
|
109
127
|
void Combine(GroupedAggregateHashTable &other);
|
|
110
128
|
|
|
129
|
+
static idx_t InitialCapacity();
|
|
111
130
|
idx_t Size() {
|
|
112
131
|
return entries;
|
|
113
132
|
}
|
|
133
|
+
idx_t Capacity() {
|
|
134
|
+
return capacity;
|
|
135
|
+
}
|
|
114
136
|
|
|
137
|
+
idx_t ResizeThreshold();
|
|
115
138
|
idx_t MaxCapacity();
|
|
116
139
|
static idx_t GetMaxCapacity(HtEntryType entry_type, idx_t tuple_size);
|
|
117
140
|
|
|
@@ -138,8 +161,7 @@ private:
|
|
|
138
161
|
//! The hashes of the HT
|
|
139
162
|
BufferHandle hashes_hdl;
|
|
140
163
|
data_ptr_t hashes_hdl_ptr;
|
|
141
|
-
|
|
142
|
-
idx_t hash_offset; // Offset into the layout of the hash column
|
|
164
|
+
idx_t hash_offset; // Offset into the layout of the hash column
|
|
143
165
|
|
|
144
166
|
hash_t hash_prefix_shift;
|
|
145
167
|
idx_t payload_page_offset;
|
|
@@ -147,16 +169,8 @@ private:
|
|
|
147
169
|
//! Bitmask for getting relevant bits from the hashes to determine the position
|
|
148
170
|
hash_t bitmask;
|
|
149
171
|
|
|
150
|
-
vector<unique_ptr<GroupedAggregateHashTable>> distinct_hashes;
|
|
151
|
-
|
|
152
172
|
bool is_finalized;
|
|
153
173
|
|
|
154
|
-
// some stuff from FindOrCreateGroupsInternal() to avoid allocation there
|
|
155
|
-
Vector ht_offsets;
|
|
156
|
-
Vector hash_salts;
|
|
157
|
-
SelectionVector group_compare_vector;
|
|
158
|
-
SelectionVector no_match_vector;
|
|
159
|
-
SelectionVector empty_vector;
|
|
160
174
|
vector<ExpressionType> predicates;
|
|
161
175
|
|
|
162
176
|
private:
|
|
@@ -176,8 +190,8 @@ private:
|
|
|
176
190
|
template <class ENTRY>
|
|
177
191
|
void Resize(idx_t size);
|
|
178
192
|
template <class ENTRY>
|
|
179
|
-
idx_t FindOrCreateGroupsInternal(
|
|
180
|
-
SelectionVector &new_groups);
|
|
193
|
+
idx_t FindOrCreateGroupsInternal(AggregateHTAppendState &state, DataChunk &groups, Vector &group_hashes,
|
|
194
|
+
Vector &addresses, SelectionVector &new_groups);
|
|
181
195
|
|
|
182
196
|
template <class FUNC = std::function<void(idx_t, idx_t, data_ptr_t)>>
|
|
183
197
|
void PayloadApply(FUNC fun);
|
|
@@ -54,9 +54,10 @@ private:
|
|
|
54
54
|
vector<idx_t> sel_vector_sizes;
|
|
55
55
|
DataChunk group_subset, payload_subset;
|
|
56
56
|
Vector hashes, hashes_subset;
|
|
57
|
+
AggregateHTAppendState append_state;
|
|
57
58
|
|
|
58
59
|
HashTableList unpartitioned_hts;
|
|
59
|
-
|
|
60
|
+
vector<HashTableList> radix_partitioned_hts;
|
|
60
61
|
idx_t tuple_size;
|
|
61
62
|
|
|
62
63
|
private:
|