duckdb 0.3.5-dev411.0 → 0.3.5-dev449.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb.cpp +986 -519
- package/src/duckdb.hpp +30 -11
- package/src/parquet-amalgamation.cpp +36745 -36745
- package/test/prepare.test.js +2 -2
package/src/duckdb.cpp
CHANGED
|
@@ -1094,6 +1094,7 @@ private:
|
|
|
1094
1094
|
Serializer &serializer;
|
|
1095
1095
|
unique_ptr<BufferedSerializer> buffer;
|
|
1096
1096
|
idx_t field_count;
|
|
1097
|
+
bool finalized;
|
|
1097
1098
|
};
|
|
1098
1099
|
|
|
1099
1100
|
template <>
|
|
@@ -1179,6 +1180,17 @@ public:
|
|
|
1179
1180
|
return T::Deserialize(source);
|
|
1180
1181
|
}
|
|
1181
1182
|
|
|
1183
|
+
template <class T, class RETURN_TYPE = unique_ptr<T>, typename... ARGS>
|
|
1184
|
+
RETURN_TYPE ReadSerializable(RETURN_TYPE default_value, ARGS &&...args) {
|
|
1185
|
+
if (field_count >= max_field_count) {
|
|
1186
|
+
// field is not there, read the default value
|
|
1187
|
+
return default_value;
|
|
1188
|
+
}
|
|
1189
|
+
// field is there, read the actual value
|
|
1190
|
+
AddField();
|
|
1191
|
+
return T::Deserialize(source, std::forward<ARGS>(args)...);
|
|
1192
|
+
}
|
|
1193
|
+
|
|
1182
1194
|
template <class T, class RETURN_TYPE = unique_ptr<T>>
|
|
1183
1195
|
RETURN_TYPE ReadRequiredSerializable() {
|
|
1184
1196
|
if (field_count >= max_field_count) {
|
|
@@ -1190,6 +1202,17 @@ public:
|
|
|
1190
1202
|
return T::Deserialize(source);
|
|
1191
1203
|
}
|
|
1192
1204
|
|
|
1205
|
+
template <class T, class RETURN_TYPE = unique_ptr<T>, typename... ARGS>
|
|
1206
|
+
RETURN_TYPE ReadRequiredSerializable(ARGS &&...args) {
|
|
1207
|
+
if (field_count >= max_field_count) {
|
|
1208
|
+
// field is not there, read the default value
|
|
1209
|
+
throw SerializationException("Attempting to read mandatory field, but field is missing");
|
|
1210
|
+
}
|
|
1211
|
+
// field is there, read the actual value
|
|
1212
|
+
AddField();
|
|
1213
|
+
return T::Deserialize(source, std::forward<ARGS>(args)...);
|
|
1214
|
+
}
|
|
1215
|
+
|
|
1193
1216
|
template <class T, class RETURN_TYPE = unique_ptr<T>>
|
|
1194
1217
|
vector<RETURN_TYPE> ReadRequiredSerializableList() {
|
|
1195
1218
|
if (field_count >= max_field_count) {
|
|
@@ -1233,6 +1256,7 @@ private:
|
|
|
1233
1256
|
idx_t field_count;
|
|
1234
1257
|
idx_t max_field_count;
|
|
1235
1258
|
idx_t total_size;
|
|
1259
|
+
bool finalized;
|
|
1236
1260
|
};
|
|
1237
1261
|
|
|
1238
1262
|
} // namespace duckdb
|
|
@@ -2473,6 +2497,11 @@ private:
|
|
|
2473
2497
|
|
|
2474
2498
|
|
|
2475
2499
|
|
|
2500
|
+
|
|
2501
|
+
#include <cfloat>
|
|
2502
|
+
#include <cstring> // strlen() on Solaris
|
|
2503
|
+
#include <limits.h>
|
|
2504
|
+
|
|
2476
2505
|
namespace duckdb {
|
|
2477
2506
|
|
|
2478
2507
|
#define BSWAP16(x) ((uint16_t)((((uint16_t)(x)&0xff00) >> 8) | (((uint16_t)(x)&0x00ff) << 8)))
|
|
@@ -2487,44 +2516,170 @@ namespace duckdb {
|
|
|
2487
2516
|
(((uint64_t)(x)&0x00000000ff000000ull) << 8) | (((uint64_t)(x)&0x0000000000ff0000ull) << 24) | \
|
|
2488
2517
|
(((uint64_t)(x)&0x000000000000ff00ull) << 40) | (((uint64_t)(x)&0x00000000000000ffull) << 56)))
|
|
2489
2518
|
|
|
2490
|
-
|
|
2491
|
-
|
|
2492
|
-
|
|
2493
|
-
|
|
2519
|
+
struct Radix {
|
|
2520
|
+
public:
|
|
2521
|
+
static inline bool IsLittleEndian() {
|
|
2522
|
+
int n = 1;
|
|
2523
|
+
if (*(char *)&n == 1) {
|
|
2524
|
+
return true;
|
|
2525
|
+
} else {
|
|
2526
|
+
return false;
|
|
2527
|
+
}
|
|
2528
|
+
}
|
|
2494
2529
|
|
|
2495
|
-
template <class T>
|
|
2496
|
-
void EncodeData(data_ptr_t dataptr, T value, bool is_little_endian) {
|
|
2497
|
-
|
|
2498
|
-
}
|
|
2530
|
+
template <class T>
|
|
2531
|
+
static inline void EncodeData(data_ptr_t dataptr, T value, bool is_little_endian) {
|
|
2532
|
+
throw NotImplementedException("Cannot create data from this type");
|
|
2533
|
+
}
|
|
2534
|
+
|
|
2535
|
+
static inline void EncodeStringDataPrefix(data_ptr_t dataptr, string_t value, idx_t prefix_len) {
|
|
2536
|
+
auto len = value.GetSize();
|
|
2537
|
+
memcpy(dataptr, value.GetDataUnsafe(), MinValue(len, prefix_len));
|
|
2538
|
+
if (len < prefix_len) {
|
|
2539
|
+
memset(dataptr + len, '\0', prefix_len - len);
|
|
2540
|
+
}
|
|
2541
|
+
}
|
|
2542
|
+
|
|
2543
|
+
static inline uint8_t FlipSign(uint8_t key_byte) {
|
|
2544
|
+
return key_byte ^ 128;
|
|
2545
|
+
}
|
|
2546
|
+
|
|
2547
|
+
static inline uint32_t EncodeFloat(float x) {
|
|
2548
|
+
uint64_t buff;
|
|
2549
|
+
|
|
2550
|
+
//! zero
|
|
2551
|
+
if (x == 0) {
|
|
2552
|
+
buff = 0;
|
|
2553
|
+
buff |= (1u << 31);
|
|
2554
|
+
return buff;
|
|
2555
|
+
}
|
|
2556
|
+
// nan
|
|
2557
|
+
if (Value::IsNan(x)) {
|
|
2558
|
+
return UINT_MAX;
|
|
2559
|
+
}
|
|
2560
|
+
//! infinity
|
|
2561
|
+
if (x > FLT_MAX) {
|
|
2562
|
+
return UINT_MAX - 1;
|
|
2563
|
+
}
|
|
2564
|
+
//! -infinity
|
|
2565
|
+
if (x < -FLT_MAX) {
|
|
2566
|
+
return 0;
|
|
2567
|
+
}
|
|
2568
|
+
buff = Load<uint32_t>((const_data_ptr_t)&x);
|
|
2569
|
+
if ((buff & (1u << 31)) == 0) { //! +0 and positive numbers
|
|
2570
|
+
buff |= (1u << 31);
|
|
2571
|
+
} else { //! negative numbers
|
|
2572
|
+
buff = ~buff; //! complement 1
|
|
2573
|
+
}
|
|
2574
|
+
|
|
2575
|
+
return buff;
|
|
2576
|
+
}
|
|
2577
|
+
|
|
2578
|
+
static inline uint64_t EncodeDouble(double x) {
|
|
2579
|
+
uint64_t buff;
|
|
2580
|
+
//! zero
|
|
2581
|
+
if (x == 0) {
|
|
2582
|
+
buff = 0;
|
|
2583
|
+
buff += (1ull << 63);
|
|
2584
|
+
return buff;
|
|
2585
|
+
}
|
|
2586
|
+
// nan
|
|
2587
|
+
if (Value::IsNan(x)) {
|
|
2588
|
+
return ULLONG_MAX;
|
|
2589
|
+
}
|
|
2590
|
+
//! infinity
|
|
2591
|
+
if (x > DBL_MAX) {
|
|
2592
|
+
return ULLONG_MAX - 1;
|
|
2593
|
+
}
|
|
2594
|
+
//! -infinity
|
|
2595
|
+
if (x < -DBL_MAX) {
|
|
2596
|
+
return 0;
|
|
2597
|
+
}
|
|
2598
|
+
buff = Load<uint64_t>((const_data_ptr_t)&x);
|
|
2599
|
+
if (buff < (1ull << 63)) { //! +0 and positive numbers
|
|
2600
|
+
buff += (1ull << 63);
|
|
2601
|
+
} else { //! negative numbers
|
|
2602
|
+
buff = ~buff; //! complement 1
|
|
2603
|
+
}
|
|
2604
|
+
return buff;
|
|
2605
|
+
}
|
|
2606
|
+
};
|
|
2499
2607
|
|
|
2500
2608
|
template <>
|
|
2501
|
-
void EncodeData(data_ptr_t dataptr, bool value, bool is_little_endian)
|
|
2502
|
-
|
|
2503
|
-
|
|
2609
|
+
inline void Radix::EncodeData(data_ptr_t dataptr, bool value, bool is_little_endian) {
|
|
2610
|
+
Store<uint8_t>(value ? 1 : 0, dataptr);
|
|
2611
|
+
}
|
|
2612
|
+
|
|
2504
2613
|
template <>
|
|
2505
|
-
void EncodeData(data_ptr_t dataptr,
|
|
2614
|
+
inline void Radix::EncodeData(data_ptr_t dataptr, int8_t value, bool is_little_endian) {
|
|
2615
|
+
Store<uint8_t>(value, dataptr);
|
|
2616
|
+
dataptr[0] = FlipSign(dataptr[0]);
|
|
2617
|
+
}
|
|
2618
|
+
|
|
2506
2619
|
template <>
|
|
2507
|
-
void EncodeData(data_ptr_t dataptr,
|
|
2620
|
+
inline void Radix::EncodeData(data_ptr_t dataptr, int16_t value, bool is_little_endian) {
|
|
2621
|
+
Store<uint16_t>(is_little_endian ? BSWAP16(value) : value, dataptr);
|
|
2622
|
+
dataptr[0] = FlipSign(dataptr[0]);
|
|
2623
|
+
}
|
|
2624
|
+
|
|
2508
2625
|
template <>
|
|
2509
|
-
void EncodeData(data_ptr_t dataptr,
|
|
2626
|
+
inline void Radix::EncodeData(data_ptr_t dataptr, int32_t value, bool is_little_endian) {
|
|
2627
|
+
Store<uint32_t>(is_little_endian ? BSWAP32(value) : value, dataptr);
|
|
2628
|
+
dataptr[0] = FlipSign(dataptr[0]);
|
|
2629
|
+
}
|
|
2630
|
+
|
|
2510
2631
|
template <>
|
|
2511
|
-
void EncodeData(data_ptr_t dataptr,
|
|
2632
|
+
inline void Radix::EncodeData(data_ptr_t dataptr, int64_t value, bool is_little_endian) {
|
|
2633
|
+
Store<uint64_t>(is_little_endian ? BSWAP64(value) : value, dataptr);
|
|
2634
|
+
dataptr[0] = FlipSign(dataptr[0]);
|
|
2635
|
+
}
|
|
2636
|
+
|
|
2512
2637
|
template <>
|
|
2513
|
-
void EncodeData(data_ptr_t dataptr,
|
|
2638
|
+
inline void Radix::EncodeData(data_ptr_t dataptr, uint8_t value, bool is_little_endian) {
|
|
2639
|
+
Store<uint8_t>(value, dataptr);
|
|
2640
|
+
}
|
|
2641
|
+
|
|
2514
2642
|
template <>
|
|
2515
|
-
void EncodeData(data_ptr_t dataptr,
|
|
2643
|
+
inline void Radix::EncodeData(data_ptr_t dataptr, uint16_t value, bool is_little_endian) {
|
|
2644
|
+
Store<uint16_t>(is_little_endian ? BSWAP16(value) : value, dataptr);
|
|
2645
|
+
}
|
|
2646
|
+
|
|
2516
2647
|
template <>
|
|
2517
|
-
void EncodeData(data_ptr_t dataptr,
|
|
2648
|
+
inline void Radix::EncodeData(data_ptr_t dataptr, uint32_t value, bool is_little_endian) {
|
|
2649
|
+
Store<uint32_t>(is_little_endian ? BSWAP32(value) : value, dataptr);
|
|
2650
|
+
}
|
|
2651
|
+
|
|
2518
2652
|
template <>
|
|
2519
|
-
void EncodeData(data_ptr_t dataptr,
|
|
2653
|
+
inline void Radix::EncodeData(data_ptr_t dataptr, uint64_t value, bool is_little_endian) {
|
|
2654
|
+
Store<uint64_t>(is_little_endian ? BSWAP64(value) : value, dataptr);
|
|
2655
|
+
}
|
|
2656
|
+
|
|
2520
2657
|
template <>
|
|
2521
|
-
void EncodeData(data_ptr_t dataptr,
|
|
2658
|
+
inline void Radix::EncodeData(data_ptr_t dataptr, hugeint_t value, bool is_little_endian) {
|
|
2659
|
+
EncodeData<int64_t>(dataptr, value.upper, is_little_endian);
|
|
2660
|
+
EncodeData<uint64_t>(dataptr + sizeof(value.upper), value.lower, is_little_endian);
|
|
2661
|
+
}
|
|
2662
|
+
|
|
2522
2663
|
template <>
|
|
2523
|
-
void EncodeData(data_ptr_t dataptr, float value, bool is_little_endian)
|
|
2664
|
+
inline void Radix::EncodeData(data_ptr_t dataptr, float value, bool is_little_endian) {
|
|
2665
|
+
uint32_t converted_value = EncodeFloat(value);
|
|
2666
|
+
Store<uint32_t>(is_little_endian ? BSWAP32(converted_value) : converted_value, dataptr);
|
|
2667
|
+
}
|
|
2668
|
+
|
|
2524
2669
|
template <>
|
|
2525
|
-
void EncodeData(data_ptr_t dataptr,
|
|
2670
|
+
inline void Radix::EncodeData(data_ptr_t dataptr, double value, bool is_little_endian) {
|
|
2671
|
+
uint64_t converted_value = EncodeDouble(value);
|
|
2672
|
+
Store<uint64_t>(is_little_endian ? BSWAP64(converted_value) : converted_value, dataptr);
|
|
2673
|
+
}
|
|
2526
2674
|
|
|
2527
|
-
|
|
2675
|
+
template <>
|
|
2676
|
+
inline void Radix::EncodeData(data_ptr_t dataptr, interval_t value, bool is_little_endian) {
|
|
2677
|
+
EncodeData<int32_t>(dataptr, value.months, is_little_endian);
|
|
2678
|
+
dataptr += sizeof(value.months);
|
|
2679
|
+
EncodeData<int32_t>(dataptr, value.days, is_little_endian);
|
|
2680
|
+
dataptr += sizeof(value.days);
|
|
2681
|
+
EncodeData<int64_t>(dataptr, value.micros, is_little_endian);
|
|
2682
|
+
}
|
|
2528
2683
|
|
|
2529
2684
|
} // namespace duckdb
|
|
2530
2685
|
|
|
@@ -2542,13 +2697,13 @@ public:
|
|
|
2542
2697
|
|
|
2543
2698
|
public:
|
|
2544
2699
|
template <class T>
|
|
2545
|
-
static unique_ptr<Key> CreateKey(T element, bool is_little_endian) {
|
|
2700
|
+
static inline unique_ptr<Key> CreateKey(T element, bool is_little_endian) {
|
|
2546
2701
|
auto data = Key::CreateData<T>(element, is_little_endian);
|
|
2547
2702
|
return make_unique<Key>(move(data), sizeof(element));
|
|
2548
2703
|
}
|
|
2549
2704
|
|
|
2550
2705
|
template <class T>
|
|
2551
|
-
static unique_ptr<Key> CreateKey(const Value &element, bool is_little_endian) {
|
|
2706
|
+
static inline unique_ptr<Key> CreateKey(const Value &element, bool is_little_endian) {
|
|
2552
2707
|
return CreateKey(element.GetValueUnsafe<T>(), is_little_endian);
|
|
2553
2708
|
}
|
|
2554
2709
|
|
|
@@ -2566,9 +2721,9 @@ public:
|
|
|
2566
2721
|
|
|
2567
2722
|
private:
|
|
2568
2723
|
template <class T>
|
|
2569
|
-
static unique_ptr<data_t[]> CreateData(T value, bool is_little_endian) {
|
|
2724
|
+
static inline unique_ptr<data_t[]> CreateData(T value, bool is_little_endian) {
|
|
2570
2725
|
auto data = unique_ptr<data_t[]>(new data_t[sizeof(value)]);
|
|
2571
|
-
EncodeData<T>(data.get(), value, is_little_endian);
|
|
2726
|
+
Radix::EncodeData<T>(data.get(), value, is_little_endian);
|
|
2572
2727
|
return data;
|
|
2573
2728
|
}
|
|
2574
2729
|
};
|
|
@@ -12934,13 +13089,14 @@ namespace duckdb {
|
|
|
12934
13089
|
// Field Writer
|
|
12935
13090
|
//===--------------------------------------------------------------------===//
|
|
12936
13091
|
FieldWriter::FieldWriter(Serializer &serializer_p)
|
|
12937
|
-
: serializer(serializer_p), buffer(make_unique<BufferedSerializer>()), field_count(0) {
|
|
13092
|
+
: serializer(serializer_p), buffer(make_unique<BufferedSerializer>()), field_count(0), finalized(false) {
|
|
12938
13093
|
}
|
|
12939
13094
|
|
|
12940
13095
|
FieldWriter::~FieldWriter() {
|
|
12941
13096
|
if (Exception::UncaughtException()) {
|
|
12942
13097
|
return;
|
|
12943
13098
|
}
|
|
13099
|
+
D_ASSERT(finalized);
|
|
12944
13100
|
// finalize should always have been called, unless this is destroyed as part of stack unwinding
|
|
12945
13101
|
D_ASSERT(!buffer);
|
|
12946
13102
|
}
|
|
@@ -12960,7 +13116,8 @@ void FieldWriter::Write(const string &val) {
|
|
|
12960
13116
|
|
|
12961
13117
|
void FieldWriter::Finalize() {
|
|
12962
13118
|
D_ASSERT(buffer);
|
|
12963
|
-
|
|
13119
|
+
D_ASSERT(!finalized);
|
|
13120
|
+
finalized = true;
|
|
12964
13121
|
serializer.Write<uint32_t>(field_count);
|
|
12965
13122
|
serializer.Write<uint64_t>(buffer->blob.size);
|
|
12966
13123
|
serializer.WriteData(buffer->blob.data.get(), buffer->blob.size);
|
|
@@ -12992,7 +13149,7 @@ void FieldDeserializer::SetRemainingData(idx_t remaining_data) {
|
|
|
12992
13149
|
//===--------------------------------------------------------------------===//
|
|
12993
13150
|
// Field Reader
|
|
12994
13151
|
//===--------------------------------------------------------------------===//
|
|
12995
|
-
FieldReader::FieldReader(Deserializer &source_p) : source(source_p), field_count(0) {
|
|
13152
|
+
FieldReader::FieldReader(Deserializer &source_p) : source(source_p), field_count(0), finalized(false) {
|
|
12996
13153
|
max_field_count = source_p.Read<uint32_t>();
|
|
12997
13154
|
total_size = source_p.Read<uint64_t>();
|
|
12998
13155
|
D_ASSERT(max_field_count > 0);
|
|
@@ -13000,9 +13157,15 @@ FieldReader::FieldReader(Deserializer &source_p) : source(source_p), field_count
|
|
|
13000
13157
|
}
|
|
13001
13158
|
|
|
13002
13159
|
FieldReader::~FieldReader() {
|
|
13160
|
+
if (Exception::UncaughtException()) {
|
|
13161
|
+
return;
|
|
13162
|
+
}
|
|
13163
|
+
D_ASSERT(finalized);
|
|
13003
13164
|
}
|
|
13004
13165
|
|
|
13005
13166
|
void FieldReader::Finalize() {
|
|
13167
|
+
D_ASSERT(!finalized);
|
|
13168
|
+
finalized = true;
|
|
13006
13169
|
if (field_count < max_field_count) {
|
|
13007
13170
|
// we can handle this case by calling source.ReadData(buffer, source.RemainingData())
|
|
13008
13171
|
throw SerializationException("Not all fields were read. This file might have been written with a newer version "
|
|
@@ -24874,174 +25037,6 @@ void ProgressBar::Update(bool final) {
|
|
|
24874
25037
|
}
|
|
24875
25038
|
|
|
24876
25039
|
} // namespace duckdb
|
|
24877
|
-
|
|
24878
|
-
|
|
24879
|
-
|
|
24880
|
-
|
|
24881
|
-
#include <cfloat>
|
|
24882
|
-
#include <cstring> // strlen() on Solaris
|
|
24883
|
-
#include <limits.h>
|
|
24884
|
-
|
|
24885
|
-
namespace duckdb {
|
|
24886
|
-
|
|
24887
|
-
bool IsLittleEndian() {
|
|
24888
|
-
int n = 1;
|
|
24889
|
-
if (*(char *)&n == 1) {
|
|
24890
|
-
return true;
|
|
24891
|
-
} else {
|
|
24892
|
-
return false;
|
|
24893
|
-
}
|
|
24894
|
-
}
|
|
24895
|
-
|
|
24896
|
-
uint8_t FlipSign(uint8_t key_byte) {
|
|
24897
|
-
return key_byte ^ 128;
|
|
24898
|
-
}
|
|
24899
|
-
|
|
24900
|
-
uint32_t EncodeFloat(float x) {
|
|
24901
|
-
uint64_t buff;
|
|
24902
|
-
|
|
24903
|
-
//! zero
|
|
24904
|
-
if (x == 0) {
|
|
24905
|
-
buff = 0;
|
|
24906
|
-
buff |= (1u << 31);
|
|
24907
|
-
return buff;
|
|
24908
|
-
}
|
|
24909
|
-
// nan
|
|
24910
|
-
if (Value::IsNan(x)) {
|
|
24911
|
-
return UINT_MAX;
|
|
24912
|
-
}
|
|
24913
|
-
//! infinity
|
|
24914
|
-
if (x > FLT_MAX) {
|
|
24915
|
-
return UINT_MAX - 1;
|
|
24916
|
-
}
|
|
24917
|
-
//! -infinity
|
|
24918
|
-
if (x < -FLT_MAX) {
|
|
24919
|
-
return 0;
|
|
24920
|
-
}
|
|
24921
|
-
buff = Load<uint32_t>((const_data_ptr_t)&x);
|
|
24922
|
-
if ((buff & (1u << 31)) == 0) { //! +0 and positive numbers
|
|
24923
|
-
buff |= (1u << 31);
|
|
24924
|
-
} else { //! negative numbers
|
|
24925
|
-
buff = ~buff; //! complement 1
|
|
24926
|
-
}
|
|
24927
|
-
|
|
24928
|
-
return buff;
|
|
24929
|
-
}
|
|
24930
|
-
|
|
24931
|
-
uint64_t EncodeDouble(double x) {
|
|
24932
|
-
uint64_t buff;
|
|
24933
|
-
//! zero
|
|
24934
|
-
if (x == 0) {
|
|
24935
|
-
buff = 0;
|
|
24936
|
-
buff += (1ull << 63);
|
|
24937
|
-
return buff;
|
|
24938
|
-
}
|
|
24939
|
-
// nan
|
|
24940
|
-
if (Value::IsNan(x)) {
|
|
24941
|
-
return ULLONG_MAX;
|
|
24942
|
-
}
|
|
24943
|
-
//! infinity
|
|
24944
|
-
if (x > DBL_MAX) {
|
|
24945
|
-
return ULLONG_MAX - 1;
|
|
24946
|
-
}
|
|
24947
|
-
//! -infinity
|
|
24948
|
-
if (x < -DBL_MAX) {
|
|
24949
|
-
return 0;
|
|
24950
|
-
}
|
|
24951
|
-
buff = Load<uint64_t>((const_data_ptr_t)&x);
|
|
24952
|
-
if (buff < (1ull << 63)) { //! +0 and positive numbers
|
|
24953
|
-
buff += (1ull << 63);
|
|
24954
|
-
} else { //! negative numbers
|
|
24955
|
-
buff = ~buff; //! complement 1
|
|
24956
|
-
}
|
|
24957
|
-
return buff;
|
|
24958
|
-
}
|
|
24959
|
-
|
|
24960
|
-
template <>
|
|
24961
|
-
void EncodeData(data_ptr_t dataptr, bool value, bool is_little_endian) {
|
|
24962
|
-
Store<uint8_t>(value ? 1 : 0, dataptr);
|
|
24963
|
-
}
|
|
24964
|
-
|
|
24965
|
-
template <>
|
|
24966
|
-
void EncodeData(data_ptr_t dataptr, int8_t value, bool is_little_endian) {
|
|
24967
|
-
Store<uint8_t>(value, dataptr);
|
|
24968
|
-
dataptr[0] = FlipSign(dataptr[0]);
|
|
24969
|
-
}
|
|
24970
|
-
|
|
24971
|
-
template <>
|
|
24972
|
-
void EncodeData(data_ptr_t dataptr, int16_t value, bool is_little_endian) {
|
|
24973
|
-
Store<uint16_t>(is_little_endian ? BSWAP16(value) : value, dataptr);
|
|
24974
|
-
dataptr[0] = FlipSign(dataptr[0]);
|
|
24975
|
-
}
|
|
24976
|
-
|
|
24977
|
-
template <>
|
|
24978
|
-
void EncodeData(data_ptr_t dataptr, int32_t value, bool is_little_endian) {
|
|
24979
|
-
Store<uint32_t>(is_little_endian ? BSWAP32(value) : value, dataptr);
|
|
24980
|
-
dataptr[0] = FlipSign(dataptr[0]);
|
|
24981
|
-
}
|
|
24982
|
-
|
|
24983
|
-
template <>
|
|
24984
|
-
void EncodeData(data_ptr_t dataptr, int64_t value, bool is_little_endian) {
|
|
24985
|
-
Store<uint64_t>(is_little_endian ? BSWAP64(value) : value, dataptr);
|
|
24986
|
-
dataptr[0] = FlipSign(dataptr[0]);
|
|
24987
|
-
}
|
|
24988
|
-
|
|
24989
|
-
template <>
|
|
24990
|
-
void EncodeData(data_ptr_t dataptr, uint8_t value, bool is_little_endian) {
|
|
24991
|
-
Store<uint8_t>(value, dataptr);
|
|
24992
|
-
}
|
|
24993
|
-
|
|
24994
|
-
template <>
|
|
24995
|
-
void EncodeData(data_ptr_t dataptr, uint16_t value, bool is_little_endian) {
|
|
24996
|
-
Store<uint16_t>(is_little_endian ? BSWAP16(value) : value, dataptr);
|
|
24997
|
-
}
|
|
24998
|
-
|
|
24999
|
-
template <>
|
|
25000
|
-
void EncodeData(data_ptr_t dataptr, uint32_t value, bool is_little_endian) {
|
|
25001
|
-
Store<uint32_t>(is_little_endian ? BSWAP32(value) : value, dataptr);
|
|
25002
|
-
}
|
|
25003
|
-
|
|
25004
|
-
template <>
|
|
25005
|
-
void EncodeData(data_ptr_t dataptr, uint64_t value, bool is_little_endian) {
|
|
25006
|
-
Store<uint64_t>(is_little_endian ? BSWAP64(value) : value, dataptr);
|
|
25007
|
-
}
|
|
25008
|
-
|
|
25009
|
-
template <>
|
|
25010
|
-
void EncodeData(data_ptr_t dataptr, hugeint_t value, bool is_little_endian) {
|
|
25011
|
-
EncodeData<int64_t>(dataptr, value.upper, is_little_endian);
|
|
25012
|
-
EncodeData<uint64_t>(dataptr + sizeof(value.upper), value.lower, is_little_endian);
|
|
25013
|
-
}
|
|
25014
|
-
|
|
25015
|
-
template <>
|
|
25016
|
-
void EncodeData(data_ptr_t dataptr, float value, bool is_little_endian) {
|
|
25017
|
-
uint32_t converted_value = EncodeFloat(value);
|
|
25018
|
-
Store<uint32_t>(is_little_endian ? BSWAP32(converted_value) : converted_value, dataptr);
|
|
25019
|
-
}
|
|
25020
|
-
|
|
25021
|
-
template <>
|
|
25022
|
-
void EncodeData(data_ptr_t dataptr, double value, bool is_little_endian) {
|
|
25023
|
-
uint64_t converted_value = EncodeDouble(value);
|
|
25024
|
-
Store<uint64_t>(is_little_endian ? BSWAP64(converted_value) : converted_value, dataptr);
|
|
25025
|
-
}
|
|
25026
|
-
|
|
25027
|
-
template <>
|
|
25028
|
-
void EncodeData(data_ptr_t dataptr, interval_t value, bool is_little_endian) {
|
|
25029
|
-
EncodeData<int32_t>(dataptr, value.months, is_little_endian);
|
|
25030
|
-
dataptr += sizeof(value.months);
|
|
25031
|
-
EncodeData<int32_t>(dataptr, value.days, is_little_endian);
|
|
25032
|
-
dataptr += sizeof(value.days);
|
|
25033
|
-
EncodeData<int64_t>(dataptr, value.micros, is_little_endian);
|
|
25034
|
-
}
|
|
25035
|
-
|
|
25036
|
-
void EncodeStringDataPrefix(data_ptr_t dataptr, string_t value, idx_t prefix_len) {
|
|
25037
|
-
auto len = value.GetSize();
|
|
25038
|
-
memcpy(dataptr, value.GetDataUnsafe(), MinValue(len, prefix_len));
|
|
25039
|
-
if (len < prefix_len) {
|
|
25040
|
-
memset(dataptr + len, '\0', prefix_len - len);
|
|
25041
|
-
}
|
|
25042
|
-
}
|
|
25043
|
-
|
|
25044
|
-
} // namespace duckdb
|
|
25045
25040
|
//===----------------------------------------------------------------------===//
|
|
25046
25041
|
// DuckDB
|
|
25047
25042
|
//
|
|
@@ -31664,7 +31659,7 @@ void TemplatedRadixScatter(VectorData &vdata, const SelectionVector &sel, idx_t
|
|
|
31664
31659
|
// write validity and according value
|
|
31665
31660
|
if (validity.RowIsValid(source_idx)) {
|
|
31666
31661
|
key_locations[i][0] = valid;
|
|
31667
|
-
EncodeData<T>(key_locations[i] + 1, source[source_idx], is_little_endian);
|
|
31662
|
+
Radix::EncodeData<T>(key_locations[i] + 1, source[source_idx], is_little_endian);
|
|
31668
31663
|
// invert bits if desc
|
|
31669
31664
|
if (desc) {
|
|
31670
31665
|
for (idx_t s = 1; s < sizeof(T) + 1; s++) {
|
|
@@ -31682,7 +31677,7 @@ void TemplatedRadixScatter(VectorData &vdata, const SelectionVector &sel, idx_t
|
|
|
31682
31677
|
auto idx = sel.get_index(i);
|
|
31683
31678
|
auto source_idx = vdata.sel->get_index(idx) + offset;
|
|
31684
31679
|
// write value
|
|
31685
|
-
EncodeData<T>(key_locations[i], source[source_idx], is_little_endian);
|
|
31680
|
+
Radix::EncodeData<T>(key_locations[i], source[source_idx], is_little_endian);
|
|
31686
31681
|
// invert bits if desc
|
|
31687
31682
|
if (desc) {
|
|
31688
31683
|
for (idx_t s = 0; s < sizeof(T); s++) {
|
|
@@ -31709,7 +31704,7 @@ void RadixScatterStringVector(VectorData &vdata, const SelectionVector &sel, idx
|
|
|
31709
31704
|
// write validity and according value
|
|
31710
31705
|
if (validity.RowIsValid(source_idx)) {
|
|
31711
31706
|
key_locations[i][0] = valid;
|
|
31712
|
-
EncodeStringDataPrefix(key_locations[i] + 1, source[source_idx], prefix_len);
|
|
31707
|
+
Radix::EncodeStringDataPrefix(key_locations[i] + 1, source[source_idx], prefix_len);
|
|
31713
31708
|
// invert bits if desc
|
|
31714
31709
|
if (desc) {
|
|
31715
31710
|
for (idx_t s = 1; s < prefix_len + 1; s++) {
|
|
@@ -31727,7 +31722,7 @@ void RadixScatterStringVector(VectorData &vdata, const SelectionVector &sel, idx
|
|
|
31727
31722
|
auto idx = sel.get_index(i);
|
|
31728
31723
|
auto source_idx = vdata.sel->get_index(idx) + offset;
|
|
31729
31724
|
// write value
|
|
31730
|
-
EncodeStringDataPrefix(key_locations[i], source[source_idx], prefix_len);
|
|
31725
|
+
Radix::EncodeStringDataPrefix(key_locations[i], source[source_idx], prefix_len);
|
|
31731
31726
|
// invert bits if desc
|
|
31732
31727
|
if (desc) {
|
|
31733
31728
|
for (idx_t s = 0; s < prefix_len; s++) {
|
|
@@ -31854,7 +31849,7 @@ void RadixScatterStructVector(Vector &v, VectorData &vdata, idx_t vcount, const
|
|
|
31854
31849
|
void RowOperations::RadixScatter(Vector &v, idx_t vcount, const SelectionVector &sel, idx_t ser_count,
|
|
31855
31850
|
data_ptr_t *key_locations, bool desc, bool has_null, bool nulls_first,
|
|
31856
31851
|
idx_t prefix_len, idx_t width, idx_t offset) {
|
|
31857
|
-
auto is_little_endian = IsLittleEndian();
|
|
31852
|
+
auto is_little_endian = Radix::IsLittleEndian();
|
|
31858
31853
|
|
|
31859
31854
|
VectorData vdata;
|
|
31860
31855
|
v.Orrify(vcount, vdata);
|
|
@@ -40595,34 +40590,6 @@ string hugeint_t::ToString() const {
|
|
|
40595
40590
|
|
|
40596
40591
|
|
|
40597
40592
|
|
|
40598
|
-
namespace duckdb {
|
|
40599
|
-
|
|
40600
|
-
//! The HyperLogLog class holds a HyperLogLog counter for approximate cardinality counting
|
|
40601
|
-
class HyperLogLog {
|
|
40602
|
-
public:
|
|
40603
|
-
HyperLogLog();
|
|
40604
|
-
~HyperLogLog();
|
|
40605
|
-
// implicit copying of HyperLogLog is not allowed
|
|
40606
|
-
HyperLogLog(const HyperLogLog &) = delete;
|
|
40607
|
-
|
|
40608
|
-
//! Adds an element of the specified size to the HyperLogLog counter
|
|
40609
|
-
void Add(data_ptr_t element, idx_t size);
|
|
40610
|
-
//! Return the count of this HyperLogLog counter
|
|
40611
|
-
idx_t Count();
|
|
40612
|
-
//! Merge this HyperLogLog counter with another counter to create a new one
|
|
40613
|
-
unique_ptr<HyperLogLog> Merge(HyperLogLog &other);
|
|
40614
|
-
HyperLogLog *MergePointer(HyperLogLog &other);
|
|
40615
|
-
//! Merge a set of HyperLogLogs to create one big one
|
|
40616
|
-
static unique_ptr<HyperLogLog> Merge(HyperLogLog logs[], idx_t count);
|
|
40617
|
-
|
|
40618
|
-
private:
|
|
40619
|
-
HyperLogLog(void *hll);
|
|
40620
|
-
|
|
40621
|
-
void *hll;
|
|
40622
|
-
};
|
|
40623
|
-
} // namespace duckdb
|
|
40624
|
-
|
|
40625
|
-
|
|
40626
40593
|
|
|
40627
40594
|
|
|
40628
40595
|
// LICENSE_CHANGE_BEGIN
|
|
@@ -40639,41 +40606,116 @@ private:
|
|
|
40639
40606
|
|
|
40640
40607
|
|
|
40641
40608
|
|
|
40642
|
-
#include <string.h>
|
|
40643
40609
|
#include <stdint.h>
|
|
40610
|
+
#include <string.h>
|
|
40644
40611
|
|
|
40645
40612
|
namespace duckdb_hll {
|
|
40646
40613
|
|
|
40647
40614
|
/* Error codes */
|
|
40648
|
-
#define HLL_C_OK
|
|
40649
|
-
#define HLL_C_ERR
|
|
40615
|
+
#define HLL_C_OK 0
|
|
40616
|
+
#define HLL_C_ERR -1
|
|
40650
40617
|
|
|
40651
40618
|
typedef struct {
|
|
40652
|
-
|
|
40619
|
+
void *ptr;
|
|
40653
40620
|
} robj;
|
|
40654
40621
|
|
|
40655
40622
|
//! Create a new empty HyperLogLog object
|
|
40656
40623
|
robj *hll_create(void);
|
|
40624
|
+
//! Convert hll from sparse to dense
|
|
40625
|
+
int hllSparseToDense(robj *o);
|
|
40657
40626
|
//! Destroy the specified HyperLogLog object
|
|
40658
40627
|
void hll_destroy(robj *obj);
|
|
40659
|
-
//! Add an element with the specified amount of bytes to the HyperLogLog. Returns C_ERR on failure, otherwise returns 0
|
|
40628
|
+
//! Add an element with the specified amount of bytes to the HyperLogLog. Returns C_ERR on failure, otherwise returns 0
|
|
40629
|
+
//! if the cardinality did not change, and 1 otherwise.
|
|
40660
40630
|
int hll_add(robj *o, unsigned char *ele, size_t elesize);
|
|
40661
|
-
//! Returns the estimated amount of unique elements seen by the HyperLogLog. Returns C_OK on success, or C_ERR on
|
|
40631
|
+
//! Returns the estimated amount of unique elements seen by the HyperLogLog. Returns C_OK on success, or C_ERR on
|
|
40632
|
+
//! failure.
|
|
40662
40633
|
int hll_count(robj *o, size_t *result);
|
|
40663
40634
|
//! Merge hll_count HyperLogLog objects into a single one. Returns NULL on failure, or the new HLL object on success.
|
|
40664
40635
|
robj *hll_merge(robj **hlls, size_t hll_count);
|
|
40636
|
+
//! Get size (in bytes) of the HLL
|
|
40637
|
+
uint64_t get_size();
|
|
40665
40638
|
|
|
40666
|
-
uint64_t MurmurHash64A
|
|
40639
|
+
uint64_t MurmurHash64A(const void *key, int len, unsigned int seed);
|
|
40640
|
+
|
|
40641
|
+
} // namespace duckdb_hll
|
|
40642
|
+
|
|
40643
|
+
namespace duckdb {
|
|
40644
|
+
|
|
40645
|
+
void AddToLogsInternal(VectorData &vdata, idx_t count, uint64_t indices[], uint8_t counts[], void ***logs[],
|
|
40646
|
+
const SelectionVector *log_sel);
|
|
40647
|
+
|
|
40648
|
+
void AddToSingleLogInternal(VectorData &vdata, idx_t count, uint64_t indices[], uint8_t counts[], void *log);
|
|
40649
|
+
|
|
40650
|
+
} // namespace duckdb
|
|
40667
40651
|
|
|
40668
|
-
}
|
|
40669
40652
|
|
|
40670
40653
|
// LICENSE_CHANGE_END
|
|
40671
40654
|
|
|
40672
40655
|
|
|
40656
|
+
namespace duckdb {
|
|
40657
|
+
|
|
40658
|
+
enum class HLLStorageType { UNCOMPRESSED = 1 };
|
|
40659
|
+
|
|
40660
|
+
class FieldWriter;
|
|
40661
|
+
class FieldReader;
|
|
40662
|
+
|
|
40663
|
+
//! The HyperLogLog class holds a HyperLogLog counter for approximate cardinality counting
|
|
40664
|
+
class HyperLogLog {
|
|
40665
|
+
public:
|
|
40666
|
+
HyperLogLog();
|
|
40667
|
+
~HyperLogLog();
|
|
40668
|
+
// implicit copying of HyperLogLog is not allowed
|
|
40669
|
+
HyperLogLog(const HyperLogLog &) = delete;
|
|
40670
|
+
|
|
40671
|
+
//! Adds an element of the specified size to the HyperLogLog counter
|
|
40672
|
+
void Add(data_ptr_t element, idx_t size);
|
|
40673
|
+
//! Return the count of this HyperLogLog counter
|
|
40674
|
+
idx_t Count() const;
|
|
40675
|
+
//! Merge this HyperLogLog counter with another counter to create a new one
|
|
40676
|
+
unique_ptr<HyperLogLog> Merge(HyperLogLog &other);
|
|
40677
|
+
HyperLogLog *MergePointer(HyperLogLog &other);
|
|
40678
|
+
//! Merge a set of HyperLogLogs to create one big one
|
|
40679
|
+
static unique_ptr<HyperLogLog> Merge(HyperLogLog logs[], idx_t count);
|
|
40680
|
+
//! Get the size (in bytes) of a HLL
|
|
40681
|
+
static idx_t GetSize();
|
|
40682
|
+
//! Get pointer to the HLL
|
|
40683
|
+
data_ptr_t GetPtr() const;
|
|
40684
|
+
//! Get copy of the HLL
|
|
40685
|
+
unique_ptr<HyperLogLog> Copy();
|
|
40686
|
+
//! (De)Serialize the HLL
|
|
40687
|
+
void Serialize(FieldWriter &writer) const;
|
|
40688
|
+
static unique_ptr<HyperLogLog> Deserialize(FieldReader &reader);
|
|
40689
|
+
|
|
40690
|
+
public:
|
|
40691
|
+
//! Compute HLL hashes over vdata, and store them in 'hashes'
|
|
40692
|
+
//! Then, compute register indices and prefix lengths, and also store them in 'hashes' as a pair of uint32_t
|
|
40693
|
+
static void ProcessEntries(VectorData &vdata, const LogicalType &type, uint64_t hashes[], uint8_t counts[],
|
|
40694
|
+
idx_t count);
|
|
40695
|
+
//! Add the indices and counts to the logs
|
|
40696
|
+
static void AddToLogs(VectorData &vdata, idx_t count, uint64_t indices[], uint8_t counts[], HyperLogLog **logs[],
|
|
40697
|
+
const SelectionVector *log_sel);
|
|
40698
|
+
//! Add the indices and counts to THIS log
|
|
40699
|
+
void AddToLog(VectorData &vdata, idx_t count, uint64_t indices[], uint8_t counts[]);
|
|
40700
|
+
|
|
40701
|
+
private:
|
|
40702
|
+
explicit HyperLogLog(void *hll);
|
|
40703
|
+
|
|
40704
|
+
void *hll;
|
|
40705
|
+
mutex lock;
|
|
40706
|
+
};
|
|
40707
|
+
} // namespace duckdb
|
|
40708
|
+
|
|
40709
|
+
|
|
40710
|
+
|
|
40711
|
+
|
|
40712
|
+
|
|
40673
40713
|
namespace duckdb {
|
|
40674
40714
|
|
|
40675
40715
|
HyperLogLog::HyperLogLog() : hll(nullptr) {
|
|
40676
40716
|
hll = duckdb_hll::hll_create();
|
|
40717
|
+
// Insert into a dense hll can be vectorized, sparse cannot, so we immediately convert
|
|
40718
|
+
duckdb_hll::hllSparseToDense((duckdb_hll::robj *)hll);
|
|
40677
40719
|
}
|
|
40678
40720
|
|
|
40679
40721
|
HyperLogLog::HyperLogLog(void *hll) : hll(hll) {
|
|
@@ -40689,7 +40731,7 @@ void HyperLogLog::Add(data_ptr_t element, idx_t size) {
|
|
|
40689
40731
|
}
|
|
40690
40732
|
}
|
|
40691
40733
|
|
|
40692
|
-
idx_t HyperLogLog::Count() {
|
|
40734
|
+
idx_t HyperLogLog::Count() const {
|
|
40693
40735
|
// exception from size_t ban
|
|
40694
40736
|
size_t result;
|
|
40695
40737
|
|
|
@@ -40736,9 +40778,206 @@ unique_ptr<HyperLogLog> HyperLogLog::Merge(HyperLogLog logs[], idx_t count) {
|
|
|
40736
40778
|
return unique_ptr<HyperLogLog>(new HyperLogLog((void *)new_hll));
|
|
40737
40779
|
}
|
|
40738
40780
|
|
|
40781
|
+
idx_t HyperLogLog::GetSize() {
|
|
40782
|
+
return duckdb_hll::get_size();
|
|
40783
|
+
}
|
|
40784
|
+
|
|
40785
|
+
data_ptr_t HyperLogLog::GetPtr() const {
|
|
40786
|
+
return (data_ptr_t)((duckdb_hll::robj *)hll)->ptr;
|
|
40787
|
+
}
|
|
40788
|
+
|
|
40789
|
+
unique_ptr<HyperLogLog> HyperLogLog::Copy() {
|
|
40790
|
+
auto result = make_unique<HyperLogLog>();
|
|
40791
|
+
lock_guard<mutex> guard(lock);
|
|
40792
|
+
memcpy(result->GetPtr(), GetPtr(), GetSize());
|
|
40793
|
+
D_ASSERT(result->Count() == Count());
|
|
40794
|
+
return result;
|
|
40795
|
+
}
|
|
40796
|
+
|
|
40797
|
+
void HyperLogLog::Serialize(FieldWriter &writer) const {
|
|
40798
|
+
writer.WriteField<HLLStorageType>(HLLStorageType::UNCOMPRESSED);
|
|
40799
|
+
writer.WriteBlob(GetPtr(), GetSize());
|
|
40800
|
+
}
|
|
40801
|
+
|
|
40802
|
+
unique_ptr<HyperLogLog> HyperLogLog::Deserialize(FieldReader &reader) {
|
|
40803
|
+
auto result = make_unique<HyperLogLog>();
|
|
40804
|
+
auto storage_type = reader.ReadRequired<HLLStorageType>();
|
|
40805
|
+
switch (storage_type) {
|
|
40806
|
+
case HLLStorageType::UNCOMPRESSED:
|
|
40807
|
+
reader.ReadBlob(result->GetPtr(), GetSize());
|
|
40808
|
+
break;
|
|
40809
|
+
default:
|
|
40810
|
+
throw SerializationException("Unknown HyperLogLog storage type!");
|
|
40811
|
+
}
|
|
40812
|
+
return result;
|
|
40813
|
+
}
|
|
40814
|
+
|
|
40815
|
+
//===--------------------------------------------------------------------===//
|
|
40816
|
+
// Vectorized HLL implementation
|
|
40817
|
+
//===--------------------------------------------------------------------===//
|
|
40818
|
+
//! Taken from https://nullprogram.com/blog/2018/07/31/
|
|
40819
|
+
template <class T>
|
|
40820
|
+
inline uint64_t TemplatedHash(const T &elem) {
|
|
40821
|
+
uint64_t x = elem;
|
|
40822
|
+
x ^= x >> 30;
|
|
40823
|
+
x *= UINT64_C(0xbf58476d1ce4e5b9);
|
|
40824
|
+
x ^= x >> 27;
|
|
40825
|
+
x *= UINT64_C(0x94d049bb133111eb);
|
|
40826
|
+
x ^= x >> 31;
|
|
40827
|
+
return x;
|
|
40828
|
+
}
|
|
40829
|
+
|
|
40830
|
+
template <>
|
|
40831
|
+
inline uint64_t TemplatedHash(const hugeint_t &elem) {
|
|
40832
|
+
return TemplatedHash<uint64_t>(Load<uint64_t>((data_ptr_t)&elem.upper)) ^ TemplatedHash<uint64_t>(elem.lower);
|
|
40833
|
+
}
|
|
40834
|
+
|
|
40835
|
+
template <idx_t rest>
|
|
40836
|
+
inline void CreateIntegerRecursive(const data_ptr_t &data, uint64_t &x) {
|
|
40837
|
+
x ^= (uint64_t)data[rest - 1] << ((rest - 1) * 8);
|
|
40838
|
+
return CreateIntegerRecursive<rest - 1>(data, x);
|
|
40839
|
+
}
|
|
40840
|
+
|
|
40841
|
+
template <>
|
|
40842
|
+
inline void CreateIntegerRecursive<1>(const data_ptr_t &data, uint64_t &x) {
|
|
40843
|
+
x ^= (uint64_t)data[0];
|
|
40844
|
+
}
|
|
40845
|
+
|
|
40846
|
+
inline uint64_t HashOtherSize(const data_ptr_t &data, const idx_t &len) {
|
|
40847
|
+
uint64_t x = 0;
|
|
40848
|
+
switch (len & 7) {
|
|
40849
|
+
case 7:
|
|
40850
|
+
CreateIntegerRecursive<7>(data, x);
|
|
40851
|
+
break;
|
|
40852
|
+
case 6:
|
|
40853
|
+
CreateIntegerRecursive<6>(data, x);
|
|
40854
|
+
break;
|
|
40855
|
+
case 5:
|
|
40856
|
+
CreateIntegerRecursive<5>(data, x);
|
|
40857
|
+
break;
|
|
40858
|
+
case 4:
|
|
40859
|
+
CreateIntegerRecursive<4>(data, x);
|
|
40860
|
+
break;
|
|
40861
|
+
case 3:
|
|
40862
|
+
CreateIntegerRecursive<3>(data, x);
|
|
40863
|
+
break;
|
|
40864
|
+
case 2:
|
|
40865
|
+
CreateIntegerRecursive<2>(data, x);
|
|
40866
|
+
break;
|
|
40867
|
+
case 1:
|
|
40868
|
+
CreateIntegerRecursive<1>(data, x);
|
|
40869
|
+
break;
|
|
40870
|
+
case 0:
|
|
40871
|
+
break;
|
|
40872
|
+
}
|
|
40873
|
+
return TemplatedHash<uint64_t>(x);
|
|
40874
|
+
}
|
|
40875
|
+
|
|
40876
|
+
template <>
|
|
40877
|
+
inline uint64_t TemplatedHash(const string_t &elem) {
|
|
40878
|
+
data_ptr_t data = (data_ptr_t)elem.GetDataUnsafe();
|
|
40879
|
+
const auto &len = elem.GetSize();
|
|
40880
|
+
uint64_t h = 0;
|
|
40881
|
+
for (idx_t i = 0; i < len / 8; i += 8) {
|
|
40882
|
+
h ^= TemplatedHash<uint64_t>(Load<uint64_t>(data));
|
|
40883
|
+
data += 8;
|
|
40884
|
+
}
|
|
40885
|
+
switch (len & 7) {
|
|
40886
|
+
case 4:
|
|
40887
|
+
h ^= TemplatedHash<uint32_t>(Load<uint32_t>(data));
|
|
40888
|
+
break;
|
|
40889
|
+
case 2:
|
|
40890
|
+
h ^= TemplatedHash<uint16_t>(Load<uint16_t>(data));
|
|
40891
|
+
break;
|
|
40892
|
+
case 1:
|
|
40893
|
+
h ^= TemplatedHash<uint8_t>(Load<uint8_t>(data));
|
|
40894
|
+
break;
|
|
40895
|
+
default:
|
|
40896
|
+
h ^= HashOtherSize(data, len);
|
|
40897
|
+
}
|
|
40898
|
+
return h;
|
|
40899
|
+
}
|
|
40900
|
+
|
|
40901
|
+
template <class T>
|
|
40902
|
+
void TemplatedComputeHashes(VectorData &vdata, const idx_t &count, uint64_t hashes[]) {
|
|
40903
|
+
T *data = (T *)vdata.data;
|
|
40904
|
+
for (idx_t i = 0; i < count; i++) {
|
|
40905
|
+
auto idx = vdata.sel->get_index(i);
|
|
40906
|
+
if (vdata.validity.RowIsValid(idx)) {
|
|
40907
|
+
hashes[i] = TemplatedHash<T>(data[idx]);
|
|
40908
|
+
}
|
|
40909
|
+
}
|
|
40910
|
+
}
|
|
40911
|
+
|
|
40912
|
+
static void ComputeHashes(VectorData &vdata, const LogicalType &type, uint64_t hashes[], idx_t count) {
|
|
40913
|
+
switch (type.InternalType()) {
|
|
40914
|
+
case PhysicalType::BOOL:
|
|
40915
|
+
case PhysicalType::INT8:
|
|
40916
|
+
case PhysicalType::UINT8:
|
|
40917
|
+
return TemplatedComputeHashes<uint8_t>(vdata, count, hashes);
|
|
40918
|
+
case PhysicalType::INT16:
|
|
40919
|
+
case PhysicalType::UINT16:
|
|
40920
|
+
return TemplatedComputeHashes<uint16_t>(vdata, count, hashes);
|
|
40921
|
+
case PhysicalType::INT32:
|
|
40922
|
+
case PhysicalType::UINT32:
|
|
40923
|
+
case PhysicalType::FLOAT:
|
|
40924
|
+
return TemplatedComputeHashes<uint32_t>(vdata, count, hashes);
|
|
40925
|
+
case PhysicalType::INT64:
|
|
40926
|
+
case PhysicalType::UINT64:
|
|
40927
|
+
case PhysicalType::DOUBLE:
|
|
40928
|
+
return TemplatedComputeHashes<uint64_t>(vdata, count, hashes);
|
|
40929
|
+
case PhysicalType::INT128:
|
|
40930
|
+
case PhysicalType::INTERVAL:
|
|
40931
|
+
static_assert(sizeof(hugeint_t) == sizeof(interval_t), "ComputeHashes assumes these are the same size!");
|
|
40932
|
+
return TemplatedComputeHashes<hugeint_t>(vdata, count, hashes);
|
|
40933
|
+
case PhysicalType::VARCHAR:
|
|
40934
|
+
return TemplatedComputeHashes<string_t>(vdata, count, hashes);
|
|
40935
|
+
default:
|
|
40936
|
+
throw InternalException("Unimplemented type for HyperLogLog::ComputeHashes");
|
|
40937
|
+
}
|
|
40938
|
+
}
|
|
40939
|
+
|
|
40940
|
+
//! Taken from https://stackoverflow.com/a/72088344
|
|
40941
|
+
static inline uint8_t CountTrailingZeros(uint64_t &x) {
|
|
40942
|
+
static constexpr const uint64_t DEBRUIJN = 0x03f79d71b4cb0a89;
|
|
40943
|
+
static constexpr const uint8_t LOOKUP[] = {0, 47, 1, 56, 48, 27, 2, 60, 57, 49, 41, 37, 28, 16, 3, 61,
|
|
40944
|
+
54, 58, 35, 52, 50, 42, 21, 44, 38, 32, 29, 23, 17, 11, 4, 62,
|
|
40945
|
+
46, 55, 26, 59, 40, 36, 15, 53, 34, 51, 20, 43, 31, 22, 10, 45,
|
|
40946
|
+
25, 39, 14, 33, 19, 30, 9, 24, 13, 18, 8, 12, 7, 6, 5, 63};
|
|
40947
|
+
return LOOKUP[(DEBRUIJN * (x ^ (x - 1))) >> 58];
|
|
40948
|
+
}
|
|
40949
|
+
|
|
40950
|
+
static inline void ComputeIndexAndCount(uint64_t &hash, uint8_t &prefix) {
|
|
40951
|
+
uint64_t index = hash & ((1 << 12) - 1); /* Register index. */
|
|
40952
|
+
hash >>= 12; /* Remove bits used to address the register. */
|
|
40953
|
+
hash |= ((uint64_t)1 << (64 - 12)); /* Make sure the count will be <= Q+1. */
|
|
40954
|
+
|
|
40955
|
+
prefix = CountTrailingZeros(hash) + 1; /* Add 1 since we count the "00000...1" pattern. */
|
|
40956
|
+
hash = index;
|
|
40957
|
+
}
|
|
40958
|
+
|
|
40959
|
+
void HyperLogLog::ProcessEntries(VectorData &vdata, const LogicalType &type, uint64_t hashes[], uint8_t counts[],
|
|
40960
|
+
idx_t count) {
|
|
40961
|
+
ComputeHashes(vdata, type, hashes, count);
|
|
40962
|
+
for (idx_t i = 0; i < count; i++) {
|
|
40963
|
+
ComputeIndexAndCount(hashes[i], counts[i]);
|
|
40964
|
+
}
|
|
40965
|
+
}
|
|
40966
|
+
|
|
40967
|
+
void HyperLogLog::AddToLogs(VectorData &vdata, idx_t count, uint64_t indices[], uint8_t counts[], HyperLogLog **logs[],
|
|
40968
|
+
const SelectionVector *log_sel) {
|
|
40969
|
+
AddToLogsInternal(vdata, count, indices, counts, (void ****)logs, log_sel);
|
|
40970
|
+
}
|
|
40971
|
+
|
|
40972
|
+
void HyperLogLog::AddToLog(VectorData &vdata, idx_t count, uint64_t indices[], uint8_t counts[]) {
|
|
40973
|
+
lock_guard<mutex> guard(lock);
|
|
40974
|
+
AddToSingleLogInternal(vdata, count, indices, counts, hll);
|
|
40975
|
+
}
|
|
40976
|
+
|
|
40739
40977
|
} // namespace duckdb
|
|
40740
40978
|
|
|
40741
40979
|
|
|
40980
|
+
|
|
40742
40981
|
//===----------------------------------------------------------------------===//
|
|
40743
40982
|
// DuckDB
|
|
40744
40983
|
//
|
|
@@ -44318,6 +44557,7 @@ Value Value::Deserialize(Deserializer &main_source) {
|
|
|
44318
44557
|
auto is_null = reader.ReadRequired<bool>();
|
|
44319
44558
|
Value new_value = Value(type);
|
|
44320
44559
|
if (is_null) {
|
|
44560
|
+
reader.Finalize();
|
|
44321
44561
|
return new_value;
|
|
44322
44562
|
}
|
|
44323
44563
|
new_value.is_null = false;
|
|
@@ -54344,7 +54584,7 @@ ART::ART(const vector<column_t> &column_ids, const vector<unique_ptr<Expression>
|
|
|
54344
54584
|
: Index(IndexType::ART, column_ids, unbound_expressions, constraint_type) {
|
|
54345
54585
|
tree = nullptr;
|
|
54346
54586
|
expression_result.Initialize(logical_types);
|
|
54347
|
-
is_little_endian = IsLittleEndian();
|
|
54587
|
+
is_little_endian = Radix::IsLittleEndian();
|
|
54348
54588
|
for (idx_t i = 0; i < types.size(); i++) {
|
|
54349
54589
|
switch (types[i]) {
|
|
54350
54590
|
case PhysicalType::BOOL:
|
|
@@ -72978,11 +73218,11 @@ static bool CanUsePerfectHashAggregate(ClientContext &context, LogicalAggregate
|
|
|
72978
73218
|
switch (group_type.InternalType()) {
|
|
72979
73219
|
case PhysicalType::INT8:
|
|
72980
73220
|
stats = make_unique<NumericStatistics>(group_type, Value::MinimumValue(group_type),
|
|
72981
|
-
Value::MaximumValue(group_type));
|
|
73221
|
+
Value::MaximumValue(group_type), StatisticsType::LOCAL_STATS);
|
|
72982
73222
|
break;
|
|
72983
73223
|
case PhysicalType::INT16:
|
|
72984
73224
|
stats = make_unique<NumericStatistics>(group_type, Value::MinimumValue(group_type),
|
|
72985
|
-
Value::MaximumValue(group_type));
|
|
73225
|
+
Value::MaximumValue(group_type), StatisticsType::LOCAL_STATS);
|
|
72986
73226
|
break;
|
|
72987
73227
|
default:
|
|
72988
73228
|
// type is too large and there are no stats: skip perfect hashing
|
|
@@ -78255,7 +78495,7 @@ struct ApproxDistinctCountState {
|
|
|
78255
78495
|
HyperLogLog *log;
|
|
78256
78496
|
};
|
|
78257
78497
|
|
|
78258
|
-
struct
|
|
78498
|
+
struct ApproxCountDistinctFunction {
|
|
78259
78499
|
template <class STATE>
|
|
78260
78500
|
static void Initialize(STATE *state) {
|
|
78261
78501
|
state->log = nullptr;
|
|
@@ -78296,109 +78536,76 @@ struct ApproxCountDistinctFunctionBase {
|
|
|
78296
78536
|
}
|
|
78297
78537
|
};
|
|
78298
78538
|
|
|
78299
|
-
|
|
78300
|
-
|
|
78301
|
-
|
|
78302
|
-
if (!state->log) {
|
|
78303
|
-
state->log = new HyperLogLog();
|
|
78304
|
-
}
|
|
78305
|
-
INPUT_TYPE value = input[idx];
|
|
78306
|
-
state->log->Add((uint8_t *)&value, sizeof(value));
|
|
78307
|
-
}
|
|
78308
|
-
template <class INPUT_TYPE, class STATE, class OP>
|
|
78309
|
-
static void ConstantOperation(STATE *state, FunctionData *bind_data, INPUT_TYPE *input, ValidityMask &mask,
|
|
78310
|
-
idx_t count) {
|
|
78311
|
-
for (idx_t i = 0; i < count; i++) {
|
|
78312
|
-
Operation<INPUT_TYPE, STATE, OP>(state, bind_data, input, mask, 0);
|
|
78313
|
-
}
|
|
78314
|
-
}
|
|
78315
|
-
};
|
|
78539
|
+
static void ApproxCountDistinctSimpleUpdateFunction(Vector inputs[], FunctionData *bind_data, idx_t input_count,
|
|
78540
|
+
data_ptr_t state, idx_t count) {
|
|
78541
|
+
D_ASSERT(input_count == 1);
|
|
78316
78542
|
|
|
78317
|
-
|
|
78318
|
-
|
|
78319
|
-
|
|
78320
|
-
if (!state->log) {
|
|
78321
|
-
state->log = new HyperLogLog();
|
|
78322
|
-
}
|
|
78323
|
-
auto str = input[idx].GetDataUnsafe();
|
|
78324
|
-
auto str_len = input[idx].GetSize();
|
|
78325
|
-
auto str_hash = Hash(str, str_len);
|
|
78326
|
-
state->log->Add((uint8_t *)&str_hash, sizeof(str_hash));
|
|
78327
|
-
}
|
|
78328
|
-
template <class INPUT_TYPE, class STATE, class OP>
|
|
78329
|
-
static void ConstantOperation(STATE *state, FunctionData *bind_data, INPUT_TYPE *input, ValidityMask &mask,
|
|
78330
|
-
idx_t count) {
|
|
78331
|
-
for (idx_t i = 0; i < count; i++) {
|
|
78332
|
-
Operation<INPUT_TYPE, STATE, OP>(state, bind_data, input, mask, 0);
|
|
78333
|
-
}
|
|
78543
|
+
auto agg_state = (ApproxDistinctCountState *)state;
|
|
78544
|
+
if (!agg_state->log) {
|
|
78545
|
+
agg_state->log = new HyperLogLog();
|
|
78334
78546
|
}
|
|
78335
|
-
};
|
|
78336
78547
|
|
|
78337
|
-
|
|
78338
|
-
|
|
78339
|
-
|
|
78340
|
-
|
|
78548
|
+
VectorData vdata;
|
|
78549
|
+
inputs[0].Orrify(count, vdata);
|
|
78550
|
+
|
|
78551
|
+
uint64_t indices[STANDARD_VECTOR_SIZE];
|
|
78552
|
+
uint8_t counts[STANDARD_VECTOR_SIZE];
|
|
78553
|
+
|
|
78554
|
+
HyperLogLog::ProcessEntries(vdata, inputs[0].GetType(), indices, counts, count);
|
|
78555
|
+
agg_state->log->AddToLog(vdata, count, indices, counts);
|
|
78341
78556
|
}
|
|
78342
78557
|
|
|
78343
|
-
|
|
78344
|
-
|
|
78345
|
-
|
|
78346
|
-
return AggregateFunction::UnaryAggregateDestructor<ApproxDistinctCountState, uint16_t, int64_t,
|
|
78347
|
-
ApproxCountDistinctFunction>(LogicalType::UTINYINT,
|
|
78348
|
-
LogicalType::BIGINT);
|
|
78349
|
-
case PhysicalType::UINT32:
|
|
78350
|
-
return AggregateFunction::UnaryAggregateDestructor<ApproxDistinctCountState, uint32_t, int64_t,
|
|
78351
|
-
ApproxCountDistinctFunction>(LogicalType::UINTEGER,
|
|
78352
|
-
LogicalType::BIGINT);
|
|
78353
|
-
case PhysicalType::UINT64:
|
|
78354
|
-
return AggregateFunction::UnaryAggregateDestructor<ApproxDistinctCountState, uint64_t, int64_t,
|
|
78355
|
-
ApproxCountDistinctFunction>(LogicalType::UBIGINT,
|
|
78356
|
-
LogicalType::BIGINT);
|
|
78357
|
-
case PhysicalType::INT16:
|
|
78358
|
-
return AggregateFunction::UnaryAggregateDestructor<ApproxDistinctCountState, int16_t, int64_t,
|
|
78359
|
-
ApproxCountDistinctFunction>(LogicalType::TINYINT,
|
|
78360
|
-
LogicalType::BIGINT);
|
|
78361
|
-
case PhysicalType::INT32:
|
|
78362
|
-
return AggregateFunction::UnaryAggregateDestructor<ApproxDistinctCountState, int32_t, int64_t,
|
|
78363
|
-
ApproxCountDistinctFunction>(LogicalType::INTEGER,
|
|
78364
|
-
LogicalType::BIGINT);
|
|
78365
|
-
case PhysicalType::INT64:
|
|
78366
|
-
return AggregateFunction::UnaryAggregateDestructor<ApproxDistinctCountState, int64_t, int64_t,
|
|
78367
|
-
ApproxCountDistinctFunction>(LogicalType::BIGINT,
|
|
78368
|
-
LogicalType::BIGINT);
|
|
78369
|
-
case PhysicalType::FLOAT:
|
|
78370
|
-
return AggregateFunction::UnaryAggregateDestructor<ApproxDistinctCountState, float, int64_t,
|
|
78371
|
-
ApproxCountDistinctFunction>(LogicalType::FLOAT,
|
|
78372
|
-
LogicalType::BIGINT);
|
|
78373
|
-
case PhysicalType::DOUBLE:
|
|
78374
|
-
return AggregateFunction::UnaryAggregateDestructor<ApproxDistinctCountState, double, int64_t,
|
|
78375
|
-
ApproxCountDistinctFunction>(LogicalType::DOUBLE,
|
|
78376
|
-
LogicalType::BIGINT);
|
|
78377
|
-
case PhysicalType::VARCHAR:
|
|
78378
|
-
return AggregateFunction::UnaryAggregateDestructor<ApproxDistinctCountState, string_t, int64_t,
|
|
78379
|
-
ApproxCountDistinctFunctionString>(LogicalType::VARCHAR,
|
|
78380
|
-
LogicalType::BIGINT);
|
|
78558
|
+
static void ApproxCountDistinctUpdateFunction(Vector inputs[], FunctionData *bind_data, idx_t input_count,
|
|
78559
|
+
Vector &state_vector, idx_t count) {
|
|
78560
|
+
D_ASSERT(input_count == 1);
|
|
78381
78561
|
|
|
78382
|
-
|
|
78383
|
-
|
|
78562
|
+
VectorData sdata;
|
|
78563
|
+
state_vector.Orrify(count, sdata);
|
|
78564
|
+
auto states = (ApproxDistinctCountState **)sdata.data;
|
|
78565
|
+
|
|
78566
|
+
for (idx_t i = 0; i < count; i++) {
|
|
78567
|
+
auto agg_state = states[sdata.sel->get_index(i)];
|
|
78568
|
+
if (!agg_state->log) {
|
|
78569
|
+
agg_state->log = new HyperLogLog();
|
|
78570
|
+
}
|
|
78384
78571
|
}
|
|
78572
|
+
|
|
78573
|
+
VectorData vdata;
|
|
78574
|
+
inputs[0].Orrify(count, vdata);
|
|
78575
|
+
|
|
78576
|
+
uint64_t indices[STANDARD_VECTOR_SIZE];
|
|
78577
|
+
uint8_t counts[STANDARD_VECTOR_SIZE];
|
|
78578
|
+
|
|
78579
|
+
HyperLogLog::ProcessEntries(vdata, inputs[0].GetType(), indices, counts, count);
|
|
78580
|
+
HyperLogLog::AddToLogs(vdata, count, indices, counts, (HyperLogLog ***)states, sdata.sel);
|
|
78581
|
+
}
|
|
78582
|
+
|
|
78583
|
+
AggregateFunction GetApproxCountDistinctFunction(const LogicalType &input_type) {
|
|
78584
|
+
return AggregateFunction(
|
|
78585
|
+
{input_type}, LogicalTypeId::BIGINT, AggregateFunction::StateSize<ApproxDistinctCountState>,
|
|
78586
|
+
AggregateFunction::StateInitialize<ApproxDistinctCountState, ApproxCountDistinctFunction>,
|
|
78587
|
+
ApproxCountDistinctUpdateFunction,
|
|
78588
|
+
AggregateFunction::StateCombine<ApproxDistinctCountState, ApproxCountDistinctFunction>,
|
|
78589
|
+
AggregateFunction::StateFinalize<ApproxDistinctCountState, int64_t, ApproxCountDistinctFunction>,
|
|
78590
|
+
ApproxCountDistinctSimpleUpdateFunction, nullptr,
|
|
78591
|
+
AggregateFunction::StateDestroy<ApproxDistinctCountState, ApproxCountDistinctFunction>);
|
|
78385
78592
|
}
|
|
78386
78593
|
|
|
78387
78594
|
void ApproxCountDistinctFun::RegisterFunction(BuiltinFunctions &set) {
|
|
78388
78595
|
AggregateFunctionSet approx_count("approx_count_distinct");
|
|
78389
|
-
approx_count.AddFunction(GetApproxCountDistinctFunction(
|
|
78390
|
-
approx_count.AddFunction(GetApproxCountDistinctFunction(
|
|
78391
|
-
approx_count.AddFunction(GetApproxCountDistinctFunction(
|
|
78392
|
-
approx_count.AddFunction(GetApproxCountDistinctFunction(
|
|
78393
|
-
approx_count.AddFunction(GetApproxCountDistinctFunction(
|
|
78394
|
-
approx_count.AddFunction(GetApproxCountDistinctFunction(
|
|
78395
|
-
approx_count.AddFunction(GetApproxCountDistinctFunction(
|
|
78396
|
-
approx_count.AddFunction(GetApproxCountDistinctFunction(
|
|
78397
|
-
approx_count.AddFunction(GetApproxCountDistinctFunction(
|
|
78398
|
-
approx_count.AddFunction(
|
|
78399
|
-
|
|
78400
|
-
approx_count.AddFunction(
|
|
78401
|
-
|
|
78596
|
+
approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::UTINYINT));
|
|
78597
|
+
approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::USMALLINT));
|
|
78598
|
+
approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::UINTEGER));
|
|
78599
|
+
approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::UBIGINT));
|
|
78600
|
+
approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::TINYINT));
|
|
78601
|
+
approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::SMALLINT));
|
|
78602
|
+
approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::BIGINT));
|
|
78603
|
+
approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::HUGEINT));
|
|
78604
|
+
approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::FLOAT));
|
|
78605
|
+
approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::DOUBLE));
|
|
78606
|
+
approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::VARCHAR));
|
|
78607
|
+
approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::TIMESTAMP));
|
|
78608
|
+
approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::TIMESTAMP_TZ));
|
|
78402
78609
|
set.AddFunction(approx_count);
|
|
78403
78610
|
}
|
|
78404
78611
|
|
|
@@ -87498,7 +87705,8 @@ static unique_ptr<BaseStatistics> PropagateDatePartStatistics(vector<unique_ptr<
|
|
|
87498
87705
|
}
|
|
87499
87706
|
auto min_part = OP::template Operation<T, int64_t>(min);
|
|
87500
87707
|
auto max_part = OP::template Operation<T, int64_t>(max);
|
|
87501
|
-
auto result = make_unique<NumericStatistics>(LogicalType::BIGINT, Value::BIGINT(min_part), Value::BIGINT(max_part)
|
|
87708
|
+
auto result = make_unique<NumericStatistics>(LogicalType::BIGINT, Value::BIGINT(min_part), Value::BIGINT(max_part),
|
|
87709
|
+
StatisticsType::LOCAL_STATS);
|
|
87502
87710
|
if (child_stats[0]->validity_stats) {
|
|
87503
87711
|
result->validity_stats = child_stats[0]->validity_stats->Copy();
|
|
87504
87712
|
}
|
|
@@ -87509,7 +87717,8 @@ template <int64_t MIN, int64_t MAX>
|
|
|
87509
87717
|
static unique_ptr<BaseStatistics> PropagateSimpleDatePartStatistics(vector<unique_ptr<BaseStatistics>> &child_stats) {
|
|
87510
87718
|
// we can always propagate simple date part statistics
|
|
87511
87719
|
// since the min and max can never exceed these bounds
|
|
87512
|
-
auto result = make_unique<NumericStatistics>(LogicalType::BIGINT, Value::BIGINT(MIN), Value::BIGINT(MAX)
|
|
87720
|
+
auto result = make_unique<NumericStatistics>(LogicalType::BIGINT, Value::BIGINT(MIN), Value::BIGINT(MAX),
|
|
87721
|
+
StatisticsType::LOCAL_STATS);
|
|
87513
87722
|
if (!child_stats[0]) {
|
|
87514
87723
|
// if there are no child stats, we don't know
|
|
87515
87724
|
result->validity_stats = make_unique<ValidityStatistics>(true);
|
|
@@ -96367,7 +96576,8 @@ static unique_ptr<BaseStatistics> PropagateNumericStats(ClientContext &context,
|
|
|
96367
96576
|
// no potential overflow: replace with non-overflowing operator
|
|
96368
96577
|
expr.function.function = GetScalarIntegerFunction<BASEOP>(expr.return_type.InternalType());
|
|
96369
96578
|
}
|
|
96370
|
-
auto stats =
|
|
96579
|
+
auto stats =
|
|
96580
|
+
make_unique<NumericStatistics>(expr.return_type, move(new_min), move(new_max), StatisticsType::LOCAL_STATS);
|
|
96371
96581
|
stats->validity_stats = ValidityStatistics::Combine(lstats.validity_stats, rstats.validity_stats);
|
|
96372
96582
|
return move(stats);
|
|
96373
96583
|
}
|
|
@@ -96672,7 +96882,8 @@ static unique_ptr<BaseStatistics> NegateBindStatistics(ClientContext &context, B
|
|
|
96672
96882
|
new_min = Value(expr.return_type);
|
|
96673
96883
|
new_max = Value(expr.return_type);
|
|
96674
96884
|
}
|
|
96675
|
-
auto stats =
|
|
96885
|
+
auto stats =
|
|
96886
|
+
make_unique<NumericStatistics>(expr.return_type, move(new_min), move(new_max), StatisticsType::LOCAL_STATS);
|
|
96676
96887
|
if (istats.validity_stats) {
|
|
96677
96888
|
stats->validity_stats = istats.validity_stats->Copy();
|
|
96678
96889
|
}
|
|
@@ -135403,10 +135614,8 @@ static unique_ptr<BaseStatistics> StatisticsOperationsNumericNumericCast(const B
|
|
|
135403
135614
|
// overflow in cast: bailout
|
|
135404
135615
|
return nullptr;
|
|
135405
135616
|
}
|
|
135406
|
-
auto stats = make_unique<NumericStatistics>(target, move(min), move(max));
|
|
135407
|
-
|
|
135408
|
-
stats->validity_stats = input.validity_stats->Copy();
|
|
135409
|
-
}
|
|
135617
|
+
auto stats = make_unique<NumericStatistics>(target, move(min), move(max), input.stats_type);
|
|
135618
|
+
stats->CopyBase(*input_p);
|
|
135410
135619
|
return move(stats);
|
|
135411
135620
|
}
|
|
135412
135621
|
|
|
@@ -135601,12 +135810,73 @@ unique_ptr<BaseStatistics> StatisticsPropagator::PropagateExpression(BoundCompar
|
|
|
135601
135810
|
|
|
135602
135811
|
|
|
135603
135812
|
|
|
135813
|
+
//===----------------------------------------------------------------------===//
|
|
135814
|
+
// DuckDB
|
|
135815
|
+
//
|
|
135816
|
+
// duckdb/storage/statistics/distinct_statistics.hpp
|
|
135817
|
+
//
|
|
135818
|
+
//
|
|
135819
|
+
//===----------------------------------------------------------------------===//
|
|
135820
|
+
|
|
135821
|
+
|
|
135604
135822
|
|
|
135605
135823
|
|
|
135606
135824
|
|
|
135607
135825
|
|
|
135608
135826
|
|
|
135609
135827
|
namespace duckdb {
|
|
135828
|
+
class Serializer;
|
|
135829
|
+
class Deserializer;
|
|
135830
|
+
class Vector;
|
|
135831
|
+
|
|
135832
|
+
class DistinctStatistics : public BaseStatistics {
|
|
135833
|
+
public:
|
|
135834
|
+
DistinctStatistics();
|
|
135835
|
+
explicit DistinctStatistics(unique_ptr<HyperLogLog> log, idx_t sample_count, idx_t total_count);
|
|
135836
|
+
|
|
135837
|
+
//! The HLL of the table
|
|
135838
|
+
unique_ptr<HyperLogLog> log;
|
|
135839
|
+
//! How many values have been sampled into the HLL
|
|
135840
|
+
atomic<idx_t> sample_count;
|
|
135841
|
+
//! How many values have been inserted (before sampling)
|
|
135842
|
+
atomic<idx_t> total_count;
|
|
135843
|
+
|
|
135844
|
+
public:
|
|
135845
|
+
void Merge(const BaseStatistics &other) override;
|
|
135846
|
+
|
|
135847
|
+
unique_ptr<BaseStatistics> Copy() const override;
|
|
135848
|
+
|
|
135849
|
+
void Serialize(Serializer &serializer) const override;
|
|
135850
|
+
void Serialize(FieldWriter &writer) const override;
|
|
135851
|
+
|
|
135852
|
+
static unique_ptr<DistinctStatistics> Deserialize(Deserializer &source);
|
|
135853
|
+
static unique_ptr<DistinctStatistics> Deserialize(FieldReader &reader);
|
|
135854
|
+
|
|
135855
|
+
void Update(Vector &update, idx_t count);
|
|
135856
|
+
void Update(VectorData &update_data, const LogicalType &ptype, idx_t count);
|
|
135857
|
+
|
|
135858
|
+
string ToString() const override;
|
|
135859
|
+
idx_t GetCount() const;
|
|
135860
|
+
|
|
135861
|
+
private:
|
|
135862
|
+
//! For distinct statistics we sample the input to speed up insertions
|
|
135863
|
+
static constexpr const double SAMPLE_RATE = 0.1;
|
|
135864
|
+
};
|
|
135865
|
+
|
|
135866
|
+
} // namespace duckdb
|
|
135867
|
+
|
|
135868
|
+
|
|
135869
|
+
|
|
135870
|
+
|
|
135871
|
+
|
|
135872
|
+
|
|
135873
|
+
namespace duckdb {
|
|
135874
|
+
|
|
135875
|
+
void UpdateDistinctStats(BaseStatistics &distinct_stats, const Value &input) {
|
|
135876
|
+
Vector v(input);
|
|
135877
|
+
auto &d_stats = (DistinctStatistics &)distinct_stats;
|
|
135878
|
+
d_stats.Update(v, 1);
|
|
135879
|
+
}
|
|
135610
135880
|
|
|
135611
135881
|
unique_ptr<BaseStatistics> StatisticsPropagator::StatisticsFromValue(const Value &input) {
|
|
135612
135882
|
switch (input.type().InternalType()) {
|
|
@@ -135622,13 +135892,15 @@ unique_ptr<BaseStatistics> StatisticsPropagator::StatisticsFromValue(const Value
|
|
|
135622
135892
|
case PhysicalType::INT128:
|
|
135623
135893
|
case PhysicalType::FLOAT:
|
|
135624
135894
|
case PhysicalType::DOUBLE: {
|
|
135625
|
-
auto result = make_unique<NumericStatistics>(input.type(), input, input);
|
|
135895
|
+
auto result = make_unique<NumericStatistics>(input.type(), input, input, StatisticsType::GLOBAL_STATS);
|
|
135626
135896
|
result->validity_stats = make_unique<ValidityStatistics>(input.IsNull(), !input.IsNull());
|
|
135897
|
+
UpdateDistinctStats(*result->distinct_stats, input);
|
|
135627
135898
|
return move(result);
|
|
135628
135899
|
}
|
|
135629
135900
|
case PhysicalType::VARCHAR: {
|
|
135630
|
-
auto result = make_unique<StringStatistics>(input.type());
|
|
135901
|
+
auto result = make_unique<StringStatistics>(input.type(), StatisticsType::GLOBAL_STATS);
|
|
135631
135902
|
result->validity_stats = make_unique<ValidityStatistics>(input.IsNull(), !input.IsNull());
|
|
135903
|
+
UpdateDistinctStats(*result->distinct_stats, input);
|
|
135632
135904
|
if (!input.IsNull()) {
|
|
135633
135905
|
auto &string_value = StringValue::Get(input);
|
|
135634
135906
|
result->Update(string_t(string_value));
|
|
@@ -151469,6 +151741,7 @@ unique_ptr<QueryNode> QueryNode::Deserialize(Deserializer &main_source) {
|
|
|
151469
151741
|
}
|
|
151470
151742
|
result->modifiers = move(modifiers);
|
|
151471
151743
|
result->cte_map = move(cte_map);
|
|
151744
|
+
reader.Finalize();
|
|
151472
151745
|
return result;
|
|
151473
151746
|
}
|
|
151474
151747
|
|
|
@@ -151514,6 +151787,7 @@ unique_ptr<ResultModifier> ResultModifier::Deserialize(Deserializer &source) {
|
|
|
151514
151787
|
default:
|
|
151515
151788
|
throw InternalException("Unrecognized ResultModifierType for Deserialization");
|
|
151516
151789
|
}
|
|
151790
|
+
reader.Finalize();
|
|
151517
151791
|
return result;
|
|
151518
151792
|
}
|
|
151519
151793
|
|
|
@@ -151649,6 +151923,7 @@ OrderByNode OrderByNode::Deserialize(Deserializer &source) {
|
|
|
151649
151923
|
auto type = reader.ReadRequired<OrderType>();
|
|
151650
151924
|
auto null_order = reader.ReadRequired<OrderByNullType>();
|
|
151651
151925
|
auto expression = reader.ReadRequiredSerializable<ParsedExpression>();
|
|
151926
|
+
reader.Finalize();
|
|
151652
151927
|
return OrderByNode(type, null_order, move(expression));
|
|
151653
151928
|
}
|
|
151654
151929
|
|
|
@@ -176309,7 +176584,7 @@ DataTable::DataTable(DatabaseInstance &db, const string &schema, const string &t
|
|
|
176309
176584
|
|
|
176310
176585
|
AppendRowGroup(0);
|
|
176311
176586
|
for (auto &type : types) {
|
|
176312
|
-
column_stats.push_back(BaseStatistics::CreateEmpty(type));
|
|
176587
|
+
column_stats.push_back(BaseStatistics::CreateEmpty(type, StatisticsType::GLOBAL_STATS));
|
|
176313
176588
|
}
|
|
176314
176589
|
} else {
|
|
176315
176590
|
D_ASSERT(column_stats.size() == types.size());
|
|
@@ -176339,7 +176614,7 @@ DataTable::DataTable(ClientContext &context, DataTable &parent, ColumnDefinition
|
|
|
176339
176614
|
for (idx_t i = 0; i < parent.column_stats.size(); i++) {
|
|
176340
176615
|
column_stats.push_back(parent.column_stats[i]->Copy());
|
|
176341
176616
|
}
|
|
176342
|
-
column_stats.push_back(BaseStatistics::CreateEmpty(new_column_type));
|
|
176617
|
+
column_stats.push_back(BaseStatistics::CreateEmpty(new_column_type, StatisticsType::GLOBAL_STATS));
|
|
176343
176618
|
|
|
176344
176619
|
// add the column definitions from this DataTable
|
|
176345
176620
|
column_definitions.emplace_back(new_column.Copy());
|
|
@@ -176448,7 +176723,8 @@ DataTable::DataTable(ClientContext &context, DataTable &parent, idx_t changed_id
|
|
|
176448
176723
|
// the column that had its type changed will have the new statistics computed during conversion
|
|
176449
176724
|
for (idx_t i = 0; i < column_definitions.size(); i++) {
|
|
176450
176725
|
if (i == changed_idx) {
|
|
176451
|
-
column_stats.push_back(
|
|
176726
|
+
column_stats.push_back(
|
|
176727
|
+
BaseStatistics::CreateEmpty(column_definitions[i].type, StatisticsType::GLOBAL_STATS));
|
|
176452
176728
|
} else {
|
|
176453
176729
|
column_stats.push_back(parent.column_stats[i]->Copy());
|
|
176454
176730
|
}
|
|
@@ -176965,6 +177241,13 @@ void DataTable::Append(Transaction &transaction, DataChunk &chunk, TableAppendSt
|
|
|
176965
177241
|
}
|
|
176966
177242
|
}
|
|
176967
177243
|
state.current_row += append_count;
|
|
177244
|
+
for (idx_t col_idx = 0; col_idx < column_stats.size(); col_idx++) {
|
|
177245
|
+
auto type = chunk.data[col_idx].GetType().InternalType();
|
|
177246
|
+
if (type == PhysicalType::LIST || type == PhysicalType::STRUCT) {
|
|
177247
|
+
continue;
|
|
177248
|
+
}
|
|
177249
|
+
column_stats[col_idx]->UpdateDistinctStatistics(chunk.data[col_idx], chunk.size());
|
|
177250
|
+
}
|
|
176968
177251
|
}
|
|
176969
177252
|
|
|
176970
177253
|
void DataTable::ScanTableSegment(idx_t row_start, idx_t count, const std::function<void(DataChunk &chunk)> &function) {
|
|
@@ -177535,7 +177818,7 @@ BlockPointer DataTable::Checkpoint(TableDataWriter &writer) {
|
|
|
177535
177818
|
// FIXME: we might want to combine adjacent row groups in case they have had deletions...
|
|
177536
177819
|
vector<unique_ptr<BaseStatistics>> global_stats;
|
|
177537
177820
|
for (idx_t i = 0; i < column_definitions.size(); i++) {
|
|
177538
|
-
global_stats.push_back(
|
|
177821
|
+
global_stats.push_back(column_stats[i]->Copy());
|
|
177539
177822
|
}
|
|
177540
177823
|
|
|
177541
177824
|
auto row_group = (RowGroup *)row_groups->GetRootSegment();
|
|
@@ -178803,14 +179086,22 @@ void SingleFileBlockManager::WriteHeader(DatabaseHeader header) {
|
|
|
178803
179086
|
|
|
178804
179087
|
|
|
178805
179088
|
|
|
179089
|
+
|
|
178806
179090
|
namespace duckdb {
|
|
178807
179091
|
|
|
178808
|
-
BaseStatistics::BaseStatistics(LogicalType type) : type(move(type)) {
|
|
179092
|
+
BaseStatistics::BaseStatistics(LogicalType type, StatisticsType stats_type) : type(move(type)), stats_type(stats_type) {
|
|
178809
179093
|
}
|
|
178810
179094
|
|
|
178811
179095
|
BaseStatistics::~BaseStatistics() {
|
|
178812
179096
|
}
|
|
178813
179097
|
|
|
179098
|
+
void BaseStatistics::InitializeBase() {
|
|
179099
|
+
validity_stats = make_unique<ValidityStatistics>(false);
|
|
179100
|
+
if (stats_type == GLOBAL_STATS) {
|
|
179101
|
+
distinct_stats = make_unique<DistinctStatistics>();
|
|
179102
|
+
}
|
|
179103
|
+
}
|
|
179104
|
+
|
|
178814
179105
|
bool BaseStatistics::CanHaveNull() const {
|
|
178815
179106
|
if (!validity_stats) {
|
|
178816
179107
|
// we don't know
|
|
@@ -178829,18 +179120,34 @@ bool BaseStatistics::CanHaveNoNull() const {
|
|
|
178829
179120
|
return ((ValidityStatistics &)*validity_stats).has_no_null;
|
|
178830
179121
|
}
|
|
178831
179122
|
|
|
178832
|
-
void BaseStatistics::
|
|
178833
|
-
|
|
178834
|
-
|
|
178835
|
-
|
|
178836
|
-
|
|
179123
|
+
void BaseStatistics::UpdateDistinctStatistics(Vector &v, idx_t count) {
|
|
179124
|
+
if (!distinct_stats) {
|
|
179125
|
+
return;
|
|
179126
|
+
}
|
|
179127
|
+
auto &d_stats = (DistinctStatistics &)*distinct_stats;
|
|
179128
|
+
d_stats.Update(v, count);
|
|
179129
|
+
}
|
|
179130
|
+
|
|
179131
|
+
void MergeInternal(unique_ptr<BaseStatistics> &orig, const unique_ptr<BaseStatistics> &other) {
|
|
179132
|
+
if (other) {
|
|
179133
|
+
if (orig) {
|
|
179134
|
+
orig->Merge(*other);
|
|
178837
179135
|
} else {
|
|
178838
|
-
|
|
179136
|
+
orig = other->Copy();
|
|
178839
179137
|
}
|
|
178840
179138
|
}
|
|
178841
179139
|
}
|
|
178842
179140
|
|
|
178843
|
-
|
|
179141
|
+
void BaseStatistics::Merge(const BaseStatistics &other) {
|
|
179142
|
+
D_ASSERT(type == other.type);
|
|
179143
|
+
MergeInternal(validity_stats, other.validity_stats);
|
|
179144
|
+
if (stats_type == GLOBAL_STATS) {
|
|
179145
|
+
MergeInternal(distinct_stats, other.distinct_stats);
|
|
179146
|
+
}
|
|
179147
|
+
}
|
|
179148
|
+
|
|
179149
|
+
unique_ptr<BaseStatistics> BaseStatistics::CreateEmpty(LogicalType type, StatisticsType stats_type) {
|
|
179150
|
+
unique_ptr<BaseStatistics> result;
|
|
178844
179151
|
switch (type.InternalType()) {
|
|
178845
179152
|
case PhysicalType::BIT:
|
|
178846
179153
|
return make_unique<ValidityStatistics>(false, false);
|
|
@@ -178856,34 +179163,49 @@ unique_ptr<BaseStatistics> BaseStatistics::CreateEmpty(LogicalType type) {
|
|
|
178856
179163
|
case PhysicalType::INT128:
|
|
178857
179164
|
case PhysicalType::FLOAT:
|
|
178858
179165
|
case PhysicalType::DOUBLE:
|
|
178859
|
-
|
|
179166
|
+
result = make_unique<NumericStatistics>(move(type), stats_type);
|
|
179167
|
+
break;
|
|
178860
179168
|
case PhysicalType::VARCHAR:
|
|
178861
|
-
|
|
179169
|
+
result = make_unique<StringStatistics>(move(type), stats_type);
|
|
179170
|
+
break;
|
|
178862
179171
|
case PhysicalType::STRUCT:
|
|
178863
|
-
|
|
179172
|
+
result = make_unique<StructStatistics>(move(type));
|
|
179173
|
+
break;
|
|
178864
179174
|
case PhysicalType::LIST:
|
|
178865
|
-
|
|
179175
|
+
result = make_unique<ListStatistics>(move(type));
|
|
179176
|
+
break;
|
|
178866
179177
|
case PhysicalType::INTERVAL:
|
|
178867
179178
|
default:
|
|
178868
|
-
|
|
178869
|
-
base_stats->validity_stats = make_unique<ValidityStatistics>(false);
|
|
178870
|
-
return base_stats;
|
|
179179
|
+
result = make_unique<BaseStatistics>(move(type), stats_type);
|
|
178871
179180
|
}
|
|
179181
|
+
result->InitializeBase();
|
|
179182
|
+
return result;
|
|
178872
179183
|
}
|
|
178873
179184
|
|
|
178874
179185
|
unique_ptr<BaseStatistics> BaseStatistics::Copy() const {
|
|
178875
|
-
auto
|
|
178876
|
-
|
|
178877
|
-
|
|
179186
|
+
auto result = make_unique<BaseStatistics>(type, stats_type);
|
|
179187
|
+
result->CopyBase(*this);
|
|
179188
|
+
return result;
|
|
179189
|
+
}
|
|
179190
|
+
|
|
179191
|
+
void BaseStatistics::CopyBase(const BaseStatistics &orig) {
|
|
179192
|
+
if (orig.validity_stats) {
|
|
179193
|
+
validity_stats = orig.validity_stats->Copy();
|
|
179194
|
+
}
|
|
179195
|
+
if (orig.distinct_stats) {
|
|
179196
|
+
distinct_stats = orig.distinct_stats->Copy();
|
|
178878
179197
|
}
|
|
178879
|
-
return statistics;
|
|
178880
179198
|
}
|
|
178881
179199
|
|
|
178882
179200
|
void BaseStatistics::Serialize(Serializer &serializer) const {
|
|
178883
179201
|
FieldWriter writer(serializer);
|
|
178884
|
-
|
|
178885
|
-
writer.WriteField<bool>(CanHaveNoNull());
|
|
179202
|
+
ValidityStatistics(CanHaveNull(), CanHaveNoNull()).Serialize(writer);
|
|
178886
179203
|
Serialize(writer);
|
|
179204
|
+
auto ptype = type.InternalType();
|
|
179205
|
+
if (ptype != PhysicalType::BIT) {
|
|
179206
|
+
writer.WriteField<StatisticsType>(stats_type);
|
|
179207
|
+
writer.WriteOptional<BaseStatistics>(distinct_stats);
|
|
179208
|
+
}
|
|
178887
179209
|
writer.Finalize();
|
|
178888
179210
|
}
|
|
178889
179211
|
|
|
@@ -178892,12 +179214,13 @@ void BaseStatistics::Serialize(FieldWriter &writer) const {
|
|
|
178892
179214
|
|
|
178893
179215
|
unique_ptr<BaseStatistics> BaseStatistics::Deserialize(Deserializer &source, LogicalType type) {
|
|
178894
179216
|
FieldReader reader(source);
|
|
178895
|
-
|
|
178896
|
-
bool can_have_no_null = reader.ReadRequired<bool>();
|
|
179217
|
+
auto validity_stats = ValidityStatistics::Deserialize(reader);
|
|
178897
179218
|
unique_ptr<BaseStatistics> result;
|
|
178898
|
-
|
|
179219
|
+
auto ptype = type.InternalType();
|
|
179220
|
+
switch (ptype) {
|
|
178899
179221
|
case PhysicalType::BIT:
|
|
178900
|
-
|
|
179222
|
+
result = ValidityStatistics::Deserialize(reader);
|
|
179223
|
+
break;
|
|
178901
179224
|
case PhysicalType::BOOL:
|
|
178902
179225
|
case PhysicalType::INT8:
|
|
178903
179226
|
case PhysicalType::INT16:
|
|
@@ -178922,17 +179245,25 @@ unique_ptr<BaseStatistics> BaseStatistics::Deserialize(Deserializer &source, Log
|
|
|
178922
179245
|
result = ListStatistics::Deserialize(reader, move(type));
|
|
178923
179246
|
break;
|
|
178924
179247
|
case PhysicalType::INTERVAL:
|
|
178925
|
-
result = make_unique<BaseStatistics>(move(type));
|
|
179248
|
+
result = make_unique<BaseStatistics>(move(type), StatisticsType::LOCAL_STATS);
|
|
178926
179249
|
break;
|
|
178927
179250
|
default:
|
|
178928
179251
|
throw InternalException("Unimplemented type for statistics deserialization");
|
|
178929
179252
|
}
|
|
178930
|
-
|
|
179253
|
+
|
|
179254
|
+
if (ptype != PhysicalType::BIT) {
|
|
179255
|
+
result->validity_stats = move(validity_stats);
|
|
179256
|
+
result->stats_type = reader.ReadField<StatisticsType>(StatisticsType::LOCAL_STATS);
|
|
179257
|
+
result->distinct_stats = reader.ReadOptional<DistinctStatistics>(nullptr);
|
|
179258
|
+
}
|
|
179259
|
+
|
|
179260
|
+
reader.Finalize();
|
|
178931
179261
|
return result;
|
|
178932
179262
|
}
|
|
178933
179263
|
|
|
178934
179264
|
string BaseStatistics::ToString() const {
|
|
178935
|
-
return StringUtil::Format("
|
|
179265
|
+
return StringUtil::Format("%s%s", validity_stats ? validity_stats->ToString() : "",
|
|
179266
|
+
distinct_stats ? distinct_stats->ToString() : "");
|
|
178936
179267
|
}
|
|
178937
179268
|
|
|
178938
179269
|
void BaseStatistics::Verify(Vector &vector, const SelectionVector &sel, idx_t count) const {
|
|
@@ -178953,14 +179284,104 @@ void BaseStatistics::Verify(Vector &vector, idx_t count) const {
|
|
|
178953
179284
|
|
|
178954
179285
|
|
|
178955
179286
|
|
|
179287
|
+
|
|
178956
179288
|
namespace duckdb {
|
|
178957
179289
|
|
|
178958
|
-
|
|
178959
|
-
|
|
179290
|
+
DistinctStatistics::DistinctStatistics()
|
|
179291
|
+
: BaseStatistics(LogicalType::INVALID, StatisticsType::LOCAL_STATS), log(make_unique<HyperLogLog>()),
|
|
179292
|
+
sample_count(0), total_count(0) {
|
|
179293
|
+
}
|
|
179294
|
+
|
|
179295
|
+
DistinctStatistics::DistinctStatistics(unique_ptr<HyperLogLog> log, idx_t sample_count, idx_t total_count)
|
|
179296
|
+
: BaseStatistics(LogicalType::INVALID, StatisticsType::LOCAL_STATS), log(move(log)), sample_count(sample_count),
|
|
179297
|
+
total_count(total_count) {
|
|
179298
|
+
}
|
|
179299
|
+
|
|
179300
|
+
unique_ptr<BaseStatistics> DistinctStatistics::Copy() const {
|
|
179301
|
+
return make_unique<DistinctStatistics>(log->Copy(), sample_count, total_count);
|
|
179302
|
+
}
|
|
179303
|
+
|
|
179304
|
+
void DistinctStatistics::Merge(const BaseStatistics &other_p) {
|
|
179305
|
+
BaseStatistics::Merge(other_p);
|
|
179306
|
+
auto &other = (const DistinctStatistics &)other_p;
|
|
179307
|
+
log->Merge(*other.log);
|
|
179308
|
+
sample_count += other.sample_count;
|
|
179309
|
+
total_count += other.total_count;
|
|
179310
|
+
}
|
|
179311
|
+
|
|
179312
|
+
void DistinctStatistics::Serialize(Serializer &serializer) const {
|
|
179313
|
+
FieldWriter writer(serializer);
|
|
179314
|
+
Serialize(writer);
|
|
179315
|
+
writer.Finalize();
|
|
179316
|
+
}
|
|
179317
|
+
|
|
179318
|
+
void DistinctStatistics::Serialize(FieldWriter &writer) const {
|
|
179319
|
+
writer.WriteField<idx_t>(sample_count);
|
|
179320
|
+
writer.WriteField<idx_t>(total_count);
|
|
179321
|
+
log->Serialize(writer);
|
|
179322
|
+
}
|
|
179323
|
+
|
|
179324
|
+
unique_ptr<DistinctStatistics> DistinctStatistics::Deserialize(Deserializer &source) {
|
|
179325
|
+
FieldReader reader(source);
|
|
179326
|
+
auto result = Deserialize(reader);
|
|
179327
|
+
reader.Finalize();
|
|
179328
|
+
return result;
|
|
179329
|
+
}
|
|
179330
|
+
|
|
179331
|
+
unique_ptr<DistinctStatistics> DistinctStatistics::Deserialize(FieldReader &reader) {
|
|
179332
|
+
auto sample_count = reader.ReadRequired<idx_t>();
|
|
179333
|
+
auto total_count = reader.ReadRequired<idx_t>();
|
|
179334
|
+
return make_unique<DistinctStatistics>(HyperLogLog::Deserialize(reader), sample_count, total_count);
|
|
179335
|
+
}
|
|
179336
|
+
|
|
179337
|
+
void DistinctStatistics::Update(Vector &v, idx_t count) {
|
|
179338
|
+
VectorData vdata;
|
|
179339
|
+
v.Orrify(count, vdata);
|
|
179340
|
+
Update(vdata, v.GetType(), count);
|
|
179341
|
+
}
|
|
179342
|
+
|
|
179343
|
+
void DistinctStatistics::Update(VectorData &vdata, const LogicalType &type, idx_t count) {
|
|
179344
|
+
if (count == 0) {
|
|
179345
|
+
return;
|
|
179346
|
+
}
|
|
179347
|
+
total_count += count;
|
|
179348
|
+
count = MaxValue<idx_t>(idx_t(SAMPLE_RATE * double(count)), 1);
|
|
179349
|
+
sample_count += count;
|
|
179350
|
+
|
|
179351
|
+
uint64_t indices[STANDARD_VECTOR_SIZE];
|
|
179352
|
+
uint8_t counts[STANDARD_VECTOR_SIZE];
|
|
179353
|
+
|
|
179354
|
+
HyperLogLog::ProcessEntries(vdata, type, indices, counts, count);
|
|
179355
|
+
log->AddToLog(vdata, count, indices, counts);
|
|
179356
|
+
}
|
|
179357
|
+
|
|
179358
|
+
string DistinctStatistics::ToString() const {
|
|
179359
|
+
return StringUtil::Format("[Approx Unique: %s]", to_string(GetCount()));
|
|
179360
|
+
}
|
|
179361
|
+
|
|
179362
|
+
idx_t DistinctStatistics::GetCount() const {
|
|
179363
|
+
// Estimate HLL count because we use sampling
|
|
179364
|
+
double hll_count = log->Count();
|
|
179365
|
+
double unique_proportion = hll_count / double(sample_count);
|
|
179366
|
+
double actual_sample_rate = double(sample_count) / double(total_count);
|
|
179367
|
+
double multiplier = double(1) + unique_proportion * (double(1) / actual_sample_rate - double(1));
|
|
179368
|
+
return idx_t(multiplier * hll_count);
|
|
179369
|
+
}
|
|
178960
179370
|
|
|
179371
|
+
} // namespace duckdb
|
|
179372
|
+
|
|
179373
|
+
|
|
179374
|
+
|
|
179375
|
+
|
|
179376
|
+
|
|
179377
|
+
|
|
179378
|
+
namespace duckdb {
|
|
179379
|
+
|
|
179380
|
+
ListStatistics::ListStatistics(LogicalType type_p) : BaseStatistics(move(type_p), StatisticsType::LOCAL_STATS) {
|
|
179381
|
+
D_ASSERT(type.InternalType() == PhysicalType::LIST);
|
|
179382
|
+
InitializeBase();
|
|
178961
179383
|
auto &child_type = ListType::GetChildType(type);
|
|
178962
|
-
child_stats = BaseStatistics::CreateEmpty(child_type);
|
|
178963
|
-
validity_stats = make_unique<ValidityStatistics>(false);
|
|
179384
|
+
child_stats = BaseStatistics::CreateEmpty(child_type, StatisticsType::LOCAL_STATS);
|
|
178964
179385
|
}
|
|
178965
179386
|
|
|
178966
179387
|
void ListStatistics::Merge(const BaseStatistics &other_p) {
|
|
@@ -178981,10 +179402,11 @@ FilterPropagateResult ListStatistics::CheckZonemap(ExpressionType comparison_typ
|
|
|
178981
179402
|
// LCOV_EXCL_STOP
|
|
178982
179403
|
|
|
178983
179404
|
unique_ptr<BaseStatistics> ListStatistics::Copy() const {
|
|
178984
|
-
auto
|
|
178985
|
-
|
|
178986
|
-
|
|
178987
|
-
|
|
179405
|
+
auto result = make_unique<ListStatistics>(type);
|
|
179406
|
+
result->CopyBase(*this);
|
|
179407
|
+
|
|
179408
|
+
result->child_stats = child_stats ? child_stats->Copy() : nullptr;
|
|
179409
|
+
return move(result);
|
|
178988
179410
|
}
|
|
178989
179411
|
|
|
178990
179412
|
void ListStatistics::Serialize(FieldWriter &writer) const {
|
|
@@ -178995,18 +179417,12 @@ unique_ptr<BaseStatistics> ListStatistics::Deserialize(FieldReader &reader, Logi
|
|
|
178995
179417
|
D_ASSERT(type.InternalType() == PhysicalType::LIST);
|
|
178996
179418
|
auto result = make_unique<ListStatistics>(move(type));
|
|
178997
179419
|
auto &child_type = ListType::GetChildType(result->type);
|
|
178998
|
-
|
|
178999
|
-
result->child_stats = BaseStatistics::Deserialize(source, child_type);
|
|
179420
|
+
result->child_stats = reader.ReadRequiredSerializable<BaseStatistics>(child_type);
|
|
179000
179421
|
return move(result);
|
|
179001
179422
|
}
|
|
179002
179423
|
|
|
179003
179424
|
string ListStatistics::ToString() const {
|
|
179004
|
-
|
|
179005
|
-
result += " [";
|
|
179006
|
-
result += child_stats ? child_stats->ToString() : "No Stats";
|
|
179007
|
-
result += "]";
|
|
179008
|
-
result += validity_stats ? validity_stats->ToString() : "";
|
|
179009
|
-
return result;
|
|
179425
|
+
return StringUtil::Format("[%s]%s", child_stats ? child_stats->ToString() : "No Stats", BaseStatistics::ToString());
|
|
179010
179426
|
}
|
|
179011
179427
|
|
|
179012
179428
|
void ListStatistics::Verify(Vector &vector, const SelectionVector &sel, idx_t count) const {
|
|
@@ -179052,6 +179468,7 @@ void ListStatistics::Verify(Vector &vector, const SelectionVector &sel, idx_t co
|
|
|
179052
179468
|
|
|
179053
179469
|
|
|
179054
179470
|
|
|
179471
|
+
|
|
179055
179472
|
namespace duckdb {
|
|
179056
179473
|
|
|
179057
179474
|
template <>
|
|
@@ -179062,14 +179479,16 @@ template <>
|
|
|
179062
179479
|
void NumericStatistics::Update<list_entry_t>(SegmentStatistics &stats, list_entry_t new_value) {
|
|
179063
179480
|
}
|
|
179064
179481
|
|
|
179065
|
-
NumericStatistics::NumericStatistics(LogicalType type_p
|
|
179482
|
+
NumericStatistics::NumericStatistics(LogicalType type_p, StatisticsType stats_type)
|
|
179483
|
+
: BaseStatistics(move(type_p), stats_type) {
|
|
179484
|
+
InitializeBase();
|
|
179066
179485
|
min = Value::MaximumValue(type);
|
|
179067
179486
|
max = Value::MinimumValue(type);
|
|
179068
|
-
validity_stats = make_unique<ValidityStatistics>(false);
|
|
179069
179487
|
}
|
|
179070
179488
|
|
|
179071
|
-
NumericStatistics::NumericStatistics(LogicalType type_p, Value min_p, Value max_p)
|
|
179072
|
-
: BaseStatistics(move(type_p)), min(move(min_p)), max(move(max_p)) {
|
|
179489
|
+
NumericStatistics::NumericStatistics(LogicalType type_p, Value min_p, Value max_p, StatisticsType stats_type)
|
|
179490
|
+
: BaseStatistics(move(type_p), stats_type), min(move(min_p)), max(move(max_p)) {
|
|
179491
|
+
InitializeBase();
|
|
179073
179492
|
}
|
|
179074
179493
|
|
|
179075
179494
|
void NumericStatistics::Merge(const BaseStatistics &other_p) {
|
|
@@ -179161,11 +179580,9 @@ FilterPropagateResult NumericStatistics::CheckZonemap(ExpressionType comparison_
|
|
|
179161
179580
|
}
|
|
179162
179581
|
|
|
179163
179582
|
unique_ptr<BaseStatistics> NumericStatistics::Copy() const {
|
|
179164
|
-
auto
|
|
179165
|
-
|
|
179166
|
-
|
|
179167
|
-
}
|
|
179168
|
-
return move(stats);
|
|
179583
|
+
auto result = make_unique<NumericStatistics>(type, min, max, stats_type);
|
|
179584
|
+
result->CopyBase(*this);
|
|
179585
|
+
return move(result);
|
|
179169
179586
|
}
|
|
179170
179587
|
|
|
179171
179588
|
bool NumericStatistics::IsConstant() const {
|
|
@@ -179180,12 +179597,11 @@ void NumericStatistics::Serialize(FieldWriter &writer) const {
|
|
|
179180
179597
|
unique_ptr<BaseStatistics> NumericStatistics::Deserialize(FieldReader &reader, LogicalType type) {
|
|
179181
179598
|
auto min = reader.ReadRequiredSerializable<Value, Value>();
|
|
179182
179599
|
auto max = reader.ReadRequiredSerializable<Value, Value>();
|
|
179183
|
-
return make_unique_base<BaseStatistics, NumericStatistics>(move(type), min, max);
|
|
179600
|
+
return make_unique_base<BaseStatistics, NumericStatistics>(move(type), min, max, StatisticsType::LOCAL_STATS);
|
|
179184
179601
|
}
|
|
179185
179602
|
|
|
179186
179603
|
string NumericStatistics::ToString() const {
|
|
179187
|
-
return StringUtil::Format("[Min: %s, Max: %s]%s", min.ToString(), max.ToString(),
|
|
179188
|
-
validity_stats ? validity_stats->ToString() : "");
|
|
179604
|
+
return StringUtil::Format("[Min: %s, Max: %s]%s", min.ToString(), max.ToString(), BaseStatistics::ToString());
|
|
179189
179605
|
}
|
|
179190
179606
|
|
|
179191
179607
|
template <class T>
|
|
@@ -179260,8 +179676,6 @@ void NumericStatistics::Verify(Vector &vector, const SelectionVector &sel, idx_t
|
|
|
179260
179676
|
|
|
179261
179677
|
|
|
179262
179678
|
|
|
179263
|
-
|
|
179264
|
-
|
|
179265
179679
|
namespace duckdb {
|
|
179266
179680
|
|
|
179267
179681
|
SegmentStatistics::SegmentStatistics(LogicalType type) : type(move(type)) {
|
|
@@ -179276,8 +179690,7 @@ SegmentStatistics::SegmentStatistics(LogicalType type, unique_ptr<BaseStatistics
|
|
|
179276
179690
|
}
|
|
179277
179691
|
|
|
179278
179692
|
void SegmentStatistics::Reset() {
|
|
179279
|
-
statistics = BaseStatistics::CreateEmpty(type);
|
|
179280
|
-
statistics->validity_stats = make_unique<ValidityStatistics>(false);
|
|
179693
|
+
statistics = BaseStatistics::CreateEmpty(type, StatisticsType::LOCAL_STATS);
|
|
179281
179694
|
}
|
|
179282
179695
|
|
|
179283
179696
|
} // namespace duckdb
|
|
@@ -179289,7 +179702,9 @@ void SegmentStatistics::Reset() {
|
|
|
179289
179702
|
|
|
179290
179703
|
namespace duckdb {
|
|
179291
179704
|
|
|
179292
|
-
StringStatistics::StringStatistics(LogicalType type_p
|
|
179705
|
+
StringStatistics::StringStatistics(LogicalType type_p, StatisticsType stats_type)
|
|
179706
|
+
: BaseStatistics(move(type_p), stats_type) {
|
|
179707
|
+
InitializeBase();
|
|
179293
179708
|
for (idx_t i = 0; i < MAX_STRING_MINMAX_SIZE; i++) {
|
|
179294
179709
|
min[i] = 0xFF;
|
|
179295
179710
|
max[i] = 0;
|
|
@@ -179297,19 +179712,17 @@ StringStatistics::StringStatistics(LogicalType type_p) : BaseStatistics(move(typ
|
|
|
179297
179712
|
max_string_length = 0;
|
|
179298
179713
|
has_unicode = false;
|
|
179299
179714
|
has_overflow_strings = false;
|
|
179300
|
-
validity_stats = make_unique<ValidityStatistics>(false);
|
|
179301
179715
|
}
|
|
179302
179716
|
|
|
179303
179717
|
unique_ptr<BaseStatistics> StringStatistics::Copy() const {
|
|
179304
|
-
auto
|
|
179305
|
-
|
|
179306
|
-
|
|
179307
|
-
|
|
179308
|
-
|
|
179309
|
-
|
|
179310
|
-
|
|
179311
|
-
|
|
179312
|
-
return move(stats);
|
|
179718
|
+
auto result = make_unique<StringStatistics>(type, stats_type);
|
|
179719
|
+
result->CopyBase(*this);
|
|
179720
|
+
|
|
179721
|
+
memcpy(result->min, min, MAX_STRING_MINMAX_SIZE);
|
|
179722
|
+
memcpy(result->max, max, MAX_STRING_MINMAX_SIZE);
|
|
179723
|
+
result->has_unicode = has_unicode;
|
|
179724
|
+
result->max_string_length = max_string_length;
|
|
179725
|
+
return move(result);
|
|
179313
179726
|
}
|
|
179314
179727
|
|
|
179315
179728
|
void StringStatistics::Serialize(FieldWriter &writer) const {
|
|
@@ -179321,7 +179734,7 @@ void StringStatistics::Serialize(FieldWriter &writer) const {
|
|
|
179321
179734
|
}
|
|
179322
179735
|
|
|
179323
179736
|
unique_ptr<BaseStatistics> StringStatistics::Deserialize(FieldReader &reader, LogicalType type) {
|
|
179324
|
-
auto stats = make_unique<StringStatistics>(move(type));
|
|
179737
|
+
auto stats = make_unique<StringStatistics>(move(type), StatisticsType::LOCAL_STATS);
|
|
179325
179738
|
reader.ReadBlob(stats->min, MAX_STRING_MINMAX_SIZE);
|
|
179326
179739
|
reader.ReadBlob(stats->max, MAX_STRING_MINMAX_SIZE);
|
|
179327
179740
|
stats->has_unicode = reader.ReadRequired<bool>();
|
|
@@ -179449,8 +179862,7 @@ string StringStatistics::ToString() const {
|
|
|
179449
179862
|
idx_t max_len = GetValidMinMaxSubstring(max);
|
|
179450
179863
|
return StringUtil::Format("[Min: %s, Max: %s, Has Unicode: %s, Max String Length: %lld]%s",
|
|
179451
179864
|
string((const char *)min, min_len), string((const char *)max, max_len),
|
|
179452
|
-
has_unicode ? "true" : "false", max_string_length,
|
|
179453
|
-
validity_stats ? validity_stats->ToString() : "");
|
|
179865
|
+
has_unicode ? "true" : "false", max_string_length, BaseStatistics::ToString());
|
|
179454
179866
|
}
|
|
179455
179867
|
|
|
179456
179868
|
void StringStatistics::Verify(Vector &vector, const SelectionVector &sel, idx_t count) const {
|
|
@@ -179504,17 +179916,18 @@ void StringStatistics::Verify(Vector &vector, const SelectionVector &sel, idx_t
|
|
|
179504
179916
|
|
|
179505
179917
|
|
|
179506
179918
|
|
|
179919
|
+
|
|
179507
179920
|
namespace duckdb {
|
|
179508
179921
|
|
|
179509
|
-
StructStatistics::StructStatistics(LogicalType type_p) : BaseStatistics(move(type_p)) {
|
|
179922
|
+
StructStatistics::StructStatistics(LogicalType type_p) : BaseStatistics(move(type_p), StatisticsType::LOCAL_STATS) {
|
|
179510
179923
|
D_ASSERT(type.InternalType() == PhysicalType::STRUCT);
|
|
179924
|
+
InitializeBase();
|
|
179511
179925
|
|
|
179512
179926
|
auto &child_types = StructType::GetChildTypes(type);
|
|
179513
179927
|
child_stats.resize(child_types.size());
|
|
179514
179928
|
for (idx_t i = 0; i < child_types.size(); i++) {
|
|
179515
|
-
child_stats[i] = BaseStatistics::CreateEmpty(child_types[i].second);
|
|
179929
|
+
child_stats[i] = BaseStatistics::CreateEmpty(child_types[i].second, StatisticsType::LOCAL_STATS);
|
|
179516
179930
|
}
|
|
179517
|
-
validity_stats = make_unique<ValidityStatistics>(false);
|
|
179518
179931
|
}
|
|
179519
179932
|
|
|
179520
179933
|
void StructStatistics::Merge(const BaseStatistics &other_p) {
|
|
@@ -179538,14 +179951,13 @@ FilterPropagateResult StructStatistics::CheckZonemap(ExpressionType comparison_t
|
|
|
179538
179951
|
// LCOV_EXCL_STOP
|
|
179539
179952
|
|
|
179540
179953
|
unique_ptr<BaseStatistics> StructStatistics::Copy() const {
|
|
179541
|
-
auto
|
|
179542
|
-
|
|
179543
|
-
|
|
179544
|
-
}
|
|
179954
|
+
auto result = make_unique<StructStatistics>(type);
|
|
179955
|
+
result->CopyBase(*this);
|
|
179956
|
+
|
|
179545
179957
|
for (idx_t i = 0; i < child_stats.size(); i++) {
|
|
179546
|
-
|
|
179958
|
+
result->child_stats[i] = child_stats[i] ? child_stats[i]->Copy() : nullptr;
|
|
179547
179959
|
}
|
|
179548
|
-
return move(
|
|
179960
|
+
return move(result);
|
|
179549
179961
|
}
|
|
179550
179962
|
|
|
179551
179963
|
void StructStatistics::Serialize(FieldWriter &writer) const {
|
|
@@ -179591,7 +180003,7 @@ string StructStatistics::ToString() const {
|
|
|
179591
180003
|
result += child_types[i].first + ": " + (child_stats[i] ? child_stats[i]->ToString() : "No Stats");
|
|
179592
180004
|
}
|
|
179593
180005
|
result += "}";
|
|
179594
|
-
result +=
|
|
180006
|
+
result += BaseStatistics::ToString();
|
|
179595
180007
|
return result;
|
|
179596
180008
|
}
|
|
179597
180009
|
|
|
@@ -179612,10 +180024,13 @@ void StructStatistics::Verify(Vector &vector, const SelectionVector &sel, idx_t
|
|
|
179612
180024
|
|
|
179613
180025
|
|
|
179614
180026
|
|
|
180027
|
+
|
|
180028
|
+
|
|
179615
180029
|
namespace duckdb {
|
|
179616
180030
|
|
|
179617
180031
|
ValidityStatistics::ValidityStatistics(bool has_null, bool has_no_null)
|
|
179618
|
-
: BaseStatistics(LogicalType(LogicalTypeId::VALIDITY)), has_null(has_null),
|
|
180032
|
+
: BaseStatistics(LogicalType(LogicalTypeId::VALIDITY), StatisticsType::LOCAL_STATS), has_null(has_null),
|
|
180033
|
+
has_no_null(has_no_null) {
|
|
179619
180034
|
}
|
|
179620
180035
|
|
|
179621
180036
|
unique_ptr<BaseStatistics> ValidityStatistics::Combine(const unique_ptr<BaseStatistics> &lstats,
|
|
@@ -179658,7 +180073,7 @@ void ValidityStatistics::Serialize(FieldWriter &writer) const {
|
|
|
179658
180073
|
writer.WriteField<bool>(has_no_null);
|
|
179659
180074
|
}
|
|
179660
180075
|
|
|
179661
|
-
unique_ptr<
|
|
180076
|
+
unique_ptr<ValidityStatistics> ValidityStatistics::Deserialize(FieldReader &reader) {
|
|
179662
180077
|
bool has_null = reader.ReadRequired<bool>();
|
|
179663
180078
|
bool has_no_null = reader.ReadRequired<bool>();
|
|
179664
180079
|
return make_unique<ValidityStatistics>(has_null, has_no_null);
|
|
@@ -179689,7 +180104,9 @@ void ValidityStatistics::Verify(Vector &vector, const SelectionVector &sel, idx_
|
|
|
179689
180104
|
}
|
|
179690
180105
|
|
|
179691
180106
|
string ValidityStatistics::ToString() const {
|
|
179692
|
-
|
|
180107
|
+
auto has_n = has_null ? "true" : "false";
|
|
180108
|
+
auto has_n_n = has_no_null ? "true" : "false";
|
|
180109
|
+
return StringUtil::Format("[Has Null: %s, Has No Null: %s]", has_n, has_n_n);
|
|
179693
180110
|
}
|
|
179694
180111
|
|
|
179695
180112
|
} // namespace duckdb
|
|
@@ -180411,6 +180828,78 @@ void ColumnCheckpointState::FlushToDisk() {
|
|
|
180411
180828
|
|
|
180412
180829
|
|
|
180413
180830
|
|
|
180831
|
+
|
|
180832
|
+
|
|
180833
|
+
|
|
180834
|
+
|
|
180835
|
+
|
|
180836
|
+
|
|
180837
|
+
|
|
180838
|
+
|
|
180839
|
+
//===----------------------------------------------------------------------===//
|
|
180840
|
+
// DuckDB
|
|
180841
|
+
//
|
|
180842
|
+
// duckdb/storage/table/struct_column_data.hpp
|
|
180843
|
+
//
|
|
180844
|
+
//
|
|
180845
|
+
//===----------------------------------------------------------------------===//
|
|
180846
|
+
|
|
180847
|
+
|
|
180848
|
+
|
|
180849
|
+
|
|
180850
|
+
|
|
180851
|
+
|
|
180852
|
+
namespace duckdb {
|
|
180853
|
+
|
|
180854
|
+
//! Struct column data represents a struct
|
|
180855
|
+
class StructColumnData : public ColumnData {
|
|
180856
|
+
public:
|
|
180857
|
+
StructColumnData(DataTableInfo &info, idx_t column_index, idx_t start_row, LogicalType type,
|
|
180858
|
+
ColumnData *parent = nullptr);
|
|
180859
|
+
|
|
180860
|
+
//! The sub-columns of the struct
|
|
180861
|
+
vector<unique_ptr<ColumnData>> sub_columns;
|
|
180862
|
+
//! The validity column data of the struct
|
|
180863
|
+
ValidityColumnData validity;
|
|
180864
|
+
|
|
180865
|
+
public:
|
|
180866
|
+
bool CheckZonemap(ColumnScanState &state, TableFilter &filter) override;
|
|
180867
|
+
idx_t GetMaxEntry() override;
|
|
180868
|
+
|
|
180869
|
+
void InitializeScan(ColumnScanState &state) override;
|
|
180870
|
+
void InitializeScanWithOffset(ColumnScanState &state, idx_t row_idx) override;
|
|
180871
|
+
|
|
180872
|
+
idx_t Scan(Transaction &transaction, idx_t vector_index, ColumnScanState &state, Vector &result) override;
|
|
180873
|
+
idx_t ScanCommitted(idx_t vector_index, ColumnScanState &state, Vector &result, bool allow_updates) override;
|
|
180874
|
+
idx_t ScanCount(ColumnScanState &state, Vector &result, idx_t count) override;
|
|
180875
|
+
|
|
180876
|
+
void InitializeAppend(ColumnAppendState &state) override;
|
|
180877
|
+
void Append(BaseStatistics &stats, ColumnAppendState &state, Vector &vector, idx_t count) override;
|
|
180878
|
+
void RevertAppend(row_t start_row) override;
|
|
180879
|
+
idx_t Fetch(ColumnScanState &state, row_t row_id, Vector &result) override;
|
|
180880
|
+
void FetchRow(Transaction &transaction, ColumnFetchState &state, row_t row_id, Vector &result,
|
|
180881
|
+
idx_t result_idx) override;
|
|
180882
|
+
void Update(Transaction &transaction, idx_t column_index, Vector &update_vector, row_t *row_ids,
|
|
180883
|
+
idx_t update_count) override;
|
|
180884
|
+
void UpdateColumn(Transaction &transaction, const vector<column_t> &column_path, Vector &update_vector,
|
|
180885
|
+
row_t *row_ids, idx_t update_count, idx_t depth) override;
|
|
180886
|
+
unique_ptr<BaseStatistics> GetUpdateStatistics() override;
|
|
180887
|
+
|
|
180888
|
+
void CommitDropColumn() override;
|
|
180889
|
+
|
|
180890
|
+
unique_ptr<ColumnCheckpointState> CreateCheckpointState(RowGroup &row_group, TableDataWriter &writer) override;
|
|
180891
|
+
unique_ptr<ColumnCheckpointState> Checkpoint(RowGroup &row_group, TableDataWriter &writer,
|
|
180892
|
+
ColumnCheckpointInfo &checkpoint_info) override;
|
|
180893
|
+
|
|
180894
|
+
void DeserializeColumn(Deserializer &source) override;
|
|
180895
|
+
|
|
180896
|
+
void GetStorageInfo(idx_t row_group_index, vector<idx_t> col_path, vector<vector<Value>> &result) override;
|
|
180897
|
+
|
|
180898
|
+
void Verify(RowGroup &parent) override;
|
|
180899
|
+
};
|
|
180900
|
+
|
|
180901
|
+
} // namespace duckdb
|
|
180902
|
+
|
|
180414
180903
|
//===----------------------------------------------------------------------===//
|
|
180415
180904
|
// DuckDB
|
|
180416
180905
|
//
|
|
@@ -180521,77 +181010,6 @@ struct UpdateNode {
|
|
|
180521
181010
|
} // namespace duckdb
|
|
180522
181011
|
|
|
180523
181012
|
|
|
180524
|
-
|
|
180525
|
-
//===----------------------------------------------------------------------===//
|
|
180526
|
-
// DuckDB
|
|
180527
|
-
//
|
|
180528
|
-
// duckdb/storage/table/struct_column_data.hpp
|
|
180529
|
-
//
|
|
180530
|
-
//
|
|
180531
|
-
//===----------------------------------------------------------------------===//
|
|
180532
|
-
|
|
180533
|
-
|
|
180534
|
-
|
|
180535
|
-
|
|
180536
|
-
|
|
180537
|
-
|
|
180538
|
-
namespace duckdb {
|
|
180539
|
-
|
|
180540
|
-
//! Struct column data represents a struct
|
|
180541
|
-
class StructColumnData : public ColumnData {
|
|
180542
|
-
public:
|
|
180543
|
-
StructColumnData(DataTableInfo &info, idx_t column_index, idx_t start_row, LogicalType type,
|
|
180544
|
-
ColumnData *parent = nullptr);
|
|
180545
|
-
|
|
180546
|
-
//! The sub-columns of the struct
|
|
180547
|
-
vector<unique_ptr<ColumnData>> sub_columns;
|
|
180548
|
-
//! The validity column data of the struct
|
|
180549
|
-
ValidityColumnData validity;
|
|
180550
|
-
|
|
180551
|
-
public:
|
|
180552
|
-
bool CheckZonemap(ColumnScanState &state, TableFilter &filter) override;
|
|
180553
|
-
idx_t GetMaxEntry() override;
|
|
180554
|
-
|
|
180555
|
-
void InitializeScan(ColumnScanState &state) override;
|
|
180556
|
-
void InitializeScanWithOffset(ColumnScanState &state, idx_t row_idx) override;
|
|
180557
|
-
|
|
180558
|
-
idx_t Scan(Transaction &transaction, idx_t vector_index, ColumnScanState &state, Vector &result) override;
|
|
180559
|
-
idx_t ScanCommitted(idx_t vector_index, ColumnScanState &state, Vector &result, bool allow_updates) override;
|
|
180560
|
-
idx_t ScanCount(ColumnScanState &state, Vector &result, idx_t count) override;
|
|
180561
|
-
|
|
180562
|
-
void InitializeAppend(ColumnAppendState &state) override;
|
|
180563
|
-
void Append(BaseStatistics &stats, ColumnAppendState &state, Vector &vector, idx_t count) override;
|
|
180564
|
-
void RevertAppend(row_t start_row) override;
|
|
180565
|
-
idx_t Fetch(ColumnScanState &state, row_t row_id, Vector &result) override;
|
|
180566
|
-
void FetchRow(Transaction &transaction, ColumnFetchState &state, row_t row_id, Vector &result,
|
|
180567
|
-
idx_t result_idx) override;
|
|
180568
|
-
void Update(Transaction &transaction, idx_t column_index, Vector &update_vector, row_t *row_ids,
|
|
180569
|
-
idx_t update_count) override;
|
|
180570
|
-
void UpdateColumn(Transaction &transaction, const vector<column_t> &column_path, Vector &update_vector,
|
|
180571
|
-
row_t *row_ids, idx_t update_count, idx_t depth) override;
|
|
180572
|
-
unique_ptr<BaseStatistics> GetUpdateStatistics() override;
|
|
180573
|
-
|
|
180574
|
-
void CommitDropColumn() override;
|
|
180575
|
-
|
|
180576
|
-
unique_ptr<ColumnCheckpointState> CreateCheckpointState(RowGroup &row_group, TableDataWriter &writer) override;
|
|
180577
|
-
unique_ptr<ColumnCheckpointState> Checkpoint(RowGroup &row_group, TableDataWriter &writer,
|
|
180578
|
-
ColumnCheckpointInfo &checkpoint_info) override;
|
|
180579
|
-
|
|
180580
|
-
void DeserializeColumn(Deserializer &source) override;
|
|
180581
|
-
|
|
180582
|
-
void GetStorageInfo(idx_t row_group_index, vector<idx_t> col_path, vector<vector<Value>> &result) override;
|
|
180583
|
-
|
|
180584
|
-
void Verify(RowGroup &parent) override;
|
|
180585
|
-
};
|
|
180586
|
-
|
|
180587
|
-
} // namespace duckdb
|
|
180588
|
-
|
|
180589
|
-
|
|
180590
|
-
|
|
180591
|
-
|
|
180592
|
-
|
|
180593
|
-
|
|
180594
|
-
|
|
180595
181013
|
namespace duckdb {
|
|
180596
181014
|
|
|
180597
181015
|
ColumnData::ColumnData(DataTableInfo &info, idx_t column_index, idx_t start_row, LogicalType type, ColumnData *parent)
|
|
@@ -180940,7 +181358,7 @@ unique_ptr<ColumnCheckpointState> ColumnData::Checkpoint(RowGroup &row_group, Ta
|
|
|
180940
181358
|
// scan the segments of the column data
|
|
180941
181359
|
// set up the checkpoint state
|
|
180942
181360
|
auto checkpoint_state = CreateCheckpointState(row_group, writer);
|
|
180943
|
-
checkpoint_state->global_stats = BaseStatistics::CreateEmpty(type);
|
|
181361
|
+
checkpoint_state->global_stats = BaseStatistics::CreateEmpty(type, StatisticsType::LOCAL_STATS);
|
|
180944
181362
|
|
|
180945
181363
|
if (!data.root_node) {
|
|
180946
181364
|
// empty table: flush the empty list
|
|
@@ -182295,8 +182713,9 @@ unique_ptr<RowGroup> RowGroup::AddColumn(ClientContext &context, ColumnDefinitio
|
|
|
182295
182713
|
|
|
182296
182714
|
// construct a new column data for the new column
|
|
182297
182715
|
auto added_column = ColumnData::CreateColumn(GetTableInfo(), columns.size(), start, new_column.type);
|
|
182716
|
+
auto added_col_stats = make_shared<SegmentStatistics>(
|
|
182717
|
+
new_column.type, BaseStatistics::CreateEmpty(new_column.type, StatisticsType::LOCAL_STATS));
|
|
182298
182718
|
|
|
182299
|
-
auto added_col_stats = make_shared<SegmentStatistics>(new_column.type);
|
|
182300
182719
|
idx_t rows_to_write = this->count;
|
|
182301
182720
|
if (rows_to_write > 0) {
|
|
182302
182721
|
DataChunk dummy_chunk;
|
|
@@ -183204,7 +183623,7 @@ unique_ptr<BaseStatistics> StandardColumnData::GetUpdateStatistics() {
|
|
|
183204
183623
|
return nullptr;
|
|
183205
183624
|
}
|
|
183206
183625
|
if (!stats) {
|
|
183207
|
-
stats = BaseStatistics::CreateEmpty(type);
|
|
183626
|
+
stats = BaseStatistics::CreateEmpty(type, StatisticsType::GLOBAL_STATS);
|
|
183208
183627
|
}
|
|
183209
183628
|
stats->validity_stats = move(validity_stats);
|
|
183210
183629
|
return stats;
|
|
@@ -183470,7 +183889,7 @@ void StructColumnData::UpdateColumn(Transaction &transaction, const vector<colum
|
|
|
183470
183889
|
|
|
183471
183890
|
unique_ptr<BaseStatistics> StructColumnData::GetUpdateStatistics() {
|
|
183472
183891
|
// check if any child column has updates
|
|
183473
|
-
auto stats = BaseStatistics::CreateEmpty(type);
|
|
183892
|
+
auto stats = BaseStatistics::CreateEmpty(type, StatisticsType::GLOBAL_STATS);
|
|
183474
183893
|
auto &struct_stats = (StructStatistics &)*stats;
|
|
183475
183894
|
stats->validity_stats = validity.GetUpdateStatistics();
|
|
183476
183895
|
for (idx_t i = 0; i < sub_columns.size(); i++) {
|
|
@@ -183578,6 +183997,13 @@ void StructColumnData::Verify(RowGroup &parent) {
|
|
|
183578
183997
|
|
|
183579
183998
|
} // namespace duckdb
|
|
183580
183999
|
|
|
184000
|
+
|
|
184001
|
+
|
|
184002
|
+
|
|
184003
|
+
|
|
184004
|
+
|
|
184005
|
+
|
|
184006
|
+
|
|
183581
184007
|
//===----------------------------------------------------------------------===//
|
|
183582
184008
|
// DuckDB
|
|
183583
184009
|
//
|
|
@@ -183642,11 +184068,6 @@ struct UpdateInfo {
|
|
|
183642
184068
|
|
|
183643
184069
|
} // namespace duckdb
|
|
183644
184070
|
|
|
183645
|
-
|
|
183646
|
-
|
|
183647
|
-
|
|
183648
|
-
|
|
183649
|
-
|
|
183650
184071
|
namespace duckdb {
|
|
183651
184072
|
|
|
183652
184073
|
static UpdateSegment::initialize_update_function_t GetInitializeUpdateFunction(PhysicalType type);
|
|
@@ -219918,6 +220339,9 @@ void StringAppendF(std::string* dst, const char* format, ...) {
|
|
|
219918
220339
|
|
|
219919
220340
|
|
|
219920
220341
|
|
|
220342
|
+
|
|
220343
|
+
|
|
220344
|
+
|
|
219921
220345
|
// LICENSE_CHANGE_BEGIN
|
|
219922
220346
|
// The following code up to LICENSE_CHANGE_END is subject to THIRD PARTY LICENSE #7
|
|
219923
220347
|
// See the end of this file for a list
|
|
@@ -220380,7 +220804,7 @@ struct hllhdr {
|
|
|
220380
220804
|
#define HLL_INVALIDATE_CACHE(hdr) (hdr)->card[7] |= (1<<7)
|
|
220381
220805
|
#define HLL_VALID_CACHE(hdr) (((hdr)->card[7] & (1<<7)) == 0)
|
|
220382
220806
|
|
|
220383
|
-
#define HLL_P
|
|
220807
|
+
#define HLL_P 12 /* The greater is P, the smaller the error. */
|
|
220384
220808
|
#define HLL_Q (64-HLL_P) /* The number of bits of the hash value used for
|
|
220385
220809
|
determining the number of leading zeros. */
|
|
220386
220810
|
#define HLL_REGISTERS (1<<HLL_P) /* With P=14, 16384 registers. */
|
|
@@ -220677,7 +221101,7 @@ int hllPatLen(unsigned char *ele, size_t elesize, long *regp) {
|
|
|
220677
221101
|
* The function always succeed, however if as a result of the operation
|
|
220678
221102
|
* the approximated cardinality changed, 1 is returned. Otherwise 0
|
|
220679
221103
|
* is returned. */
|
|
220680
|
-
int hllDenseSet(uint8_t *registers, long index, uint8_t count) {
|
|
221104
|
+
static inline int hllDenseSet(uint8_t *registers, long index, uint8_t count) {
|
|
220681
221105
|
uint8_t oldcount;
|
|
220682
221106
|
|
|
220683
221107
|
HLL_DENSE_GET_REGISTER(oldcount,registers,index);
|
|
@@ -221417,8 +221841,51 @@ robj *hll_merge(robj **hlls, size_t hll_count) {
|
|
|
221417
221841
|
}
|
|
221418
221842
|
return result;
|
|
221419
221843
|
}
|
|
221844
|
+
|
|
221845
|
+
uint64_t get_size() {
|
|
221846
|
+
return HLL_DENSE_SIZE;
|
|
221847
|
+
}
|
|
221848
|
+
|
|
221849
|
+
}
|
|
221850
|
+
|
|
221851
|
+
namespace duckdb {
|
|
221852
|
+
|
|
221853
|
+
static inline int AddToLog(void *log, const uint64_t &index, const uint8_t &count) {
|
|
221854
|
+
auto o = (duckdb_hll::robj *)log;
|
|
221855
|
+
duckdb_hll::hllhdr *hdr = (duckdb_hll::hllhdr *)o->ptr;
|
|
221856
|
+
D_ASSERT(hdr->encoding == HLL_DENSE);
|
|
221857
|
+
return duckdb_hll::hllDenseSet(hdr->registers + 1, index, count);
|
|
221858
|
+
}
|
|
221859
|
+
|
|
221860
|
+
void AddToLogsInternal(VectorData &vdata, idx_t count, uint64_t indices[], uint8_t counts[], void ***logs[],
|
|
221861
|
+
const SelectionVector *log_sel) {
|
|
221862
|
+
// 'logs' is an array of pointers to AggregateStates
|
|
221863
|
+
// AggregateStates have a pointer to a HyperLogLog object
|
|
221864
|
+
// HyperLogLog objects have a pointer to a 'robj', which we need
|
|
221865
|
+
for (idx_t i = 0; i < count; i++) {
|
|
221866
|
+
auto log = logs[log_sel->get_index(i)];
|
|
221867
|
+
if (log && vdata.validity.RowIsValid(vdata.sel->get_index(i))) {
|
|
221868
|
+
AddToLog(**log, indices[i], counts[i]);
|
|
221869
|
+
}
|
|
221870
|
+
}
|
|
221871
|
+
}
|
|
221872
|
+
|
|
221873
|
+
void AddToSingleLogInternal(VectorData &vdata, idx_t count, uint64_t indices[], uint8_t counts[], void *log) {
|
|
221874
|
+
const auto o = (duckdb_hll::robj *)log;
|
|
221875
|
+
duckdb_hll::hllhdr *hdr = (duckdb_hll::hllhdr *)o->ptr;
|
|
221876
|
+
D_ASSERT(hdr->encoding == HLL_DENSE);
|
|
221877
|
+
|
|
221878
|
+
const auto registers = hdr->registers + 1;
|
|
221879
|
+
for (idx_t i = 0; i < count; i++) {
|
|
221880
|
+
if (vdata.validity.RowIsValid(vdata.sel->get_index(i))) {
|
|
221881
|
+
duckdb_hll::hllDenseSet(registers, indices[i], counts[i]);
|
|
221882
|
+
}
|
|
221883
|
+
}
|
|
221420
221884
|
}
|
|
221421
221885
|
|
|
221886
|
+
} // namespace duckdb
|
|
221887
|
+
|
|
221888
|
+
|
|
221422
221889
|
// LICENSE_CHANGE_END
|
|
221423
221890
|
|
|
221424
221891
|
|