duckdb 0.3.5-dev411.0 → 0.3.5-dev449.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/duckdb.cpp CHANGED
@@ -1094,6 +1094,7 @@ private:
1094
1094
  Serializer &serializer;
1095
1095
  unique_ptr<BufferedSerializer> buffer;
1096
1096
  idx_t field_count;
1097
+ bool finalized;
1097
1098
  };
1098
1099
 
1099
1100
  template <>
@@ -1179,6 +1180,17 @@ public:
1179
1180
  return T::Deserialize(source);
1180
1181
  }
1181
1182
 
1183
+ template <class T, class RETURN_TYPE = unique_ptr<T>, typename... ARGS>
1184
+ RETURN_TYPE ReadSerializable(RETURN_TYPE default_value, ARGS &&...args) {
1185
+ if (field_count >= max_field_count) {
1186
+ // field is not there, read the default value
1187
+ return default_value;
1188
+ }
1189
+ // field is there, read the actual value
1190
+ AddField();
1191
+ return T::Deserialize(source, std::forward<ARGS>(args)...);
1192
+ }
1193
+
1182
1194
  template <class T, class RETURN_TYPE = unique_ptr<T>>
1183
1195
  RETURN_TYPE ReadRequiredSerializable() {
1184
1196
  if (field_count >= max_field_count) {
@@ -1190,6 +1202,17 @@ public:
1190
1202
  return T::Deserialize(source);
1191
1203
  }
1192
1204
 
1205
+ template <class T, class RETURN_TYPE = unique_ptr<T>, typename... ARGS>
1206
+ RETURN_TYPE ReadRequiredSerializable(ARGS &&...args) {
1207
+ if (field_count >= max_field_count) {
1208
+ // field is not there, read the default value
1209
+ throw SerializationException("Attempting to read mandatory field, but field is missing");
1210
+ }
1211
+ // field is there, read the actual value
1212
+ AddField();
1213
+ return T::Deserialize(source, std::forward<ARGS>(args)...);
1214
+ }
1215
+
1193
1216
  template <class T, class RETURN_TYPE = unique_ptr<T>>
1194
1217
  vector<RETURN_TYPE> ReadRequiredSerializableList() {
1195
1218
  if (field_count >= max_field_count) {
@@ -1233,6 +1256,7 @@ private:
1233
1256
  idx_t field_count;
1234
1257
  idx_t max_field_count;
1235
1258
  idx_t total_size;
1259
+ bool finalized;
1236
1260
  };
1237
1261
 
1238
1262
  } // namespace duckdb
@@ -2473,6 +2497,11 @@ private:
2473
2497
 
2474
2498
 
2475
2499
 
2500
+
2501
+ #include <cfloat>
2502
+ #include <cstring> // strlen() on Solaris
2503
+ #include <limits.h>
2504
+
2476
2505
  namespace duckdb {
2477
2506
 
2478
2507
  #define BSWAP16(x) ((uint16_t)((((uint16_t)(x)&0xff00) >> 8) | (((uint16_t)(x)&0x00ff) << 8)))
@@ -2487,44 +2516,170 @@ namespace duckdb {
2487
2516
  (((uint64_t)(x)&0x00000000ff000000ull) << 8) | (((uint64_t)(x)&0x0000000000ff0000ull) << 24) | \
2488
2517
  (((uint64_t)(x)&0x000000000000ff00ull) << 40) | (((uint64_t)(x)&0x00000000000000ffull) << 56)))
2489
2518
 
2490
- bool IsLittleEndian();
2491
- uint8_t FlipSign(uint8_t key_byte);
2492
- uint32_t EncodeFloat(float x);
2493
- uint64_t EncodeDouble(double x);
2519
+ struct Radix {
2520
+ public:
2521
+ static inline bool IsLittleEndian() {
2522
+ int n = 1;
2523
+ if (*(char *)&n == 1) {
2524
+ return true;
2525
+ } else {
2526
+ return false;
2527
+ }
2528
+ }
2494
2529
 
2495
- template <class T>
2496
- void EncodeData(data_ptr_t dataptr, T value, bool is_little_endian) {
2497
- throw NotImplementedException("Cannot create data from this type");
2498
- }
2530
+ template <class T>
2531
+ static inline void EncodeData(data_ptr_t dataptr, T value, bool is_little_endian) {
2532
+ throw NotImplementedException("Cannot create data from this type");
2533
+ }
2534
+
2535
+ static inline void EncodeStringDataPrefix(data_ptr_t dataptr, string_t value, idx_t prefix_len) {
2536
+ auto len = value.GetSize();
2537
+ memcpy(dataptr, value.GetDataUnsafe(), MinValue(len, prefix_len));
2538
+ if (len < prefix_len) {
2539
+ memset(dataptr + len, '\0', prefix_len - len);
2540
+ }
2541
+ }
2542
+
2543
+ static inline uint8_t FlipSign(uint8_t key_byte) {
2544
+ return key_byte ^ 128;
2545
+ }
2546
+
2547
+ static inline uint32_t EncodeFloat(float x) {
2548
+ uint64_t buff;
2549
+
2550
+ //! zero
2551
+ if (x == 0) {
2552
+ buff = 0;
2553
+ buff |= (1u << 31);
2554
+ return buff;
2555
+ }
2556
+ // nan
2557
+ if (Value::IsNan(x)) {
2558
+ return UINT_MAX;
2559
+ }
2560
+ //! infinity
2561
+ if (x > FLT_MAX) {
2562
+ return UINT_MAX - 1;
2563
+ }
2564
+ //! -infinity
2565
+ if (x < -FLT_MAX) {
2566
+ return 0;
2567
+ }
2568
+ buff = Load<uint32_t>((const_data_ptr_t)&x);
2569
+ if ((buff & (1u << 31)) == 0) { //! +0 and positive numbers
2570
+ buff |= (1u << 31);
2571
+ } else { //! negative numbers
2572
+ buff = ~buff; //! complement 1
2573
+ }
2574
+
2575
+ return buff;
2576
+ }
2577
+
2578
+ static inline uint64_t EncodeDouble(double x) {
2579
+ uint64_t buff;
2580
+ //! zero
2581
+ if (x == 0) {
2582
+ buff = 0;
2583
+ buff += (1ull << 63);
2584
+ return buff;
2585
+ }
2586
+ // nan
2587
+ if (Value::IsNan(x)) {
2588
+ return ULLONG_MAX;
2589
+ }
2590
+ //! infinity
2591
+ if (x > DBL_MAX) {
2592
+ return ULLONG_MAX - 1;
2593
+ }
2594
+ //! -infinity
2595
+ if (x < -DBL_MAX) {
2596
+ return 0;
2597
+ }
2598
+ buff = Load<uint64_t>((const_data_ptr_t)&x);
2599
+ if (buff < (1ull << 63)) { //! +0 and positive numbers
2600
+ buff += (1ull << 63);
2601
+ } else { //! negative numbers
2602
+ buff = ~buff; //! complement 1
2603
+ }
2604
+ return buff;
2605
+ }
2606
+ };
2499
2607
 
2500
2608
  template <>
2501
- void EncodeData(data_ptr_t dataptr, bool value, bool is_little_endian);
2502
- template <>
2503
- void EncodeData(data_ptr_t dataptr, int8_t value, bool is_little_endian);
2609
+ inline void Radix::EncodeData(data_ptr_t dataptr, bool value, bool is_little_endian) {
2610
+ Store<uint8_t>(value ? 1 : 0, dataptr);
2611
+ }
2612
+
2504
2613
  template <>
2505
- void EncodeData(data_ptr_t dataptr, int16_t value, bool is_little_endian);
2614
+ inline void Radix::EncodeData(data_ptr_t dataptr, int8_t value, bool is_little_endian) {
2615
+ Store<uint8_t>(value, dataptr);
2616
+ dataptr[0] = FlipSign(dataptr[0]);
2617
+ }
2618
+
2506
2619
  template <>
2507
- void EncodeData(data_ptr_t dataptr, int32_t value, bool is_little_endian);
2620
+ inline void Radix::EncodeData(data_ptr_t dataptr, int16_t value, bool is_little_endian) {
2621
+ Store<uint16_t>(is_little_endian ? BSWAP16(value) : value, dataptr);
2622
+ dataptr[0] = FlipSign(dataptr[0]);
2623
+ }
2624
+
2508
2625
  template <>
2509
- void EncodeData(data_ptr_t dataptr, int64_t value, bool is_little_endian);
2626
+ inline void Radix::EncodeData(data_ptr_t dataptr, int32_t value, bool is_little_endian) {
2627
+ Store<uint32_t>(is_little_endian ? BSWAP32(value) : value, dataptr);
2628
+ dataptr[0] = FlipSign(dataptr[0]);
2629
+ }
2630
+
2510
2631
  template <>
2511
- void EncodeData(data_ptr_t dataptr, uint8_t value, bool is_little_endian);
2632
+ inline void Radix::EncodeData(data_ptr_t dataptr, int64_t value, bool is_little_endian) {
2633
+ Store<uint64_t>(is_little_endian ? BSWAP64(value) : value, dataptr);
2634
+ dataptr[0] = FlipSign(dataptr[0]);
2635
+ }
2636
+
2512
2637
  template <>
2513
- void EncodeData(data_ptr_t dataptr, uint16_t value, bool is_little_endian);
2638
+ inline void Radix::EncodeData(data_ptr_t dataptr, uint8_t value, bool is_little_endian) {
2639
+ Store<uint8_t>(value, dataptr);
2640
+ }
2641
+
2514
2642
  template <>
2515
- void EncodeData(data_ptr_t dataptr, uint32_t value, bool is_little_endian);
2643
+ inline void Radix::EncodeData(data_ptr_t dataptr, uint16_t value, bool is_little_endian) {
2644
+ Store<uint16_t>(is_little_endian ? BSWAP16(value) : value, dataptr);
2645
+ }
2646
+
2516
2647
  template <>
2517
- void EncodeData(data_ptr_t dataptr, uint64_t value, bool is_little_endian);
2648
+ inline void Radix::EncodeData(data_ptr_t dataptr, uint32_t value, bool is_little_endian) {
2649
+ Store<uint32_t>(is_little_endian ? BSWAP32(value) : value, dataptr);
2650
+ }
2651
+
2518
2652
  template <>
2519
- void EncodeData(data_ptr_t dataptr, hugeint_t value, bool is_little_endian);
2653
+ inline void Radix::EncodeData(data_ptr_t dataptr, uint64_t value, bool is_little_endian) {
2654
+ Store<uint64_t>(is_little_endian ? BSWAP64(value) : value, dataptr);
2655
+ }
2656
+
2520
2657
  template <>
2521
- void EncodeData(data_ptr_t dataptr, double value, bool is_little_endian);
2658
+ inline void Radix::EncodeData(data_ptr_t dataptr, hugeint_t value, bool is_little_endian) {
2659
+ EncodeData<int64_t>(dataptr, value.upper, is_little_endian);
2660
+ EncodeData<uint64_t>(dataptr + sizeof(value.upper), value.lower, is_little_endian);
2661
+ }
2662
+
2522
2663
  template <>
2523
- void EncodeData(data_ptr_t dataptr, float value, bool is_little_endian);
2664
+ inline void Radix::EncodeData(data_ptr_t dataptr, float value, bool is_little_endian) {
2665
+ uint32_t converted_value = EncodeFloat(value);
2666
+ Store<uint32_t>(is_little_endian ? BSWAP32(converted_value) : converted_value, dataptr);
2667
+ }
2668
+
2524
2669
  template <>
2525
- void EncodeData(data_ptr_t dataptr, interval_t value, bool is_little_endian);
2670
+ inline void Radix::EncodeData(data_ptr_t dataptr, double value, bool is_little_endian) {
2671
+ uint64_t converted_value = EncodeDouble(value);
2672
+ Store<uint64_t>(is_little_endian ? BSWAP64(converted_value) : converted_value, dataptr);
2673
+ }
2526
2674
 
2527
- void EncodeStringDataPrefix(data_ptr_t dataptr, string_t value, idx_t prefix_len);
2675
+ template <>
2676
+ inline void Radix::EncodeData(data_ptr_t dataptr, interval_t value, bool is_little_endian) {
2677
+ EncodeData<int32_t>(dataptr, value.months, is_little_endian);
2678
+ dataptr += sizeof(value.months);
2679
+ EncodeData<int32_t>(dataptr, value.days, is_little_endian);
2680
+ dataptr += sizeof(value.days);
2681
+ EncodeData<int64_t>(dataptr, value.micros, is_little_endian);
2682
+ }
2528
2683
 
2529
2684
  } // namespace duckdb
2530
2685
 
@@ -2542,13 +2697,13 @@ public:
2542
2697
 
2543
2698
  public:
2544
2699
  template <class T>
2545
- static unique_ptr<Key> CreateKey(T element, bool is_little_endian) {
2700
+ static inline unique_ptr<Key> CreateKey(T element, bool is_little_endian) {
2546
2701
  auto data = Key::CreateData<T>(element, is_little_endian);
2547
2702
  return make_unique<Key>(move(data), sizeof(element));
2548
2703
  }
2549
2704
 
2550
2705
  template <class T>
2551
- static unique_ptr<Key> CreateKey(const Value &element, bool is_little_endian) {
2706
+ static inline unique_ptr<Key> CreateKey(const Value &element, bool is_little_endian) {
2552
2707
  return CreateKey(element.GetValueUnsafe<T>(), is_little_endian);
2553
2708
  }
2554
2709
 
@@ -2566,9 +2721,9 @@ public:
2566
2721
 
2567
2722
  private:
2568
2723
  template <class T>
2569
- static unique_ptr<data_t[]> CreateData(T value, bool is_little_endian) {
2724
+ static inline unique_ptr<data_t[]> CreateData(T value, bool is_little_endian) {
2570
2725
  auto data = unique_ptr<data_t[]>(new data_t[sizeof(value)]);
2571
- EncodeData<T>(data.get(), value, is_little_endian);
2726
+ Radix::EncodeData<T>(data.get(), value, is_little_endian);
2572
2727
  return data;
2573
2728
  }
2574
2729
  };
@@ -12934,13 +13089,14 @@ namespace duckdb {
12934
13089
  // Field Writer
12935
13090
  //===--------------------------------------------------------------------===//
12936
13091
  FieldWriter::FieldWriter(Serializer &serializer_p)
12937
- : serializer(serializer_p), buffer(make_unique<BufferedSerializer>()), field_count(0) {
13092
+ : serializer(serializer_p), buffer(make_unique<BufferedSerializer>()), field_count(0), finalized(false) {
12938
13093
  }
12939
13094
 
12940
13095
  FieldWriter::~FieldWriter() {
12941
13096
  if (Exception::UncaughtException()) {
12942
13097
  return;
12943
13098
  }
13099
+ D_ASSERT(finalized);
12944
13100
  // finalize should always have been called, unless this is destroyed as part of stack unwinding
12945
13101
  D_ASSERT(!buffer);
12946
13102
  }
@@ -12960,7 +13116,8 @@ void FieldWriter::Write(const string &val) {
12960
13116
 
12961
13117
  void FieldWriter::Finalize() {
12962
13118
  D_ASSERT(buffer);
12963
-
13119
+ D_ASSERT(!finalized);
13120
+ finalized = true;
12964
13121
  serializer.Write<uint32_t>(field_count);
12965
13122
  serializer.Write<uint64_t>(buffer->blob.size);
12966
13123
  serializer.WriteData(buffer->blob.data.get(), buffer->blob.size);
@@ -12992,7 +13149,7 @@ void FieldDeserializer::SetRemainingData(idx_t remaining_data) {
12992
13149
  //===--------------------------------------------------------------------===//
12993
13150
  // Field Reader
12994
13151
  //===--------------------------------------------------------------------===//
12995
- FieldReader::FieldReader(Deserializer &source_p) : source(source_p), field_count(0) {
13152
+ FieldReader::FieldReader(Deserializer &source_p) : source(source_p), field_count(0), finalized(false) {
12996
13153
  max_field_count = source_p.Read<uint32_t>();
12997
13154
  total_size = source_p.Read<uint64_t>();
12998
13155
  D_ASSERT(max_field_count > 0);
@@ -13000,9 +13157,15 @@ FieldReader::FieldReader(Deserializer &source_p) : source(source_p), field_count
13000
13157
  }
13001
13158
 
13002
13159
  FieldReader::~FieldReader() {
13160
+ if (Exception::UncaughtException()) {
13161
+ return;
13162
+ }
13163
+ D_ASSERT(finalized);
13003
13164
  }
13004
13165
 
13005
13166
  void FieldReader::Finalize() {
13167
+ D_ASSERT(!finalized);
13168
+ finalized = true;
13006
13169
  if (field_count < max_field_count) {
13007
13170
  // we can handle this case by calling source.ReadData(buffer, source.RemainingData())
13008
13171
  throw SerializationException("Not all fields were read. This file might have been written with a newer version "
@@ -24874,174 +25037,6 @@ void ProgressBar::Update(bool final) {
24874
25037
  }
24875
25038
 
24876
25039
  } // namespace duckdb
24877
-
24878
-
24879
-
24880
-
24881
- #include <cfloat>
24882
- #include <cstring> // strlen() on Solaris
24883
- #include <limits.h>
24884
-
24885
- namespace duckdb {
24886
-
24887
- bool IsLittleEndian() {
24888
- int n = 1;
24889
- if (*(char *)&n == 1) {
24890
- return true;
24891
- } else {
24892
- return false;
24893
- }
24894
- }
24895
-
24896
- uint8_t FlipSign(uint8_t key_byte) {
24897
- return key_byte ^ 128;
24898
- }
24899
-
24900
- uint32_t EncodeFloat(float x) {
24901
- uint64_t buff;
24902
-
24903
- //! zero
24904
- if (x == 0) {
24905
- buff = 0;
24906
- buff |= (1u << 31);
24907
- return buff;
24908
- }
24909
- // nan
24910
- if (Value::IsNan(x)) {
24911
- return UINT_MAX;
24912
- }
24913
- //! infinity
24914
- if (x > FLT_MAX) {
24915
- return UINT_MAX - 1;
24916
- }
24917
- //! -infinity
24918
- if (x < -FLT_MAX) {
24919
- return 0;
24920
- }
24921
- buff = Load<uint32_t>((const_data_ptr_t)&x);
24922
- if ((buff & (1u << 31)) == 0) { //! +0 and positive numbers
24923
- buff |= (1u << 31);
24924
- } else { //! negative numbers
24925
- buff = ~buff; //! complement 1
24926
- }
24927
-
24928
- return buff;
24929
- }
24930
-
24931
- uint64_t EncodeDouble(double x) {
24932
- uint64_t buff;
24933
- //! zero
24934
- if (x == 0) {
24935
- buff = 0;
24936
- buff += (1ull << 63);
24937
- return buff;
24938
- }
24939
- // nan
24940
- if (Value::IsNan(x)) {
24941
- return ULLONG_MAX;
24942
- }
24943
- //! infinity
24944
- if (x > DBL_MAX) {
24945
- return ULLONG_MAX - 1;
24946
- }
24947
- //! -infinity
24948
- if (x < -DBL_MAX) {
24949
- return 0;
24950
- }
24951
- buff = Load<uint64_t>((const_data_ptr_t)&x);
24952
- if (buff < (1ull << 63)) { //! +0 and positive numbers
24953
- buff += (1ull << 63);
24954
- } else { //! negative numbers
24955
- buff = ~buff; //! complement 1
24956
- }
24957
- return buff;
24958
- }
24959
-
24960
- template <>
24961
- void EncodeData(data_ptr_t dataptr, bool value, bool is_little_endian) {
24962
- Store<uint8_t>(value ? 1 : 0, dataptr);
24963
- }
24964
-
24965
- template <>
24966
- void EncodeData(data_ptr_t dataptr, int8_t value, bool is_little_endian) {
24967
- Store<uint8_t>(value, dataptr);
24968
- dataptr[0] = FlipSign(dataptr[0]);
24969
- }
24970
-
24971
- template <>
24972
- void EncodeData(data_ptr_t dataptr, int16_t value, bool is_little_endian) {
24973
- Store<uint16_t>(is_little_endian ? BSWAP16(value) : value, dataptr);
24974
- dataptr[0] = FlipSign(dataptr[0]);
24975
- }
24976
-
24977
- template <>
24978
- void EncodeData(data_ptr_t dataptr, int32_t value, bool is_little_endian) {
24979
- Store<uint32_t>(is_little_endian ? BSWAP32(value) : value, dataptr);
24980
- dataptr[0] = FlipSign(dataptr[0]);
24981
- }
24982
-
24983
- template <>
24984
- void EncodeData(data_ptr_t dataptr, int64_t value, bool is_little_endian) {
24985
- Store<uint64_t>(is_little_endian ? BSWAP64(value) : value, dataptr);
24986
- dataptr[0] = FlipSign(dataptr[0]);
24987
- }
24988
-
24989
- template <>
24990
- void EncodeData(data_ptr_t dataptr, uint8_t value, bool is_little_endian) {
24991
- Store<uint8_t>(value, dataptr);
24992
- }
24993
-
24994
- template <>
24995
- void EncodeData(data_ptr_t dataptr, uint16_t value, bool is_little_endian) {
24996
- Store<uint16_t>(is_little_endian ? BSWAP16(value) : value, dataptr);
24997
- }
24998
-
24999
- template <>
25000
- void EncodeData(data_ptr_t dataptr, uint32_t value, bool is_little_endian) {
25001
- Store<uint32_t>(is_little_endian ? BSWAP32(value) : value, dataptr);
25002
- }
25003
-
25004
- template <>
25005
- void EncodeData(data_ptr_t dataptr, uint64_t value, bool is_little_endian) {
25006
- Store<uint64_t>(is_little_endian ? BSWAP64(value) : value, dataptr);
25007
- }
25008
-
25009
- template <>
25010
- void EncodeData(data_ptr_t dataptr, hugeint_t value, bool is_little_endian) {
25011
- EncodeData<int64_t>(dataptr, value.upper, is_little_endian);
25012
- EncodeData<uint64_t>(dataptr + sizeof(value.upper), value.lower, is_little_endian);
25013
- }
25014
-
25015
- template <>
25016
- void EncodeData(data_ptr_t dataptr, float value, bool is_little_endian) {
25017
- uint32_t converted_value = EncodeFloat(value);
25018
- Store<uint32_t>(is_little_endian ? BSWAP32(converted_value) : converted_value, dataptr);
25019
- }
25020
-
25021
- template <>
25022
- void EncodeData(data_ptr_t dataptr, double value, bool is_little_endian) {
25023
- uint64_t converted_value = EncodeDouble(value);
25024
- Store<uint64_t>(is_little_endian ? BSWAP64(converted_value) : converted_value, dataptr);
25025
- }
25026
-
25027
- template <>
25028
- void EncodeData(data_ptr_t dataptr, interval_t value, bool is_little_endian) {
25029
- EncodeData<int32_t>(dataptr, value.months, is_little_endian);
25030
- dataptr += sizeof(value.months);
25031
- EncodeData<int32_t>(dataptr, value.days, is_little_endian);
25032
- dataptr += sizeof(value.days);
25033
- EncodeData<int64_t>(dataptr, value.micros, is_little_endian);
25034
- }
25035
-
25036
- void EncodeStringDataPrefix(data_ptr_t dataptr, string_t value, idx_t prefix_len) {
25037
- auto len = value.GetSize();
25038
- memcpy(dataptr, value.GetDataUnsafe(), MinValue(len, prefix_len));
25039
- if (len < prefix_len) {
25040
- memset(dataptr + len, '\0', prefix_len - len);
25041
- }
25042
- }
25043
-
25044
- } // namespace duckdb
25045
25040
  //===----------------------------------------------------------------------===//
25046
25041
  // DuckDB
25047
25042
  //
@@ -31664,7 +31659,7 @@ void TemplatedRadixScatter(VectorData &vdata, const SelectionVector &sel, idx_t
31664
31659
  // write validity and according value
31665
31660
  if (validity.RowIsValid(source_idx)) {
31666
31661
  key_locations[i][0] = valid;
31667
- EncodeData<T>(key_locations[i] + 1, source[source_idx], is_little_endian);
31662
+ Radix::EncodeData<T>(key_locations[i] + 1, source[source_idx], is_little_endian);
31668
31663
  // invert bits if desc
31669
31664
  if (desc) {
31670
31665
  for (idx_t s = 1; s < sizeof(T) + 1; s++) {
@@ -31682,7 +31677,7 @@ void TemplatedRadixScatter(VectorData &vdata, const SelectionVector &sel, idx_t
31682
31677
  auto idx = sel.get_index(i);
31683
31678
  auto source_idx = vdata.sel->get_index(idx) + offset;
31684
31679
  // write value
31685
- EncodeData<T>(key_locations[i], source[source_idx], is_little_endian);
31680
+ Radix::EncodeData<T>(key_locations[i], source[source_idx], is_little_endian);
31686
31681
  // invert bits if desc
31687
31682
  if (desc) {
31688
31683
  for (idx_t s = 0; s < sizeof(T); s++) {
@@ -31709,7 +31704,7 @@ void RadixScatterStringVector(VectorData &vdata, const SelectionVector &sel, idx
31709
31704
  // write validity and according value
31710
31705
  if (validity.RowIsValid(source_idx)) {
31711
31706
  key_locations[i][0] = valid;
31712
- EncodeStringDataPrefix(key_locations[i] + 1, source[source_idx], prefix_len);
31707
+ Radix::EncodeStringDataPrefix(key_locations[i] + 1, source[source_idx], prefix_len);
31713
31708
  // invert bits if desc
31714
31709
  if (desc) {
31715
31710
  for (idx_t s = 1; s < prefix_len + 1; s++) {
@@ -31727,7 +31722,7 @@ void RadixScatterStringVector(VectorData &vdata, const SelectionVector &sel, idx
31727
31722
  auto idx = sel.get_index(i);
31728
31723
  auto source_idx = vdata.sel->get_index(idx) + offset;
31729
31724
  // write value
31730
- EncodeStringDataPrefix(key_locations[i], source[source_idx], prefix_len);
31725
+ Radix::EncodeStringDataPrefix(key_locations[i], source[source_idx], prefix_len);
31731
31726
  // invert bits if desc
31732
31727
  if (desc) {
31733
31728
  for (idx_t s = 0; s < prefix_len; s++) {
@@ -31854,7 +31849,7 @@ void RadixScatterStructVector(Vector &v, VectorData &vdata, idx_t vcount, const
31854
31849
  void RowOperations::RadixScatter(Vector &v, idx_t vcount, const SelectionVector &sel, idx_t ser_count,
31855
31850
  data_ptr_t *key_locations, bool desc, bool has_null, bool nulls_first,
31856
31851
  idx_t prefix_len, idx_t width, idx_t offset) {
31857
- auto is_little_endian = IsLittleEndian();
31852
+ auto is_little_endian = Radix::IsLittleEndian();
31858
31853
 
31859
31854
  VectorData vdata;
31860
31855
  v.Orrify(vcount, vdata);
@@ -40595,34 +40590,6 @@ string hugeint_t::ToString() const {
40595
40590
 
40596
40591
 
40597
40592
 
40598
- namespace duckdb {
40599
-
40600
- //! The HyperLogLog class holds a HyperLogLog counter for approximate cardinality counting
40601
- class HyperLogLog {
40602
- public:
40603
- HyperLogLog();
40604
- ~HyperLogLog();
40605
- // implicit copying of HyperLogLog is not allowed
40606
- HyperLogLog(const HyperLogLog &) = delete;
40607
-
40608
- //! Adds an element of the specified size to the HyperLogLog counter
40609
- void Add(data_ptr_t element, idx_t size);
40610
- //! Return the count of this HyperLogLog counter
40611
- idx_t Count();
40612
- //! Merge this HyperLogLog counter with another counter to create a new one
40613
- unique_ptr<HyperLogLog> Merge(HyperLogLog &other);
40614
- HyperLogLog *MergePointer(HyperLogLog &other);
40615
- //! Merge a set of HyperLogLogs to create one big one
40616
- static unique_ptr<HyperLogLog> Merge(HyperLogLog logs[], idx_t count);
40617
-
40618
- private:
40619
- HyperLogLog(void *hll);
40620
-
40621
- void *hll;
40622
- };
40623
- } // namespace duckdb
40624
-
40625
-
40626
40593
 
40627
40594
 
40628
40595
  // LICENSE_CHANGE_BEGIN
@@ -40639,41 +40606,116 @@ private:
40639
40606
 
40640
40607
 
40641
40608
 
40642
- #include <string.h>
40643
40609
  #include <stdint.h>
40610
+ #include <string.h>
40644
40611
 
40645
40612
  namespace duckdb_hll {
40646
40613
 
40647
40614
  /* Error codes */
40648
- #define HLL_C_OK 0
40649
- #define HLL_C_ERR -1
40615
+ #define HLL_C_OK 0
40616
+ #define HLL_C_ERR -1
40650
40617
 
40651
40618
  typedef struct {
40652
- void *ptr;
40619
+ void *ptr;
40653
40620
  } robj;
40654
40621
 
40655
40622
  //! Create a new empty HyperLogLog object
40656
40623
  robj *hll_create(void);
40624
+ //! Convert hll from sparse to dense
40625
+ int hllSparseToDense(robj *o);
40657
40626
  //! Destroy the specified HyperLogLog object
40658
40627
  void hll_destroy(robj *obj);
40659
- //! Add an element with the specified amount of bytes to the HyperLogLog. Returns C_ERR on failure, otherwise returns 0 if the cardinality did not change, and 1 otherwise.
40628
+ //! Add an element with the specified amount of bytes to the HyperLogLog. Returns C_ERR on failure, otherwise returns 0
40629
+ //! if the cardinality did not change, and 1 otherwise.
40660
40630
  int hll_add(robj *o, unsigned char *ele, size_t elesize);
40661
- //! Returns the estimated amount of unique elements seen by the HyperLogLog. Returns C_OK on success, or C_ERR on failure.
40631
+ //! Returns the estimated amount of unique elements seen by the HyperLogLog. Returns C_OK on success, or C_ERR on
40632
+ //! failure.
40662
40633
  int hll_count(robj *o, size_t *result);
40663
40634
  //! Merge hll_count HyperLogLog objects into a single one. Returns NULL on failure, or the new HLL object on success.
40664
40635
  robj *hll_merge(robj **hlls, size_t hll_count);
40636
+ //! Get size (in bytes) of the HLL
40637
+ uint64_t get_size();
40665
40638
 
40666
- uint64_t MurmurHash64A (const void * key, int len, unsigned int seed);
40639
+ uint64_t MurmurHash64A(const void *key, int len, unsigned int seed);
40640
+
40641
+ } // namespace duckdb_hll
40642
+
40643
+ namespace duckdb {
40644
+
40645
+ void AddToLogsInternal(VectorData &vdata, idx_t count, uint64_t indices[], uint8_t counts[], void ***logs[],
40646
+ const SelectionVector *log_sel);
40647
+
40648
+ void AddToSingleLogInternal(VectorData &vdata, idx_t count, uint64_t indices[], uint8_t counts[], void *log);
40649
+
40650
+ } // namespace duckdb
40667
40651
 
40668
- }
40669
40652
 
40670
40653
  // LICENSE_CHANGE_END
40671
40654
 
40672
40655
 
40656
+ namespace duckdb {
40657
+
40658
+ enum class HLLStorageType { UNCOMPRESSED = 1 };
40659
+
40660
+ class FieldWriter;
40661
+ class FieldReader;
40662
+
40663
+ //! The HyperLogLog class holds a HyperLogLog counter for approximate cardinality counting
40664
+ class HyperLogLog {
40665
+ public:
40666
+ HyperLogLog();
40667
+ ~HyperLogLog();
40668
+ // implicit copying of HyperLogLog is not allowed
40669
+ HyperLogLog(const HyperLogLog &) = delete;
40670
+
40671
+ //! Adds an element of the specified size to the HyperLogLog counter
40672
+ void Add(data_ptr_t element, idx_t size);
40673
+ //! Return the count of this HyperLogLog counter
40674
+ idx_t Count() const;
40675
+ //! Merge this HyperLogLog counter with another counter to create a new one
40676
+ unique_ptr<HyperLogLog> Merge(HyperLogLog &other);
40677
+ HyperLogLog *MergePointer(HyperLogLog &other);
40678
+ //! Merge a set of HyperLogLogs to create one big one
40679
+ static unique_ptr<HyperLogLog> Merge(HyperLogLog logs[], idx_t count);
40680
+ //! Get the size (in bytes) of a HLL
40681
+ static idx_t GetSize();
40682
+ //! Get pointer to the HLL
40683
+ data_ptr_t GetPtr() const;
40684
+ //! Get copy of the HLL
40685
+ unique_ptr<HyperLogLog> Copy();
40686
+ //! (De)Serialize the HLL
40687
+ void Serialize(FieldWriter &writer) const;
40688
+ static unique_ptr<HyperLogLog> Deserialize(FieldReader &reader);
40689
+
40690
+ public:
40691
+ //! Compute HLL hashes over vdata, and store them in 'hashes'
40692
+ //! Then, compute register indices and prefix lengths, and also store them in 'hashes' as a pair of uint32_t
40693
+ static void ProcessEntries(VectorData &vdata, const LogicalType &type, uint64_t hashes[], uint8_t counts[],
40694
+ idx_t count);
40695
+ //! Add the indices and counts to the logs
40696
+ static void AddToLogs(VectorData &vdata, idx_t count, uint64_t indices[], uint8_t counts[], HyperLogLog **logs[],
40697
+ const SelectionVector *log_sel);
40698
+ //! Add the indices and counts to THIS log
40699
+ void AddToLog(VectorData &vdata, idx_t count, uint64_t indices[], uint8_t counts[]);
40700
+
40701
+ private:
40702
+ explicit HyperLogLog(void *hll);
40703
+
40704
+ void *hll;
40705
+ mutex lock;
40706
+ };
40707
+ } // namespace duckdb
40708
+
40709
+
40710
+
40711
+
40712
+
40673
40713
  namespace duckdb {
40674
40714
 
40675
40715
  HyperLogLog::HyperLogLog() : hll(nullptr) {
40676
40716
  hll = duckdb_hll::hll_create();
40717
+ // Insert into a dense hll can be vectorized, sparse cannot, so we immediately convert
40718
+ duckdb_hll::hllSparseToDense((duckdb_hll::robj *)hll);
40677
40719
  }
40678
40720
 
40679
40721
  HyperLogLog::HyperLogLog(void *hll) : hll(hll) {
@@ -40689,7 +40731,7 @@ void HyperLogLog::Add(data_ptr_t element, idx_t size) {
40689
40731
  }
40690
40732
  }
40691
40733
 
40692
- idx_t HyperLogLog::Count() {
40734
+ idx_t HyperLogLog::Count() const {
40693
40735
  // exception from size_t ban
40694
40736
  size_t result;
40695
40737
 
@@ -40736,9 +40778,206 @@ unique_ptr<HyperLogLog> HyperLogLog::Merge(HyperLogLog logs[], idx_t count) {
40736
40778
  return unique_ptr<HyperLogLog>(new HyperLogLog((void *)new_hll));
40737
40779
  }
40738
40780
 
40781
+ idx_t HyperLogLog::GetSize() {
40782
+ return duckdb_hll::get_size();
40783
+ }
40784
+
40785
+ data_ptr_t HyperLogLog::GetPtr() const {
40786
+ return (data_ptr_t)((duckdb_hll::robj *)hll)->ptr;
40787
+ }
40788
+
40789
+ unique_ptr<HyperLogLog> HyperLogLog::Copy() {
40790
+ auto result = make_unique<HyperLogLog>();
40791
+ lock_guard<mutex> guard(lock);
40792
+ memcpy(result->GetPtr(), GetPtr(), GetSize());
40793
+ D_ASSERT(result->Count() == Count());
40794
+ return result;
40795
+ }
40796
+
40797
+ void HyperLogLog::Serialize(FieldWriter &writer) const {
40798
+ writer.WriteField<HLLStorageType>(HLLStorageType::UNCOMPRESSED);
40799
+ writer.WriteBlob(GetPtr(), GetSize());
40800
+ }
40801
+
40802
+ unique_ptr<HyperLogLog> HyperLogLog::Deserialize(FieldReader &reader) {
40803
+ auto result = make_unique<HyperLogLog>();
40804
+ auto storage_type = reader.ReadRequired<HLLStorageType>();
40805
+ switch (storage_type) {
40806
+ case HLLStorageType::UNCOMPRESSED:
40807
+ reader.ReadBlob(result->GetPtr(), GetSize());
40808
+ break;
40809
+ default:
40810
+ throw SerializationException("Unknown HyperLogLog storage type!");
40811
+ }
40812
+ return result;
40813
+ }
40814
+
40815
+ //===--------------------------------------------------------------------===//
40816
+ // Vectorized HLL implementation
40817
+ //===--------------------------------------------------------------------===//
40818
+ //! Taken from https://nullprogram.com/blog/2018/07/31/
40819
+ template <class T>
40820
+ inline uint64_t TemplatedHash(const T &elem) {
40821
+ uint64_t x = elem;
40822
+ x ^= x >> 30;
40823
+ x *= UINT64_C(0xbf58476d1ce4e5b9);
40824
+ x ^= x >> 27;
40825
+ x *= UINT64_C(0x94d049bb133111eb);
40826
+ x ^= x >> 31;
40827
+ return x;
40828
+ }
40829
+
40830
+ template <>
40831
+ inline uint64_t TemplatedHash(const hugeint_t &elem) {
40832
+ return TemplatedHash<uint64_t>(Load<uint64_t>((data_ptr_t)&elem.upper)) ^ TemplatedHash<uint64_t>(elem.lower);
40833
+ }
40834
+
40835
+ template <idx_t rest>
40836
+ inline void CreateIntegerRecursive(const data_ptr_t &data, uint64_t &x) {
40837
+ x ^= (uint64_t)data[rest - 1] << ((rest - 1) * 8);
40838
+ return CreateIntegerRecursive<rest - 1>(data, x);
40839
+ }
40840
+
40841
+ template <>
40842
+ inline void CreateIntegerRecursive<1>(const data_ptr_t &data, uint64_t &x) {
40843
+ x ^= (uint64_t)data[0];
40844
+ }
40845
+
40846
+ inline uint64_t HashOtherSize(const data_ptr_t &data, const idx_t &len) {
40847
+ uint64_t x = 0;
40848
+ switch (len & 7) {
40849
+ case 7:
40850
+ CreateIntegerRecursive<7>(data, x);
40851
+ break;
40852
+ case 6:
40853
+ CreateIntegerRecursive<6>(data, x);
40854
+ break;
40855
+ case 5:
40856
+ CreateIntegerRecursive<5>(data, x);
40857
+ break;
40858
+ case 4:
40859
+ CreateIntegerRecursive<4>(data, x);
40860
+ break;
40861
+ case 3:
40862
+ CreateIntegerRecursive<3>(data, x);
40863
+ break;
40864
+ case 2:
40865
+ CreateIntegerRecursive<2>(data, x);
40866
+ break;
40867
+ case 1:
40868
+ CreateIntegerRecursive<1>(data, x);
40869
+ break;
40870
+ case 0:
40871
+ break;
40872
+ }
40873
+ return TemplatedHash<uint64_t>(x);
40874
+ }
40875
+
40876
+ template <>
40877
+ inline uint64_t TemplatedHash(const string_t &elem) {
40878
+ data_ptr_t data = (data_ptr_t)elem.GetDataUnsafe();
40879
+ const auto &len = elem.GetSize();
40880
+ uint64_t h = 0;
40881
+ for (idx_t i = 0; i < len / 8; i += 8) {
40882
+ h ^= TemplatedHash<uint64_t>(Load<uint64_t>(data));
40883
+ data += 8;
40884
+ }
40885
+ switch (len & 7) {
40886
+ case 4:
40887
+ h ^= TemplatedHash<uint32_t>(Load<uint32_t>(data));
40888
+ break;
40889
+ case 2:
40890
+ h ^= TemplatedHash<uint16_t>(Load<uint16_t>(data));
40891
+ break;
40892
+ case 1:
40893
+ h ^= TemplatedHash<uint8_t>(Load<uint8_t>(data));
40894
+ break;
40895
+ default:
40896
+ h ^= HashOtherSize(data, len);
40897
+ }
40898
+ return h;
40899
+ }
40900
+
40901
+ template <class T>
40902
+ void TemplatedComputeHashes(VectorData &vdata, const idx_t &count, uint64_t hashes[]) {
40903
+ T *data = (T *)vdata.data;
40904
+ for (idx_t i = 0; i < count; i++) {
40905
+ auto idx = vdata.sel->get_index(i);
40906
+ if (vdata.validity.RowIsValid(idx)) {
40907
+ hashes[i] = TemplatedHash<T>(data[idx]);
40908
+ }
40909
+ }
40910
+ }
40911
+
40912
+ static void ComputeHashes(VectorData &vdata, const LogicalType &type, uint64_t hashes[], idx_t count) {
40913
+ switch (type.InternalType()) {
40914
+ case PhysicalType::BOOL:
40915
+ case PhysicalType::INT8:
40916
+ case PhysicalType::UINT8:
40917
+ return TemplatedComputeHashes<uint8_t>(vdata, count, hashes);
40918
+ case PhysicalType::INT16:
40919
+ case PhysicalType::UINT16:
40920
+ return TemplatedComputeHashes<uint16_t>(vdata, count, hashes);
40921
+ case PhysicalType::INT32:
40922
+ case PhysicalType::UINT32:
40923
+ case PhysicalType::FLOAT:
40924
+ return TemplatedComputeHashes<uint32_t>(vdata, count, hashes);
40925
+ case PhysicalType::INT64:
40926
+ case PhysicalType::UINT64:
40927
+ case PhysicalType::DOUBLE:
40928
+ return TemplatedComputeHashes<uint64_t>(vdata, count, hashes);
40929
+ case PhysicalType::INT128:
40930
+ case PhysicalType::INTERVAL:
40931
+ static_assert(sizeof(hugeint_t) == sizeof(interval_t), "ComputeHashes assumes these are the same size!");
40932
+ return TemplatedComputeHashes<hugeint_t>(vdata, count, hashes);
40933
+ case PhysicalType::VARCHAR:
40934
+ return TemplatedComputeHashes<string_t>(vdata, count, hashes);
40935
+ default:
40936
+ throw InternalException("Unimplemented type for HyperLogLog::ComputeHashes");
40937
+ }
40938
+ }
40939
+
40940
+ //! Taken from https://stackoverflow.com/a/72088344
40941
+ static inline uint8_t CountTrailingZeros(uint64_t &x) {
40942
+ static constexpr const uint64_t DEBRUIJN = 0x03f79d71b4cb0a89;
40943
+ static constexpr const uint8_t LOOKUP[] = {0, 47, 1, 56, 48, 27, 2, 60, 57, 49, 41, 37, 28, 16, 3, 61,
40944
+ 54, 58, 35, 52, 50, 42, 21, 44, 38, 32, 29, 23, 17, 11, 4, 62,
40945
+ 46, 55, 26, 59, 40, 36, 15, 53, 34, 51, 20, 43, 31, 22, 10, 45,
40946
+ 25, 39, 14, 33, 19, 30, 9, 24, 13, 18, 8, 12, 7, 6, 5, 63};
40947
+ return LOOKUP[(DEBRUIJN * (x ^ (x - 1))) >> 58];
40948
+ }
40949
+
40950
+ static inline void ComputeIndexAndCount(uint64_t &hash, uint8_t &prefix) {
40951
+ uint64_t index = hash & ((1 << 12) - 1); /* Register index. */
40952
+ hash >>= 12; /* Remove bits used to address the register. */
40953
+ hash |= ((uint64_t)1 << (64 - 12)); /* Make sure the count will be <= Q+1. */
40954
+
40955
+ prefix = CountTrailingZeros(hash) + 1; /* Add 1 since we count the "00000...1" pattern. */
40956
+ hash = index;
40957
+ }
40958
+
40959
+ void HyperLogLog::ProcessEntries(VectorData &vdata, const LogicalType &type, uint64_t hashes[], uint8_t counts[],
40960
+ idx_t count) {
40961
+ ComputeHashes(vdata, type, hashes, count);
40962
+ for (idx_t i = 0; i < count; i++) {
40963
+ ComputeIndexAndCount(hashes[i], counts[i]);
40964
+ }
40965
+ }
40966
+
40967
+ void HyperLogLog::AddToLogs(VectorData &vdata, idx_t count, uint64_t indices[], uint8_t counts[], HyperLogLog **logs[],
40968
+ const SelectionVector *log_sel) {
40969
+ AddToLogsInternal(vdata, count, indices, counts, (void ****)logs, log_sel);
40970
+ }
40971
+
40972
+ void HyperLogLog::AddToLog(VectorData &vdata, idx_t count, uint64_t indices[], uint8_t counts[]) {
40973
+ lock_guard<mutex> guard(lock);
40974
+ AddToSingleLogInternal(vdata, count, indices, counts, hll);
40975
+ }
40976
+
40739
40977
  } // namespace duckdb
40740
40978
 
40741
40979
 
40980
+
40742
40981
  //===----------------------------------------------------------------------===//
40743
40982
  // DuckDB
40744
40983
  //
@@ -44318,6 +44557,7 @@ Value Value::Deserialize(Deserializer &main_source) {
44318
44557
  auto is_null = reader.ReadRequired<bool>();
44319
44558
  Value new_value = Value(type);
44320
44559
  if (is_null) {
44560
+ reader.Finalize();
44321
44561
  return new_value;
44322
44562
  }
44323
44563
  new_value.is_null = false;
@@ -54344,7 +54584,7 @@ ART::ART(const vector<column_t> &column_ids, const vector<unique_ptr<Expression>
54344
54584
  : Index(IndexType::ART, column_ids, unbound_expressions, constraint_type) {
54345
54585
  tree = nullptr;
54346
54586
  expression_result.Initialize(logical_types);
54347
- is_little_endian = IsLittleEndian();
54587
+ is_little_endian = Radix::IsLittleEndian();
54348
54588
  for (idx_t i = 0; i < types.size(); i++) {
54349
54589
  switch (types[i]) {
54350
54590
  case PhysicalType::BOOL:
@@ -72978,11 +73218,11 @@ static bool CanUsePerfectHashAggregate(ClientContext &context, LogicalAggregate
72978
73218
  switch (group_type.InternalType()) {
72979
73219
  case PhysicalType::INT8:
72980
73220
  stats = make_unique<NumericStatistics>(group_type, Value::MinimumValue(group_type),
72981
- Value::MaximumValue(group_type));
73221
+ Value::MaximumValue(group_type), StatisticsType::LOCAL_STATS);
72982
73222
  break;
72983
73223
  case PhysicalType::INT16:
72984
73224
  stats = make_unique<NumericStatistics>(group_type, Value::MinimumValue(group_type),
72985
- Value::MaximumValue(group_type));
73225
+ Value::MaximumValue(group_type), StatisticsType::LOCAL_STATS);
72986
73226
  break;
72987
73227
  default:
72988
73228
  // type is too large and there are no stats: skip perfect hashing
@@ -78255,7 +78495,7 @@ struct ApproxDistinctCountState {
78255
78495
  HyperLogLog *log;
78256
78496
  };
78257
78497
 
78258
- struct ApproxCountDistinctFunctionBase {
78498
+ struct ApproxCountDistinctFunction {
78259
78499
  template <class STATE>
78260
78500
  static void Initialize(STATE *state) {
78261
78501
  state->log = nullptr;
@@ -78296,109 +78536,76 @@ struct ApproxCountDistinctFunctionBase {
78296
78536
  }
78297
78537
  };
78298
78538
 
78299
- struct ApproxCountDistinctFunction : ApproxCountDistinctFunctionBase {
78300
- template <class INPUT_TYPE, class STATE, class OP>
78301
- static void Operation(STATE *state, FunctionData *bind_data, INPUT_TYPE *input, ValidityMask &mask, idx_t idx) {
78302
- if (!state->log) {
78303
- state->log = new HyperLogLog();
78304
- }
78305
- INPUT_TYPE value = input[idx];
78306
- state->log->Add((uint8_t *)&value, sizeof(value));
78307
- }
78308
- template <class INPUT_TYPE, class STATE, class OP>
78309
- static void ConstantOperation(STATE *state, FunctionData *bind_data, INPUT_TYPE *input, ValidityMask &mask,
78310
- idx_t count) {
78311
- for (idx_t i = 0; i < count; i++) {
78312
- Operation<INPUT_TYPE, STATE, OP>(state, bind_data, input, mask, 0);
78313
- }
78314
- }
78315
- };
78539
+ static void ApproxCountDistinctSimpleUpdateFunction(Vector inputs[], FunctionData *bind_data, idx_t input_count,
78540
+ data_ptr_t state, idx_t count) {
78541
+ D_ASSERT(input_count == 1);
78316
78542
 
78317
- struct ApproxCountDistinctFunctionString : ApproxCountDistinctFunctionBase {
78318
- template <class INPUT_TYPE, class STATE, class OP>
78319
- static void Operation(STATE *state, FunctionData *bind_data, INPUT_TYPE *input, ValidityMask &mask, idx_t idx) {
78320
- if (!state->log) {
78321
- state->log = new HyperLogLog();
78322
- }
78323
- auto str = input[idx].GetDataUnsafe();
78324
- auto str_len = input[idx].GetSize();
78325
- auto str_hash = Hash(str, str_len);
78326
- state->log->Add((uint8_t *)&str_hash, sizeof(str_hash));
78327
- }
78328
- template <class INPUT_TYPE, class STATE, class OP>
78329
- static void ConstantOperation(STATE *state, FunctionData *bind_data, INPUT_TYPE *input, ValidityMask &mask,
78330
- idx_t count) {
78331
- for (idx_t i = 0; i < count; i++) {
78332
- Operation<INPUT_TYPE, STATE, OP>(state, bind_data, input, mask, 0);
78333
- }
78543
+ auto agg_state = (ApproxDistinctCountState *)state;
78544
+ if (!agg_state->log) {
78545
+ agg_state->log = new HyperLogLog();
78334
78546
  }
78335
- };
78336
78547
 
78337
- template <typename INPUT_TYPE, typename RESULT_TYPE>
78338
- AggregateFunction GetApproxCountDistinctFunction(const LogicalType &input_type, const LogicalType &result_type) {
78339
- return AggregateFunction::UnaryAggregateDestructor<ApproxDistinctCountState, INPUT_TYPE, RESULT_TYPE,
78340
- ApproxCountDistinctFunction>(input_type, result_type);
78548
+ VectorData vdata;
78549
+ inputs[0].Orrify(count, vdata);
78550
+
78551
+ uint64_t indices[STANDARD_VECTOR_SIZE];
78552
+ uint8_t counts[STANDARD_VECTOR_SIZE];
78553
+
78554
+ HyperLogLog::ProcessEntries(vdata, inputs[0].GetType(), indices, counts, count);
78555
+ agg_state->log->AddToLog(vdata, count, indices, counts);
78341
78556
  }
78342
78557
 
78343
- AggregateFunction GetApproxCountDistinctFunction(PhysicalType type) {
78344
- switch (type) {
78345
- case PhysicalType::UINT16:
78346
- return AggregateFunction::UnaryAggregateDestructor<ApproxDistinctCountState, uint16_t, int64_t,
78347
- ApproxCountDistinctFunction>(LogicalType::UTINYINT,
78348
- LogicalType::BIGINT);
78349
- case PhysicalType::UINT32:
78350
- return AggregateFunction::UnaryAggregateDestructor<ApproxDistinctCountState, uint32_t, int64_t,
78351
- ApproxCountDistinctFunction>(LogicalType::UINTEGER,
78352
- LogicalType::BIGINT);
78353
- case PhysicalType::UINT64:
78354
- return AggregateFunction::UnaryAggregateDestructor<ApproxDistinctCountState, uint64_t, int64_t,
78355
- ApproxCountDistinctFunction>(LogicalType::UBIGINT,
78356
- LogicalType::BIGINT);
78357
- case PhysicalType::INT16:
78358
- return AggregateFunction::UnaryAggregateDestructor<ApproxDistinctCountState, int16_t, int64_t,
78359
- ApproxCountDistinctFunction>(LogicalType::TINYINT,
78360
- LogicalType::BIGINT);
78361
- case PhysicalType::INT32:
78362
- return AggregateFunction::UnaryAggregateDestructor<ApproxDistinctCountState, int32_t, int64_t,
78363
- ApproxCountDistinctFunction>(LogicalType::INTEGER,
78364
- LogicalType::BIGINT);
78365
- case PhysicalType::INT64:
78366
- return AggregateFunction::UnaryAggregateDestructor<ApproxDistinctCountState, int64_t, int64_t,
78367
- ApproxCountDistinctFunction>(LogicalType::BIGINT,
78368
- LogicalType::BIGINT);
78369
- case PhysicalType::FLOAT:
78370
- return AggregateFunction::UnaryAggregateDestructor<ApproxDistinctCountState, float, int64_t,
78371
- ApproxCountDistinctFunction>(LogicalType::FLOAT,
78372
- LogicalType::BIGINT);
78373
- case PhysicalType::DOUBLE:
78374
- return AggregateFunction::UnaryAggregateDestructor<ApproxDistinctCountState, double, int64_t,
78375
- ApproxCountDistinctFunction>(LogicalType::DOUBLE,
78376
- LogicalType::BIGINT);
78377
- case PhysicalType::VARCHAR:
78378
- return AggregateFunction::UnaryAggregateDestructor<ApproxDistinctCountState, string_t, int64_t,
78379
- ApproxCountDistinctFunctionString>(LogicalType::VARCHAR,
78380
- LogicalType::BIGINT);
78558
+ static void ApproxCountDistinctUpdateFunction(Vector inputs[], FunctionData *bind_data, idx_t input_count,
78559
+ Vector &state_vector, idx_t count) {
78560
+ D_ASSERT(input_count == 1);
78381
78561
 
78382
- default:
78383
- throw InternalException("Unimplemented approximate_count aggregate");
78562
+ VectorData sdata;
78563
+ state_vector.Orrify(count, sdata);
78564
+ auto states = (ApproxDistinctCountState **)sdata.data;
78565
+
78566
+ for (idx_t i = 0; i < count; i++) {
78567
+ auto agg_state = states[sdata.sel->get_index(i)];
78568
+ if (!agg_state->log) {
78569
+ agg_state->log = new HyperLogLog();
78570
+ }
78384
78571
  }
78572
+
78573
+ VectorData vdata;
78574
+ inputs[0].Orrify(count, vdata);
78575
+
78576
+ uint64_t indices[STANDARD_VECTOR_SIZE];
78577
+ uint8_t counts[STANDARD_VECTOR_SIZE];
78578
+
78579
+ HyperLogLog::ProcessEntries(vdata, inputs[0].GetType(), indices, counts, count);
78580
+ HyperLogLog::AddToLogs(vdata, count, indices, counts, (HyperLogLog ***)states, sdata.sel);
78581
+ }
78582
+
78583
+ AggregateFunction GetApproxCountDistinctFunction(const LogicalType &input_type) {
78584
+ return AggregateFunction(
78585
+ {input_type}, LogicalTypeId::BIGINT, AggregateFunction::StateSize<ApproxDistinctCountState>,
78586
+ AggregateFunction::StateInitialize<ApproxDistinctCountState, ApproxCountDistinctFunction>,
78587
+ ApproxCountDistinctUpdateFunction,
78588
+ AggregateFunction::StateCombine<ApproxDistinctCountState, ApproxCountDistinctFunction>,
78589
+ AggregateFunction::StateFinalize<ApproxDistinctCountState, int64_t, ApproxCountDistinctFunction>,
78590
+ ApproxCountDistinctSimpleUpdateFunction, nullptr,
78591
+ AggregateFunction::StateDestroy<ApproxDistinctCountState, ApproxCountDistinctFunction>);
78385
78592
  }
78386
78593
 
78387
78594
  void ApproxCountDistinctFun::RegisterFunction(BuiltinFunctions &set) {
78388
78595
  AggregateFunctionSet approx_count("approx_count_distinct");
78389
- approx_count.AddFunction(GetApproxCountDistinctFunction(PhysicalType::UINT16));
78390
- approx_count.AddFunction(GetApproxCountDistinctFunction(PhysicalType::UINT32));
78391
- approx_count.AddFunction(GetApproxCountDistinctFunction(PhysicalType::UINT64));
78392
- approx_count.AddFunction(GetApproxCountDistinctFunction(PhysicalType::FLOAT));
78393
- approx_count.AddFunction(GetApproxCountDistinctFunction(PhysicalType::INT16));
78394
- approx_count.AddFunction(GetApproxCountDistinctFunction(PhysicalType::INT32));
78395
- approx_count.AddFunction(GetApproxCountDistinctFunction(PhysicalType::INT64));
78396
- approx_count.AddFunction(GetApproxCountDistinctFunction(PhysicalType::DOUBLE));
78397
- approx_count.AddFunction(GetApproxCountDistinctFunction(PhysicalType::VARCHAR));
78398
- approx_count.AddFunction(
78399
- GetApproxCountDistinctFunction<int64_t, int64_t>(LogicalType::TIMESTAMP, LogicalType::BIGINT));
78400
- approx_count.AddFunction(
78401
- GetApproxCountDistinctFunction<int64_t, int64_t>(LogicalType::TIMESTAMP_TZ, LogicalType::BIGINT));
78596
+ approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::UTINYINT));
78597
+ approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::USMALLINT));
78598
+ approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::UINTEGER));
78599
+ approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::UBIGINT));
78600
+ approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::TINYINT));
78601
+ approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::SMALLINT));
78602
+ approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::BIGINT));
78603
+ approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::HUGEINT));
78604
+ approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::FLOAT));
78605
+ approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::DOUBLE));
78606
+ approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::VARCHAR));
78607
+ approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::TIMESTAMP));
78608
+ approx_count.AddFunction(GetApproxCountDistinctFunction(LogicalType::TIMESTAMP_TZ));
78402
78609
  set.AddFunction(approx_count);
78403
78610
  }
78404
78611
 
@@ -87498,7 +87705,8 @@ static unique_ptr<BaseStatistics> PropagateDatePartStatistics(vector<unique_ptr<
87498
87705
  }
87499
87706
  auto min_part = OP::template Operation<T, int64_t>(min);
87500
87707
  auto max_part = OP::template Operation<T, int64_t>(max);
87501
- auto result = make_unique<NumericStatistics>(LogicalType::BIGINT, Value::BIGINT(min_part), Value::BIGINT(max_part));
87708
+ auto result = make_unique<NumericStatistics>(LogicalType::BIGINT, Value::BIGINT(min_part), Value::BIGINT(max_part),
87709
+ StatisticsType::LOCAL_STATS);
87502
87710
  if (child_stats[0]->validity_stats) {
87503
87711
  result->validity_stats = child_stats[0]->validity_stats->Copy();
87504
87712
  }
@@ -87509,7 +87717,8 @@ template <int64_t MIN, int64_t MAX>
87509
87717
  static unique_ptr<BaseStatistics> PropagateSimpleDatePartStatistics(vector<unique_ptr<BaseStatistics>> &child_stats) {
87510
87718
  // we can always propagate simple date part statistics
87511
87719
  // since the min and max can never exceed these bounds
87512
- auto result = make_unique<NumericStatistics>(LogicalType::BIGINT, Value::BIGINT(MIN), Value::BIGINT(MAX));
87720
+ auto result = make_unique<NumericStatistics>(LogicalType::BIGINT, Value::BIGINT(MIN), Value::BIGINT(MAX),
87721
+ StatisticsType::LOCAL_STATS);
87513
87722
  if (!child_stats[0]) {
87514
87723
  // if there are no child stats, we don't know
87515
87724
  result->validity_stats = make_unique<ValidityStatistics>(true);
@@ -96367,7 +96576,8 @@ static unique_ptr<BaseStatistics> PropagateNumericStats(ClientContext &context,
96367
96576
  // no potential overflow: replace with non-overflowing operator
96368
96577
  expr.function.function = GetScalarIntegerFunction<BASEOP>(expr.return_type.InternalType());
96369
96578
  }
96370
- auto stats = make_unique<NumericStatistics>(expr.return_type, move(new_min), move(new_max));
96579
+ auto stats =
96580
+ make_unique<NumericStatistics>(expr.return_type, move(new_min), move(new_max), StatisticsType::LOCAL_STATS);
96371
96581
  stats->validity_stats = ValidityStatistics::Combine(lstats.validity_stats, rstats.validity_stats);
96372
96582
  return move(stats);
96373
96583
  }
@@ -96672,7 +96882,8 @@ static unique_ptr<BaseStatistics> NegateBindStatistics(ClientContext &context, B
96672
96882
  new_min = Value(expr.return_type);
96673
96883
  new_max = Value(expr.return_type);
96674
96884
  }
96675
- auto stats = make_unique<NumericStatistics>(expr.return_type, move(new_min), move(new_max));
96885
+ auto stats =
96886
+ make_unique<NumericStatistics>(expr.return_type, move(new_min), move(new_max), StatisticsType::LOCAL_STATS);
96676
96887
  if (istats.validity_stats) {
96677
96888
  stats->validity_stats = istats.validity_stats->Copy();
96678
96889
  }
@@ -135403,10 +135614,8 @@ static unique_ptr<BaseStatistics> StatisticsOperationsNumericNumericCast(const B
135403
135614
  // overflow in cast: bailout
135404
135615
  return nullptr;
135405
135616
  }
135406
- auto stats = make_unique<NumericStatistics>(target, move(min), move(max));
135407
- if (input.validity_stats) {
135408
- stats->validity_stats = input.validity_stats->Copy();
135409
- }
135617
+ auto stats = make_unique<NumericStatistics>(target, move(min), move(max), input.stats_type);
135618
+ stats->CopyBase(*input_p);
135410
135619
  return move(stats);
135411
135620
  }
135412
135621
 
@@ -135601,12 +135810,73 @@ unique_ptr<BaseStatistics> StatisticsPropagator::PropagateExpression(BoundCompar
135601
135810
 
135602
135811
 
135603
135812
 
135813
+ //===----------------------------------------------------------------------===//
135814
+ // DuckDB
135815
+ //
135816
+ // duckdb/storage/statistics/distinct_statistics.hpp
135817
+ //
135818
+ //
135819
+ //===----------------------------------------------------------------------===//
135820
+
135821
+
135604
135822
 
135605
135823
 
135606
135824
 
135607
135825
 
135608
135826
 
135609
135827
  namespace duckdb {
135828
+ class Serializer;
135829
+ class Deserializer;
135830
+ class Vector;
135831
+
135832
+ class DistinctStatistics : public BaseStatistics {
135833
+ public:
135834
+ DistinctStatistics();
135835
+ explicit DistinctStatistics(unique_ptr<HyperLogLog> log, idx_t sample_count, idx_t total_count);
135836
+
135837
+ //! The HLL of the table
135838
+ unique_ptr<HyperLogLog> log;
135839
+ //! How many values have been sampled into the HLL
135840
+ atomic<idx_t> sample_count;
135841
+ //! How many values have been inserted (before sampling)
135842
+ atomic<idx_t> total_count;
135843
+
135844
+ public:
135845
+ void Merge(const BaseStatistics &other) override;
135846
+
135847
+ unique_ptr<BaseStatistics> Copy() const override;
135848
+
135849
+ void Serialize(Serializer &serializer) const override;
135850
+ void Serialize(FieldWriter &writer) const override;
135851
+
135852
+ static unique_ptr<DistinctStatistics> Deserialize(Deserializer &source);
135853
+ static unique_ptr<DistinctStatistics> Deserialize(FieldReader &reader);
135854
+
135855
+ void Update(Vector &update, idx_t count);
135856
+ void Update(VectorData &update_data, const LogicalType &ptype, idx_t count);
135857
+
135858
+ string ToString() const override;
135859
+ idx_t GetCount() const;
135860
+
135861
+ private:
135862
+ //! For distinct statistics we sample the input to speed up insertions
135863
+ static constexpr const double SAMPLE_RATE = 0.1;
135864
+ };
135865
+
135866
+ } // namespace duckdb
135867
+
135868
+
135869
+
135870
+
135871
+
135872
+
135873
+ namespace duckdb {
135874
+
135875
+ void UpdateDistinctStats(BaseStatistics &distinct_stats, const Value &input) {
135876
+ Vector v(input);
135877
+ auto &d_stats = (DistinctStatistics &)distinct_stats;
135878
+ d_stats.Update(v, 1);
135879
+ }
135610
135880
 
135611
135881
  unique_ptr<BaseStatistics> StatisticsPropagator::StatisticsFromValue(const Value &input) {
135612
135882
  switch (input.type().InternalType()) {
@@ -135622,13 +135892,15 @@ unique_ptr<BaseStatistics> StatisticsPropagator::StatisticsFromValue(const Value
135622
135892
  case PhysicalType::INT128:
135623
135893
  case PhysicalType::FLOAT:
135624
135894
  case PhysicalType::DOUBLE: {
135625
- auto result = make_unique<NumericStatistics>(input.type(), input, input);
135895
+ auto result = make_unique<NumericStatistics>(input.type(), input, input, StatisticsType::GLOBAL_STATS);
135626
135896
  result->validity_stats = make_unique<ValidityStatistics>(input.IsNull(), !input.IsNull());
135897
+ UpdateDistinctStats(*result->distinct_stats, input);
135627
135898
  return move(result);
135628
135899
  }
135629
135900
  case PhysicalType::VARCHAR: {
135630
- auto result = make_unique<StringStatistics>(input.type());
135901
+ auto result = make_unique<StringStatistics>(input.type(), StatisticsType::GLOBAL_STATS);
135631
135902
  result->validity_stats = make_unique<ValidityStatistics>(input.IsNull(), !input.IsNull());
135903
+ UpdateDistinctStats(*result->distinct_stats, input);
135632
135904
  if (!input.IsNull()) {
135633
135905
  auto &string_value = StringValue::Get(input);
135634
135906
  result->Update(string_t(string_value));
@@ -151469,6 +151741,7 @@ unique_ptr<QueryNode> QueryNode::Deserialize(Deserializer &main_source) {
151469
151741
  }
151470
151742
  result->modifiers = move(modifiers);
151471
151743
  result->cte_map = move(cte_map);
151744
+ reader.Finalize();
151472
151745
  return result;
151473
151746
  }
151474
151747
 
@@ -151514,6 +151787,7 @@ unique_ptr<ResultModifier> ResultModifier::Deserialize(Deserializer &source) {
151514
151787
  default:
151515
151788
  throw InternalException("Unrecognized ResultModifierType for Deserialization");
151516
151789
  }
151790
+ reader.Finalize();
151517
151791
  return result;
151518
151792
  }
151519
151793
 
@@ -151649,6 +151923,7 @@ OrderByNode OrderByNode::Deserialize(Deserializer &source) {
151649
151923
  auto type = reader.ReadRequired<OrderType>();
151650
151924
  auto null_order = reader.ReadRequired<OrderByNullType>();
151651
151925
  auto expression = reader.ReadRequiredSerializable<ParsedExpression>();
151926
+ reader.Finalize();
151652
151927
  return OrderByNode(type, null_order, move(expression));
151653
151928
  }
151654
151929
 
@@ -176309,7 +176584,7 @@ DataTable::DataTable(DatabaseInstance &db, const string &schema, const string &t
176309
176584
 
176310
176585
  AppendRowGroup(0);
176311
176586
  for (auto &type : types) {
176312
- column_stats.push_back(BaseStatistics::CreateEmpty(type));
176587
+ column_stats.push_back(BaseStatistics::CreateEmpty(type, StatisticsType::GLOBAL_STATS));
176313
176588
  }
176314
176589
  } else {
176315
176590
  D_ASSERT(column_stats.size() == types.size());
@@ -176339,7 +176614,7 @@ DataTable::DataTable(ClientContext &context, DataTable &parent, ColumnDefinition
176339
176614
  for (idx_t i = 0; i < parent.column_stats.size(); i++) {
176340
176615
  column_stats.push_back(parent.column_stats[i]->Copy());
176341
176616
  }
176342
- column_stats.push_back(BaseStatistics::CreateEmpty(new_column_type));
176617
+ column_stats.push_back(BaseStatistics::CreateEmpty(new_column_type, StatisticsType::GLOBAL_STATS));
176343
176618
 
176344
176619
  // add the column definitions from this DataTable
176345
176620
  column_definitions.emplace_back(new_column.Copy());
@@ -176448,7 +176723,8 @@ DataTable::DataTable(ClientContext &context, DataTable &parent, idx_t changed_id
176448
176723
  // the column that had its type changed will have the new statistics computed during conversion
176449
176724
  for (idx_t i = 0; i < column_definitions.size(); i++) {
176450
176725
  if (i == changed_idx) {
176451
- column_stats.push_back(BaseStatistics::CreateEmpty(column_definitions[i].type));
176726
+ column_stats.push_back(
176727
+ BaseStatistics::CreateEmpty(column_definitions[i].type, StatisticsType::GLOBAL_STATS));
176452
176728
  } else {
176453
176729
  column_stats.push_back(parent.column_stats[i]->Copy());
176454
176730
  }
@@ -176965,6 +177241,13 @@ void DataTable::Append(Transaction &transaction, DataChunk &chunk, TableAppendSt
176965
177241
  }
176966
177242
  }
176967
177243
  state.current_row += append_count;
177244
+ for (idx_t col_idx = 0; col_idx < column_stats.size(); col_idx++) {
177245
+ auto type = chunk.data[col_idx].GetType().InternalType();
177246
+ if (type == PhysicalType::LIST || type == PhysicalType::STRUCT) {
177247
+ continue;
177248
+ }
177249
+ column_stats[col_idx]->UpdateDistinctStatistics(chunk.data[col_idx], chunk.size());
177250
+ }
176968
177251
  }
176969
177252
 
176970
177253
  void DataTable::ScanTableSegment(idx_t row_start, idx_t count, const std::function<void(DataChunk &chunk)> &function) {
@@ -177535,7 +177818,7 @@ BlockPointer DataTable::Checkpoint(TableDataWriter &writer) {
177535
177818
  // FIXME: we might want to combine adjacent row groups in case they have had deletions...
177536
177819
  vector<unique_ptr<BaseStatistics>> global_stats;
177537
177820
  for (idx_t i = 0; i < column_definitions.size(); i++) {
177538
- global_stats.push_back(BaseStatistics::CreateEmpty(column_definitions[i].type));
177821
+ global_stats.push_back(column_stats[i]->Copy());
177539
177822
  }
177540
177823
 
177541
177824
  auto row_group = (RowGroup *)row_groups->GetRootSegment();
@@ -178803,14 +179086,22 @@ void SingleFileBlockManager::WriteHeader(DatabaseHeader header) {
178803
179086
 
178804
179087
 
178805
179088
 
179089
+
178806
179090
  namespace duckdb {
178807
179091
 
178808
- BaseStatistics::BaseStatistics(LogicalType type) : type(move(type)) {
179092
+ BaseStatistics::BaseStatistics(LogicalType type, StatisticsType stats_type) : type(move(type)), stats_type(stats_type) {
178809
179093
  }
178810
179094
 
178811
179095
  BaseStatistics::~BaseStatistics() {
178812
179096
  }
178813
179097
 
179098
+ void BaseStatistics::InitializeBase() {
179099
+ validity_stats = make_unique<ValidityStatistics>(false);
179100
+ if (stats_type == GLOBAL_STATS) {
179101
+ distinct_stats = make_unique<DistinctStatistics>();
179102
+ }
179103
+ }
179104
+
178814
179105
  bool BaseStatistics::CanHaveNull() const {
178815
179106
  if (!validity_stats) {
178816
179107
  // we don't know
@@ -178829,18 +179120,34 @@ bool BaseStatistics::CanHaveNoNull() const {
178829
179120
  return ((ValidityStatistics &)*validity_stats).has_no_null;
178830
179121
  }
178831
179122
 
178832
- void BaseStatistics::Merge(const BaseStatistics &other) {
178833
- D_ASSERT(type == other.type);
178834
- if (other.validity_stats) {
178835
- if (validity_stats) {
178836
- validity_stats->Merge(*other.validity_stats);
179123
+ void BaseStatistics::UpdateDistinctStatistics(Vector &v, idx_t count) {
179124
+ if (!distinct_stats) {
179125
+ return;
179126
+ }
179127
+ auto &d_stats = (DistinctStatistics &)*distinct_stats;
179128
+ d_stats.Update(v, count);
179129
+ }
179130
+
179131
+ void MergeInternal(unique_ptr<BaseStatistics> &orig, const unique_ptr<BaseStatistics> &other) {
179132
+ if (other) {
179133
+ if (orig) {
179134
+ orig->Merge(*other);
178837
179135
  } else {
178838
- validity_stats = other.validity_stats->Copy();
179136
+ orig = other->Copy();
178839
179137
  }
178840
179138
  }
178841
179139
  }
178842
179140
 
178843
- unique_ptr<BaseStatistics> BaseStatistics::CreateEmpty(LogicalType type) {
179141
+ void BaseStatistics::Merge(const BaseStatistics &other) {
179142
+ D_ASSERT(type == other.type);
179143
+ MergeInternal(validity_stats, other.validity_stats);
179144
+ if (stats_type == GLOBAL_STATS) {
179145
+ MergeInternal(distinct_stats, other.distinct_stats);
179146
+ }
179147
+ }
179148
+
179149
+ unique_ptr<BaseStatistics> BaseStatistics::CreateEmpty(LogicalType type, StatisticsType stats_type) {
179150
+ unique_ptr<BaseStatistics> result;
178844
179151
  switch (type.InternalType()) {
178845
179152
  case PhysicalType::BIT:
178846
179153
  return make_unique<ValidityStatistics>(false, false);
@@ -178856,34 +179163,49 @@ unique_ptr<BaseStatistics> BaseStatistics::CreateEmpty(LogicalType type) {
178856
179163
  case PhysicalType::INT128:
178857
179164
  case PhysicalType::FLOAT:
178858
179165
  case PhysicalType::DOUBLE:
178859
- return make_unique<NumericStatistics>(move(type));
179166
+ result = make_unique<NumericStatistics>(move(type), stats_type);
179167
+ break;
178860
179168
  case PhysicalType::VARCHAR:
178861
- return make_unique<StringStatistics>(move(type));
179169
+ result = make_unique<StringStatistics>(move(type), stats_type);
179170
+ break;
178862
179171
  case PhysicalType::STRUCT:
178863
- return make_unique<StructStatistics>(move(type));
179172
+ result = make_unique<StructStatistics>(move(type));
179173
+ break;
178864
179174
  case PhysicalType::LIST:
178865
- return make_unique<ListStatistics>(move(type));
179175
+ result = make_unique<ListStatistics>(move(type));
179176
+ break;
178866
179177
  case PhysicalType::INTERVAL:
178867
179178
  default:
178868
- auto base_stats = make_unique<BaseStatistics>(move(type));
178869
- base_stats->validity_stats = make_unique<ValidityStatistics>(false);
178870
- return base_stats;
179179
+ result = make_unique<BaseStatistics>(move(type), stats_type);
178871
179180
  }
179181
+ result->InitializeBase();
179182
+ return result;
178872
179183
  }
178873
179184
 
178874
179185
  unique_ptr<BaseStatistics> BaseStatistics::Copy() const {
178875
- auto statistics = make_unique<BaseStatistics>(type);
178876
- if (validity_stats) {
178877
- statistics->validity_stats = validity_stats->Copy();
179186
+ auto result = make_unique<BaseStatistics>(type, stats_type);
179187
+ result->CopyBase(*this);
179188
+ return result;
179189
+ }
179190
+
179191
+ void BaseStatistics::CopyBase(const BaseStatistics &orig) {
179192
+ if (orig.validity_stats) {
179193
+ validity_stats = orig.validity_stats->Copy();
179194
+ }
179195
+ if (orig.distinct_stats) {
179196
+ distinct_stats = orig.distinct_stats->Copy();
178878
179197
  }
178879
- return statistics;
178880
179198
  }
178881
179199
 
178882
179200
  void BaseStatistics::Serialize(Serializer &serializer) const {
178883
179201
  FieldWriter writer(serializer);
178884
- writer.WriteField<bool>(CanHaveNull());
178885
- writer.WriteField<bool>(CanHaveNoNull());
179202
+ ValidityStatistics(CanHaveNull(), CanHaveNoNull()).Serialize(writer);
178886
179203
  Serialize(writer);
179204
+ auto ptype = type.InternalType();
179205
+ if (ptype != PhysicalType::BIT) {
179206
+ writer.WriteField<StatisticsType>(stats_type);
179207
+ writer.WriteOptional<BaseStatistics>(distinct_stats);
179208
+ }
178887
179209
  writer.Finalize();
178888
179210
  }
178889
179211
 
@@ -178892,12 +179214,13 @@ void BaseStatistics::Serialize(FieldWriter &writer) const {
178892
179214
 
178893
179215
  unique_ptr<BaseStatistics> BaseStatistics::Deserialize(Deserializer &source, LogicalType type) {
178894
179216
  FieldReader reader(source);
178895
- bool can_have_null = reader.ReadRequired<bool>();
178896
- bool can_have_no_null = reader.ReadRequired<bool>();
179217
+ auto validity_stats = ValidityStatistics::Deserialize(reader);
178897
179218
  unique_ptr<BaseStatistics> result;
178898
- switch (type.InternalType()) {
179219
+ auto ptype = type.InternalType();
179220
+ switch (ptype) {
178899
179221
  case PhysicalType::BIT:
178900
- return ValidityStatistics::Deserialize(reader);
179222
+ result = ValidityStatistics::Deserialize(reader);
179223
+ break;
178901
179224
  case PhysicalType::BOOL:
178902
179225
  case PhysicalType::INT8:
178903
179226
  case PhysicalType::INT16:
@@ -178922,17 +179245,25 @@ unique_ptr<BaseStatistics> BaseStatistics::Deserialize(Deserializer &source, Log
178922
179245
  result = ListStatistics::Deserialize(reader, move(type));
178923
179246
  break;
178924
179247
  case PhysicalType::INTERVAL:
178925
- result = make_unique<BaseStatistics>(move(type));
179248
+ result = make_unique<BaseStatistics>(move(type), StatisticsType::LOCAL_STATS);
178926
179249
  break;
178927
179250
  default:
178928
179251
  throw InternalException("Unimplemented type for statistics deserialization");
178929
179252
  }
178930
- result->validity_stats = make_unique<ValidityStatistics>(can_have_null, can_have_no_null);
179253
+
179254
+ if (ptype != PhysicalType::BIT) {
179255
+ result->validity_stats = move(validity_stats);
179256
+ result->stats_type = reader.ReadField<StatisticsType>(StatisticsType::LOCAL_STATS);
179257
+ result->distinct_stats = reader.ReadOptional<DistinctStatistics>(nullptr);
179258
+ }
179259
+
179260
+ reader.Finalize();
178931
179261
  return result;
178932
179262
  }
178933
179263
 
178934
179264
  string BaseStatistics::ToString() const {
178935
- return StringUtil::Format("Base Statistics %s", validity_stats ? validity_stats->ToString() : "[]");
179265
+ return StringUtil::Format("%s%s", validity_stats ? validity_stats->ToString() : "",
179266
+ distinct_stats ? distinct_stats->ToString() : "");
178936
179267
  }
178937
179268
 
178938
179269
  void BaseStatistics::Verify(Vector &vector, const SelectionVector &sel, idx_t count) const {
@@ -178953,14 +179284,104 @@ void BaseStatistics::Verify(Vector &vector, idx_t count) const {
178953
179284
 
178954
179285
 
178955
179286
 
179287
+
178956
179288
  namespace duckdb {
178957
179289
 
178958
- ListStatistics::ListStatistics(LogicalType type_p) : BaseStatistics(move(type_p)) {
178959
- D_ASSERT(type.InternalType() == PhysicalType::LIST);
179290
+ DistinctStatistics::DistinctStatistics()
179291
+ : BaseStatistics(LogicalType::INVALID, StatisticsType::LOCAL_STATS), log(make_unique<HyperLogLog>()),
179292
+ sample_count(0), total_count(0) {
179293
+ }
179294
+
179295
+ DistinctStatistics::DistinctStatistics(unique_ptr<HyperLogLog> log, idx_t sample_count, idx_t total_count)
179296
+ : BaseStatistics(LogicalType::INVALID, StatisticsType::LOCAL_STATS), log(move(log)), sample_count(sample_count),
179297
+ total_count(total_count) {
179298
+ }
179299
+
179300
+ unique_ptr<BaseStatistics> DistinctStatistics::Copy() const {
179301
+ return make_unique<DistinctStatistics>(log->Copy(), sample_count, total_count);
179302
+ }
179303
+
179304
+ void DistinctStatistics::Merge(const BaseStatistics &other_p) {
179305
+ BaseStatistics::Merge(other_p);
179306
+ auto &other = (const DistinctStatistics &)other_p;
179307
+ log->Merge(*other.log);
179308
+ sample_count += other.sample_count;
179309
+ total_count += other.total_count;
179310
+ }
179311
+
179312
+ void DistinctStatistics::Serialize(Serializer &serializer) const {
179313
+ FieldWriter writer(serializer);
179314
+ Serialize(writer);
179315
+ writer.Finalize();
179316
+ }
179317
+
179318
+ void DistinctStatistics::Serialize(FieldWriter &writer) const {
179319
+ writer.WriteField<idx_t>(sample_count);
179320
+ writer.WriteField<idx_t>(total_count);
179321
+ log->Serialize(writer);
179322
+ }
179323
+
179324
+ unique_ptr<DistinctStatistics> DistinctStatistics::Deserialize(Deserializer &source) {
179325
+ FieldReader reader(source);
179326
+ auto result = Deserialize(reader);
179327
+ reader.Finalize();
179328
+ return result;
179329
+ }
179330
+
179331
+ unique_ptr<DistinctStatistics> DistinctStatistics::Deserialize(FieldReader &reader) {
179332
+ auto sample_count = reader.ReadRequired<idx_t>();
179333
+ auto total_count = reader.ReadRequired<idx_t>();
179334
+ return make_unique<DistinctStatistics>(HyperLogLog::Deserialize(reader), sample_count, total_count);
179335
+ }
179336
+
179337
+ void DistinctStatistics::Update(Vector &v, idx_t count) {
179338
+ VectorData vdata;
179339
+ v.Orrify(count, vdata);
179340
+ Update(vdata, v.GetType(), count);
179341
+ }
179342
+
179343
+ void DistinctStatistics::Update(VectorData &vdata, const LogicalType &type, idx_t count) {
179344
+ if (count == 0) {
179345
+ return;
179346
+ }
179347
+ total_count += count;
179348
+ count = MaxValue<idx_t>(idx_t(SAMPLE_RATE * double(count)), 1);
179349
+ sample_count += count;
179350
+
179351
+ uint64_t indices[STANDARD_VECTOR_SIZE];
179352
+ uint8_t counts[STANDARD_VECTOR_SIZE];
179353
+
179354
+ HyperLogLog::ProcessEntries(vdata, type, indices, counts, count);
179355
+ log->AddToLog(vdata, count, indices, counts);
179356
+ }
179357
+
179358
+ string DistinctStatistics::ToString() const {
179359
+ return StringUtil::Format("[Approx Unique: %s]", to_string(GetCount()));
179360
+ }
179361
+
179362
+ idx_t DistinctStatistics::GetCount() const {
179363
+ // Estimate HLL count because we use sampling
179364
+ double hll_count = log->Count();
179365
+ double unique_proportion = hll_count / double(sample_count);
179366
+ double actual_sample_rate = double(sample_count) / double(total_count);
179367
+ double multiplier = double(1) + unique_proportion * (double(1) / actual_sample_rate - double(1));
179368
+ return idx_t(multiplier * hll_count);
179369
+ }
178960
179370
 
179371
+ } // namespace duckdb
179372
+
179373
+
179374
+
179375
+
179376
+
179377
+
179378
+ namespace duckdb {
179379
+
179380
+ ListStatistics::ListStatistics(LogicalType type_p) : BaseStatistics(move(type_p), StatisticsType::LOCAL_STATS) {
179381
+ D_ASSERT(type.InternalType() == PhysicalType::LIST);
179382
+ InitializeBase();
178961
179383
  auto &child_type = ListType::GetChildType(type);
178962
- child_stats = BaseStatistics::CreateEmpty(child_type);
178963
- validity_stats = make_unique<ValidityStatistics>(false);
179384
+ child_stats = BaseStatistics::CreateEmpty(child_type, StatisticsType::LOCAL_STATS);
178964
179385
  }
178965
179386
 
178966
179387
  void ListStatistics::Merge(const BaseStatistics &other_p) {
@@ -178981,10 +179402,11 @@ FilterPropagateResult ListStatistics::CheckZonemap(ExpressionType comparison_typ
178981
179402
  // LCOV_EXCL_STOP
178982
179403
 
178983
179404
  unique_ptr<BaseStatistics> ListStatistics::Copy() const {
178984
- auto copy = make_unique<ListStatistics>(type);
178985
- copy->validity_stats = validity_stats ? validity_stats->Copy() : nullptr;
178986
- copy->child_stats = child_stats ? child_stats->Copy() : nullptr;
178987
- return move(copy);
179405
+ auto result = make_unique<ListStatistics>(type);
179406
+ result->CopyBase(*this);
179407
+
179408
+ result->child_stats = child_stats ? child_stats->Copy() : nullptr;
179409
+ return move(result);
178988
179410
  }
178989
179411
 
178990
179412
  void ListStatistics::Serialize(FieldWriter &writer) const {
@@ -178995,18 +179417,12 @@ unique_ptr<BaseStatistics> ListStatistics::Deserialize(FieldReader &reader, Logi
178995
179417
  D_ASSERT(type.InternalType() == PhysicalType::LIST);
178996
179418
  auto result = make_unique<ListStatistics>(move(type));
178997
179419
  auto &child_type = ListType::GetChildType(result->type);
178998
- auto &source = reader.GetSource();
178999
- result->child_stats = BaseStatistics::Deserialize(source, child_type);
179420
+ result->child_stats = reader.ReadRequiredSerializable<BaseStatistics>(child_type);
179000
179421
  return move(result);
179001
179422
  }
179002
179423
 
179003
179424
  string ListStatistics::ToString() const {
179004
- string result;
179005
- result += " [";
179006
- result += child_stats ? child_stats->ToString() : "No Stats";
179007
- result += "]";
179008
- result += validity_stats ? validity_stats->ToString() : "";
179009
- return result;
179425
+ return StringUtil::Format("[%s]%s", child_stats ? child_stats->ToString() : "No Stats", BaseStatistics::ToString());
179010
179426
  }
179011
179427
 
179012
179428
  void ListStatistics::Verify(Vector &vector, const SelectionVector &sel, idx_t count) const {
@@ -179052,6 +179468,7 @@ void ListStatistics::Verify(Vector &vector, const SelectionVector &sel, idx_t co
179052
179468
 
179053
179469
 
179054
179470
 
179471
+
179055
179472
  namespace duckdb {
179056
179473
 
179057
179474
  template <>
@@ -179062,14 +179479,16 @@ template <>
179062
179479
  void NumericStatistics::Update<list_entry_t>(SegmentStatistics &stats, list_entry_t new_value) {
179063
179480
  }
179064
179481
 
179065
- NumericStatistics::NumericStatistics(LogicalType type_p) : BaseStatistics(move(type_p)) {
179482
+ NumericStatistics::NumericStatistics(LogicalType type_p, StatisticsType stats_type)
179483
+ : BaseStatistics(move(type_p), stats_type) {
179484
+ InitializeBase();
179066
179485
  min = Value::MaximumValue(type);
179067
179486
  max = Value::MinimumValue(type);
179068
- validity_stats = make_unique<ValidityStatistics>(false);
179069
179487
  }
179070
179488
 
179071
- NumericStatistics::NumericStatistics(LogicalType type_p, Value min_p, Value max_p)
179072
- : BaseStatistics(move(type_p)), min(move(min_p)), max(move(max_p)) {
179489
+ NumericStatistics::NumericStatistics(LogicalType type_p, Value min_p, Value max_p, StatisticsType stats_type)
179490
+ : BaseStatistics(move(type_p), stats_type), min(move(min_p)), max(move(max_p)) {
179491
+ InitializeBase();
179073
179492
  }
179074
179493
 
179075
179494
  void NumericStatistics::Merge(const BaseStatistics &other_p) {
@@ -179161,11 +179580,9 @@ FilterPropagateResult NumericStatistics::CheckZonemap(ExpressionType comparison_
179161
179580
  }
179162
179581
 
179163
179582
  unique_ptr<BaseStatistics> NumericStatistics::Copy() const {
179164
- auto stats = make_unique<NumericStatistics>(type, min, max);
179165
- if (validity_stats) {
179166
- stats->validity_stats = validity_stats->Copy();
179167
- }
179168
- return move(stats);
179583
+ auto result = make_unique<NumericStatistics>(type, min, max, stats_type);
179584
+ result->CopyBase(*this);
179585
+ return move(result);
179169
179586
  }
179170
179587
 
179171
179588
  bool NumericStatistics::IsConstant() const {
@@ -179180,12 +179597,11 @@ void NumericStatistics::Serialize(FieldWriter &writer) const {
179180
179597
  unique_ptr<BaseStatistics> NumericStatistics::Deserialize(FieldReader &reader, LogicalType type) {
179181
179598
  auto min = reader.ReadRequiredSerializable<Value, Value>();
179182
179599
  auto max = reader.ReadRequiredSerializable<Value, Value>();
179183
- return make_unique_base<BaseStatistics, NumericStatistics>(move(type), min, max);
179600
+ return make_unique_base<BaseStatistics, NumericStatistics>(move(type), min, max, StatisticsType::LOCAL_STATS);
179184
179601
  }
179185
179602
 
179186
179603
  string NumericStatistics::ToString() const {
179187
- return StringUtil::Format("[Min: %s, Max: %s]%s", min.ToString(), max.ToString(),
179188
- validity_stats ? validity_stats->ToString() : "");
179604
+ return StringUtil::Format("[Min: %s, Max: %s]%s", min.ToString(), max.ToString(), BaseStatistics::ToString());
179189
179605
  }
179190
179606
 
179191
179607
  template <class T>
@@ -179260,8 +179676,6 @@ void NumericStatistics::Verify(Vector &vector, const SelectionVector &sel, idx_t
179260
179676
 
179261
179677
 
179262
179678
 
179263
-
179264
-
179265
179679
  namespace duckdb {
179266
179680
 
179267
179681
  SegmentStatistics::SegmentStatistics(LogicalType type) : type(move(type)) {
@@ -179276,8 +179690,7 @@ SegmentStatistics::SegmentStatistics(LogicalType type, unique_ptr<BaseStatistics
179276
179690
  }
179277
179691
 
179278
179692
  void SegmentStatistics::Reset() {
179279
- statistics = BaseStatistics::CreateEmpty(type);
179280
- statistics->validity_stats = make_unique<ValidityStatistics>(false);
179693
+ statistics = BaseStatistics::CreateEmpty(type, StatisticsType::LOCAL_STATS);
179281
179694
  }
179282
179695
 
179283
179696
  } // namespace duckdb
@@ -179289,7 +179702,9 @@ void SegmentStatistics::Reset() {
179289
179702
 
179290
179703
  namespace duckdb {
179291
179704
 
179292
- StringStatistics::StringStatistics(LogicalType type_p) : BaseStatistics(move(type_p)) {
179705
+ StringStatistics::StringStatistics(LogicalType type_p, StatisticsType stats_type)
179706
+ : BaseStatistics(move(type_p), stats_type) {
179707
+ InitializeBase();
179293
179708
  for (idx_t i = 0; i < MAX_STRING_MINMAX_SIZE; i++) {
179294
179709
  min[i] = 0xFF;
179295
179710
  max[i] = 0;
@@ -179297,19 +179712,17 @@ StringStatistics::StringStatistics(LogicalType type_p) : BaseStatistics(move(typ
179297
179712
  max_string_length = 0;
179298
179713
  has_unicode = false;
179299
179714
  has_overflow_strings = false;
179300
- validity_stats = make_unique<ValidityStatistics>(false);
179301
179715
  }
179302
179716
 
179303
179717
  unique_ptr<BaseStatistics> StringStatistics::Copy() const {
179304
- auto stats = make_unique<StringStatistics>(type);
179305
- memcpy(stats->min, min, MAX_STRING_MINMAX_SIZE);
179306
- memcpy(stats->max, max, MAX_STRING_MINMAX_SIZE);
179307
- stats->has_unicode = has_unicode;
179308
- stats->max_string_length = max_string_length;
179309
- if (validity_stats) {
179310
- stats->validity_stats = validity_stats->Copy();
179311
- }
179312
- return move(stats);
179718
+ auto result = make_unique<StringStatistics>(type, stats_type);
179719
+ result->CopyBase(*this);
179720
+
179721
+ memcpy(result->min, min, MAX_STRING_MINMAX_SIZE);
179722
+ memcpy(result->max, max, MAX_STRING_MINMAX_SIZE);
179723
+ result->has_unicode = has_unicode;
179724
+ result->max_string_length = max_string_length;
179725
+ return move(result);
179313
179726
  }
179314
179727
 
179315
179728
  void StringStatistics::Serialize(FieldWriter &writer) const {
@@ -179321,7 +179734,7 @@ void StringStatistics::Serialize(FieldWriter &writer) const {
179321
179734
  }
179322
179735
 
179323
179736
  unique_ptr<BaseStatistics> StringStatistics::Deserialize(FieldReader &reader, LogicalType type) {
179324
- auto stats = make_unique<StringStatistics>(move(type));
179737
+ auto stats = make_unique<StringStatistics>(move(type), StatisticsType::LOCAL_STATS);
179325
179738
  reader.ReadBlob(stats->min, MAX_STRING_MINMAX_SIZE);
179326
179739
  reader.ReadBlob(stats->max, MAX_STRING_MINMAX_SIZE);
179327
179740
  stats->has_unicode = reader.ReadRequired<bool>();
@@ -179449,8 +179862,7 @@ string StringStatistics::ToString() const {
179449
179862
  idx_t max_len = GetValidMinMaxSubstring(max);
179450
179863
  return StringUtil::Format("[Min: %s, Max: %s, Has Unicode: %s, Max String Length: %lld]%s",
179451
179864
  string((const char *)min, min_len), string((const char *)max, max_len),
179452
- has_unicode ? "true" : "false", max_string_length,
179453
- validity_stats ? validity_stats->ToString() : "");
179865
+ has_unicode ? "true" : "false", max_string_length, BaseStatistics::ToString());
179454
179866
  }
179455
179867
 
179456
179868
  void StringStatistics::Verify(Vector &vector, const SelectionVector &sel, idx_t count) const {
@@ -179504,17 +179916,18 @@ void StringStatistics::Verify(Vector &vector, const SelectionVector &sel, idx_t
179504
179916
 
179505
179917
 
179506
179918
 
179919
+
179507
179920
  namespace duckdb {
179508
179921
 
179509
- StructStatistics::StructStatistics(LogicalType type_p) : BaseStatistics(move(type_p)) {
179922
+ StructStatistics::StructStatistics(LogicalType type_p) : BaseStatistics(move(type_p), StatisticsType::LOCAL_STATS) {
179510
179923
  D_ASSERT(type.InternalType() == PhysicalType::STRUCT);
179924
+ InitializeBase();
179511
179925
 
179512
179926
  auto &child_types = StructType::GetChildTypes(type);
179513
179927
  child_stats.resize(child_types.size());
179514
179928
  for (idx_t i = 0; i < child_types.size(); i++) {
179515
- child_stats[i] = BaseStatistics::CreateEmpty(child_types[i].second);
179929
+ child_stats[i] = BaseStatistics::CreateEmpty(child_types[i].second, StatisticsType::LOCAL_STATS);
179516
179930
  }
179517
- validity_stats = make_unique<ValidityStatistics>(false);
179518
179931
  }
179519
179932
 
179520
179933
  void StructStatistics::Merge(const BaseStatistics &other_p) {
@@ -179538,14 +179951,13 @@ FilterPropagateResult StructStatistics::CheckZonemap(ExpressionType comparison_t
179538
179951
  // LCOV_EXCL_STOP
179539
179952
 
179540
179953
  unique_ptr<BaseStatistics> StructStatistics::Copy() const {
179541
- auto copy = make_unique<StructStatistics>(type);
179542
- if (validity_stats) {
179543
- copy->validity_stats = validity_stats->Copy();
179544
- }
179954
+ auto result = make_unique<StructStatistics>(type);
179955
+ result->CopyBase(*this);
179956
+
179545
179957
  for (idx_t i = 0; i < child_stats.size(); i++) {
179546
- copy->child_stats[i] = child_stats[i] ? child_stats[i]->Copy() : nullptr;
179958
+ result->child_stats[i] = child_stats[i] ? child_stats[i]->Copy() : nullptr;
179547
179959
  }
179548
- return move(copy);
179960
+ return move(result);
179549
179961
  }
179550
179962
 
179551
179963
  void StructStatistics::Serialize(FieldWriter &writer) const {
@@ -179591,7 +180003,7 @@ string StructStatistics::ToString() const {
179591
180003
  result += child_types[i].first + ": " + (child_stats[i] ? child_stats[i]->ToString() : "No Stats");
179592
180004
  }
179593
180005
  result += "}";
179594
- result += validity_stats ? validity_stats->ToString() : "";
180006
+ result += BaseStatistics::ToString();
179595
180007
  return result;
179596
180008
  }
179597
180009
 
@@ -179612,10 +180024,13 @@ void StructStatistics::Verify(Vector &vector, const SelectionVector &sel, idx_t
179612
180024
 
179613
180025
 
179614
180026
 
180027
+
180028
+
179615
180029
  namespace duckdb {
179616
180030
 
179617
180031
  ValidityStatistics::ValidityStatistics(bool has_null, bool has_no_null)
179618
- : BaseStatistics(LogicalType(LogicalTypeId::VALIDITY)), has_null(has_null), has_no_null(has_no_null) {
180032
+ : BaseStatistics(LogicalType(LogicalTypeId::VALIDITY), StatisticsType::LOCAL_STATS), has_null(has_null),
180033
+ has_no_null(has_no_null) {
179619
180034
  }
179620
180035
 
179621
180036
  unique_ptr<BaseStatistics> ValidityStatistics::Combine(const unique_ptr<BaseStatistics> &lstats,
@@ -179658,7 +180073,7 @@ void ValidityStatistics::Serialize(FieldWriter &writer) const {
179658
180073
  writer.WriteField<bool>(has_no_null);
179659
180074
  }
179660
180075
 
179661
- unique_ptr<BaseStatistics> ValidityStatistics::Deserialize(FieldReader &reader) {
180076
+ unique_ptr<ValidityStatistics> ValidityStatistics::Deserialize(FieldReader &reader) {
179662
180077
  bool has_null = reader.ReadRequired<bool>();
179663
180078
  bool has_no_null = reader.ReadRequired<bool>();
179664
180079
  return make_unique<ValidityStatistics>(has_null, has_no_null);
@@ -179689,7 +180104,9 @@ void ValidityStatistics::Verify(Vector &vector, const SelectionVector &sel, idx_
179689
180104
  }
179690
180105
 
179691
180106
  string ValidityStatistics::ToString() const {
179692
- return has_null ? "[Has Null: true]" : "[Has Null: false]";
180107
+ auto has_n = has_null ? "true" : "false";
180108
+ auto has_n_n = has_no_null ? "true" : "false";
180109
+ return StringUtil::Format("[Has Null: %s, Has No Null: %s]", has_n, has_n_n);
179693
180110
  }
179694
180111
 
179695
180112
  } // namespace duckdb
@@ -180411,6 +180828,78 @@ void ColumnCheckpointState::FlushToDisk() {
180411
180828
 
180412
180829
 
180413
180830
 
180831
+
180832
+
180833
+
180834
+
180835
+
180836
+
180837
+
180838
+
180839
+ //===----------------------------------------------------------------------===//
180840
+ // DuckDB
180841
+ //
180842
+ // duckdb/storage/table/struct_column_data.hpp
180843
+ //
180844
+ //
180845
+ //===----------------------------------------------------------------------===//
180846
+
180847
+
180848
+
180849
+
180850
+
180851
+
180852
+ namespace duckdb {
180853
+
180854
+ //! Struct column data represents a struct
180855
+ class StructColumnData : public ColumnData {
180856
+ public:
180857
+ StructColumnData(DataTableInfo &info, idx_t column_index, idx_t start_row, LogicalType type,
180858
+ ColumnData *parent = nullptr);
180859
+
180860
+ //! The sub-columns of the struct
180861
+ vector<unique_ptr<ColumnData>> sub_columns;
180862
+ //! The validity column data of the struct
180863
+ ValidityColumnData validity;
180864
+
180865
+ public:
180866
+ bool CheckZonemap(ColumnScanState &state, TableFilter &filter) override;
180867
+ idx_t GetMaxEntry() override;
180868
+
180869
+ void InitializeScan(ColumnScanState &state) override;
180870
+ void InitializeScanWithOffset(ColumnScanState &state, idx_t row_idx) override;
180871
+
180872
+ idx_t Scan(Transaction &transaction, idx_t vector_index, ColumnScanState &state, Vector &result) override;
180873
+ idx_t ScanCommitted(idx_t vector_index, ColumnScanState &state, Vector &result, bool allow_updates) override;
180874
+ idx_t ScanCount(ColumnScanState &state, Vector &result, idx_t count) override;
180875
+
180876
+ void InitializeAppend(ColumnAppendState &state) override;
180877
+ void Append(BaseStatistics &stats, ColumnAppendState &state, Vector &vector, idx_t count) override;
180878
+ void RevertAppend(row_t start_row) override;
180879
+ idx_t Fetch(ColumnScanState &state, row_t row_id, Vector &result) override;
180880
+ void FetchRow(Transaction &transaction, ColumnFetchState &state, row_t row_id, Vector &result,
180881
+ idx_t result_idx) override;
180882
+ void Update(Transaction &transaction, idx_t column_index, Vector &update_vector, row_t *row_ids,
180883
+ idx_t update_count) override;
180884
+ void UpdateColumn(Transaction &transaction, const vector<column_t> &column_path, Vector &update_vector,
180885
+ row_t *row_ids, idx_t update_count, idx_t depth) override;
180886
+ unique_ptr<BaseStatistics> GetUpdateStatistics() override;
180887
+
180888
+ void CommitDropColumn() override;
180889
+
180890
+ unique_ptr<ColumnCheckpointState> CreateCheckpointState(RowGroup &row_group, TableDataWriter &writer) override;
180891
+ unique_ptr<ColumnCheckpointState> Checkpoint(RowGroup &row_group, TableDataWriter &writer,
180892
+ ColumnCheckpointInfo &checkpoint_info) override;
180893
+
180894
+ void DeserializeColumn(Deserializer &source) override;
180895
+
180896
+ void GetStorageInfo(idx_t row_group_index, vector<idx_t> col_path, vector<vector<Value>> &result) override;
180897
+
180898
+ void Verify(RowGroup &parent) override;
180899
+ };
180900
+
180901
+ } // namespace duckdb
180902
+
180414
180903
  //===----------------------------------------------------------------------===//
180415
180904
  // DuckDB
180416
180905
  //
@@ -180521,77 +181010,6 @@ struct UpdateNode {
180521
181010
  } // namespace duckdb
180522
181011
 
180523
181012
 
180524
-
180525
- //===----------------------------------------------------------------------===//
180526
- // DuckDB
180527
- //
180528
- // duckdb/storage/table/struct_column_data.hpp
180529
- //
180530
- //
180531
- //===----------------------------------------------------------------------===//
180532
-
180533
-
180534
-
180535
-
180536
-
180537
-
180538
- namespace duckdb {
180539
-
180540
- //! Struct column data represents a struct
180541
- class StructColumnData : public ColumnData {
180542
- public:
180543
- StructColumnData(DataTableInfo &info, idx_t column_index, idx_t start_row, LogicalType type,
180544
- ColumnData *parent = nullptr);
180545
-
180546
- //! The sub-columns of the struct
180547
- vector<unique_ptr<ColumnData>> sub_columns;
180548
- //! The validity column data of the struct
180549
- ValidityColumnData validity;
180550
-
180551
- public:
180552
- bool CheckZonemap(ColumnScanState &state, TableFilter &filter) override;
180553
- idx_t GetMaxEntry() override;
180554
-
180555
- void InitializeScan(ColumnScanState &state) override;
180556
- void InitializeScanWithOffset(ColumnScanState &state, idx_t row_idx) override;
180557
-
180558
- idx_t Scan(Transaction &transaction, idx_t vector_index, ColumnScanState &state, Vector &result) override;
180559
- idx_t ScanCommitted(idx_t vector_index, ColumnScanState &state, Vector &result, bool allow_updates) override;
180560
- idx_t ScanCount(ColumnScanState &state, Vector &result, idx_t count) override;
180561
-
180562
- void InitializeAppend(ColumnAppendState &state) override;
180563
- void Append(BaseStatistics &stats, ColumnAppendState &state, Vector &vector, idx_t count) override;
180564
- void RevertAppend(row_t start_row) override;
180565
- idx_t Fetch(ColumnScanState &state, row_t row_id, Vector &result) override;
180566
- void FetchRow(Transaction &transaction, ColumnFetchState &state, row_t row_id, Vector &result,
180567
- idx_t result_idx) override;
180568
- void Update(Transaction &transaction, idx_t column_index, Vector &update_vector, row_t *row_ids,
180569
- idx_t update_count) override;
180570
- void UpdateColumn(Transaction &transaction, const vector<column_t> &column_path, Vector &update_vector,
180571
- row_t *row_ids, idx_t update_count, idx_t depth) override;
180572
- unique_ptr<BaseStatistics> GetUpdateStatistics() override;
180573
-
180574
- void CommitDropColumn() override;
180575
-
180576
- unique_ptr<ColumnCheckpointState> CreateCheckpointState(RowGroup &row_group, TableDataWriter &writer) override;
180577
- unique_ptr<ColumnCheckpointState> Checkpoint(RowGroup &row_group, TableDataWriter &writer,
180578
- ColumnCheckpointInfo &checkpoint_info) override;
180579
-
180580
- void DeserializeColumn(Deserializer &source) override;
180581
-
180582
- void GetStorageInfo(idx_t row_group_index, vector<idx_t> col_path, vector<vector<Value>> &result) override;
180583
-
180584
- void Verify(RowGroup &parent) override;
180585
- };
180586
-
180587
- } // namespace duckdb
180588
-
180589
-
180590
-
180591
-
180592
-
180593
-
180594
-
180595
181013
  namespace duckdb {
180596
181014
 
180597
181015
  ColumnData::ColumnData(DataTableInfo &info, idx_t column_index, idx_t start_row, LogicalType type, ColumnData *parent)
@@ -180940,7 +181358,7 @@ unique_ptr<ColumnCheckpointState> ColumnData::Checkpoint(RowGroup &row_group, Ta
180940
181358
  // scan the segments of the column data
180941
181359
  // set up the checkpoint state
180942
181360
  auto checkpoint_state = CreateCheckpointState(row_group, writer);
180943
- checkpoint_state->global_stats = BaseStatistics::CreateEmpty(type);
181361
+ checkpoint_state->global_stats = BaseStatistics::CreateEmpty(type, StatisticsType::LOCAL_STATS);
180944
181362
 
180945
181363
  if (!data.root_node) {
180946
181364
  // empty table: flush the empty list
@@ -182295,8 +182713,9 @@ unique_ptr<RowGroup> RowGroup::AddColumn(ClientContext &context, ColumnDefinitio
182295
182713
 
182296
182714
  // construct a new column data for the new column
182297
182715
  auto added_column = ColumnData::CreateColumn(GetTableInfo(), columns.size(), start, new_column.type);
182716
+ auto added_col_stats = make_shared<SegmentStatistics>(
182717
+ new_column.type, BaseStatistics::CreateEmpty(new_column.type, StatisticsType::LOCAL_STATS));
182298
182718
 
182299
- auto added_col_stats = make_shared<SegmentStatistics>(new_column.type);
182300
182719
  idx_t rows_to_write = this->count;
182301
182720
  if (rows_to_write > 0) {
182302
182721
  DataChunk dummy_chunk;
@@ -183204,7 +183623,7 @@ unique_ptr<BaseStatistics> StandardColumnData::GetUpdateStatistics() {
183204
183623
  return nullptr;
183205
183624
  }
183206
183625
  if (!stats) {
183207
- stats = BaseStatistics::CreateEmpty(type);
183626
+ stats = BaseStatistics::CreateEmpty(type, StatisticsType::GLOBAL_STATS);
183208
183627
  }
183209
183628
  stats->validity_stats = move(validity_stats);
183210
183629
  return stats;
@@ -183470,7 +183889,7 @@ void StructColumnData::UpdateColumn(Transaction &transaction, const vector<colum
183470
183889
 
183471
183890
  unique_ptr<BaseStatistics> StructColumnData::GetUpdateStatistics() {
183472
183891
  // check if any child column has updates
183473
- auto stats = BaseStatistics::CreateEmpty(type);
183892
+ auto stats = BaseStatistics::CreateEmpty(type, StatisticsType::GLOBAL_STATS);
183474
183893
  auto &struct_stats = (StructStatistics &)*stats;
183475
183894
  stats->validity_stats = validity.GetUpdateStatistics();
183476
183895
  for (idx_t i = 0; i < sub_columns.size(); i++) {
@@ -183578,6 +183997,13 @@ void StructColumnData::Verify(RowGroup &parent) {
183578
183997
 
183579
183998
  } // namespace duckdb
183580
183999
 
184000
+
184001
+
184002
+
184003
+
184004
+
184005
+
184006
+
183581
184007
  //===----------------------------------------------------------------------===//
183582
184008
  // DuckDB
183583
184009
  //
@@ -183642,11 +184068,6 @@ struct UpdateInfo {
183642
184068
 
183643
184069
  } // namespace duckdb
183644
184070
 
183645
-
183646
-
183647
-
183648
-
183649
-
183650
184071
  namespace duckdb {
183651
184072
 
183652
184073
  static UpdateSegment::initialize_update_function_t GetInitializeUpdateFunction(PhysicalType type);
@@ -219918,6 +220339,9 @@ void StringAppendF(std::string* dst, const char* format, ...) {
219918
220339
 
219919
220340
 
219920
220341
 
220342
+
220343
+
220344
+
219921
220345
  // LICENSE_CHANGE_BEGIN
219922
220346
  // The following code up to LICENSE_CHANGE_END is subject to THIRD PARTY LICENSE #7
219923
220347
  // See the end of this file for a list
@@ -220380,7 +220804,7 @@ struct hllhdr {
220380
220804
  #define HLL_INVALIDATE_CACHE(hdr) (hdr)->card[7] |= (1<<7)
220381
220805
  #define HLL_VALID_CACHE(hdr) (((hdr)->card[7] & (1<<7)) == 0)
220382
220806
 
220383
- #define HLL_P 14 /* The greater is P, the smaller the error. */
220807
+ #define HLL_P 12 /* The greater is P, the smaller the error. */
220384
220808
  #define HLL_Q (64-HLL_P) /* The number of bits of the hash value used for
220385
220809
  determining the number of leading zeros. */
220386
220810
  #define HLL_REGISTERS (1<<HLL_P) /* With P=14, 16384 registers. */
@@ -220677,7 +221101,7 @@ int hllPatLen(unsigned char *ele, size_t elesize, long *regp) {
220677
221101
  * The function always succeed, however if as a result of the operation
220678
221102
  * the approximated cardinality changed, 1 is returned. Otherwise 0
220679
221103
  * is returned. */
220680
- int hllDenseSet(uint8_t *registers, long index, uint8_t count) {
221104
+ static inline int hllDenseSet(uint8_t *registers, long index, uint8_t count) {
220681
221105
  uint8_t oldcount;
220682
221106
 
220683
221107
  HLL_DENSE_GET_REGISTER(oldcount,registers,index);
@@ -221417,8 +221841,51 @@ robj *hll_merge(robj **hlls, size_t hll_count) {
221417
221841
  }
221418
221842
  return result;
221419
221843
  }
221844
+
221845
+ uint64_t get_size() {
221846
+ return HLL_DENSE_SIZE;
221847
+ }
221848
+
221849
+ }
221850
+
221851
+ namespace duckdb {
221852
+
221853
+ static inline int AddToLog(void *log, const uint64_t &index, const uint8_t &count) {
221854
+ auto o = (duckdb_hll::robj *)log;
221855
+ duckdb_hll::hllhdr *hdr = (duckdb_hll::hllhdr *)o->ptr;
221856
+ D_ASSERT(hdr->encoding == HLL_DENSE);
221857
+ return duckdb_hll::hllDenseSet(hdr->registers + 1, index, count);
221858
+ }
221859
+
221860
+ void AddToLogsInternal(VectorData &vdata, idx_t count, uint64_t indices[], uint8_t counts[], void ***logs[],
221861
+ const SelectionVector *log_sel) {
221862
+ // 'logs' is an array of pointers to AggregateStates
221863
+ // AggregateStates have a pointer to a HyperLogLog object
221864
+ // HyperLogLog objects have a pointer to a 'robj', which we need
221865
+ for (idx_t i = 0; i < count; i++) {
221866
+ auto log = logs[log_sel->get_index(i)];
221867
+ if (log && vdata.validity.RowIsValid(vdata.sel->get_index(i))) {
221868
+ AddToLog(**log, indices[i], counts[i]);
221869
+ }
221870
+ }
221871
+ }
221872
+
221873
+ void AddToSingleLogInternal(VectorData &vdata, idx_t count, uint64_t indices[], uint8_t counts[], void *log) {
221874
+ const auto o = (duckdb_hll::robj *)log;
221875
+ duckdb_hll::hllhdr *hdr = (duckdb_hll::hllhdr *)o->ptr;
221876
+ D_ASSERT(hdr->encoding == HLL_DENSE);
221877
+
221878
+ const auto registers = hdr->registers + 1;
221879
+ for (idx_t i = 0; i < count; i++) {
221880
+ if (vdata.validity.RowIsValid(vdata.sel->get_index(i))) {
221881
+ duckdb_hll::hllDenseSet(registers, indices[i], counts[i]);
221882
+ }
221883
+ }
221420
221884
  }
221421
221885
 
221886
+ } // namespace duckdb
221887
+
221888
+
221422
221889
  // LICENSE_CHANGE_END
221423
221890
 
221424
221891