duckdb 0.3.5-dev692.0 → 0.3.5-dev699.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
- "version": "0.3.5-dev692.0",
4
+ "version": "0.3.5-dev699.0",
5
5
  "description": "DuckDB node.js API",
6
6
  "gypfile": true,
7
7
  "dependencies": {
package/src/duckdb.cpp CHANGED
@@ -41296,11 +41296,11 @@ inline uint64_t TemplatedHash(const string_t &elem) {
41296
41296
  data_ptr_t data = (data_ptr_t)elem.GetDataUnsafe();
41297
41297
  const auto &len = elem.GetSize();
41298
41298
  uint64_t h = 0;
41299
- for (idx_t i = 0; i < len / 8; i += 8) {
41299
+ for (idx_t i = 0; i + sizeof(uint64_t) <= len; i += sizeof(uint64_t)) {
41300
41300
  h ^= TemplatedHash<uint64_t>(Load<uint64_t>(data));
41301
- data += 8;
41301
+ data += sizeof(uint64_t);
41302
41302
  }
41303
- switch (len & 7) {
41303
+ switch (len & (sizeof(uint64_t) - 1)) {
41304
41304
  case 4:
41305
41305
  h ^= TemplatedHash<uint32_t>(Load<uint32_t>(data));
41306
41306
  break;
@@ -181055,6 +181055,8 @@ void BaseStatistics::Verify(Vector &vector, idx_t count) const {
181055
181055
 
181056
181056
 
181057
181057
 
181058
+ #include <math.h>
181059
+
181058
181060
  namespace duckdb {
181059
181061
 
181060
181062
  DistinctStatistics::DistinctStatistics()
@@ -181115,7 +181117,7 @@ void DistinctStatistics::Update(VectorData &vdata, const LogicalType &type, idx_
181115
181117
  return;
181116
181118
  }
181117
181119
  total_count += count;
181118
- count = MaxValue<idx_t>(idx_t(SAMPLE_RATE * double(count)), 1);
181120
+ count = MinValue<idx_t>(idx_t(SAMPLE_RATE * MaxValue<idx_t>(STANDARD_VECTOR_SIZE, count)), count);
181119
181121
  sample_count += count;
181120
181122
 
181121
181123
  uint64_t indices[STANDARD_VECTOR_SIZE];
@@ -181133,12 +181135,17 @@ idx_t DistinctStatistics::GetCount() const {
181133
181135
  if (sample_count == 0 || total_count == 0) {
181134
181136
  return 0;
181135
181137
  }
181136
- // Estimate HLL count because we use sampling
181137
- double hll_count = log->Count();
181138
- double unique_proportion = hll_count / double(sample_count);
181139
- double actual_sample_rate = double(sample_count) / double(total_count);
181140
- double multiplier = double(1) + unique_proportion * (double(1) / actual_sample_rate - double(1));
181141
- return idx_t(multiplier * hll_count);
181138
+
181139
+ double u = MinValue<idx_t>(log->Count(), sample_count);
181140
+ double s = sample_count;
181141
+ double n = total_count;
181142
+
181143
+ // Assume this proportion of the the sampled values occurred only once
181144
+ double u1 = pow(u / s, 2) * u;
181145
+
181146
+ // Estimate total uniques using Good Turing Estimation
181147
+ idx_t estimate = u + u1 / s * (n - s);
181148
+ return MinValue<idx_t>(estimate, total_count);
181142
181149
  }
181143
181150
 
181144
181151
  } // namespace duckdb
package/src/duckdb.hpp CHANGED
@@ -11,8 +11,8 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI
11
11
  #pragma once
12
12
  #define DUCKDB_AMALGAMATION 1
13
13
  #define DUCKDB_AMALGAMATION_EXTENDED 1
14
- #define DUCKDB_SOURCE_ID "c2581dfeb"
15
- #define DUCKDB_VERSION "v0.3.5-dev692"
14
+ #define DUCKDB_SOURCE_ID "360aedc4f"
15
+ #define DUCKDB_VERSION "v0.3.5-dev699"
16
16
  //===----------------------------------------------------------------------===//
17
17
  // DuckDB
18
18
  //