duckdb 0.7.1-dev407.0 → 0.7.1-dev415.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "name": "duckdb",
3
3
  "main": "./lib/duckdb.js",
4
4
  "types": "./lib/duckdb.d.ts",
5
- "version": "0.7.1-dev407.0",
5
+ "version": "0.7.1-dev415.0",
6
6
  "description": "DuckDB node.js API",
7
7
  "gypfile": true,
8
8
  "dependencies": {
@@ -24,7 +24,7 @@ SimilarCatalogEntry SchemaCatalogEntry::GetSimilarEntry(CatalogTransaction trans
24
24
  const string &name) {
25
25
  SimilarCatalogEntry result;
26
26
  Scan(transaction.GetContext(), type, [&](CatalogEntry *entry) {
27
- auto ldist = StringUtil::LevenshteinDistance(entry->name, name);
27
+ auto ldist = StringUtil::SimilarityScore(entry->name, name);
28
28
  if (ldist < result.distance) {
29
29
  result.distance = ldist;
30
30
  result.name = entry->name;
@@ -460,7 +460,7 @@ SimilarCatalogEntry CatalogSet::SimilarEntry(CatalogTransaction transaction, con
460
460
  for (auto &kv : mapping) {
461
461
  auto mapping_value = GetMapping(transaction, kv.first);
462
462
  if (mapping_value && !mapping_value->deleted) {
463
- auto ldist = StringUtil::LevenshteinDistance(kv.first, name);
463
+ auto ldist = StringUtil::SimilarityScore(kv.first, name);
464
464
  if (ldist < result.distance) {
465
465
  result.distance = ldist;
466
466
  result.name = kv.first;
@@ -249,7 +249,7 @@ private:
249
249
  };
250
250
 
251
251
  // adapted from https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#C++
252
- idx_t StringUtil::LevenshteinDistance(const string &s1_p, const string &s2_p) {
252
+ idx_t StringUtil::LevenshteinDistance(const string &s1_p, const string &s2_p, idx_t not_equal_penalty) {
253
253
  auto s1 = StringUtil::Lower(s1_p);
254
254
  auto s2 = StringUtil::Lower(s2_p);
255
255
  idx_t len1 = s1.size();
@@ -273,7 +273,7 @@ idx_t StringUtil::LevenshteinDistance(const string &s1_p, const string &s2_p) {
273
273
  // d[i][j] = std::min({ d[i - 1][j] + 1,
274
274
  // d[i][j - 1] + 1,
275
275
  // d[i - 1][j - 1] + (s1[i - 1] == s2[j - 1] ? 0 : 1) });
276
- int equal = s1[i - 1] == s2[j - 1] ? 0 : 1;
276
+ int equal = s1[i - 1] == s2[j - 1] ? 0 : not_equal_penalty;
277
277
  idx_t adjacent_score1 = array.Score(i - 1, j) + 1;
278
278
  idx_t adjacent_score2 = array.Score(i, j - 1) + 1;
279
279
  idx_t adjacent_score3 = array.Score(i - 1, j - 1) + equal;
@@ -285,15 +285,19 @@ idx_t StringUtil::LevenshteinDistance(const string &s1_p, const string &s2_p) {
285
285
  return array.Score(len1, len2);
286
286
  }
287
287
 
288
+ idx_t StringUtil::SimilarityScore(const string &s1, const string &s2) {
289
+ return LevenshteinDistance(s1, s2, 3);
290
+ }
291
+
288
292
  vector<string> StringUtil::TopNLevenshtein(const vector<string> &strings, const string &target, idx_t n,
289
293
  idx_t threshold) {
290
294
  vector<pair<string, idx_t>> scores;
291
295
  scores.reserve(strings.size());
292
296
  for (auto &str : strings) {
293
297
  if (target.size() < str.size()) {
294
- scores.emplace_back(str, LevenshteinDistance(str.substr(0, target.size()), target));
298
+ scores.emplace_back(str, SimilarityScore(str.substr(0, target.size()), target));
295
299
  } else {
296
- scores.emplace_back(str, LevenshteinDistance(str, target));
300
+ scores.emplace_back(str, SimilarityScore(str, target));
297
301
  }
298
302
  }
299
303
  return TopNStrings(scores, n, threshold);
@@ -1,8 +1,8 @@
1
1
  #ifndef DUCKDB_VERSION
2
- #define DUCKDB_VERSION "0.7.1-dev407"
2
+ #define DUCKDB_VERSION "0.7.1-dev415"
3
3
  #endif
4
4
  #ifndef DUCKDB_SOURCE_ID
5
- #define DUCKDB_SOURCE_ID "66ba97b5f9"
5
+ #define DUCKDB_SOURCE_ID "ae3510f069"
6
6
  #endif
7
7
  #include "duckdb/function/table/system_functions.hpp"
8
8
  #include "duckdb/main/database.hpp"
@@ -145,8 +145,15 @@ public:
145
145
  DUCKDB_API static string Replace(string source, const string &from, const string &to);
146
146
 
147
147
  //! Get the levenshtein distance from two strings
148
- DUCKDB_API static idx_t LevenshteinDistance(const string &s1, const string &s2);
149
-
148
+ //! The not_equal_penalty is the penalty given when two characters in a string are not equal
149
+ //! The regular levenshtein distance has a not equal penalty of 1, which means changing a character is as expensive
150
+ //! as adding or removing one For similarity searches we often want to give extra weight to changing a character For
151
+ //! example: with an equal penalty of 1, "pg_am" is closer to "depdelay" than "depdelay_minutes"
152
+ //! with an equal penalty of 3, "depdelay_minutes" is closer to "depdelay" than to "pg_am"
153
+ DUCKDB_API static idx_t LevenshteinDistance(const string &s1, const string &s2, idx_t not_equal_penalty = 1);
154
+
155
+ //! Returns the similarity score between two strings
156
+ DUCKDB_API static idx_t SimilarityScore(const string &s1, const string &s2);
150
157
  //! Get the top-n strings (sorted by the given score distance) from a set of scores.
151
158
  //! At least one entry is returned (if there is one).
152
159
  //! Strings are only returned if they have a score less than the threshold.
@@ -44,7 +44,7 @@ vector<string> BindContext::GetSimilarBindings(const string &column_name) {
44
44
  for (auto &kv : bindings) {
45
45
  auto binding = kv.second.get();
46
46
  for (auto &name : binding->names) {
47
- idx_t distance = StringUtil::LevenshteinDistance(name, column_name);
47
+ idx_t distance = StringUtil::SimilarityScore(name, column_name);
48
48
  scores.emplace_back(binding->alias + "." + name, distance);
49
49
  }
50
50
  }
@@ -167,7 +167,8 @@ idx_t LogicalOperator::EstimateCardinality(ClientContext &context) {
167
167
  max_cardinality = MaxValue(child->EstimateCardinality(context), max_cardinality);
168
168
  }
169
169
  has_estimated_cardinality = true;
170
- return max_cardinality;
170
+ estimated_cardinality = max_cardinality;
171
+ return estimated_cardinality;
171
172
  }
172
173
 
173
174
  void LogicalOperator::Print() {