duckdb 0.7.1-dev407.0 → 0.7.1-dev415.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/duckdb/src/catalog/catalog_entry/schema_catalog_entry.cpp +1 -1
- package/src/duckdb/src/catalog/catalog_set.cpp +1 -1
- package/src/duckdb/src/common/string_util.cpp +8 -4
- package/src/duckdb/src/function/table/version/pragma_version.cpp +2 -2
- package/src/duckdb/src/include/duckdb/common/string_util.hpp +9 -2
- package/src/duckdb/src/planner/bind_context.cpp +1 -1
- package/src/duckdb/src/planner/logical_operator.cpp +2 -1
package/package.json
CHANGED
|
@@ -24,7 +24,7 @@ SimilarCatalogEntry SchemaCatalogEntry::GetSimilarEntry(CatalogTransaction trans
|
|
|
24
24
|
const string &name) {
|
|
25
25
|
SimilarCatalogEntry result;
|
|
26
26
|
Scan(transaction.GetContext(), type, [&](CatalogEntry *entry) {
|
|
27
|
-
auto ldist = StringUtil::
|
|
27
|
+
auto ldist = StringUtil::SimilarityScore(entry->name, name);
|
|
28
28
|
if (ldist < result.distance) {
|
|
29
29
|
result.distance = ldist;
|
|
30
30
|
result.name = entry->name;
|
|
@@ -460,7 +460,7 @@ SimilarCatalogEntry CatalogSet::SimilarEntry(CatalogTransaction transaction, con
|
|
|
460
460
|
for (auto &kv : mapping) {
|
|
461
461
|
auto mapping_value = GetMapping(transaction, kv.first);
|
|
462
462
|
if (mapping_value && !mapping_value->deleted) {
|
|
463
|
-
auto ldist = StringUtil::
|
|
463
|
+
auto ldist = StringUtil::SimilarityScore(kv.first, name);
|
|
464
464
|
if (ldist < result.distance) {
|
|
465
465
|
result.distance = ldist;
|
|
466
466
|
result.name = kv.first;
|
|
@@ -249,7 +249,7 @@ private:
|
|
|
249
249
|
};
|
|
250
250
|
|
|
251
251
|
// adapted from https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#C++
|
|
252
|
-
idx_t StringUtil::LevenshteinDistance(const string &s1_p, const string &s2_p) {
|
|
252
|
+
idx_t StringUtil::LevenshteinDistance(const string &s1_p, const string &s2_p, idx_t not_equal_penalty) {
|
|
253
253
|
auto s1 = StringUtil::Lower(s1_p);
|
|
254
254
|
auto s2 = StringUtil::Lower(s2_p);
|
|
255
255
|
idx_t len1 = s1.size();
|
|
@@ -273,7 +273,7 @@ idx_t StringUtil::LevenshteinDistance(const string &s1_p, const string &s2_p) {
|
|
|
273
273
|
// d[i][j] = std::min({ d[i - 1][j] + 1,
|
|
274
274
|
// d[i][j - 1] + 1,
|
|
275
275
|
// d[i - 1][j - 1] + (s1[i - 1] == s2[j - 1] ? 0 : 1) });
|
|
276
|
-
int equal = s1[i - 1] == s2[j - 1] ? 0 :
|
|
276
|
+
int equal = s1[i - 1] == s2[j - 1] ? 0 : not_equal_penalty;
|
|
277
277
|
idx_t adjacent_score1 = array.Score(i - 1, j) + 1;
|
|
278
278
|
idx_t adjacent_score2 = array.Score(i, j - 1) + 1;
|
|
279
279
|
idx_t adjacent_score3 = array.Score(i - 1, j - 1) + equal;
|
|
@@ -285,15 +285,19 @@ idx_t StringUtil::LevenshteinDistance(const string &s1_p, const string &s2_p) {
|
|
|
285
285
|
return array.Score(len1, len2);
|
|
286
286
|
}
|
|
287
287
|
|
|
288
|
+
idx_t StringUtil::SimilarityScore(const string &s1, const string &s2) {
|
|
289
|
+
return LevenshteinDistance(s1, s2, 3);
|
|
290
|
+
}
|
|
291
|
+
|
|
288
292
|
vector<string> StringUtil::TopNLevenshtein(const vector<string> &strings, const string &target, idx_t n,
|
|
289
293
|
idx_t threshold) {
|
|
290
294
|
vector<pair<string, idx_t>> scores;
|
|
291
295
|
scores.reserve(strings.size());
|
|
292
296
|
for (auto &str : strings) {
|
|
293
297
|
if (target.size() < str.size()) {
|
|
294
|
-
scores.emplace_back(str,
|
|
298
|
+
scores.emplace_back(str, SimilarityScore(str.substr(0, target.size()), target));
|
|
295
299
|
} else {
|
|
296
|
-
scores.emplace_back(str,
|
|
300
|
+
scores.emplace_back(str, SimilarityScore(str, target));
|
|
297
301
|
}
|
|
298
302
|
}
|
|
299
303
|
return TopNStrings(scores, n, threshold);
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
#ifndef DUCKDB_VERSION
|
|
2
|
-
#define DUCKDB_VERSION "0.7.1-
|
|
2
|
+
#define DUCKDB_VERSION "0.7.1-dev415"
|
|
3
3
|
#endif
|
|
4
4
|
#ifndef DUCKDB_SOURCE_ID
|
|
5
|
-
#define DUCKDB_SOURCE_ID "
|
|
5
|
+
#define DUCKDB_SOURCE_ID "ae3510f069"
|
|
6
6
|
#endif
|
|
7
7
|
#include "duckdb/function/table/system_functions.hpp"
|
|
8
8
|
#include "duckdb/main/database.hpp"
|
|
@@ -145,8 +145,15 @@ public:
|
|
|
145
145
|
DUCKDB_API static string Replace(string source, const string &from, const string &to);
|
|
146
146
|
|
|
147
147
|
//! Get the levenshtein distance from two strings
|
|
148
|
-
|
|
149
|
-
|
|
148
|
+
//! The not_equal_penalty is the penalty given when two characters in a string are not equal
|
|
149
|
+
//! The regular levenshtein distance has a not equal penalty of 1, which means changing a character is as expensive
|
|
150
|
+
//! as adding or removing one For similarity searches we often want to give extra weight to changing a character For
|
|
151
|
+
//! example: with an equal penalty of 1, "pg_am" is closer to "depdelay" than "depdelay_minutes"
|
|
152
|
+
//! with an equal penalty of 3, "depdelay_minutes" is closer to "depdelay" than to "pg_am"
|
|
153
|
+
DUCKDB_API static idx_t LevenshteinDistance(const string &s1, const string &s2, idx_t not_equal_penalty = 1);
|
|
154
|
+
|
|
155
|
+
//! Returns the similarity score between two strings
|
|
156
|
+
DUCKDB_API static idx_t SimilarityScore(const string &s1, const string &s2);
|
|
150
157
|
//! Get the top-n strings (sorted by the given score distance) from a set of scores.
|
|
151
158
|
//! At least one entry is returned (if there is one).
|
|
152
159
|
//! Strings are only returned if they have a score less than the threshold.
|
|
@@ -44,7 +44,7 @@ vector<string> BindContext::GetSimilarBindings(const string &column_name) {
|
|
|
44
44
|
for (auto &kv : bindings) {
|
|
45
45
|
auto binding = kv.second.get();
|
|
46
46
|
for (auto &name : binding->names) {
|
|
47
|
-
idx_t distance = StringUtil::
|
|
47
|
+
idx_t distance = StringUtil::SimilarityScore(name, column_name);
|
|
48
48
|
scores.emplace_back(binding->alias + "." + name, distance);
|
|
49
49
|
}
|
|
50
50
|
}
|
|
@@ -167,7 +167,8 @@ idx_t LogicalOperator::EstimateCardinality(ClientContext &context) {
|
|
|
167
167
|
max_cardinality = MaxValue(child->EstimateCardinality(context), max_cardinality);
|
|
168
168
|
}
|
|
169
169
|
has_estimated_cardinality = true;
|
|
170
|
-
|
|
170
|
+
estimated_cardinality = max_cardinality;
|
|
171
|
+
return estimated_cardinality;
|
|
171
172
|
}
|
|
172
173
|
|
|
173
174
|
void LogicalOperator::Print() {
|