quickmatch-js 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/config.rs +19 -2
- package/src/index.js +252 -253
- package/src/lib.rs +72 -38
- package/LICENSE +0 -21
- package/README.md +0 -26
package/package.json
CHANGED
package/src/config.rs
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
|
-
const DEFAULT_SEPARATORS: &[char] = &['_', '-', ' '];
|
|
1
|
+
const DEFAULT_SEPARATORS: &[char] = &['_', '-', ' ', ':', '/'];
|
|
2
2
|
const DEFAULT_TRIGRAM_BUDGET: usize = 6;
|
|
3
3
|
const DEFAULT_LIMIT: usize = 100;
|
|
4
|
+
const DEFAULT_MIN_SCORE: usize = 2;
|
|
4
5
|
|
|
5
6
|
pub struct QuickMatchConfig {
|
|
6
7
|
/// Separators used to split words.
|
|
7
8
|
///
|
|
8
|
-
/// Default: ['_', '-', ' ']
|
|
9
|
+
/// Default: ['_', '-', ' ', ':', '/']
|
|
9
10
|
separators: &'static [char],
|
|
10
11
|
/// Maximum number of results to return.
|
|
11
12
|
///
|
|
@@ -22,6 +23,12 @@ pub struct QuickMatchConfig {
|
|
|
22
23
|
/// - High (9-15): Slower, more accurate fuzzy matching
|
|
23
24
|
/// - Max: 20
|
|
24
25
|
trigram_budget: usize,
|
|
26
|
+
/// Minimum trigram score required for fuzzy matches.
|
|
27
|
+
/// Higher values require more trigram overlap, reducing noise.
|
|
28
|
+
///
|
|
29
|
+
/// Default: 2
|
|
30
|
+
/// - Min: 1
|
|
31
|
+
min_score: usize,
|
|
25
32
|
}
|
|
26
33
|
|
|
27
34
|
impl Default for QuickMatchConfig {
|
|
@@ -30,6 +37,7 @@ impl Default for QuickMatchConfig {
|
|
|
30
37
|
separators: DEFAULT_SEPARATORS,
|
|
31
38
|
limit: DEFAULT_LIMIT,
|
|
32
39
|
trigram_budget: DEFAULT_TRIGRAM_BUDGET,
|
|
40
|
+
min_score: DEFAULT_MIN_SCORE,
|
|
33
41
|
}
|
|
34
42
|
}
|
|
35
43
|
}
|
|
@@ -54,6 +62,11 @@ impl QuickMatchConfig {
|
|
|
54
62
|
self
|
|
55
63
|
}
|
|
56
64
|
|
|
65
|
+
pub fn with_min_score(mut self, min_score: usize) -> Self {
|
|
66
|
+
self.min_score = min_score.max(1);
|
|
67
|
+
self
|
|
68
|
+
}
|
|
69
|
+
|
|
57
70
|
pub fn limit(&self) -> usize {
|
|
58
71
|
self.limit
|
|
59
72
|
}
|
|
@@ -65,4 +78,8 @@ impl QuickMatchConfig {
|
|
|
65
78
|
pub fn separators(&self) -> &[char] {
|
|
66
79
|
self.separators
|
|
67
80
|
}
|
|
81
|
+
|
|
82
|
+
pub fn min_score(&self) -> usize {
|
|
83
|
+
self.min_score
|
|
84
|
+
}
|
|
68
85
|
}
|
package/src/index.js
CHANGED
|
@@ -1,47 +1,70 @@
|
|
|
1
|
-
const DEFAULT_SEPARATORS = "_- ";
|
|
1
|
+
const DEFAULT_SEPARATORS = "_- :/";
|
|
2
2
|
const DEFAULT_TRIGRAM_BUDGET = 6;
|
|
3
3
|
const DEFAULT_LIMIT = 100;
|
|
4
|
+
const DEFAULT_MIN_SCORE = 2;
|
|
4
5
|
|
|
5
6
|
/**
|
|
6
7
|
* Configuration for QuickMatch.
|
|
7
8
|
*/
|
|
8
9
|
export class QuickMatchConfig {
|
|
9
|
-
/**
|
|
10
|
+
/**
|
|
11
|
+
* Separators used to split words.
|
|
12
|
+
* @type {string}
|
|
13
|
+
* @default "_- :/"
|
|
14
|
+
*/
|
|
10
15
|
separators = DEFAULT_SEPARATORS;
|
|
11
16
|
|
|
12
|
-
/**
|
|
17
|
+
/**
|
|
18
|
+
* Maximum number of results to return.
|
|
19
|
+
* @type {number}
|
|
20
|
+
* @default 100
|
|
21
|
+
*/
|
|
13
22
|
limit = DEFAULT_LIMIT;
|
|
14
23
|
|
|
15
|
-
/**
|
|
24
|
+
/**
|
|
25
|
+
* Budget of trigrams to process from unknown words.
|
|
26
|
+
* This budget is distributed fairly across all unknown words.
|
|
27
|
+
*
|
|
28
|
+
* - 0: Disable trigram matching (only exact word matches)
|
|
29
|
+
* - Low (3-6): Faster, less accurate fuzzy matching
|
|
30
|
+
* - High (9-15): Slower, more accurate fuzzy matching
|
|
31
|
+
* - Max: 20
|
|
32
|
+
* @type {number}
|
|
33
|
+
* @default 6
|
|
34
|
+
*/
|
|
16
35
|
trigramBudget = DEFAULT_TRIGRAM_BUDGET;
|
|
17
36
|
|
|
18
37
|
/**
|
|
19
|
-
*
|
|
20
|
-
*
|
|
38
|
+
* Minimum trigram score required for fuzzy matches.
|
|
39
|
+
* Higher values require more trigram overlap, reducing noise.
|
|
40
|
+
* @type {number}
|
|
41
|
+
* @default 2
|
|
21
42
|
*/
|
|
43
|
+
minScore = DEFAULT_MIN_SCORE;
|
|
44
|
+
|
|
45
|
+
/** @param {number} n - Max results (default: 100, min: 1) */
|
|
22
46
|
withLimit(n) {
|
|
23
47
|
this.limit = Math.max(1, n);
|
|
24
48
|
return this;
|
|
25
49
|
}
|
|
26
50
|
|
|
27
|
-
/**
|
|
28
|
-
* Set trigram budget for fuzzy matching.
|
|
29
|
-
* Higher values find more typos but cost more.
|
|
30
|
-
* @param {number} n - Budget (0-20, default: 6)
|
|
31
|
-
*/
|
|
51
|
+
/** @param {number} n - Trigram budget (0-20, default: 6) */
|
|
32
52
|
withTrigramBudget(n) {
|
|
33
53
|
this.trigramBudget = Math.max(0, Math.min(20, n));
|
|
34
54
|
return this;
|
|
35
55
|
}
|
|
36
56
|
|
|
37
|
-
/**
|
|
38
|
-
* Set word separator characters.
|
|
39
|
-
* @param {string} s - Separator characters (default: '_- ')
|
|
40
|
-
*/
|
|
57
|
+
/** @param {string} s - Separator characters (default: '_- :/') */
|
|
41
58
|
withSeparators(s) {
|
|
42
59
|
this.separators = s;
|
|
43
60
|
return this;
|
|
44
61
|
}
|
|
62
|
+
|
|
63
|
+
/** @param {number} n - Min trigram score (default: 2, min: 1) */
|
|
64
|
+
withMinScore(n) {
|
|
65
|
+
this.minScore = Math.max(1, n);
|
|
66
|
+
return this;
|
|
67
|
+
}
|
|
45
68
|
}
|
|
46
69
|
|
|
47
70
|
/**
|
|
@@ -49,9 +72,8 @@ export class QuickMatchConfig {
|
|
|
49
72
|
*/
|
|
50
73
|
export class QuickMatch {
|
|
51
74
|
/**
|
|
52
|
-
* Create a new matcher.
|
|
53
75
|
* @param {string[]} items - Items to index (should be lowercase)
|
|
54
|
-
* @param {QuickMatchConfig} [config]
|
|
76
|
+
* @param {QuickMatchConfig} [config]
|
|
55
77
|
*/
|
|
56
78
|
constructor(items, config = new QuickMatchConfig()) {
|
|
57
79
|
this.config = config;
|
|
@@ -60,263 +82,238 @@ export class QuickMatch {
|
|
|
60
82
|
this.wordIndex = new Map();
|
|
61
83
|
/** @type {Map<string, number[]>} */
|
|
62
84
|
this.trigramIndex = new Map();
|
|
85
|
+
this._sepLookup = sepLookup(config.separators);
|
|
86
|
+
this._scores = new Uint32Array(items.length);
|
|
87
|
+
/** @type {number[]} */
|
|
88
|
+
this._dirty = [];
|
|
63
89
|
|
|
64
|
-
let
|
|
65
|
-
let
|
|
66
|
-
let
|
|
67
|
-
|
|
68
|
-
const { separators } = config;
|
|
69
|
-
|
|
70
|
-
for (let itemIndex = 0; itemIndex < items.length; itemIndex++) {
|
|
71
|
-
const item = items[itemIndex];
|
|
90
|
+
let maxWordLen = 0;
|
|
91
|
+
let maxQueryLen = 0;
|
|
92
|
+
let maxWords = 0;
|
|
93
|
+
const sep = this._sepLookup;
|
|
72
94
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
95
|
+
for (let idx = 0; idx < items.length; idx++) {
|
|
96
|
+
const item = items[idx];
|
|
97
|
+
if (item.length > maxQueryLen) maxQueryLen = item.length;
|
|
76
98
|
|
|
77
|
-
|
|
78
|
-
let
|
|
99
|
+
const words = [];
|
|
100
|
+
let start = 0;
|
|
79
101
|
|
|
80
102
|
for (let i = 0; i <= item.length; i++) {
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
maxWordLength = word.length;
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
addToIndex(this.wordIndex, word, itemIndex);
|
|
92
|
-
addTrigramsToIndex(this.trigramIndex, word, itemIndex);
|
|
93
|
-
|
|
94
|
-
wordStart = i + 1;
|
|
95
|
-
} else if (isEndOfWord) {
|
|
96
|
-
wordStart = i + 1;
|
|
103
|
+
if (i < item.length && !sep[item.charCodeAt(i)]) continue;
|
|
104
|
+
if (i > start) {
|
|
105
|
+
const word = item.slice(start, i);
|
|
106
|
+
words.push(word);
|
|
107
|
+
if (word.length > maxWordLen) maxWordLen = word.length;
|
|
108
|
+
addToIndex(this.wordIndex, word, idx);
|
|
109
|
+
indexTrigrams(this.trigramIndex, word, idx);
|
|
97
110
|
}
|
|
111
|
+
start = i + 1;
|
|
98
112
|
}
|
|
99
113
|
|
|
100
|
-
|
|
101
|
-
|
|
114
|
+
for (let i = 0; i < words.length - 1; i++) {
|
|
115
|
+
addToIndex(this.wordIndex, words[i] + words[i + 1], idx);
|
|
102
116
|
}
|
|
117
|
+
|
|
118
|
+
if (words.length > maxWords) maxWords = words.length;
|
|
103
119
|
}
|
|
104
120
|
|
|
105
|
-
this.
|
|
106
|
-
this.
|
|
107
|
-
this.
|
|
121
|
+
this.maxWordLen = maxWordLen + 4;
|
|
122
|
+
this.maxQueryLen = maxQueryLen + 6;
|
|
123
|
+
this.maxWords = maxWords + 2;
|
|
108
124
|
}
|
|
109
125
|
|
|
110
|
-
/**
|
|
111
|
-
* Find matching items. Returns items sorted by relevance.
|
|
112
|
-
* @param {string} query - Search query
|
|
113
|
-
*/
|
|
126
|
+
/** @param {string} query */
|
|
114
127
|
matches(query) {
|
|
115
128
|
return this.matchesWith(query, this.config);
|
|
116
129
|
}
|
|
117
130
|
|
|
118
131
|
/**
|
|
119
|
-
*
|
|
120
|
-
* @param {
|
|
121
|
-
* @param {QuickMatchConfig} config - Configuration to use
|
|
132
|
+
* @param {string} query
|
|
133
|
+
* @param {QuickMatchConfig} config
|
|
122
134
|
*/
|
|
123
135
|
matchesWith(query, config) {
|
|
124
|
-
const { limit, trigramBudget
|
|
125
|
-
|
|
126
|
-
|
|
136
|
+
const { limit, trigramBudget } = config;
|
|
137
|
+
const sep =
|
|
138
|
+
config.separators === this.config.separators
|
|
139
|
+
? this._sepLookup
|
|
140
|
+
: sepLookup(config.separators);
|
|
127
141
|
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
}
|
|
131
|
-
|
|
132
|
-
const queryWords = parseWords(
|
|
133
|
-
normalizedQuery,
|
|
134
|
-
separators,
|
|
135
|
-
this.maxWordLength,
|
|
136
|
-
);
|
|
137
|
-
|
|
138
|
-
if (!queryWords.length || queryWords.length > this.maxWordCount) {
|
|
139
|
-
return [];
|
|
140
|
-
}
|
|
142
|
+
const q = normalize(query);
|
|
143
|
+
if (!q || q.length > this.maxQueryLen) return [];
|
|
141
144
|
|
|
142
|
-
const
|
|
143
|
-
|
|
145
|
+
const qwords = splitWords(q, sep, this.maxWordLen);
|
|
146
|
+
if (!qwords.length || qwords.length > this.maxWords) return [];
|
|
144
147
|
|
|
145
|
-
|
|
146
|
-
|
|
148
|
+
const known = [];
|
|
149
|
+
const unknown = [];
|
|
147
150
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
}
|
|
151
|
+
for (const w of qwords) {
|
|
152
|
+
const hits = this.wordIndex.get(w);
|
|
153
|
+
if (hits) known.push(hits);
|
|
154
|
+
else if (w.length >= 3 && unknown.length < trigramBudget) unknown.push(w);
|
|
153
155
|
}
|
|
154
156
|
|
|
155
|
-
const
|
|
156
|
-
const
|
|
157
|
-
const needsFuzzyMatching = unknownWords.length > 0 && trigramBudget > 0;
|
|
157
|
+
const pool = intersect(known);
|
|
158
|
+
const hasPool = pool.length > 0;
|
|
158
159
|
|
|
159
|
-
if (!
|
|
160
|
-
if (!
|
|
161
|
-
return this.
|
|
160
|
+
if (!unknown.length || !trigramBudget) {
|
|
161
|
+
if (!hasPool) return [];
|
|
162
|
+
return this._rank(pool, null, qwords, sep, limit);
|
|
162
163
|
}
|
|
163
164
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
if (
|
|
167
|
-
for (const
|
|
168
|
-
scores
|
|
165
|
+
// Seed scores from exact-match pool
|
|
166
|
+
const { _scores: scores, _dirty: dirty } = this;
|
|
167
|
+
if (hasPool) {
|
|
168
|
+
for (const i of pool) {
|
|
169
|
+
scores[i] = 1;
|
|
170
|
+
dirty.push(i);
|
|
169
171
|
}
|
|
170
172
|
}
|
|
171
173
|
|
|
172
|
-
const
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
});
|
|
181
|
-
|
|
182
|
-
const minScoreToInclude = Math.max(1, Math.ceil(hitCount / 2));
|
|
174
|
+
const hitCount = this._scoreTrigrams(
|
|
175
|
+
unknown,
|
|
176
|
+
trigramBudget,
|
|
177
|
+
hasPool,
|
|
178
|
+
Math.max(0, q.length - 3),
|
|
179
|
+
);
|
|
180
|
+
const minScore = Math.max(config.minScore, Math.ceil(hitCount / 2));
|
|
181
|
+
const result = this._rank(dirty, minScore, qwords, sep, limit);
|
|
183
182
|
|
|
184
|
-
|
|
183
|
+
for (const i of dirty) scores[i] = 0;
|
|
184
|
+
dirty.length = 0;
|
|
185
|
+
return result;
|
|
185
186
|
}
|
|
186
187
|
|
|
187
188
|
/**
|
|
188
189
|
* @private
|
|
189
|
-
* @param {
|
|
190
|
+
* @param {string[]} unknown
|
|
191
|
+
* @param {number} budget
|
|
192
|
+
* @param {boolean} poolOnly
|
|
193
|
+
* @param {number} minLen
|
|
190
194
|
*/
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
minItemLength,
|
|
197
|
-
}) {
|
|
198
|
-
const visitedTrigrams = new Set();
|
|
199
|
-
let budgetRemaining = budget;
|
|
200
|
-
let hitCount = 0;
|
|
195
|
+
_scoreTrigrams(unknown, budget, poolOnly, minLen) {
|
|
196
|
+
const visited = new Set();
|
|
197
|
+
const { _scores: scores, _dirty: dirty, items } = this;
|
|
198
|
+
let remaining = budget;
|
|
199
|
+
let hits = 0;
|
|
201
200
|
|
|
202
201
|
outer: for (let round = 0; round < budget; round++) {
|
|
203
|
-
for (const word of
|
|
204
|
-
if (
|
|
202
|
+
for (const word of unknown) {
|
|
203
|
+
if (remaining <= 0) break outer;
|
|
205
204
|
|
|
206
|
-
const
|
|
207
|
-
if (
|
|
205
|
+
const pos = trigramPosition(word.length, round);
|
|
206
|
+
if (pos < 0) continue;
|
|
208
207
|
|
|
209
|
-
const
|
|
210
|
-
|
|
208
|
+
const tri = word[pos] + word[pos + 1] + word[pos + 2];
|
|
209
|
+
if (visited.has(tri)) continue;
|
|
210
|
+
visited.add(tri);
|
|
211
|
+
remaining--;
|
|
211
212
|
|
|
212
|
-
|
|
213
|
-
|
|
213
|
+
const matched = this.trigramIndex.get(tri);
|
|
214
|
+
if (!matched) continue;
|
|
215
|
+
hits++;
|
|
214
216
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
scores.set(itemIndex, currentScore + 1);
|
|
217
|
+
if (poolOnly) {
|
|
218
|
+
for (let j = 0; j < matched.length; j++) {
|
|
219
|
+
const i = matched[j];
|
|
220
|
+
if (scores[i] > 0) scores[i]++;
|
|
221
|
+
}
|
|
222
|
+
} else {
|
|
223
|
+
for (let j = 0; j < matched.length; j++) {
|
|
224
|
+
const i = matched[j];
|
|
225
|
+
if (items[i].length >= minLen) {
|
|
226
|
+
if (scores[i] === 0) dirty.push(i);
|
|
227
|
+
scores[i]++;
|
|
227
228
|
}
|
|
228
|
-
} else if (this.items[itemIndex].length >= minItemLength) {
|
|
229
|
-
scores.set(itemIndex, (scores.get(itemIndex) || 0) + 1);
|
|
230
229
|
}
|
|
231
230
|
}
|
|
232
231
|
}
|
|
233
232
|
}
|
|
234
233
|
|
|
235
|
-
return
|
|
234
|
+
return hits;
|
|
236
235
|
}
|
|
237
236
|
|
|
238
237
|
/**
|
|
238
|
+
* Rank candidates by prefix match, then score, then length.
|
|
239
239
|
* @private
|
|
240
240
|
* @param {number[]} indices
|
|
241
|
+
* @param {number|null} minScore - null = no score filtering (exact-match path)
|
|
242
|
+
* @param {string[]} qwords
|
|
243
|
+
* @param {Uint8Array} sep
|
|
241
244
|
* @param {number} limit
|
|
242
245
|
*/
|
|
243
|
-
|
|
244
|
-
const { items } = this;
|
|
245
|
-
indices.sort((a, b) => items[a].length - items[b].length);
|
|
246
|
-
if (indices.length > limit) indices.length = limit;
|
|
247
|
-
return indices.map((i) => items[i]);
|
|
248
|
-
}
|
|
249
|
-
|
|
250
|
-
/**
|
|
251
|
-
* @private
|
|
252
|
-
* @param {Map<number, number>} scores
|
|
253
|
-
* @param {number} minScore
|
|
254
|
-
* @param {number} limit
|
|
255
|
-
*/
|
|
256
|
-
rankedResults(scores, minScore, limit) {
|
|
257
|
-
const { items } = this;
|
|
246
|
+
_rank(indices, minScore, qwords, sep, limit) {
|
|
247
|
+
const { items, _scores: scores } = this;
|
|
258
248
|
const results = [];
|
|
259
249
|
|
|
260
|
-
for (
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
250
|
+
for (let i = 0; i < indices.length; i++) {
|
|
251
|
+
const idx = indices[i];
|
|
252
|
+
if (minScore !== null && scores[idx] < minScore) continue;
|
|
253
|
+
results.push(idx);
|
|
264
254
|
}
|
|
265
255
|
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
}
|
|
256
|
+
const pscores = new Uint8Array(items.length);
|
|
257
|
+
for (let i = 0; i < results.length; i++) {
|
|
258
|
+
pscores[results[i]] = prefixScore(items[results[i]], qwords, sep);
|
|
259
|
+
}
|
|
270
260
|
|
|
271
|
-
|
|
261
|
+
results.sort(
|
|
262
|
+
(a, b) =>
|
|
263
|
+
pscores[b] - pscores[a] ||
|
|
264
|
+
scores[b] - scores[a] ||
|
|
265
|
+
items[a].length - items[b].length,
|
|
266
|
+
);
|
|
272
267
|
|
|
273
|
-
|
|
268
|
+
if (results.length > limit) results.length = limit;
|
|
269
|
+
return results.map((i) => items[i]);
|
|
274
270
|
}
|
|
275
271
|
}
|
|
276
272
|
|
|
273
|
+
// --- Helpers ---
|
|
274
|
+
|
|
277
275
|
/** @param {string} query */
|
|
278
|
-
function
|
|
279
|
-
let
|
|
276
|
+
function normalize(query) {
|
|
277
|
+
let out = "";
|
|
280
278
|
let start = 0;
|
|
281
279
|
let end = query.length;
|
|
282
|
-
|
|
283
280
|
while (start < end && query.charCodeAt(start) <= 32) start++;
|
|
284
281
|
while (end > start && query.charCodeAt(end - 1) <= 32) end--;
|
|
285
|
-
|
|
286
282
|
for (let i = start; i < end; i++) {
|
|
287
|
-
const
|
|
288
|
-
if (
|
|
289
|
-
|
|
290
|
-
code >= 65 && code <= 90 ? String.fromCharCode(code + 32) : query[i];
|
|
283
|
+
const c = query.charCodeAt(i);
|
|
284
|
+
if (c >= 128) continue;
|
|
285
|
+
out += c >= 65 && c <= 90 ? String.fromCharCode(c + 32) : query[i];
|
|
291
286
|
}
|
|
287
|
+
return out;
|
|
288
|
+
}
|
|
292
289
|
|
|
293
|
-
|
|
290
|
+
/** @param {string} separators */
|
|
291
|
+
function sepLookup(separators) {
|
|
292
|
+
const t = new Uint8Array(128);
|
|
293
|
+
for (let i = 0; i < separators.length; i++) {
|
|
294
|
+
const c = separators.charCodeAt(i);
|
|
295
|
+
if (c < 128) t[c] = 1;
|
|
296
|
+
}
|
|
297
|
+
return t;
|
|
294
298
|
}
|
|
295
299
|
|
|
296
300
|
/**
|
|
297
301
|
* @param {string} text
|
|
298
|
-
* @param {
|
|
299
|
-
* @param {number}
|
|
302
|
+
* @param {Uint8Array} sep
|
|
303
|
+
* @param {number} maxLen
|
|
300
304
|
*/
|
|
301
|
-
function
|
|
305
|
+
function splitWords(text, sep, maxLen) {
|
|
302
306
|
/** @type {string[]} */
|
|
303
307
|
const words = [];
|
|
304
308
|
let start = 0;
|
|
305
|
-
|
|
306
309
|
for (let i = 0; i <= text.length; i++) {
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
if (word.length <= maxLength && !words.includes(word)) {
|
|
312
|
-
words.push(word);
|
|
313
|
-
}
|
|
314
|
-
start = i + 1;
|
|
315
|
-
} else if (isEnd) {
|
|
316
|
-
start = i + 1;
|
|
310
|
+
if (i < text.length && !sep[text.charCodeAt(i)]) continue;
|
|
311
|
+
if (i > start) {
|
|
312
|
+
const w = text.slice(start, i);
|
|
313
|
+
if (w.length <= maxLen && !words.includes(w)) words.push(w);
|
|
317
314
|
}
|
|
315
|
+
start = i + 1;
|
|
318
316
|
}
|
|
319
|
-
|
|
320
317
|
return words;
|
|
321
318
|
}
|
|
322
319
|
|
|
@@ -326,102 +323,104 @@ function parseWords(text, separators, maxLength) {
|
|
|
326
323
|
* @param {number} value
|
|
327
324
|
*/
|
|
328
325
|
function addToIndex(index, key, value) {
|
|
329
|
-
const
|
|
330
|
-
if (
|
|
331
|
-
|
|
332
|
-
} else {
|
|
333
|
-
index.set(key, [value]);
|
|
334
|
-
}
|
|
326
|
+
const arr = index.get(key);
|
|
327
|
+
if (arr) arr.push(value);
|
|
328
|
+
else index.set(key, [value]);
|
|
335
329
|
}
|
|
336
330
|
|
|
337
331
|
/**
|
|
338
332
|
* @param {Map<string, number[]>} index
|
|
339
333
|
* @param {string} word
|
|
340
|
-
* @param {number}
|
|
334
|
+
* @param {number} idx
|
|
341
335
|
*/
|
|
342
|
-
function
|
|
336
|
+
function indexTrigrams(index, word, idx) {
|
|
343
337
|
if (word.length < 3) return;
|
|
344
|
-
|
|
345
338
|
for (let i = 0; i <= word.length - 3; i++) {
|
|
346
|
-
const
|
|
347
|
-
const
|
|
348
|
-
|
|
349
|
-
if (
|
|
350
|
-
index.set(trigram, [itemIndex]);
|
|
351
|
-
} else if (existing[existing.length - 1] !== itemIndex) {
|
|
352
|
-
existing.push(itemIndex);
|
|
353
|
-
}
|
|
339
|
+
const tri = word[i] + word[i + 1] + word[i + 2];
|
|
340
|
+
const arr = index.get(tri);
|
|
341
|
+
if (!arr) index.set(tri, [idx]);
|
|
342
|
+
else if (arr[arr.length - 1] !== idx) arr.push(idx);
|
|
354
343
|
}
|
|
355
344
|
}
|
|
356
345
|
|
|
357
346
|
/** @param {number[][]} arrays */
|
|
358
|
-
function
|
|
347
|
+
function intersect(arrays) {
|
|
359
348
|
if (!arrays.length) return [];
|
|
360
349
|
|
|
361
|
-
let
|
|
350
|
+
let si = 0;
|
|
362
351
|
for (let i = 1; i < arrays.length; i++) {
|
|
363
|
-
if (arrays[i].length < arrays[
|
|
364
|
-
smallestIndex = i;
|
|
365
|
-
}
|
|
352
|
+
if (arrays[i].length < arrays[si].length) si = i;
|
|
366
353
|
}
|
|
367
354
|
|
|
368
|
-
const result = arrays[
|
|
369
|
-
|
|
355
|
+
const result = arrays[si].slice();
|
|
370
356
|
for (let i = 0; i < arrays.length && result.length > 0; i++) {
|
|
371
|
-
if (i ===
|
|
372
|
-
|
|
373
|
-
let writeIndex = 0;
|
|
357
|
+
if (i === si) continue;
|
|
358
|
+
let w = 0;
|
|
374
359
|
for (let j = 0; j < result.length; j++) {
|
|
375
|
-
if (
|
|
376
|
-
result[writeIndex++] = result[j];
|
|
377
|
-
}
|
|
360
|
+
if (bsearch(arrays[i], result[j])) result[w++] = result[j];
|
|
378
361
|
}
|
|
379
|
-
result.length =
|
|
362
|
+
result.length = w;
|
|
380
363
|
}
|
|
381
|
-
|
|
382
364
|
return result;
|
|
383
365
|
}
|
|
384
366
|
|
|
385
367
|
/**
|
|
386
|
-
* @param {number[]}
|
|
387
|
-
* @param {number}
|
|
368
|
+
* @param {number[]} arr
|
|
369
|
+
* @param {number} val
|
|
388
370
|
*/
|
|
389
|
-
function
|
|
390
|
-
let
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
if (midValue === value) return true;
|
|
398
|
-
if (midValue < value) low = mid + 1;
|
|
399
|
-
else high = mid - 1;
|
|
371
|
+
function bsearch(arr, val) {
|
|
372
|
+
let lo = 0,
|
|
373
|
+
hi = arr.length - 1;
|
|
374
|
+
while (lo <= hi) {
|
|
375
|
+
const mid = (lo + hi) >> 1;
|
|
376
|
+
if (arr[mid] === val) return true;
|
|
377
|
+
if (arr[mid] < val) lo = mid + 1;
|
|
378
|
+
else hi = mid - 1;
|
|
400
379
|
}
|
|
401
|
-
|
|
402
380
|
return false;
|
|
403
381
|
}
|
|
404
382
|
|
|
405
383
|
/**
|
|
406
|
-
*
|
|
407
|
-
* @param {
|
|
384
|
+
* 2 = exact match, 1 = prefix match, 0 = no match
|
|
385
|
+
* @param {string} item
|
|
386
|
+
* @param {string[]} qwords
|
|
387
|
+
* @param {Uint8Array} sep
|
|
408
388
|
*/
|
|
409
|
-
function
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
389
|
+
function prefixScore(item, qwords, sep) {
|
|
390
|
+
let qi = 0,
|
|
391
|
+
pos = 0;
|
|
392
|
+
const len = item.length;
|
|
393
|
+
|
|
394
|
+
while (qi < qwords.length) {
|
|
395
|
+
while (pos < len && sep[item.charCodeAt(pos)]) pos++;
|
|
396
|
+
if (pos >= len) return 0;
|
|
397
|
+
|
|
398
|
+
const ws = pos;
|
|
399
|
+
while (pos < len && !sep[item.charCodeAt(pos)]) pos++;
|
|
400
|
+
|
|
401
|
+
const qw = qwords[qi];
|
|
402
|
+
if (pos - ws !== qw.length) return 0;
|
|
403
|
+
for (let j = 0; j < qw.length; j++) {
|
|
404
|
+
if (item.charCodeAt(ws + j) !== qw.charCodeAt(j)) return 0;
|
|
405
|
+
}
|
|
406
|
+
qi++;
|
|
424
407
|
}
|
|
425
408
|
|
|
426
|
-
|
|
409
|
+
while (pos < len && sep[item.charCodeAt(pos)]) pos++;
|
|
410
|
+
return pos >= len ? 2 : 1;
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
/** @param {number} len @param {number} round */
|
|
414
|
+
function trigramPosition(len, round) {
|
|
415
|
+
const max = len - 3;
|
|
416
|
+
if (max < 0) return -1;
|
|
417
|
+
if (round === 0) return 0;
|
|
418
|
+
if (round === 1 && max > 0) return max;
|
|
419
|
+
if (round === 2 && max > 1) return max >> 1;
|
|
420
|
+
if (max <= 2) return -1;
|
|
421
|
+
|
|
422
|
+
const mid = max >> 1;
|
|
423
|
+
const off = (round - 2) >> 1;
|
|
424
|
+
const pos = round & 1 ? Math.max(0, mid - off) : mid + off;
|
|
425
|
+
return pos === 0 || pos >= max || pos === mid ? -1 : pos;
|
|
427
426
|
}
|
package/src/lib.rs
CHANGED
|
@@ -36,16 +36,16 @@ impl<'a> QuickMatch<'a> {
|
|
|
36
36
|
|
|
37
37
|
for &item in items {
|
|
38
38
|
max_query_len = max_query_len.max(item.len());
|
|
39
|
-
let
|
|
40
|
-
|
|
41
|
-
word_count += 1;
|
|
42
|
-
if word.is_empty() {
|
|
43
|
-
continue;
|
|
44
|
-
}
|
|
39
|
+
let item_words: Vec<&str> = item.split(separators).filter(|w| !w.is_empty()).collect();
|
|
40
|
+
max_words = max_words.max(item_words.len());
|
|
45
41
|
|
|
46
|
-
|
|
42
|
+
for word in &item_words {
|
|
43
|
+
max_word_len = max_word_len.max(word.len());
|
|
47
44
|
|
|
48
|
-
word_index
|
|
45
|
+
word_index
|
|
46
|
+
.entry(word.to_string())
|
|
47
|
+
.or_default()
|
|
48
|
+
.insert(item);
|
|
49
49
|
|
|
50
50
|
if word.len() >= 3 {
|
|
51
51
|
let chars = word.chars().collect::<Vec<_>>();
|
|
@@ -57,13 +57,18 @@ impl<'a> QuickMatch<'a> {
|
|
|
57
57
|
}
|
|
58
58
|
}
|
|
59
59
|
}
|
|
60
|
-
|
|
60
|
+
|
|
61
|
+
// Index adjacent word pairs as compounds (e.g. "hash"+"rate" → "hashrate")
|
|
62
|
+
for pair in item_words.windows(2) {
|
|
63
|
+
let compound = format!("{}{}", pair[0], pair[1]);
|
|
64
|
+
word_index.entry(compound).or_default().insert(item);
|
|
65
|
+
}
|
|
61
66
|
}
|
|
62
67
|
|
|
63
68
|
Self {
|
|
64
69
|
max_query_len: max_query_len + 6,
|
|
65
70
|
max_word_len: max_word_len + 4,
|
|
66
|
-
max_word_count:
|
|
71
|
+
max_word_count: max_words + 2,
|
|
67
72
|
word_index,
|
|
68
73
|
trigram_index,
|
|
69
74
|
config,
|
|
@@ -71,26 +76,15 @@ impl<'a> QuickMatch<'a> {
|
|
|
71
76
|
}
|
|
72
77
|
}
|
|
73
78
|
|
|
74
|
-
///
|
|
75
|
-
/// `limit`: max number of returned matches
|
|
76
|
-
///
|
|
77
|
-
/// `max_trigrams`: max number of processed trigrams in unknown words (0-10 recommended)
|
|
78
|
-
///
|
|
79
79
|
pub fn matches(&self, query: &str) -> Vec<&'a str> {
|
|
80
80
|
self.matches_with(query, &self.config)
|
|
81
81
|
}
|
|
82
82
|
|
|
83
|
-
///
|
|
84
|
-
/// `limit`: max number of returned matches
|
|
85
|
-
///
|
|
86
|
-
/// `max_trigrams`: max number of processed trigrams in unknown words (0-10 recommended)
|
|
87
|
-
///
|
|
88
83
|
pub fn matches_with(&self, query: &str, config: &QuickMatchConfig) -> Vec<&'a str> {
|
|
89
84
|
let limit = config.limit();
|
|
90
85
|
let trigram_budget = config.trigram_budget();
|
|
91
|
-
let query_len = query.len();
|
|
92
86
|
|
|
93
|
-
if query.is_empty()
|
|
87
|
+
if query.is_empty() {
|
|
94
88
|
return vec![];
|
|
95
89
|
}
|
|
96
90
|
|
|
@@ -101,22 +95,30 @@ impl<'a> QuickMatch<'a> {
|
|
|
101
95
|
.collect::<String>()
|
|
102
96
|
.to_ascii_lowercase();
|
|
103
97
|
|
|
104
|
-
|
|
105
|
-
|
|
98
|
+
if query.is_empty() || query.len() > self.max_query_len {
|
|
99
|
+
return vec![];
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
let separators = config.separators();
|
|
103
|
+
|
|
104
|
+
let query_words: Vec<&str> = query
|
|
105
|
+
.split(separators)
|
|
106
106
|
.filter(|w| !w.is_empty() && w.len() <= self.max_word_len)
|
|
107
|
-
.collect
|
|
107
|
+
.collect();
|
|
108
|
+
|
|
109
|
+
let words: FxHashSet<&str> = query_words.iter().copied().collect();
|
|
108
110
|
|
|
109
111
|
if words.is_empty() || words.len() > self.max_word_count {
|
|
110
112
|
return vec![];
|
|
111
113
|
}
|
|
112
114
|
|
|
113
|
-
let min_len =
|
|
115
|
+
let min_len = query.len().saturating_sub(3);
|
|
114
116
|
|
|
115
117
|
let mut pool: Option<FxHashSet<*const str>> = None;
|
|
116
118
|
let mut unknown_words = Vec::new();
|
|
117
119
|
|
|
118
120
|
let mut words_to_intersect = vec![];
|
|
119
|
-
for word in words {
|
|
121
|
+
for &word in &words {
|
|
120
122
|
if let Some(items) = self.word_index.get(word) {
|
|
121
123
|
words_to_intersect.push(items)
|
|
122
124
|
} else if word.len() >= 3 && unknown_words.len() < trigram_budget {
|
|
@@ -144,17 +146,23 @@ impl<'a> QuickMatch<'a> {
|
|
|
144
146
|
let mut results: Vec<_> = pool
|
|
145
147
|
.unwrap_or_default()
|
|
146
148
|
.into_iter()
|
|
147
|
-
.map(|item|
|
|
149
|
+
.map(|item| {
|
|
150
|
+
let s = unsafe { &*item as &str };
|
|
151
|
+
(s, prefix_score(s, &query_words, separators))
|
|
152
|
+
})
|
|
148
153
|
.collect();
|
|
149
154
|
|
|
155
|
+
let cmp =
|
|
156
|
+
|a: &(&str, u8), b: &(&str, u8)| b.1.cmp(&a.1).then_with(|| a.0.len().cmp(&b.0.len()));
|
|
157
|
+
|
|
150
158
|
if results.len() > limit {
|
|
151
|
-
results.
|
|
159
|
+
results.select_nth_unstable_by(limit, cmp);
|
|
152
160
|
results.truncate(limit);
|
|
153
161
|
}
|
|
154
162
|
|
|
155
|
-
results.
|
|
163
|
+
results.sort_unstable_by(cmp);
|
|
156
164
|
|
|
157
|
-
return results;
|
|
165
|
+
return results.into_iter().map(|(item, _)| item).collect();
|
|
158
166
|
}
|
|
159
167
|
|
|
160
168
|
let mut scores: FxHashMap<*const str, usize> = FxHashMap::default();
|
|
@@ -232,26 +240,52 @@ impl<'a> QuickMatch<'a> {
|
|
|
232
240
|
}
|
|
233
241
|
}
|
|
234
242
|
|
|
235
|
-
let min_score = hit_count.div_ceil(2).max(
|
|
243
|
+
let min_score = hit_count.div_ceil(2).max(config.min_score());
|
|
236
244
|
let mut results: Vec<_> = scores
|
|
237
245
|
.into_iter()
|
|
238
246
|
.filter(|(_, s)| *s >= min_score)
|
|
239
|
-
.map(|(item, score)|
|
|
247
|
+
.map(|(item, score)| {
|
|
248
|
+
let s = unsafe { &*item as &str };
|
|
249
|
+
(s, score, prefix_score(s, &query_words, separators))
|
|
250
|
+
})
|
|
240
251
|
.collect();
|
|
241
252
|
|
|
253
|
+
let cmp = |a: &(&str, usize, u8), b: &(&str, usize, u8)| {
|
|
254
|
+
b.2.cmp(&a.2)
|
|
255
|
+
.then_with(|| b.1.cmp(&a.1))
|
|
256
|
+
.then_with(|| a.0.len().cmp(&b.0.len()))
|
|
257
|
+
};
|
|
258
|
+
|
|
242
259
|
if results.len() > limit {
|
|
243
|
-
results.select_nth_unstable_by(limit,
|
|
244
|
-
b.1.cmp(&a.1).then_with(|| a.0.len().cmp(&b.0.len()))
|
|
245
|
-
});
|
|
260
|
+
results.select_nth_unstable_by(limit, cmp);
|
|
246
261
|
results.truncate(limit);
|
|
247
262
|
}
|
|
248
263
|
|
|
249
|
-
results.sort_unstable_by(
|
|
264
|
+
results.sort_unstable_by(cmp);
|
|
250
265
|
|
|
251
266
|
results
|
|
252
267
|
.into_iter()
|
|
253
268
|
.take(limit)
|
|
254
|
-
.map(|(item, _)| item)
|
|
269
|
+
.map(|(item, _, _)| item)
|
|
255
270
|
.collect()
|
|
256
271
|
}
|
|
257
272
|
}
|
|
273
|
+
|
|
274
|
+
/// Score how well an item's word sequence matches the query as a prefix.
|
|
275
|
+
/// - 2: exact match (all words match, no extra words in item)
|
|
276
|
+
/// - 1: prefix match (item starts with query words but has more)
|
|
277
|
+
/// - 0: no prefix match
|
|
278
|
+
fn prefix_score(item: &str, query_words: &[&str], separators: &[char]) -> u8 {
|
|
279
|
+
let mut item_words = item.split(separators).filter(|w| !w.is_empty());
|
|
280
|
+
for &qw in query_words {
|
|
281
|
+
match item_words.next() {
|
|
282
|
+
Some(iw) if iw == qw => continue,
|
|
283
|
+
_ => return 0,
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
if item_words.next().is_none() {
|
|
287
|
+
2
|
|
288
|
+
} else {
|
|
289
|
+
1
|
|
290
|
+
}
|
|
291
|
+
}
|
package/LICENSE
DELETED
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
MIT License
|
|
2
|
-
|
|
3
|
-
Copyright (c) 2025 quickmatch
|
|
4
|
-
|
|
5
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
-
in the Software without restriction, including without limitation the rights
|
|
8
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
-
furnished to do so, subject to the following conditions:
|
|
11
|
-
|
|
12
|
-
The above copyright notice and this permission notice shall be included in all
|
|
13
|
-
copies or substantial portions of the Software.
|
|
14
|
-
|
|
15
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
-
SOFTWARE.
|
package/README.md
DELETED
|
@@ -1,26 +0,0 @@
|
|
|
1
|
-
# quickmatch
|
|
2
|
-
|
|
3
|
-
**Lightning-fast fuzzy string matching for Rust.**
|
|
4
|
-
|
|
5
|
-
A high-performance string matching library optimized for interactive search experiences like autocomplete, command palettes, and search-as-you-type interfaces.
|
|
6
|
-
|
|
7
|
-
[](https://crates.io/crates/quickmatch)
|
|
8
|
-
[](https://docs.rs/quickmatch)
|
|
9
|
-
|
|
10
|
-
## Features
|
|
11
|
-
|
|
12
|
-
- **Blazing fast** - Optimized for sub-millisecond search times
|
|
13
|
-
- **Hybrid matching** - Word-level matching with trigram-based fuzzy fallback
|
|
14
|
-
- **Memory efficient** - Zero-copy string storage with pointer-based indexing
|
|
15
|
-
- **Ranked results** - Intelligent scoring based on match quality
|
|
16
|
-
- **Zero external dependencies** - Only uses `rustc-hash` for fast hashing
|
|
17
|
-
|
|
18
|
-
## Installation
|
|
19
|
-
|
|
20
|
-
```bash
|
|
21
|
-
# rust
|
|
22
|
-
cargo add quickmatch
|
|
23
|
-
|
|
24
|
-
# js
|
|
25
|
-
npm install quickmatch-js
|
|
26
|
-
```
|