@strav/search 0.3.20 → 0.3.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,134 @@
1
+ import type { SearchOptions } from '../../../types.ts'
2
+ import type { FieldRegistry } from './field_registry.ts'
3
+ import type { FtsExpression } from './fts_query_builder.ts'
4
+ import { compileFilter } from '../filters/filter_compiler.ts'
5
+ import { quoteIdent } from './schema.ts'
6
+ import { OPEN_SENTINEL, CLOSE_SENTINEL } from './snippet_formatter.ts'
7
+
8
+ export interface CompiledSearch {
9
+ /** Main SELECT returning hits + score + snippets. */
10
+ sql: string
11
+ /** Bound parameters for the main SELECT. */
12
+ params: unknown[]
13
+ /** COUNT(*) variant for totalHits. */
14
+ countSql: string
15
+ /** Bound parameters for the count query (subset of `params`). */
16
+ countParams: unknown[]
17
+ /** Names of columns we asked SQLite to return for snippets, in order. */
18
+ snippetColumns: string[]
19
+ }
20
+
21
+ const DEFAULT_SNIPPET_TOKENS = 24
22
+
23
+ export interface QueryCompilerOptions {
24
+ registry: FieldRegistry
25
+ expression: FtsExpression
26
+ search: SearchOptions
27
+ /** Per-column BM25 weights, matching `registry.searchable` order. Defaults to all-1. */
28
+ weights?: number[]
29
+ }
30
+
31
+ export function compileSearch(opts: QueryCompilerOptions): CompiledSearch {
32
+ const { registry, expression, search, weights } = opts
33
+ const filterableSet = new Set(registry.filterable)
34
+ const sortableSet = new Set(registry.sortable)
35
+
36
+ const filter = compileFilter(search.filter, filterableSet)
37
+
38
+ const whereParts: string[] = []
39
+ const matchParams: unknown[] = []
40
+
41
+ if (!expression.isEmpty) {
42
+ whereParts.push('fts.fts MATCH ?')
43
+ matchParams.push(expression.match)
44
+ }
45
+ if (filter.sql) whereParts.push(filter.sql)
46
+
47
+ const where = whereParts.length > 0 ? `WHERE ${whereParts.join(' AND ')}` : ''
48
+
49
+ // BM25 score (negative = better). Defaults to weight 1.0 for every column.
50
+ const ws = (weights ?? registry.searchable.map(() => 1)).map(w => Number(w) || 1)
51
+ const bm25Args = ws.length > 0 ? `, ${ws.join(', ')}` : ''
52
+
53
+ const orderBy = compileOrder(search.sort, sortableSet, expression.isEmpty, bm25Args)
54
+
55
+ // Build snippet expressions for each field the caller wants highlighted.
56
+ const wantedHighlights = pickHighlightFields(search.attributesToHighlight, registry)
57
+ const snippetSelect = wantedHighlights
58
+ .map(field => {
59
+ const idx = registry.searchable.indexOf(field)
60
+ return `snippet(fts.fts, ${idx}, '${OPEN_SENTINEL}', '${CLOSE_SENTINEL}', ' … ', ${DEFAULT_SNIPPET_TOKENS}) AS ${quoteIdent(`__snip_${field}`)}`
61
+ })
62
+ .join(', ')
63
+
64
+ const perPage = Math.max(1, search.perPage ?? 20)
65
+ const page = Math.max(1, search.page ?? 1)
66
+ const offset = (page - 1) * perPage
67
+
68
+ const selectCols = [
69
+ 'documents.id AS id',
70
+ 'documents.doc AS doc',
71
+ expression.isEmpty ? '0 AS score' : `bm25(fts.fts${bm25Args}) AS score`,
72
+ ]
73
+ if (snippetSelect) selectCols.push(snippetSelect)
74
+
75
+ const sql = `
76
+ SELECT ${selectCols.join(', ')}
77
+ FROM documents
78
+ ${expression.isEmpty ? '' : 'JOIN fts ON fts.rowid = documents.rowid'}
79
+ ${where}
80
+ ${orderBy}
81
+ LIMIT ? OFFSET ?
82
+ `.trim()
83
+
84
+ const countSql = `
85
+ SELECT COUNT(*) AS n
86
+ FROM documents
87
+ ${expression.isEmpty ? '' : 'JOIN fts ON fts.rowid = documents.rowid'}
88
+ ${where}
89
+ `.trim()
90
+
91
+ const allParams = [...matchParams, ...filter.params]
92
+ const params = [...allParams, perPage, offset]
93
+
94
+ return {
95
+ sql,
96
+ params,
97
+ countSql,
98
+ countParams: allParams,
99
+ snippetColumns: wantedHighlights,
100
+ }
101
+ }
102
+
103
+ function compileOrder(
104
+ sort: string[] | undefined,
105
+ sortableSet: ReadonlySet<string>,
106
+ matchAll: boolean,
107
+ bm25Args: string
108
+ ): string {
109
+ if (sort && sort.length > 0) {
110
+ const parts: string[] = []
111
+ for (const spec of sort) {
112
+ const [field, dirRaw] = spec.split(':') as [string, string | undefined]
113
+ if (!field || !sortableSet.has(field)) {
114
+ throw new Error(
115
+ `Field "${field}" is not in sortableAttributes. Add it to the index settings before sorting on it.`
116
+ )
117
+ }
118
+ const dir = dirRaw?.toLowerCase() === 'desc' ? 'DESC' : 'ASC'
119
+ parts.push(`${quoteIdent(field)} ${dir}`)
120
+ }
121
+ return `ORDER BY ${parts.join(', ')}`
122
+ }
123
+ if (matchAll) return 'ORDER BY documents.rowid ASC'
124
+ return `ORDER BY bm25(fts.fts${bm25Args}) ASC`
125
+ }
126
+
127
+ function pickHighlightFields(
128
+ requested: string[] | undefined,
129
+ registry: FieldRegistry
130
+ ): string[] {
131
+ if (registry.usesDefaultTextColumn) return []
132
+ if (!requested || requested.length === 0) return []
133
+ return requested.filter(f => registry.searchable.includes(f))
134
+ }
@@ -0,0 +1,99 @@
1
+ import type { Database } from 'bun:sqlite'
2
+ import type { FieldRegistry } from './field_registry.ts'
3
+
4
+ const SCHEMA_VERSION = 1
5
+
6
+ /** Quote a SQLite identifier (column or table). Throws on identifiers that contain a NUL byte. */
7
+ export function quoteIdent(name: string): string {
8
+ if (name.includes('\0')) throw new Error(`Invalid identifier: ${name}`)
9
+ return `"${name.replace(/"/g, '""')}"`
10
+ }
11
+
12
+ export function applyConnectionPragmas(
13
+ db: Database,
14
+ synchronous: 'OFF' | 'NORMAL' | 'FULL'
15
+ ): void {
16
+ db.exec('PRAGMA journal_mode = WAL')
17
+ db.exec(`PRAGMA synchronous = ${synchronous}`)
18
+ db.exec('PRAGMA temp_store = MEMORY')
19
+ db.exec('PRAGMA foreign_keys = ON')
20
+ }
21
+
22
+ /**
23
+ * Create the documents + FTS5 + terms_dict tables for a fresh index.
24
+ * Idempotent: skips creation if `_meta` already exists.
25
+ */
26
+ export function createSchema(db: Database, registry: FieldRegistry): void {
27
+ if (schemaExists(db)) return
28
+
29
+ db.exec(`
30
+ CREATE TABLE _meta (
31
+ key TEXT PRIMARY KEY,
32
+ value TEXT NOT NULL
33
+ )
34
+ `)
35
+
36
+ const typedColumns = registry.typedColumns
37
+ .map(c => `${quoteIdent(c)} BLOB`)
38
+ .join(', ')
39
+ const typedColumnsClause = typedColumns ? `, ${typedColumns}` : ''
40
+
41
+ db.exec(`
42
+ CREATE TABLE documents (
43
+ rowid INTEGER PRIMARY KEY AUTOINCREMENT,
44
+ id TEXT NOT NULL UNIQUE,
45
+ doc TEXT NOT NULL${typedColumnsClause}
46
+ )
47
+ `)
48
+ db.exec('CREATE UNIQUE INDEX documents_id_idx ON documents(id)')
49
+
50
+ // One index per filterable column so WHERE clauses can use it.
51
+ for (const col of registry.filterable) {
52
+ db.exec(`CREATE INDEX ${quoteIdent(`documents_${col}_idx`)} ON documents(${quoteIdent(col)})`)
53
+ }
54
+
55
+ const ftsCols = registry.searchable.map(quoteIdent).join(', ')
56
+ // FTS5 default mode (no `content` option): the original text is stored in
57
+ // FTS5 itself so `snippet()` can echo it back highlighted. The Porter
58
+ // tokenizer applies English stemming; unicode61 normalises and folds
59
+ // diacritics so accented input matches its ASCII form.
60
+ db.exec(`
61
+ CREATE VIRTUAL TABLE fts USING fts5(
62
+ ${ftsCols},
63
+ tokenize = 'porter unicode61 remove_diacritics 2'
64
+ )
65
+ `)
66
+
67
+ // Maintain a terms dictionary for typo expansion.
68
+ db.exec(`
69
+ CREATE TABLE terms_dict (
70
+ term TEXT PRIMARY KEY,
71
+ doc_freq INTEGER NOT NULL DEFAULT 0
72
+ )
73
+ `)
74
+ db.exec('CREATE INDEX terms_dict_len_idx ON terms_dict(length(term))')
75
+
76
+ // Persist registry layout so we can detect mismatches on reopen.
77
+ const stmt = db.prepare('INSERT INTO _meta (key, value) VALUES (?, ?)')
78
+ stmt.run('schema_version', String(SCHEMA_VERSION))
79
+ stmt.run('searchable', JSON.stringify(registry.searchable))
80
+ stmt.run('filterable', JSON.stringify(registry.filterable))
81
+ stmt.run('sortable', JSON.stringify(registry.sortable))
82
+ stmt.run('primary_key', registry.primaryKey)
83
+ }
84
+
85
+ function schemaExists(db: Database): boolean {
86
+ const row = db
87
+ .query<{ name: string }, []>(
88
+ "SELECT name FROM sqlite_master WHERE type='table' AND name='_meta'"
89
+ )
90
+ .get()
91
+ return row !== null
92
+ }
93
+
94
+ export function readMeta(db: Database, key: string): string | null {
95
+ const row = db
96
+ .query<{ value: string }, [string]>('SELECT value FROM _meta WHERE key = ?')
97
+ .get(key)
98
+ return row ? row.value : null
99
+ }
@@ -0,0 +1,29 @@
1
+ /**
2
+ * SQLite's snippet() returns text with the requested marker tokens around hits.
3
+ * We use sentinel markers instead of `<mark>` directly so we can safely escape
4
+ * any HTML in the source text first, then swap sentinels for the real tags.
5
+ */
6
+ export const OPEN_SENTINEL = 'STRAV_OPEN'
7
+ export const CLOSE_SENTINEL = 'STRAV_CLOSE'
8
+
9
+ export const OPEN_TAG = '<mark>'
10
+ export const CLOSE_TAG = '</mark>'
11
+
12
+ /**
13
+ * Convert SQLite-snippet output (already wrapped in sentinels) into the
14
+ * caller-facing string with `<mark>...</mark>` around hits and HTML-escaped
15
+ * surrounding text.
16
+ */
17
+ export function formatSnippet(snippet: string | null | undefined): string {
18
+ if (!snippet) return ''
19
+ return escapeHtml(snippet).replaceAll(OPEN_SENTINEL, OPEN_TAG).replaceAll(CLOSE_SENTINEL, CLOSE_TAG)
20
+ }
21
+
22
+ function escapeHtml(input: string): string {
23
+ return input
24
+ .replace(/&/g, '&amp;')
25
+ .replace(/</g, '&lt;')
26
+ .replace(/>/g, '&gt;')
27
+ .replace(/"/g, '&quot;')
28
+ .replace(/'/g, '&#39;')
29
+ }
@@ -0,0 +1,255 @@
1
+ import { Database } from 'bun:sqlite'
2
+ import type {
3
+ SearchDocument,
4
+ SearchOptions,
5
+ SearchResult,
6
+ SearchHit,
7
+ IndexSettings,
8
+ } from '../../../types.ts'
9
+ import type { ResolvedTypoTolerance } from '../types.ts'
10
+ import { FieldRegistry } from './field_registry.ts'
11
+ import { applyConnectionPragmas, createSchema, quoteIdent } from './schema.ts'
12
+ import { compileQuery, compileQueryWithExpansions } from './fts_query_builder.ts'
13
+ import { compileSearch } from './query_compiler.ts'
14
+ import { formatSnippet } from './snippet_formatter.ts'
15
+ import { recordTerms, unrecordTerms, expandTokens } from './typo_expander.ts'
16
+
17
+ export interface SqliteEngineOptions {
18
+ path: string
19
+ synchronous: 'OFF' | 'NORMAL' | 'FULL'
20
+ typoTolerance: ResolvedTypoTolerance
21
+ indexName: string
22
+ settings?: IndexSettings
23
+ }
24
+
25
+ /**
26
+ * One SqliteEngine wraps a single index (a single SQLite file). The driver
27
+ * holds a Map<indexName, SqliteEngine> and lazily instantiates per index.
28
+ */
29
+ export class SqliteEngine {
30
+ readonly db: Database
31
+ readonly registry: FieldRegistry
32
+ private readonly typo: ResolvedTypoTolerance
33
+ private readonly indexName: string
34
+
35
+ constructor(opts: SqliteEngineOptions) {
36
+ this.db = new Database(opts.path)
37
+ applyConnectionPragmas(this.db, opts.synchronous)
38
+ this.registry = new FieldRegistry(opts.settings)
39
+ createSchema(this.db, this.registry)
40
+ this.typo = opts.typoTolerance
41
+ this.indexName = opts.indexName
42
+ }
43
+
44
+ // ── Writes ──────────────────────────────────────────────────────────────
45
+
46
+ upsert(id: string | number, document: Record<string, unknown>): void {
47
+ this.runUpsertBatch([{ id, document }])
48
+ }
49
+
50
+ upsertMany(documents: SearchDocument[]): void {
51
+ if (documents.length === 0) return
52
+ const batch = documents.map(d => {
53
+ const { id, ...rest } = d
54
+ return { id, document: rest as Record<string, unknown> }
55
+ })
56
+ this.runUpsertBatch(batch)
57
+ }
58
+
59
+ delete(id: string | number): void {
60
+ this.runDeleteBatch([id])
61
+ }
62
+
63
+ deleteMany(ids: Array<string | number>): void {
64
+ if (ids.length === 0) return
65
+ this.runDeleteBatch(ids)
66
+ }
67
+
68
+ /** Remove all documents from the index, leaving the schema in place. */
69
+ flush(): void {
70
+ const tx = this.db.transaction(() => {
71
+ this.db.exec('DELETE FROM fts')
72
+ this.db.exec('DELETE FROM documents')
73
+ this.db.exec('DELETE FROM terms_dict')
74
+ })
75
+ tx()
76
+ }
77
+
78
+ /** Force-merge FTS5 segments into one. Run periodically (e.g. nightly via CLI). */
79
+ optimize(): void {
80
+ this.db.exec("INSERT INTO fts(fts) VALUES('optimize')")
81
+ }
82
+
83
+ close(): void {
84
+ try {
85
+ this.db.exec('PRAGMA wal_checkpoint(TRUNCATE)')
86
+ } catch {
87
+ // Ignore — closing should never throw on a checkpoint failure
88
+ }
89
+ this.db.close()
90
+ }
91
+
92
+ // ── Reads ───────────────────────────────────────────────────────────────
93
+
94
+ search(query: string, options?: SearchOptions): SearchResult {
95
+ const start = performance.now()
96
+ const opts = options ?? {}
97
+ const expression = this.buildExpression(query)
98
+
99
+ const compiled = compileSearch({
100
+ registry: this.registry,
101
+ expression,
102
+ search: opts,
103
+ })
104
+
105
+ const rows = this.db
106
+ .prepare<RawHitRow, any[]>(compiled.sql)
107
+ .all(...(compiled.params as any[]))
108
+ const totalRow = this.db
109
+ .prepare<{ n: number }, any[]>(compiled.countSql)
110
+ .get(...(compiled.countParams as any[]))
111
+
112
+ const projection = opts.attributesToRetrieve
113
+ const hits: SearchHit[] = rows.map(row => projectHit(row, compiled.snippetColumns, projection))
114
+
115
+ return {
116
+ hits,
117
+ totalHits: totalRow?.n ?? hits.length,
118
+ page: Math.max(1, opts.page ?? 1),
119
+ perPage: Math.max(1, opts.perPage ?? 20),
120
+ processingTimeMs: Math.round(performance.now() - start),
121
+ }
122
+ }
123
+
124
+ // ── Internals ───────────────────────────────────────────────────────────
125
+
126
+ private buildExpression(query: string) {
127
+ const base = compileQuery(query)
128
+ if (!this.typo.enabled || base.isEmpty || base.positiveTokens.length === 0) return base
129
+
130
+ const expansions = expandTokens(this.db, base.positiveTokens, this.typo)
131
+ if (expansions.size === 0) return base
132
+ return compileQueryWithExpansions(query, expansions)
133
+ }
134
+
135
+ private runUpsertBatch(items: Array<{ id: string | number; document: Record<string, unknown> }>) {
136
+ const insertDoc = this.prepareInsertDoc()
137
+ const fetchExisting = this.db.prepare<
138
+ { rowid: number; doc: string },
139
+ [string]
140
+ >('SELECT rowid, doc FROM documents WHERE id = ?')
141
+ const insertFts = this.prepareInsertFts()
142
+ const deleteFts = this.db.prepare('DELETE FROM fts WHERE rowid = ?')
143
+ const indexName = this.indexName
144
+
145
+ const tx = this.db.transaction(() => {
146
+ for (const { id, document: doc } of items) {
147
+ const idStr = String(id)
148
+ const docJson = JSON.stringify({ id, ...doc })
149
+ const ftsValues = this.registry.projectFtsValues(doc)
150
+ const typedValues = this.registry.projectTypedValues(doc)
151
+ const newText = this.registry.concatSearchableText(doc)
152
+
153
+ const existing = fetchExisting.get(idStr)
154
+ if (existing) {
155
+ // Update path
156
+ const oldDoc = JSON.parse(existing.doc) as Record<string, unknown>
157
+ const oldText = this.registry.concatSearchableText(oldDoc)
158
+ unrecordTerms(this.db, oldText)
159
+
160
+ deleteFts.run(existing.rowid)
161
+ insertFts.run(existing.rowid as any, ...(ftsValues as any[]))
162
+ this.updateDocumentRow(existing.rowid, docJson, typedValues)
163
+ } else {
164
+ const result = insertDoc.run(idStr as any, docJson as any, ...(typedValues as any[]))
165
+ const rowid = Number(result.lastInsertRowid)
166
+ insertFts.run(rowid as any, ...(ftsValues as any[]))
167
+ }
168
+
169
+ recordTerms(this.db, newText)
170
+ }
171
+ })
172
+ void indexName
173
+ tx()
174
+ }
175
+
176
+ private runDeleteBatch(ids: Array<string | number>) {
177
+ const fetchExisting = this.db.prepare<
178
+ { rowid: number; doc: string },
179
+ [string]
180
+ >('SELECT rowid, doc FROM documents WHERE id = ?')
181
+ const deleteDoc = this.db.prepare('DELETE FROM documents WHERE id = ?')
182
+ const deleteFts = this.db.prepare('DELETE FROM fts WHERE rowid = ?')
183
+
184
+ const tx = this.db.transaction(() => {
185
+ for (const id of ids) {
186
+ const idStr = String(id)
187
+ const existing = fetchExisting.get(idStr)
188
+ if (!existing) continue
189
+ const oldDoc = JSON.parse(existing.doc) as Record<string, unknown>
190
+ unrecordTerms(this.db, this.registry.concatSearchableText(oldDoc))
191
+ deleteFts.run(existing.rowid)
192
+ deleteDoc.run(idStr)
193
+ }
194
+ })
195
+ tx()
196
+ }
197
+
198
+ private prepareInsertDoc() {
199
+ const cols = ['id', 'doc', ...this.registry.typedColumns.map(quoteIdent)]
200
+ const placeholders = cols.map(() => '?').join(', ')
201
+ return this.db.prepare(
202
+ `INSERT INTO documents (${cols.join(', ')}) VALUES (${placeholders})`
203
+ )
204
+ }
205
+
206
+ private updateDocumentRow(rowid: number, docJson: string, typedValues: unknown[]) {
207
+ const sets = ['doc = ?']
208
+ for (const col of this.registry.typedColumns) sets.push(`${quoteIdent(col)} = ?`)
209
+ const sql = `UPDATE documents SET ${sets.join(', ')} WHERE rowid = ?`
210
+ this.db.prepare(sql).run(docJson as any, ...(typedValues as any[]), rowid as any)
211
+ }
212
+
213
+ private prepareInsertFts() {
214
+ const cols = ['rowid', ...this.registry.searchable.map(quoteIdent)]
215
+ const placeholders = cols.map(() => '?').join(', ')
216
+ return this.db.prepare(`INSERT INTO fts (${cols.join(', ')}) VALUES (${placeholders})`)
217
+ }
218
+ }
219
+
220
+ interface RawHitRow {
221
+ id: string
222
+ doc: string
223
+ score: number
224
+ [snippetCol: string]: unknown
225
+ }
226
+
227
+ function projectHit(
228
+ row: RawHitRow,
229
+ snippetCols: string[],
230
+ attributesToRetrieve: string[] | undefined
231
+ ): SearchHit {
232
+ const document = JSON.parse(row.doc) as Record<string, unknown>
233
+
234
+ let projected = document
235
+ if (attributesToRetrieve && attributesToRetrieve.length > 0) {
236
+ const out: Record<string, unknown> = {}
237
+ for (const attr of attributesToRetrieve) {
238
+ if (attr in document) out[attr] = document[attr]
239
+ }
240
+ projected = out
241
+ }
242
+
243
+ const hit: SearchHit = { document: projected }
244
+
245
+ if (snippetCols.length > 0) {
246
+ const highlights: Record<string, string> = {}
247
+ for (const col of snippetCols) {
248
+ const raw = row[`__snip_${col}`] as string | null | undefined
249
+ if (raw) highlights[col] = formatSnippet(raw)
250
+ }
251
+ if (Object.keys(highlights).length > 0) hit.highlights = highlights
252
+ }
253
+
254
+ return hit
255
+ }
@@ -0,0 +1,138 @@
1
+ import type { Database } from 'bun:sqlite'
2
+ import type { ResolvedTypoTolerance } from '../types.ts'
3
+
4
+ /**
5
+ * Plain-text tokeniser for the terms dictionary.
6
+ *
7
+ * Lowercases input, splits on non-letter/digit boundaries, drops tokens shorter
8
+ * than 2 characters. We deliberately do NOT apply Porter stemming here because:
9
+ *
10
+ * - Most typos are on rare/proper nouns (e.g. customer names, product SKUs)
11
+ * which Porter doesn't transform anyway.
12
+ * - Mirroring SQLite's stem inside JS would require shipping a Porter
13
+ * implementation just for the dictionary, which is a lot of code for the
14
+ * marginal gain on common-word typos.
15
+ *
16
+ * The candidate term we feed back into FTS5 is then re-stemmed by FTS5 itself,
17
+ * so the lookup still works.
18
+ */
19
+ export function tokenize(text: string): string[] {
20
+ if (!text) return []
21
+ const tokens: string[] = []
22
+ for (const raw of text.toLowerCase().split(/[^\p{L}\p{N}]+/u)) {
23
+ if (raw.length >= 2) tokens.push(raw)
24
+ }
25
+ return tokens
26
+ }
27
+
28
+ /** Add a document's tokens to the terms dictionary, incrementing per unique term. */
29
+ export function recordTerms(db: Database, text: string): void {
30
+ const unique = new Set(tokenize(text))
31
+ if (unique.size === 0) return
32
+
33
+ const stmt = db.prepare(
34
+ 'INSERT INTO terms_dict (term, doc_freq) VALUES (?, 1) ' +
35
+ 'ON CONFLICT(term) DO UPDATE SET doc_freq = doc_freq + 1'
36
+ )
37
+ for (const term of unique) stmt.run(term)
38
+ }
39
+
40
+ /** Decrement a document's tokens; remove rows that drop to zero. */
41
+ export function unrecordTerms(db: Database, text: string): void {
42
+ const unique = new Set(tokenize(text))
43
+ if (unique.size === 0) return
44
+
45
+ const dec = db.prepare('UPDATE terms_dict SET doc_freq = doc_freq - 1 WHERE term = ?')
46
+ const purge = db.prepare('DELETE FROM terms_dict WHERE doc_freq <= 0')
47
+ for (const term of unique) dec.run(term)
48
+ purge.run()
49
+ }
50
+
51
+ /**
52
+ * For each token, return up to `maxCandidates` near-misses already present in
53
+ * the dictionary, using Levenshtein distance ≤ settings.maxDistance.
54
+ */
55
+ export function expandTokens(
56
+ db: Database,
57
+ tokens: string[],
58
+ settings: ResolvedTypoTolerance,
59
+ maxCandidates = 8
60
+ ): Map<string, string[]> {
61
+ const out = new Map<string, string[]>()
62
+ if (!settings.enabled) return out
63
+
64
+ const stmt = db.prepare<{ term: string }, [number, number]>(
65
+ 'SELECT term FROM terms_dict WHERE length(term) BETWEEN ? AND ?'
66
+ )
67
+
68
+ for (const token of tokens) {
69
+ if (token.length < settings.minTokenLength) continue
70
+
71
+ const minLen = Math.max(1, token.length - settings.maxDistance)
72
+ const maxLen = token.length + settings.maxDistance
73
+
74
+ const candidates: string[] = []
75
+ for (const row of stmt.all(minLen, maxLen)) {
76
+ if (row.term === token) continue
77
+ if (levenshtein(token, row.term, settings.maxDistance) <= settings.maxDistance) {
78
+ candidates.push(row.term)
79
+ if (candidates.length >= maxCandidates) break
80
+ }
81
+ }
82
+ if (candidates.length > 0) out.set(token, candidates)
83
+ }
84
+
85
+ return out
86
+ }
87
+
88
+ /** Resolve user-provided typo tolerance settings into concrete numbers. */
89
+ export function resolveTypoTolerance(
90
+ setting:
91
+ | 'off'
92
+ | 'auto'
93
+ | { minTokenLength?: number; maxDistance?: number }
94
+ | undefined
95
+ ): ResolvedTypoTolerance {
96
+ if (setting === 'off') {
97
+ return { enabled: false, minTokenLength: 4, maxDistance: 1 }
98
+ }
99
+ if (setting === undefined || setting === 'auto') {
100
+ return { enabled: true, minTokenLength: 4, maxDistance: 1 }
101
+ }
102
+ return {
103
+ enabled: true,
104
+ minTokenLength: setting.minTokenLength ?? 4,
105
+ maxDistance: setting.maxDistance ?? 1,
106
+ }
107
+ }
108
+
109
+ /**
110
+ * Bounded Levenshtein distance: returns max+1 once it can prove the distance
111
+ * exceeds `max` so we can short-circuit. Operates on UTF-16 code units, which
112
+ * is fine for our supported (ASCII-ish) corpora.
113
+ */
114
+ function levenshtein(a: string, b: string, max: number): number {
115
+ if (a === b) return 0
116
+ if (Math.abs(a.length - b.length) > max) return max + 1
117
+ if (a.length === 0) return b.length
118
+ if (b.length === 0) return a.length
119
+
120
+ const aLen = a.length
121
+ const bLen = b.length
122
+ let prev = new Array<number>(bLen + 1).fill(0)
123
+ let curr = new Array<number>(bLen + 1).fill(0)
124
+ for (let j = 0; j <= bLen; j++) prev[j] = j
125
+
126
+ for (let i = 1; i <= aLen; i++) {
127
+ curr[0] = i
128
+ let rowMin = curr[0]!
129
+ for (let j = 1; j <= bLen; j++) {
130
+ const cost = a.charCodeAt(i - 1) === b.charCodeAt(j - 1) ? 0 : 1
131
+ curr[j] = Math.min(prev[j]! + 1, curr[j - 1]! + 1, prev[j - 1]! + cost)
132
+ if (curr[j]! < rowMin) rowMin = curr[j]!
133
+ }
134
+ if (rowMin > max) return max + 1
135
+ ;[prev, curr] = [curr, prev]
136
+ }
137
+ return prev[bLen]!
138
+ }
@@ -0,0 +1,15 @@
1
+ import { SearchError } from '../../errors.ts'
2
+
3
+ export class EmbeddedSearchError extends SearchError {}
4
+
5
+ export class IndexCorruptError extends EmbeddedSearchError {
6
+ constructor(index: string, cause: string) {
7
+ super(`Embedded search index "${index}" is corrupt: ${cause}`)
8
+ }
9
+ }
10
+
11
+ export class UnsupportedFilterError extends EmbeddedSearchError {
12
+ constructor(message: string) {
13
+ super(`Embedded driver filter is unsupported: ${message}`)
14
+ }
15
+ }