@strav/search 0.4.30 → 1.0.0-alpha.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +20 -22
- package/src/console/index.ts +5 -0
- package/src/console/search_console_provider.ts +20 -0
- package/src/console/search_flush.ts +49 -0
- package/src/console/search_import.ts +103 -0
- package/src/console/search_list.ts +46 -0
- package/src/console/search_reindex.ts +94 -0
- package/src/drivers/meilisearch/meilisearch_driver.ts +304 -0
- package/src/drivers/memory/memory_driver.ts +344 -0
- package/src/drivers/postgres/apply_search_migration.ts +74 -0
- package/src/drivers/postgres/postgres_fts_driver.ts +493 -135
- package/src/drivers/typesense/typesense_driver.ts +345 -0
- package/src/index.ts +50 -39
- package/src/search_engine.ts +40 -25
- package/src/search_error.ts +86 -0
- package/src/search_manager.ts +112 -94
- package/src/search_provider.ts +68 -6
- package/src/searchable.ts +173 -160
- package/src/searchable_registry.ts +61 -0
- package/src/types.ts +59 -49
- package/README.md +0 -191
- package/src/commands/search_flush.ts +0 -41
- package/src/commands/search_import.ts +0 -43
- package/src/commands/search_optimize.ts +0 -52
- package/src/commands/search_rebuild.ts +0 -73
- package/src/drivers/algolia_driver.ts +0 -170
- package/src/drivers/embedded/embedded_driver.ts +0 -136
- package/src/drivers/embedded/engine/field_registry.ts +0 -97
- package/src/drivers/embedded/engine/fts_query_builder.ts +0 -184
- package/src/drivers/embedded/engine/query_compiler.ts +0 -134
- package/src/drivers/embedded/engine/schema.ts +0 -99
- package/src/drivers/embedded/engine/snippet_formatter.ts +0 -29
- package/src/drivers/embedded/engine/sqlite_engine.ts +0 -255
- package/src/drivers/embedded/engine/typo_expander.ts +0 -138
- package/src/drivers/embedded/errors.ts +0 -15
- package/src/drivers/embedded/filters/filter_compiler.ts +0 -136
- package/src/drivers/embedded/index.ts +0 -3
- package/src/drivers/embedded/storage/paths.ts +0 -23
- package/src/drivers/embedded/types.ts +0 -34
- package/src/drivers/meilisearch_driver.ts +0 -150
- package/src/drivers/null_driver.ts +0 -27
- package/src/drivers/postgres/engine/field_registry.ts +0 -116
- package/src/drivers/postgres/engine/fts_query_builder.ts +0 -105
- package/src/drivers/postgres/engine/pg_engine.ts +0 -300
- package/src/drivers/postgres/engine/query_compiler.ts +0 -165
- package/src/drivers/postgres/engine/schema.ts +0 -187
- package/src/drivers/postgres/engine/snippet_formatter.ts +0 -31
- package/src/drivers/postgres/engine/typo_expander.ts +0 -131
- package/src/drivers/postgres/errors.ts +0 -33
- package/src/drivers/postgres/filters/filter_compiler.ts +0 -138
- package/src/drivers/postgres/index.ts +0 -14
- package/src/drivers/postgres/rebuild/rebuild_inplace.ts +0 -113
- package/src/drivers/postgres/storage/identifiers.ts +0 -46
- package/src/drivers/postgres/types.ts +0 -53
- package/src/drivers/typesense_driver.ts +0 -229
- package/src/errors.ts +0 -18
- package/src/helpers.ts +0 -120
- package/stubs/config/search.ts +0 -57
- package/tsconfig.json +0 -5
|
@@ -1,136 +0,0 @@
|
|
|
1
|
-
import type { SearchEngine } from '../../search_engine.ts'
|
|
2
|
-
import type {
|
|
3
|
-
SearchDocument,
|
|
4
|
-
SearchOptions,
|
|
5
|
-
SearchResult,
|
|
6
|
-
IndexSettings,
|
|
7
|
-
DriverConfig,
|
|
8
|
-
} from '../../types.ts'
|
|
9
|
-
import { SqliteEngine } from './engine/sqlite_engine.ts'
|
|
10
|
-
import { resolveTypoTolerance } from './engine/typo_expander.ts'
|
|
11
|
-
import { resolveIndexPath, MEMORY_PATH } from './storage/paths.ts'
|
|
12
|
-
import type { EmbeddedConfig, ResolvedTypoTolerance } from './types.ts'
|
|
13
|
-
|
|
14
|
-
/**
|
|
15
|
-
* In-process full-text search driver backed by SQLite FTS5.
|
|
16
|
-
*
|
|
17
|
-
* Each index lives in its own SQLite file (or `:memory:` for tests). The
|
|
18
|
-
* driver maintains a `Map<indexName, SqliteEngine>` and creates engines
|
|
19
|
-
* lazily on first reference. This means a fresh `upsert()` against a
|
|
20
|
-
* never-created index will auto-create a default schema (single `_text`
|
|
21
|
-
* column). Callers that want per-field weights call `createIndex()` first
|
|
22
|
-
* with their `IndexSettings`.
|
|
23
|
-
*/
|
|
24
|
-
export class EmbeddedDriver implements SearchEngine {
|
|
25
|
-
readonly name = 'embedded'
|
|
26
|
-
|
|
27
|
-
private readonly config: EmbeddedConfig
|
|
28
|
-
private readonly synchronous: 'OFF' | 'NORMAL' | 'FULL'
|
|
29
|
-
private readonly typo: ResolvedTypoTolerance
|
|
30
|
-
private readonly engines = new Map<string, SqliteEngine>()
|
|
31
|
-
/** Pending settings for indexes that haven't been opened yet. */
|
|
32
|
-
private readonly pendingSettings = new Map<string, IndexSettings>()
|
|
33
|
-
|
|
34
|
-
constructor(config: DriverConfig) {
|
|
35
|
-
this.config = (config ?? {}) as EmbeddedConfig
|
|
36
|
-
this.synchronous = this.config.synchronous ?? 'NORMAL'
|
|
37
|
-
this.typo = resolveTypoTolerance(this.config.typoTolerance)
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
// ── Document operations ──────────────────────────────────────────────────
|
|
41
|
-
|
|
42
|
-
async upsert(
|
|
43
|
-
index: string,
|
|
44
|
-
id: string | number,
|
|
45
|
-
document: Record<string, unknown>
|
|
46
|
-
): Promise<void> {
|
|
47
|
-
this.engineFor(index).upsert(id, document)
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
async upsertMany(index: string, documents: SearchDocument[]): Promise<void> {
|
|
51
|
-
this.engineFor(index).upsertMany(documents)
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
async delete(index: string, id: string | number): Promise<void> {
|
|
55
|
-
this.engineFor(index).delete(id)
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
async deleteMany(index: string, ids: Array<string | number>): Promise<void> {
|
|
59
|
-
this.engineFor(index).deleteMany(ids)
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
// ── Index operations ─────────────────────────────────────────────────────
|
|
63
|
-
|
|
64
|
-
async flush(index: string): Promise<void> {
|
|
65
|
-
this.engineFor(index).flush()
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
async deleteIndex(index: string): Promise<void> {
|
|
69
|
-
const engine = this.engines.get(index)
|
|
70
|
-
if (engine) {
|
|
71
|
-
engine.close()
|
|
72
|
-
this.engines.delete(index)
|
|
73
|
-
}
|
|
74
|
-
this.pendingSettings.delete(index)
|
|
75
|
-
|
|
76
|
-
const path = resolveIndexPath(this.config, index)
|
|
77
|
-
if (path === MEMORY_PATH) return
|
|
78
|
-
|
|
79
|
-
const fs = await import('node:fs/promises')
|
|
80
|
-
for (const suffix of ['', '-wal', '-shm']) {
|
|
81
|
-
try {
|
|
82
|
-
await fs.unlink(`${path}${suffix}`)
|
|
83
|
-
} catch (err: any) {
|
|
84
|
-
if (err?.code !== 'ENOENT') throw err
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
async createIndex(index: string, options?: IndexSettings): Promise<void> {
|
|
90
|
-
if (options) this.pendingSettings.set(index, options)
|
|
91
|
-
// Force engine instantiation so the schema exists on disk.
|
|
92
|
-
this.engineFor(index)
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
// ── Search ───────────────────────────────────────────────────────────────
|
|
96
|
-
|
|
97
|
-
async search(index: string, query: string, options?: SearchOptions): Promise<SearchResult> {
|
|
98
|
-
return this.engineFor(index).search(query, options)
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
// ── Lifecycle ────────────────────────────────────────────────────────────
|
|
102
|
-
|
|
103
|
-
/** Close all open engines. Call from app shutdown. */
|
|
104
|
-
close(): void {
|
|
105
|
-
for (const engine of this.engines.values()) engine.close()
|
|
106
|
-
this.engines.clear()
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
/** Run FTS5 segment merge on every open index. Use from CLI for periodic ops. */
|
|
110
|
-
optimize(index?: string): void {
|
|
111
|
-
if (index) {
|
|
112
|
-
this.engineFor(index).optimize()
|
|
113
|
-
return
|
|
114
|
-
}
|
|
115
|
-
for (const engine of this.engines.values()) engine.optimize()
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
// ── Internals ────────────────────────────────────────────────────────────
|
|
119
|
-
|
|
120
|
-
private engineFor(index: string): SqliteEngine {
|
|
121
|
-
let engine = this.engines.get(index)
|
|
122
|
-
if (engine) return engine
|
|
123
|
-
|
|
124
|
-
const settings = this.pendingSettings.get(index)
|
|
125
|
-
engine = new SqliteEngine({
|
|
126
|
-
path: resolveIndexPath(this.config, index),
|
|
127
|
-
synchronous: this.synchronous,
|
|
128
|
-
typoTolerance: this.typo,
|
|
129
|
-
indexName: index,
|
|
130
|
-
settings,
|
|
131
|
-
})
|
|
132
|
-
this.engines.set(index, engine)
|
|
133
|
-
this.pendingSettings.delete(index)
|
|
134
|
-
return engine
|
|
135
|
-
}
|
|
136
|
-
}
|
|
@@ -1,97 +0,0 @@
|
|
|
1
|
-
import type { IndexSettings } from '../../../types.ts'
|
|
2
|
-
|
|
3
|
-
/** The default searchable column name used when no `searchableAttributes` are configured. */
|
|
4
|
-
export const DEFAULT_TEXT_COLUMN = '_text'
|
|
5
|
-
|
|
6
|
-
/**
|
|
7
|
-
* The schema layout for one index: which document attributes feed which FTS5
|
|
8
|
-
* column and which typed `documents` columns exist for filtering / sorting.
|
|
9
|
-
*
|
|
10
|
-
* When a caller doesn't declare `searchableAttributes`, we fall back to a
|
|
11
|
-
* single `_text` column that concatenates every string-valued field at
|
|
12
|
-
* indexing time. Users who want per-field weights opt in by passing
|
|
13
|
-
* `IndexSettings`.
|
|
14
|
-
*/
|
|
15
|
-
export class FieldRegistry {
|
|
16
|
-
/** FTS5 columns in declaration order — also the order BM25 weights apply in. */
|
|
17
|
-
readonly searchable: string[]
|
|
18
|
-
/** Filterable attributes — materialized as typed columns on `documents`. */
|
|
19
|
-
readonly filterable: string[]
|
|
20
|
-
/** Sortable attributes — materialized as typed columns on `documents`. */
|
|
21
|
-
readonly sortable: string[]
|
|
22
|
-
/** Union of filterable + sortable, deduplicated. */
|
|
23
|
-
readonly typedColumns: string[]
|
|
24
|
-
/** Primary key field name — defaults to 'id'. */
|
|
25
|
-
readonly primaryKey: string
|
|
26
|
-
|
|
27
|
-
constructor(settings?: IndexSettings) {
|
|
28
|
-
this.primaryKey = settings?.primaryKey ?? 'id'
|
|
29
|
-
this.searchable =
|
|
30
|
-
settings?.searchableAttributes && settings.searchableAttributes.length > 0
|
|
31
|
-
? [...settings.searchableAttributes]
|
|
32
|
-
: [DEFAULT_TEXT_COLUMN]
|
|
33
|
-
this.filterable = settings?.filterableAttributes ?? []
|
|
34
|
-
this.sortable = settings?.sortableAttributes ?? []
|
|
35
|
-
this.typedColumns = Array.from(new Set([...this.filterable, ...this.sortable]))
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
/** Whether this registry uses the synthesised `_text` column. */
|
|
39
|
-
get usesDefaultTextColumn(): boolean {
|
|
40
|
-
return this.searchable.length === 1 && this.searchable[0] === DEFAULT_TEXT_COLUMN
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
/**
|
|
44
|
-
* Project a document into the values that go into the FTS5 row.
|
|
45
|
-
* For default mode, concatenate every string-valued field.
|
|
46
|
-
* For declared mode, pick each named attribute (coerced to string).
|
|
47
|
-
*/
|
|
48
|
-
projectFtsValues(document: Record<string, unknown>): string[] {
|
|
49
|
-
if (this.usesDefaultTextColumn) {
|
|
50
|
-
const parts: string[] = []
|
|
51
|
-
for (const value of Object.values(document)) {
|
|
52
|
-
if (typeof value === 'string' && value.length > 0) parts.push(value)
|
|
53
|
-
else if (Array.isArray(value)) {
|
|
54
|
-
for (const item of value) {
|
|
55
|
-
if (typeof item === 'string' && item.length > 0) parts.push(item)
|
|
56
|
-
}
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
|
-
return [parts.join(' ')]
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
return this.searchable.map(attr => coerceText(document[attr]))
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
/**
|
|
66
|
-
* Project a document into the typed-column values stored on `documents`.
|
|
67
|
-
* Returned in the same order as `typedColumns`.
|
|
68
|
-
*/
|
|
69
|
-
projectTypedValues(document: Record<string, unknown>): unknown[] {
|
|
70
|
-
return this.typedColumns.map(attr => coerceTyped(document[attr]))
|
|
71
|
-
}
|
|
72
|
-
|
|
73
|
-
/**
|
|
74
|
-
* Concatenate every searchable attribute into one long string suitable for
|
|
75
|
-
* tokenization (used for terms-dictionary maintenance).
|
|
76
|
-
*/
|
|
77
|
-
concatSearchableText(document: Record<string, unknown>): string {
|
|
78
|
-
return this.projectFtsValues(document).join(' ')
|
|
79
|
-
}
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
function coerceText(value: unknown): string {
|
|
83
|
-
if (value === null || value === undefined) return ''
|
|
84
|
-
if (typeof value === 'string') return value
|
|
85
|
-
if (Array.isArray(value)) {
|
|
86
|
-
return value.map(v => coerceText(v)).filter(Boolean).join(' ')
|
|
87
|
-
}
|
|
88
|
-
if (typeof value === 'number' || typeof value === 'boolean') return String(value)
|
|
89
|
-
return ''
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
function coerceTyped(value: unknown): unknown {
|
|
93
|
-
if (value === null || value === undefined) return null
|
|
94
|
-
if (Array.isArray(value)) return JSON.stringify(value)
|
|
95
|
-
if (typeof value === 'object') return JSON.stringify(value)
|
|
96
|
-
return value
|
|
97
|
-
}
|
|
@@ -1,184 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Translate a user-facing query string into a sanitized FTS5 MATCH expression.
|
|
3
|
-
*
|
|
4
|
-
* Supported syntax (subset of Google-style search):
|
|
5
|
-
* - `"foo bar"` — exact phrase
|
|
6
|
-
* - `-foo` — exclude documents containing this token
|
|
7
|
-
* - `+foo` — required (default for all positive tokens — accepted for symmetry)
|
|
8
|
-
* - `foo*` — prefix match
|
|
9
|
-
*
|
|
10
|
-
* Everything else is treated as a positive ANDed token.
|
|
11
|
-
*
|
|
12
|
-
* Defends against FTS5 syntax injection by stripping or escaping any FTS5
|
|
13
|
-
* operator characters from raw user tokens. The user never gets to write a
|
|
14
|
-
* raw MATCH expression.
|
|
15
|
-
*/
|
|
16
|
-
export interface FtsExpression {
|
|
17
|
-
/** Final MATCH expression, ready to bind into a query. */
|
|
18
|
-
match: string
|
|
19
|
-
/** The positive tokens (no quotes, no operators) — useful for typo expansion. */
|
|
20
|
-
positiveTokens: string[]
|
|
21
|
-
/** Whether the expression is empty (caller may short-circuit to "match all"). */
|
|
22
|
-
isEmpty: boolean
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
interface ParsedToken {
|
|
26
|
-
text: string
|
|
27
|
-
negate: boolean
|
|
28
|
-
phrase: boolean
|
|
29
|
-
prefix: boolean
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
const FTS5_RESERVED = /["()*:^]/g
|
|
33
|
-
const PHRASE_RE = /"([^"]*)"/g
|
|
34
|
-
|
|
35
|
-
export function compileQuery(input: string): FtsExpression {
|
|
36
|
-
const trimmed = input.trim()
|
|
37
|
-
if (!trimmed) return { match: '', positiveTokens: [], isEmpty: true }
|
|
38
|
-
|
|
39
|
-
const tokens = parseTokens(trimmed)
|
|
40
|
-
if (tokens.length === 0) return { match: '', positiveTokens: [], isEmpty: true }
|
|
41
|
-
|
|
42
|
-
const positives: string[] = []
|
|
43
|
-
const negatives: string[] = []
|
|
44
|
-
const positiveTokens: string[] = []
|
|
45
|
-
|
|
46
|
-
for (const tok of tokens) {
|
|
47
|
-
const rendered = renderToken(tok)
|
|
48
|
-
if (!rendered) continue
|
|
49
|
-
|
|
50
|
-
if (tok.negate) {
|
|
51
|
-
negatives.push(rendered)
|
|
52
|
-
} else {
|
|
53
|
-
positives.push(rendered)
|
|
54
|
-
if (!tok.phrase && !tok.prefix) positiveTokens.push(tok.text.toLowerCase())
|
|
55
|
-
}
|
|
56
|
-
}
|
|
57
|
-
|
|
58
|
-
if (positives.length === 0 && negatives.length === 0) {
|
|
59
|
-
return { match: '', positiveTokens: [], isEmpty: true }
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
// Pure-negative queries can't be expressed in FTS5 — fall back to no-match.
|
|
63
|
-
if (positives.length === 0) {
|
|
64
|
-
return { match: '', positiveTokens: [], isEmpty: true }
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
let expr = positives.join(' AND ')
|
|
68
|
-
if (negatives.length > 0) {
|
|
69
|
-
expr = `${expr} NOT (${negatives.join(' OR ')})`
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
return { match: expr, positiveTokens, isEmpty: false }
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
/**
|
|
76
|
-
* Re-render a previously parsed query but with extra OR-candidates injected
|
|
77
|
-
* for each positive token. Used by the typo expander.
|
|
78
|
-
*/
|
|
79
|
-
export function compileQueryWithExpansions(
|
|
80
|
-
input: string,
|
|
81
|
-
expansions: Map<string, string[]>
|
|
82
|
-
): FtsExpression {
|
|
83
|
-
const trimmed = input.trim()
|
|
84
|
-
if (!trimmed) return { match: '', positiveTokens: [], isEmpty: true }
|
|
85
|
-
|
|
86
|
-
const tokens = parseTokens(trimmed)
|
|
87
|
-
const positives: string[] = []
|
|
88
|
-
const negatives: string[] = []
|
|
89
|
-
const positiveTokens: string[] = []
|
|
90
|
-
|
|
91
|
-
for (const tok of tokens) {
|
|
92
|
-
if (tok.negate) {
|
|
93
|
-
const r = renderToken(tok)
|
|
94
|
-
if (r) negatives.push(r)
|
|
95
|
-
continue
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
if (tok.phrase || tok.prefix) {
|
|
99
|
-
const r = renderToken(tok)
|
|
100
|
-
if (r) positives.push(r)
|
|
101
|
-
continue
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
const sanitized = sanitizeBareToken(tok.text)
|
|
105
|
-
if (!sanitized) continue
|
|
106
|
-
positiveTokens.push(sanitized.toLowerCase())
|
|
107
|
-
|
|
108
|
-
const cands = expansions.get(sanitized.toLowerCase()) ?? []
|
|
109
|
-
if (cands.length === 0) {
|
|
110
|
-
positives.push(sanitized)
|
|
111
|
-
} else {
|
|
112
|
-
const all = [sanitized, ...cands].map(t => sanitizeBareToken(t)).filter(Boolean) as string[]
|
|
113
|
-
const unique = Array.from(new Set(all))
|
|
114
|
-
positives.push(`(${unique.join(' OR ')})`)
|
|
115
|
-
}
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
if (positives.length === 0) return { match: '', positiveTokens: [], isEmpty: true }
|
|
119
|
-
|
|
120
|
-
let expr = positives.join(' AND ')
|
|
121
|
-
if (negatives.length > 0) {
|
|
122
|
-
expr = `${expr} NOT (${negatives.join(' OR ')})`
|
|
123
|
-
}
|
|
124
|
-
return { match: expr, positiveTokens, isEmpty: false }
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
function parseTokens(input: string): ParsedToken[] {
|
|
128
|
-
const tokens: ParsedToken[] = []
|
|
129
|
-
let cursor = 0
|
|
130
|
-
let working = input
|
|
131
|
-
|
|
132
|
-
// Pull out phrase tokens first to avoid splitting on inner whitespace.
|
|
133
|
-
working = working.replace(PHRASE_RE, (_, phrase, offset) => {
|
|
134
|
-
const negate = offset > 0 && input[offset - 1] === '-'
|
|
135
|
-
tokens.push({ text: phrase, negate, phrase: true, prefix: false })
|
|
136
|
-
return ' '.repeat(_.length + (negate ? 1 : 0))
|
|
137
|
-
})
|
|
138
|
-
|
|
139
|
-
for (const raw of working.split(/\s+/)) {
|
|
140
|
-
if (!raw) continue
|
|
141
|
-
let text = raw
|
|
142
|
-
let negate = false
|
|
143
|
-
let prefix = false
|
|
144
|
-
|
|
145
|
-
if (text.startsWith('-')) {
|
|
146
|
-
negate = true
|
|
147
|
-
text = text.slice(1)
|
|
148
|
-
} else if (text.startsWith('+')) {
|
|
149
|
-
text = text.slice(1)
|
|
150
|
-
}
|
|
151
|
-
if (text.endsWith('*')) {
|
|
152
|
-
prefix = true
|
|
153
|
-
text = text.slice(0, -1)
|
|
154
|
-
}
|
|
155
|
-
if (!text) continue
|
|
156
|
-
|
|
157
|
-
tokens.push({ text, negate, phrase: false, prefix })
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
void cursor
|
|
161
|
-
return tokens
|
|
162
|
-
}
|
|
163
|
-
|
|
164
|
-
function renderToken(tok: ParsedToken): string | null {
|
|
165
|
-
if (tok.phrase) {
|
|
166
|
-
const cleaned = tok.text.replace(/"/g, '').trim()
|
|
167
|
-
if (!cleaned) return null
|
|
168
|
-
return `"${cleaned}"`
|
|
169
|
-
}
|
|
170
|
-
const sanitized = sanitizeBareToken(tok.text)
|
|
171
|
-
if (!sanitized) return null
|
|
172
|
-
return tok.prefix ? `${sanitized}*` : sanitized
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
function sanitizeBareToken(token: string): string {
|
|
176
|
-
// Replace any FTS5 operator characters with a space, then collapse to one
|
|
177
|
-
// word. If only one word survives we use it bare; otherwise wrap in quotes
|
|
178
|
-
// so FTS5 treats it as a phrase rather than two ANDed tokens.
|
|
179
|
-
const cleaned = token.replace(FTS5_RESERVED, ' ').trim()
|
|
180
|
-
if (!cleaned) return ''
|
|
181
|
-
const parts = cleaned.split(/\s+/).filter(Boolean)
|
|
182
|
-
if (parts.length === 1) return parts[0]!
|
|
183
|
-
return `"${parts.join(' ')}"`
|
|
184
|
-
}
|
|
@@ -1,134 +0,0 @@
|
|
|
1
|
-
import type { SearchOptions } from '../../../types.ts'
|
|
2
|
-
import type { FieldRegistry } from './field_registry.ts'
|
|
3
|
-
import type { FtsExpression } from './fts_query_builder.ts'
|
|
4
|
-
import { compileFilter } from '../filters/filter_compiler.ts'
|
|
5
|
-
import { quoteIdent } from './schema.ts'
|
|
6
|
-
import { OPEN_SENTINEL, CLOSE_SENTINEL } from './snippet_formatter.ts'
|
|
7
|
-
|
|
8
|
-
export interface CompiledSearch {
|
|
9
|
-
/** Main SELECT returning hits + score + snippets. */
|
|
10
|
-
sql: string
|
|
11
|
-
/** Bound parameters for the main SELECT. */
|
|
12
|
-
params: unknown[]
|
|
13
|
-
/** COUNT(*) variant for totalHits. */
|
|
14
|
-
countSql: string
|
|
15
|
-
/** Bound parameters for the count query (subset of `params`). */
|
|
16
|
-
countParams: unknown[]
|
|
17
|
-
/** Names of columns we asked SQLite to return for snippets, in order. */
|
|
18
|
-
snippetColumns: string[]
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
const DEFAULT_SNIPPET_TOKENS = 24
|
|
22
|
-
|
|
23
|
-
export interface QueryCompilerOptions {
|
|
24
|
-
registry: FieldRegistry
|
|
25
|
-
expression: FtsExpression
|
|
26
|
-
search: SearchOptions
|
|
27
|
-
/** Per-column BM25 weights, matching `registry.searchable` order. Defaults to all-1. */
|
|
28
|
-
weights?: number[]
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
export function compileSearch(opts: QueryCompilerOptions): CompiledSearch {
|
|
32
|
-
const { registry, expression, search, weights } = opts
|
|
33
|
-
const filterableSet = new Set(registry.filterable)
|
|
34
|
-
const sortableSet = new Set(registry.sortable)
|
|
35
|
-
|
|
36
|
-
const filter = compileFilter(search.filter, filterableSet)
|
|
37
|
-
|
|
38
|
-
const whereParts: string[] = []
|
|
39
|
-
const matchParams: unknown[] = []
|
|
40
|
-
|
|
41
|
-
if (!expression.isEmpty) {
|
|
42
|
-
whereParts.push('fts.fts MATCH ?')
|
|
43
|
-
matchParams.push(expression.match)
|
|
44
|
-
}
|
|
45
|
-
if (filter.sql) whereParts.push(filter.sql)
|
|
46
|
-
|
|
47
|
-
const where = whereParts.length > 0 ? `WHERE ${whereParts.join(' AND ')}` : ''
|
|
48
|
-
|
|
49
|
-
// BM25 score (negative = better). Defaults to weight 1.0 for every column.
|
|
50
|
-
const ws = (weights ?? registry.searchable.map(() => 1)).map(w => Number(w) || 1)
|
|
51
|
-
const bm25Args = ws.length > 0 ? `, ${ws.join(', ')}` : ''
|
|
52
|
-
|
|
53
|
-
const orderBy = compileOrder(search.sort, sortableSet, expression.isEmpty, bm25Args)
|
|
54
|
-
|
|
55
|
-
// Build snippet expressions for each field the caller wants highlighted.
|
|
56
|
-
const wantedHighlights = pickHighlightFields(search.attributesToHighlight, registry)
|
|
57
|
-
const snippetSelect = wantedHighlights
|
|
58
|
-
.map(field => {
|
|
59
|
-
const idx = registry.searchable.indexOf(field)
|
|
60
|
-
return `snippet(fts.fts, ${idx}, '${OPEN_SENTINEL}', '${CLOSE_SENTINEL}', ' … ', ${DEFAULT_SNIPPET_TOKENS}) AS ${quoteIdent(`__snip_${field}`)}`
|
|
61
|
-
})
|
|
62
|
-
.join(', ')
|
|
63
|
-
|
|
64
|
-
const perPage = Math.max(1, search.perPage ?? 20)
|
|
65
|
-
const page = Math.max(1, search.page ?? 1)
|
|
66
|
-
const offset = (page - 1) * perPage
|
|
67
|
-
|
|
68
|
-
const selectCols = [
|
|
69
|
-
'documents.id AS id',
|
|
70
|
-
'documents.doc AS doc',
|
|
71
|
-
expression.isEmpty ? '0 AS score' : `bm25(fts.fts${bm25Args}) AS score`,
|
|
72
|
-
]
|
|
73
|
-
if (snippetSelect) selectCols.push(snippetSelect)
|
|
74
|
-
|
|
75
|
-
const sql = `
|
|
76
|
-
SELECT ${selectCols.join(', ')}
|
|
77
|
-
FROM documents
|
|
78
|
-
${expression.isEmpty ? '' : 'JOIN fts ON fts.rowid = documents.rowid'}
|
|
79
|
-
${where}
|
|
80
|
-
${orderBy}
|
|
81
|
-
LIMIT ? OFFSET ?
|
|
82
|
-
`.trim()
|
|
83
|
-
|
|
84
|
-
const countSql = `
|
|
85
|
-
SELECT COUNT(*) AS n
|
|
86
|
-
FROM documents
|
|
87
|
-
${expression.isEmpty ? '' : 'JOIN fts ON fts.rowid = documents.rowid'}
|
|
88
|
-
${where}
|
|
89
|
-
`.trim()
|
|
90
|
-
|
|
91
|
-
const allParams = [...matchParams, ...filter.params]
|
|
92
|
-
const params = [...allParams, perPage, offset]
|
|
93
|
-
|
|
94
|
-
return {
|
|
95
|
-
sql,
|
|
96
|
-
params,
|
|
97
|
-
countSql,
|
|
98
|
-
countParams: allParams,
|
|
99
|
-
snippetColumns: wantedHighlights,
|
|
100
|
-
}
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
function compileOrder(
|
|
104
|
-
sort: string[] | undefined,
|
|
105
|
-
sortableSet: ReadonlySet<string>,
|
|
106
|
-
matchAll: boolean,
|
|
107
|
-
bm25Args: string
|
|
108
|
-
): string {
|
|
109
|
-
if (sort && sort.length > 0) {
|
|
110
|
-
const parts: string[] = []
|
|
111
|
-
for (const spec of sort) {
|
|
112
|
-
const [field, dirRaw] = spec.split(':') as [string, string | undefined]
|
|
113
|
-
if (!field || !sortableSet.has(field)) {
|
|
114
|
-
throw new Error(
|
|
115
|
-
`Field "${field}" is not in sortableAttributes. Add it to the index settings before sorting on it.`
|
|
116
|
-
)
|
|
117
|
-
}
|
|
118
|
-
const dir = dirRaw?.toLowerCase() === 'desc' ? 'DESC' : 'ASC'
|
|
119
|
-
parts.push(`${quoteIdent(field)} ${dir}`)
|
|
120
|
-
}
|
|
121
|
-
return `ORDER BY ${parts.join(', ')}`
|
|
122
|
-
}
|
|
123
|
-
if (matchAll) return 'ORDER BY documents.rowid ASC'
|
|
124
|
-
return `ORDER BY bm25(fts.fts${bm25Args}) ASC`
|
|
125
|
-
}
|
|
126
|
-
|
|
127
|
-
function pickHighlightFields(
|
|
128
|
-
requested: string[] | undefined,
|
|
129
|
-
registry: FieldRegistry
|
|
130
|
-
): string[] {
|
|
131
|
-
if (registry.usesDefaultTextColumn) return []
|
|
132
|
-
if (!requested || requested.length === 0) return []
|
|
133
|
-
return requested.filter(f => registry.searchable.includes(f))
|
|
134
|
-
}
|
|
@@ -1,99 +0,0 @@
|
|
|
1
|
-
import type { Database } from 'bun:sqlite'
|
|
2
|
-
import type { FieldRegistry } from './field_registry.ts'
|
|
3
|
-
|
|
4
|
-
const SCHEMA_VERSION = 1
|
|
5
|
-
|
|
6
|
-
/** Quote a SQLite identifier (column or table). Throws on identifiers that contain a NUL byte. */
|
|
7
|
-
export function quoteIdent(name: string): string {
|
|
8
|
-
if (name.includes('\0')) throw new Error(`Invalid identifier: ${name}`)
|
|
9
|
-
return `"${name.replace(/"/g, '""')}"`
|
|
10
|
-
}
|
|
11
|
-
|
|
12
|
-
export function applyConnectionPragmas(
|
|
13
|
-
db: Database,
|
|
14
|
-
synchronous: 'OFF' | 'NORMAL' | 'FULL'
|
|
15
|
-
): void {
|
|
16
|
-
db.exec('PRAGMA journal_mode = WAL')
|
|
17
|
-
db.exec(`PRAGMA synchronous = ${synchronous}`)
|
|
18
|
-
db.exec('PRAGMA temp_store = MEMORY')
|
|
19
|
-
db.exec('PRAGMA foreign_keys = ON')
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
/**
|
|
23
|
-
* Create the documents + FTS5 + terms_dict tables for a fresh index.
|
|
24
|
-
* Idempotent: skips creation if `_meta` already exists.
|
|
25
|
-
*/
|
|
26
|
-
export function createSchema(db: Database, registry: FieldRegistry): void {
|
|
27
|
-
if (schemaExists(db)) return
|
|
28
|
-
|
|
29
|
-
db.exec(`
|
|
30
|
-
CREATE TABLE _meta (
|
|
31
|
-
key TEXT PRIMARY KEY,
|
|
32
|
-
value TEXT NOT NULL
|
|
33
|
-
)
|
|
34
|
-
`)
|
|
35
|
-
|
|
36
|
-
const typedColumns = registry.typedColumns
|
|
37
|
-
.map(c => `${quoteIdent(c)} BLOB`)
|
|
38
|
-
.join(', ')
|
|
39
|
-
const typedColumnsClause = typedColumns ? `, ${typedColumns}` : ''
|
|
40
|
-
|
|
41
|
-
db.exec(`
|
|
42
|
-
CREATE TABLE documents (
|
|
43
|
-
rowid INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
44
|
-
id TEXT NOT NULL UNIQUE,
|
|
45
|
-
doc TEXT NOT NULL${typedColumnsClause}
|
|
46
|
-
)
|
|
47
|
-
`)
|
|
48
|
-
db.exec('CREATE UNIQUE INDEX documents_id_idx ON documents(id)')
|
|
49
|
-
|
|
50
|
-
// One index per filterable column so WHERE clauses can use it.
|
|
51
|
-
for (const col of registry.filterable) {
|
|
52
|
-
db.exec(`CREATE INDEX ${quoteIdent(`documents_${col}_idx`)} ON documents(${quoteIdent(col)})`)
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
const ftsCols = registry.searchable.map(quoteIdent).join(', ')
|
|
56
|
-
// FTS5 default mode (no `content` option): the original text is stored in
|
|
57
|
-
// FTS5 itself so `snippet()` can echo it back highlighted. The Porter
|
|
58
|
-
// tokenizer applies English stemming; unicode61 normalises and folds
|
|
59
|
-
// diacritics so accented input matches its ASCII form.
|
|
60
|
-
db.exec(`
|
|
61
|
-
CREATE VIRTUAL TABLE fts USING fts5(
|
|
62
|
-
${ftsCols},
|
|
63
|
-
tokenize = 'porter unicode61 remove_diacritics 2'
|
|
64
|
-
)
|
|
65
|
-
`)
|
|
66
|
-
|
|
67
|
-
// Maintain a terms dictionary for typo expansion.
|
|
68
|
-
db.exec(`
|
|
69
|
-
CREATE TABLE terms_dict (
|
|
70
|
-
term TEXT PRIMARY KEY,
|
|
71
|
-
doc_freq INTEGER NOT NULL DEFAULT 0
|
|
72
|
-
)
|
|
73
|
-
`)
|
|
74
|
-
db.exec('CREATE INDEX terms_dict_len_idx ON terms_dict(length(term))')
|
|
75
|
-
|
|
76
|
-
// Persist registry layout so we can detect mismatches on reopen.
|
|
77
|
-
const stmt = db.prepare('INSERT INTO _meta (key, value) VALUES (?, ?)')
|
|
78
|
-
stmt.run('schema_version', String(SCHEMA_VERSION))
|
|
79
|
-
stmt.run('searchable', JSON.stringify(registry.searchable))
|
|
80
|
-
stmt.run('filterable', JSON.stringify(registry.filterable))
|
|
81
|
-
stmt.run('sortable', JSON.stringify(registry.sortable))
|
|
82
|
-
stmt.run('primary_key', registry.primaryKey)
|
|
83
|
-
}
|
|
84
|
-
|
|
85
|
-
function schemaExists(db: Database): boolean {
|
|
86
|
-
const row = db
|
|
87
|
-
.query<{ name: string }, []>(
|
|
88
|
-
"SELECT name FROM sqlite_master WHERE type='table' AND name='_meta'"
|
|
89
|
-
)
|
|
90
|
-
.get()
|
|
91
|
-
return row !== null
|
|
92
|
-
}
|
|
93
|
-
|
|
94
|
-
export function readMeta(db: Database, key: string): string | null {
|
|
95
|
-
const row = db
|
|
96
|
-
.query<{ value: string }, [string]>('SELECT value FROM _meta WHERE key = ?')
|
|
97
|
-
.get(key)
|
|
98
|
-
return row ? row.value : null
|
|
99
|
-
}
|
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* SQLite's snippet() returns text with the requested marker tokens around hits.
|
|
3
|
-
* We use sentinel markers instead of `<mark>` directly so we can safely escape
|
|
4
|
-
* any HTML in the source text first, then swap sentinels for the real tags.
|
|
5
|
-
*/
|
|
6
|
-
export const OPEN_SENTINEL = 'STRAV_OPEN'
|
|
7
|
-
export const CLOSE_SENTINEL = 'STRAV_CLOSE'
|
|
8
|
-
|
|
9
|
-
export const OPEN_TAG = '<mark>'
|
|
10
|
-
export const CLOSE_TAG = '</mark>'
|
|
11
|
-
|
|
12
|
-
/**
|
|
13
|
-
* Convert SQLite-snippet output (already wrapped in sentinels) into the
|
|
14
|
-
* caller-facing string with `<mark>...</mark>` around hits and HTML-escaped
|
|
15
|
-
* surrounding text.
|
|
16
|
-
*/
|
|
17
|
-
export function formatSnippet(snippet: string | null | undefined): string {
|
|
18
|
-
if (!snippet) return ''
|
|
19
|
-
return escapeHtml(snippet).replaceAll(OPEN_SENTINEL, OPEN_TAG).replaceAll(CLOSE_SENTINEL, CLOSE_TAG)
|
|
20
|
-
}
|
|
21
|
-
|
|
22
|
-
function escapeHtml(input: string): string {
|
|
23
|
-
return input
|
|
24
|
-
.replace(/&/g, '&')
|
|
25
|
-
.replace(/</g, '<')
|
|
26
|
-
.replace(/>/g, '>')
|
|
27
|
-
.replace(/"/g, '"')
|
|
28
|
-
.replace(/'/g, ''')
|
|
29
|
-
}
|