@strav/search 0.3.20 → 0.3.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/README.md +122 -3
  2. package/package.json +4 -4
  3. package/src/commands/search_optimize.ts +52 -0
  4. package/src/commands/search_rebuild.ts +73 -0
  5. package/src/drivers/embedded/embedded_driver.ts +136 -0
  6. package/src/drivers/embedded/engine/field_registry.ts +97 -0
  7. package/src/drivers/embedded/engine/fts_query_builder.ts +184 -0
  8. package/src/drivers/embedded/engine/query_compiler.ts +134 -0
  9. package/src/drivers/embedded/engine/schema.ts +99 -0
  10. package/src/drivers/embedded/engine/snippet_formatter.ts +29 -0
  11. package/src/drivers/embedded/engine/sqlite_engine.ts +255 -0
  12. package/src/drivers/embedded/engine/typo_expander.ts +138 -0
  13. package/src/drivers/embedded/errors.ts +15 -0
  14. package/src/drivers/embedded/filters/filter_compiler.ts +136 -0
  15. package/src/drivers/embedded/index.ts +3 -0
  16. package/src/drivers/embedded/storage/paths.ts +23 -0
  17. package/src/drivers/embedded/types.ts +34 -0
  18. package/src/drivers/postgres/engine/field_registry.ts +116 -0
  19. package/src/drivers/postgres/engine/fts_query_builder.ts +105 -0
  20. package/src/drivers/postgres/engine/pg_engine.ts +300 -0
  21. package/src/drivers/postgres/engine/query_compiler.ts +165 -0
  22. package/src/drivers/postgres/engine/schema.ts +187 -0
  23. package/src/drivers/postgres/engine/snippet_formatter.ts +31 -0
  24. package/src/drivers/postgres/engine/typo_expander.ts +131 -0
  25. package/src/drivers/postgres/errors.ts +33 -0
  26. package/src/drivers/postgres/filters/filter_compiler.ts +138 -0
  27. package/src/drivers/postgres/index.ts +14 -0
  28. package/src/drivers/postgres/postgres_fts_driver.ts +184 -0
  29. package/src/drivers/postgres/rebuild/rebuild_inplace.ts +113 -0
  30. package/src/drivers/postgres/storage/identifiers.ts +46 -0
  31. package/src/drivers/postgres/types.ts +53 -0
  32. package/src/index.ts +11 -0
  33. package/src/search_manager.ts +7 -0
  34. package/stubs/config/search.ts +25 -0
@@ -0,0 +1,138 @@
1
+ import type { Database } from 'bun:sqlite'
2
+ import type { ResolvedTypoTolerance } from '../types.ts'
3
+
4
+ /**
5
+ * Plain-text tokeniser for the terms dictionary.
6
+ *
7
+ * Lowercases input, splits on non-letter/digit boundaries, drops tokens shorter
8
+ * than 2 characters. We deliberately do NOT apply Porter stemming here because:
9
+ *
10
+ * - Most typos are on rare/proper nouns (e.g. customer names, product SKUs)
11
+ * which Porter doesn't transform anyway.
12
+ * - Mirroring SQLite's stem inside JS would require shipping a Porter
13
+ * implementation just for the dictionary, which is a lot of code for the
14
+ * marginal gain on common-word typos.
15
+ *
16
+ * The candidate term we feed back into FTS5 is then re-stemmed by FTS5 itself,
17
+ * so the lookup still works.
18
+ */
19
+ export function tokenize(text: string): string[] {
20
+ if (!text) return []
21
+ const tokens: string[] = []
22
+ for (const raw of text.toLowerCase().split(/[^\p{L}\p{N}]+/u)) {
23
+ if (raw.length >= 2) tokens.push(raw)
24
+ }
25
+ return tokens
26
+ }
27
+
28
+ /** Add a document's tokens to the terms dictionary, incrementing per unique term. */
29
+ export function recordTerms(db: Database, text: string): void {
30
+ const unique = new Set(tokenize(text))
31
+ if (unique.size === 0) return
32
+
33
+ const stmt = db.prepare(
34
+ 'INSERT INTO terms_dict (term, doc_freq) VALUES (?, 1) ' +
35
+ 'ON CONFLICT(term) DO UPDATE SET doc_freq = doc_freq + 1'
36
+ )
37
+ for (const term of unique) stmt.run(term)
38
+ }
39
+
40
+ /** Decrement a document's tokens; remove rows that drop to zero. */
41
+ export function unrecordTerms(db: Database, text: string): void {
42
+ const unique = new Set(tokenize(text))
43
+ if (unique.size === 0) return
44
+
45
+ const dec = db.prepare('UPDATE terms_dict SET doc_freq = doc_freq - 1 WHERE term = ?')
46
+ const purge = db.prepare('DELETE FROM terms_dict WHERE doc_freq <= 0')
47
+ for (const term of unique) dec.run(term)
48
+ purge.run()
49
+ }
50
+
51
+ /**
52
+ * For each token, return up to `maxCandidates` near-misses already present in
53
+ * the dictionary, using Levenshtein distance ≤ settings.maxDistance.
54
+ */
55
+ export function expandTokens(
56
+ db: Database,
57
+ tokens: string[],
58
+ settings: ResolvedTypoTolerance,
59
+ maxCandidates = 8
60
+ ): Map<string, string[]> {
61
+ const out = new Map<string, string[]>()
62
+ if (!settings.enabled) return out
63
+
64
+ const stmt = db.prepare<{ term: string }, [number, number]>(
65
+ 'SELECT term FROM terms_dict WHERE length(term) BETWEEN ? AND ?'
66
+ )
67
+
68
+ for (const token of tokens) {
69
+ if (token.length < settings.minTokenLength) continue
70
+
71
+ const minLen = Math.max(1, token.length - settings.maxDistance)
72
+ const maxLen = token.length + settings.maxDistance
73
+
74
+ const candidates: string[] = []
75
+ for (const row of stmt.all(minLen, maxLen)) {
76
+ if (row.term === token) continue
77
+ if (levenshtein(token, row.term, settings.maxDistance) <= settings.maxDistance) {
78
+ candidates.push(row.term)
79
+ if (candidates.length >= maxCandidates) break
80
+ }
81
+ }
82
+ if (candidates.length > 0) out.set(token, candidates)
83
+ }
84
+
85
+ return out
86
+ }
87
+
88
+ /** Resolve user-provided typo tolerance settings into concrete numbers. */
89
+ export function resolveTypoTolerance(
90
+ setting:
91
+ | 'off'
92
+ | 'auto'
93
+ | { minTokenLength?: number; maxDistance?: number }
94
+ | undefined
95
+ ): ResolvedTypoTolerance {
96
+ if (setting === 'off') {
97
+ return { enabled: false, minTokenLength: 4, maxDistance: 1 }
98
+ }
99
+ if (setting === undefined || setting === 'auto') {
100
+ return { enabled: true, minTokenLength: 4, maxDistance: 1 }
101
+ }
102
+ return {
103
+ enabled: true,
104
+ minTokenLength: setting.minTokenLength ?? 4,
105
+ maxDistance: setting.maxDistance ?? 1,
106
+ }
107
+ }
108
+
109
+ /**
110
+ * Bounded Levenshtein distance: returns max+1 once it can prove the distance
111
+ * exceeds `max` so we can short-circuit. Operates on UTF-16 code units, which
112
+ * is fine for our supported (ASCII-ish) corpora.
113
+ */
114
+ function levenshtein(a: string, b: string, max: number): number {
115
+ if (a === b) return 0
116
+ if (Math.abs(a.length - b.length) > max) return max + 1
117
+ if (a.length === 0) return b.length
118
+ if (b.length === 0) return a.length
119
+
120
+ const aLen = a.length
121
+ const bLen = b.length
122
+ let prev = new Array<number>(bLen + 1).fill(0)
123
+ let curr = new Array<number>(bLen + 1).fill(0)
124
+ for (let j = 0; j <= bLen; j++) prev[j] = j
125
+
126
+ for (let i = 1; i <= aLen; i++) {
127
+ curr[0] = i
128
+ let rowMin = curr[0]!
129
+ for (let j = 1; j <= bLen; j++) {
130
+ const cost = a.charCodeAt(i - 1) === b.charCodeAt(j - 1) ? 0 : 1
131
+ curr[j] = Math.min(prev[j]! + 1, curr[j - 1]! + 1, prev[j - 1]! + cost)
132
+ if (curr[j]! < rowMin) rowMin = curr[j]!
133
+ }
134
+ if (rowMin > max) return max + 1
135
+ ;[prev, curr] = [curr, prev]
136
+ }
137
+ return prev[bLen]!
138
+ }
@@ -0,0 +1,15 @@
1
+ import { SearchError } from '../../errors.ts'
2
+
3
+ export class EmbeddedSearchError extends SearchError {}
4
+
5
+ export class IndexCorruptError extends EmbeddedSearchError {
6
+ constructor(index: string, cause: string) {
7
+ super(`Embedded search index "${index}" is corrupt: ${cause}`)
8
+ }
9
+ }
10
+
11
+ export class UnsupportedFilterError extends EmbeddedSearchError {
12
+ constructor(message: string) {
13
+ super(`Embedded driver filter is unsupported: ${message}`)
14
+ }
15
+ }
@@ -0,0 +1,136 @@
1
+ import { UnsupportedFilterError } from '../errors.ts'
2
+ import { quoteIdent } from '../engine/schema.ts'
3
+
4
+ export interface CompiledFilter {
5
+ /** SQL fragment to splice into a WHERE clause (no leading 'WHERE'). Empty if no filter. */
6
+ sql: string
7
+ /** Bound parameters in the order their `?` placeholders appear. */
8
+ params: unknown[]
9
+ }
10
+
11
+ /**
12
+ * Compile a filter object into a parameterized SQL WHERE fragment.
13
+ *
14
+ * Supported value shapes for each key:
15
+ * - primitive (string/number/boolean/null) → `key = ?`
16
+ * - array of primitives → `key IN (?, ?, ?)`
17
+ * - operator object: `{ gt, gte, lt, lte, eq, neq, in }` → composed clauses
18
+ *
19
+ * Unknown keys are NOT validated against the field registry here — callers
20
+ * are expected to have configured `filterableAttributes` correctly. SQLite
21
+ * raises a clean error if the column doesn't exist.
22
+ */
23
+ export function compileFilter(
24
+ filter: Record<string, unknown> | string | undefined,
25
+ filterableAttributes: ReadonlySet<string>
26
+ ): CompiledFilter {
27
+ if (!filter) return { sql: '', params: [] }
28
+
29
+ if (typeof filter === 'string') {
30
+ throw new UnsupportedFilterError(
31
+ 'Raw string filters are not supported by the embedded driver. ' +
32
+ 'Pass an object like `{ status: "published" }` instead.'
33
+ )
34
+ }
35
+
36
+ const clauses: string[] = []
37
+ const params: unknown[] = []
38
+
39
+ for (const [key, value] of Object.entries(filter)) {
40
+ if (value === undefined) continue
41
+
42
+ if (!filterableAttributes.has(key)) {
43
+ throw new UnsupportedFilterError(
44
+ `Field "${key}" is not in filterableAttributes. ` +
45
+ 'Add it to the index settings before filtering on it.'
46
+ )
47
+ }
48
+
49
+ const col = quoteIdent(key)
50
+
51
+ if (value === null) {
52
+ clauses.push(`${col} IS NULL`)
53
+ continue
54
+ }
55
+
56
+ if (Array.isArray(value)) {
57
+ if (value.length === 0) {
58
+ clauses.push('1 = 0')
59
+ } else {
60
+ const placeholders = value.map(() => '?').join(', ')
61
+ clauses.push(`${col} IN (${placeholders})`)
62
+ params.push(...value.map(coerce))
63
+ }
64
+ continue
65
+ }
66
+
67
+ if (isOperatorObject(value)) {
68
+ for (const [op, opValue] of Object.entries(value)) {
69
+ const compiled = compileOperator(col, op, opValue)
70
+ clauses.push(compiled.sql)
71
+ params.push(...compiled.params)
72
+ }
73
+ continue
74
+ }
75
+
76
+ if (isPrimitive(value)) {
77
+ clauses.push(`${col} = ?`)
78
+ params.push(coerce(value))
79
+ continue
80
+ }
81
+
82
+ throw new UnsupportedFilterError(
83
+ `Unsupported filter value for key "${key}": ${JSON.stringify(value)}`
84
+ )
85
+ }
86
+
87
+ return { sql: clauses.join(' AND '), params }
88
+ }
89
+
90
+ function isOperatorObject(value: unknown): value is Record<string, unknown> {
91
+ if (value === null || typeof value !== 'object' || Array.isArray(value)) return false
92
+ return Object.keys(value).every(k => OPERATORS.has(k))
93
+ }
94
+
95
+ const OPERATORS = new Set(['eq', 'neq', 'gt', 'gte', 'lt', 'lte', 'in', 'nin'])
96
+
97
+ function compileOperator(col: string, op: string, value: unknown): CompiledFilter {
98
+ switch (op) {
99
+ case 'eq':
100
+ return { sql: `${col} = ?`, params: [coerce(value)] }
101
+ case 'neq':
102
+ return { sql: `${col} <> ?`, params: [coerce(value)] }
103
+ case 'gt':
104
+ return { sql: `${col} > ?`, params: [coerce(value)] }
105
+ case 'gte':
106
+ return { sql: `${col} >= ?`, params: [coerce(value)] }
107
+ case 'lt':
108
+ return { sql: `${col} < ?`, params: [coerce(value)] }
109
+ case 'lte':
110
+ return { sql: `${col} <= ?`, params: [coerce(value)] }
111
+ case 'in': {
112
+ if (!Array.isArray(value) || value.length === 0) return { sql: '1 = 0', params: [] }
113
+ const ph = value.map(() => '?').join(', ')
114
+ return { sql: `${col} IN (${ph})`, params: value.map(coerce) }
115
+ }
116
+ case 'nin': {
117
+ if (!Array.isArray(value) || value.length === 0) return { sql: '1 = 1', params: [] }
118
+ const ph = value.map(() => '?').join(', ')
119
+ return { sql: `${col} NOT IN (${ph})`, params: value.map(coerce) }
120
+ }
121
+ default:
122
+ throw new UnsupportedFilterError(`Unknown operator "${op}"`)
123
+ }
124
+ }
125
+
126
+ function isPrimitive(value: unknown): boolean {
127
+ return (
128
+ typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean'
129
+ )
130
+ }
131
+
132
+ function coerce(value: unknown): unknown {
133
+ if (value === null || value === undefined) return null
134
+ if (typeof value === 'boolean') return value ? 1 : 0
135
+ return value
136
+ }
@@ -0,0 +1,3 @@
1
+ export { EmbeddedDriver } from './embedded_driver.ts'
2
+ export type { EmbeddedConfig, TypoToleranceMode, TypoToleranceSettings } from './types.ts'
3
+ export { EmbeddedSearchError, IndexCorruptError, UnsupportedFilterError } from './errors.ts'
@@ -0,0 +1,23 @@
1
+ import { join, isAbsolute, resolve } from 'node:path'
2
+ import { mkdirSync } from 'node:fs'
3
+ import type { EmbeddedConfig } from '../types.ts'
4
+
5
+ const MEMORY = ':memory:'
6
+
7
+ /**
8
+ * Resolve the on-disk path for a given index, creating the parent directory
9
+ * if necessary. Returns ':memory:' verbatim when the config asks for it.
10
+ */
11
+ export function resolveIndexPath(config: EmbeddedConfig, index: string): string {
12
+ const root = config.path ?? './storage/search'
13
+
14
+ if (root === MEMORY) return MEMORY
15
+
16
+ const dir = isAbsolute(root) ? root : resolve(process.cwd(), root)
17
+ mkdirSync(dir, { recursive: true })
18
+
19
+ const safeName = index.replace(/[^a-zA-Z0-9_.-]/g, '_')
20
+ return join(dir, `${safeName}.sqlite`)
21
+ }
22
+
23
+ export const MEMORY_PATH = MEMORY
@@ -0,0 +1,34 @@
1
+ import type { DriverConfig } from '../../types.ts'
2
+
3
+ export type TypoToleranceMode = 'off' | 'auto'
4
+
5
+ export interface TypoToleranceSettings {
6
+ /** Minimum token length to consider for fuzzy expansion (default 4). */
7
+ minTokenLength?: number
8
+ /** Maximum Levenshtein distance to tolerate (default 1; 2 is supported but slower). */
9
+ maxDistance?: number
10
+ }
11
+
12
+ export interface EmbeddedConfig extends DriverConfig {
13
+ driver: string
14
+ /** Directory holding the per-index `.sqlite` files. Use `:memory:` for tests. */
15
+ path?: string
16
+ /** SQLite synchronous pragma. Default 'NORMAL' (crash-safe, sub-second write loss possible). */
17
+ synchronous?: 'OFF' | 'NORMAL' | 'FULL'
18
+ /** Typo tolerance: 'off' disables; 'auto' uses defaults; object for fine-grained control. */
19
+ typoTolerance?: TypoToleranceMode | TypoToleranceSettings
20
+ }
21
+
22
+ /** Resolved typo tolerance settings (after defaults applied). */
23
+ export interface ResolvedTypoTolerance {
24
+ enabled: boolean
25
+ minTokenLength: number
26
+ maxDistance: number
27
+ }
28
+
29
+ /** Internal row shape from the documents table. */
30
+ export interface DocumentRow {
31
+ rowid: number
32
+ id: string
33
+ doc: string
34
+ }
@@ -0,0 +1,116 @@
1
+ import type { PgIndexSettings } from '../types.ts'
2
+
3
+ /** Default searchable column when no `searchableAttributes` are configured. */
4
+ export const DEFAULT_TEXT_COLUMN = '_text'
5
+
6
+ /** FTS5 weight tiers in declaration order. */
7
+ const WEIGHT_TIERS = ['A', 'B', 'C', 'D'] as const
8
+ type WeightTier = (typeof WEIGHT_TIERS)[number]
9
+
10
+ /** Postgres column type derived from a sample value, or `text` as the conservative default. */
11
+ type PgType = 'text' | 'integer' | 'bigint' | 'double precision' | 'boolean' | 'timestamptz'
12
+
13
+ export interface TypedColumnSpec {
14
+ name: string
15
+ pgType: PgType
16
+ /** JSONB extraction expression: `(doc->>'name')::pgType` (cast suppressed for text). */
17
+ expression: string
18
+ }
19
+
20
+ /**
21
+ * The schema layout for one index: which document attributes feed which
22
+ * tsvector segment + weight, and which typed columns exist for filter/sort.
23
+ *
24
+ * Mirrors `embedded/engine/field_registry.ts` so the two drivers project
25
+ * documents identically. Differences:
26
+ * - Per-attribute weight tier (A/B/C/D) is explicit.
27
+ * - Typed columns are emitted as `GENERATED ALWAYS AS (...) STORED` SQL.
28
+ */
29
+ export class FieldRegistry {
30
+ readonly searchable: string[]
31
+ readonly weights: Map<string, WeightTier>
32
+ readonly filterable: string[]
33
+ readonly sortable: string[]
34
+ readonly typedColumns: TypedColumnSpec[]
35
+ readonly primaryKey: string
36
+ readonly language: string
37
+
38
+ constructor(settings?: PgIndexSettings, language = 'english') {
39
+ this.primaryKey = settings?.primaryKey ?? 'id'
40
+ this.language = settings?.language ?? language
41
+ this.searchable =
42
+ settings?.searchableAttributes && settings.searchableAttributes.length > 0
43
+ ? [...settings.searchableAttributes]
44
+ : [DEFAULT_TEXT_COLUMN]
45
+
46
+ this.weights = new Map()
47
+ for (let i = 0; i < this.searchable.length; i++) {
48
+ const attr = this.searchable[i]!
49
+ const tier = (settings?.weights?.[attr] ?? WEIGHT_TIERS[Math.min(i, 3)]) as WeightTier
50
+ this.weights.set(attr, tier)
51
+ }
52
+
53
+ this.filterable = settings?.filterableAttributes ?? []
54
+ this.sortable = settings?.sortableAttributes ?? []
55
+
56
+ const seen = new Set<string>()
57
+ const typed: TypedColumnSpec[] = []
58
+ for (const attr of [...this.filterable, ...this.sortable]) {
59
+ if (seen.has(attr)) continue
60
+ seen.add(attr)
61
+ typed.push({ name: attr, pgType: 'text', expression: `(doc->>${literal(attr)})` })
62
+ }
63
+ this.typedColumns = typed
64
+ }
65
+
66
+ get usesDefaultTextColumn(): boolean {
67
+ return this.searchable.length === 1 && this.searchable[0] === DEFAULT_TEXT_COLUMN
68
+ }
69
+
70
+ /**
71
+ * Project a document into [text, tier] pairs for tsvector construction.
72
+ * Default mode collapses every string into one A-weighted blob.
73
+ */
74
+ projectFtsSegments(document: Record<string, unknown>): Array<{ text: string; tier: WeightTier }> {
75
+ if (this.usesDefaultTextColumn) {
76
+ return [{ text: collectStrings(document), tier: 'A' }]
77
+ }
78
+ return this.searchable.map(attr => ({
79
+ text: coerceText(document[attr]),
80
+ tier: this.weights.get(attr)!,
81
+ }))
82
+ }
83
+
84
+ /** Single string spanning all searchable text (for terms-dict tokenization). */
85
+ concatSearchableText(document: Record<string, unknown>): string {
86
+ return this.projectFtsSegments(document)
87
+ .map(s => s.text)
88
+ .filter(Boolean)
89
+ .join(' ')
90
+ }
91
+ }
92
+
93
+ function literal(value: string): string {
94
+ return `'${value.replace(/'/g, "''")}'`
95
+ }
96
+
97
+ function coerceText(value: unknown): string {
98
+ if (value === null || value === undefined) return ''
99
+ if (typeof value === 'string') return value
100
+ if (Array.isArray(value)) return value.map(v => coerceText(v)).filter(Boolean).join(' ')
101
+ if (typeof value === 'number' || typeof value === 'boolean') return String(value)
102
+ return ''
103
+ }
104
+
105
+ function collectStrings(document: Record<string, unknown>): string {
106
+ const parts: string[] = []
107
+ for (const value of Object.values(document)) {
108
+ if (typeof value === 'string' && value.length > 0) parts.push(value)
109
+ else if (Array.isArray(value)) {
110
+ for (const item of value) {
111
+ if (typeof item === 'string' && item.length > 0) parts.push(item)
112
+ }
113
+ }
114
+ }
115
+ return parts.join(' ')
116
+ }
@@ -0,0 +1,105 @@
1
+ import { quoteLiteral } from '../storage/identifiers.ts'
2
+
3
+ /**
4
+ * Translate a user-facing query string into one that's safe for
5
+ * `websearch_to_tsquery`, plus extract positive tokens for typo expansion.
6
+ *
7
+ * websearch_to_tsquery already accepts Google-style syntax:
8
+ * - `"foo bar"` — phrase
9
+ * - `-foo` — exclude
10
+ * - `OR`/`AND` — boolean (case-insensitive)
11
+ *
12
+ * It does NOT support prefix matching (`foo*`); we recognise that ourselves
13
+ * and emit a separate `to_tsquery('foo:*')` ORed onto the result.
14
+ */
15
+ export interface ParsedQuery {
16
+ /** The raw query, ready to pass to `websearch_to_tsquery`. */
17
+ websearch: string
18
+ /** Positive bare tokens (no quotes/operators) — used for typo expansion. */
19
+ positiveTokens: string[]
20
+ /** Prefix tokens from `foo*` syntax — emitted separately to `to_tsquery`. */
21
+ prefixTokens: string[]
22
+ /** Whether the input was effectively empty. */
23
+ isEmpty: boolean
24
+ }
25
+
26
+ const PHRASE_RE = /"([^"]*)"/g
27
+
28
+ export function parseQuery(input: string): ParsedQuery {
29
+ const trimmed = input.trim()
30
+ if (!trimmed) {
31
+ return { websearch: '', positiveTokens: [], prefixTokens: [], isEmpty: true }
32
+ }
33
+
34
+ const positiveTokens: string[] = []
35
+ const prefixTokens: string[] = []
36
+
37
+ // Strip phrases first so we don't tokenize their inner whitespace.
38
+ const scratch = trimmed.replace(PHRASE_RE, ' ')
39
+ for (const raw of scratch.split(/\s+/)) {
40
+ if (!raw) continue
41
+ let text = raw
42
+ if (text.startsWith('-') || text.startsWith('+')) text = text.slice(1)
43
+ if (text.endsWith('*')) {
44
+ const stem = text.slice(0, -1).toLowerCase().replace(/[^\p{L}\p{N}_-]/gu, '')
45
+ if (stem) prefixTokens.push(stem)
46
+ continue
47
+ }
48
+ if (text.toUpperCase() === 'AND' || text.toUpperCase() === 'OR') continue
49
+ const norm = text.toLowerCase().replace(/[^\p{L}\p{N}_-]/gu, '')
50
+ if (norm.length >= 2) positiveTokens.push(norm)
51
+ }
52
+
53
+ return { websearch: trimmed, positiveTokens, prefixTokens, isEmpty: false }
54
+ }
55
+
56
+ /**
57
+ * Build a tsquery SQL expression that ORs together the user's websearch query,
58
+ * any prefix tokens, and any typo-expanded alternatives. Returns the
59
+ * expression + the user-text bindings (the language is embedded as a literal
60
+ * since it's a per-index server-controlled value, not user input).
61
+ *
62
+ * `startAt` is the placeholder counter the caller has already used. Returned
63
+ * `paramCount` lets the caller continue numbering for filter/limit/offset.
64
+ */
65
+ export function buildTsqueryExpression(
66
+ parsed: ParsedQuery,
67
+ expansions: Map<string, string[]>,
68
+ language: string,
69
+ startAt = 0
70
+ ): { sql: string; params: string[]; paramCount: number } {
71
+ const params: string[] = []
72
+ const fragments: string[] = []
73
+ const lang = `${quoteLiteral(language)}::regconfig`
74
+ let cursor = startAt
75
+ const ph = () => `$${++cursor}`
76
+
77
+ if (parsed.websearch) {
78
+ params.push(parsed.websearch)
79
+ fragments.push(`websearch_to_tsquery(${lang}, ${ph()})`)
80
+ }
81
+
82
+ for (const stem of parsed.prefixTokens) {
83
+ params.push(`${stem}:*`)
84
+ fragments.push(`to_tsquery(${lang}, ${ph()})`)
85
+ }
86
+
87
+ for (const token of parsed.positiveTokens) {
88
+ const cands = expansions.get(token)
89
+ if (!cands || cands.length === 0) continue
90
+ const expr = cands.map(sanitiseTsTerm).filter(Boolean).join(' | ')
91
+ if (!expr) continue
92
+ params.push(expr)
93
+ fragments.push(`to_tsquery(${lang}, ${ph()})`)
94
+ }
95
+
96
+ if (fragments.length === 0) {
97
+ return { sql: '', params: [], paramCount: 0 }
98
+ }
99
+ return { sql: fragments.join(' || '), params, paramCount: cursor - startAt }
100
+ }
101
+
102
+ /** Sanitise a single term for inclusion in a manually built tsquery. */
103
+ function sanitiseTsTerm(term: string): string {
104
+ return term.toLowerCase().replace(/[^\p{L}\p{N}_-]/gu, '')
105
+ }