@strav/search 0.3.20 → 0.3.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/README.md +122 -3
  2. package/package.json +4 -4
  3. package/src/commands/search_optimize.ts +52 -0
  4. package/src/commands/search_rebuild.ts +73 -0
  5. package/src/drivers/embedded/embedded_driver.ts +136 -0
  6. package/src/drivers/embedded/engine/field_registry.ts +97 -0
  7. package/src/drivers/embedded/engine/fts_query_builder.ts +184 -0
  8. package/src/drivers/embedded/engine/query_compiler.ts +134 -0
  9. package/src/drivers/embedded/engine/schema.ts +99 -0
  10. package/src/drivers/embedded/engine/snippet_formatter.ts +29 -0
  11. package/src/drivers/embedded/engine/sqlite_engine.ts +255 -0
  12. package/src/drivers/embedded/engine/typo_expander.ts +138 -0
  13. package/src/drivers/embedded/errors.ts +15 -0
  14. package/src/drivers/embedded/filters/filter_compiler.ts +136 -0
  15. package/src/drivers/embedded/index.ts +3 -0
  16. package/src/drivers/embedded/storage/paths.ts +23 -0
  17. package/src/drivers/embedded/types.ts +34 -0
  18. package/src/drivers/postgres/engine/field_registry.ts +116 -0
  19. package/src/drivers/postgres/engine/fts_query_builder.ts +105 -0
  20. package/src/drivers/postgres/engine/pg_engine.ts +300 -0
  21. package/src/drivers/postgres/engine/query_compiler.ts +165 -0
  22. package/src/drivers/postgres/engine/schema.ts +187 -0
  23. package/src/drivers/postgres/engine/snippet_formatter.ts +31 -0
  24. package/src/drivers/postgres/engine/typo_expander.ts +131 -0
  25. package/src/drivers/postgres/errors.ts +33 -0
  26. package/src/drivers/postgres/filters/filter_compiler.ts +138 -0
  27. package/src/drivers/postgres/index.ts +14 -0
  28. package/src/drivers/postgres/postgres_fts_driver.ts +184 -0
  29. package/src/drivers/postgres/rebuild/rebuild_inplace.ts +113 -0
  30. package/src/drivers/postgres/storage/identifiers.ts +46 -0
  31. package/src/drivers/postgres/types.ts +53 -0
  32. package/src/index.ts +11 -0
  33. package/src/search_manager.ts +7 -0
  34. package/stubs/config/search.ts +25 -0
@@ -0,0 +1,300 @@
1
+ import type { SQL } from 'bun'
2
+ import type {
3
+ SearchDocument,
4
+ SearchOptions,
5
+ SearchResult,
6
+ SearchHit,
7
+ } from '../../../types.ts'
8
+ import type { PgIndexSettings, ResolvedTypoTolerance } from '../types.ts'
9
+ import { FieldRegistry } from './field_registry.ts'
10
+ import { ensureIndexTable, dropIndex as dropIndexSchema } from './schema.ts'
11
+ import { parseQuery, buildTsqueryExpression } from './fts_query_builder.ts'
12
+ import { compileSearch } from './query_compiler.ts'
13
+ import { formatSnippet } from './snippet_formatter.ts'
14
+ import {
15
+ expandTokens,
16
+ hasFuzzystrmatch,
17
+ recordTerms,
18
+ unrecordTerms,
19
+ } from './typo_expander.ts'
20
+ import {
21
+ indexTableName,
22
+ termsTableName,
23
+ quoteIdent,
24
+ quoteLiteral,
25
+ } from '../storage/identifiers.ts'
26
+ import { rebuildInPlace, type RebuildOptions } from '../rebuild/rebuild_inplace.ts'
27
+
28
+ export interface PgEngineOptions {
29
+ sql: SQL
30
+ schema: string
31
+ index: string
32
+ language: string
33
+ typoTolerance: ResolvedTypoTolerance
34
+ ginFastUpdate: boolean
35
+ workMem: string | null
36
+ settings?: PgIndexSettings
37
+ }
38
+
39
+ /** Postgres tsvector silently truncates at ~1MB lexemes. Truncate inputs to be safe. */
40
+ const MAX_TEXT_BYTES = 900_000
41
+
42
+ /** One PgEngine wraps a single index. */
43
+ export class PgEngine {
44
+ readonly registry: FieldRegistry
45
+ private readonly sql: SQL
46
+ private readonly schema: string
47
+ private readonly index: string
48
+ private readonly typo: ResolvedTypoTolerance
49
+ private readonly ginFastUpdate: boolean
50
+ private readonly workMem: string | null
51
+ private readonly tableName: string
52
+ private fuzzyAvailable: boolean | null = null
53
+ private ensured = false
54
+
55
+ constructor(opts: PgEngineOptions) {
56
+ this.sql = opts.sql
57
+ this.schema = opts.schema
58
+ this.index = opts.index
59
+ this.typo = opts.typoTolerance
60
+ this.ginFastUpdate = opts.ginFastUpdate
61
+ this.workMem = opts.workMem
62
+ this.registry = new FieldRegistry(opts.settings, opts.language)
63
+ this.tableName = indexTableName(opts.schema, opts.index)
64
+ }
65
+
66
+ /** Lazy: ensure the table + indexes + trigger exist. Idempotent. */
67
+ async ensure(): Promise<void> {
68
+ if (this.ensured) return
69
+ await ensureIndexTable(this.sql, this.schema, this.index, this.registry, this.ginFastUpdate)
70
+ if (this.typo.enabled && this.fuzzyAvailable === null) {
71
+ this.fuzzyAvailable = await hasFuzzystrmatch(this.sql)
72
+ }
73
+ this.ensured = true
74
+ }
75
+
76
+ // ── Writes ──────────────────────────────────────────────────────────────
77
+
78
+ async upsert(id: string | number, document: Record<string, unknown>): Promise<void> {
79
+ await this.upsertMany([{ id, ...document }])
80
+ }
81
+
82
+ async upsertMany(documents: SearchDocument[]): Promise<void> {
83
+ if (documents.length === 0) return
84
+ await this.ensure()
85
+
86
+ await this.sql.begin(async (tx: SQL) => {
87
+ for (const raw of documents) {
88
+ const { id, ...rest } = raw
89
+ const idStr = String(id)
90
+ // Bun's SQL treats stringified JSON as a JSONB string value (double-
91
+ // encoding the JSON). Passing the object directly lets it generate
92
+ // proper JSONB so `doc->>'field'` works for the typed generated cols.
93
+ const doc = { id, ...(rest as Record<string, unknown>) }
94
+ const newText = truncate(this.registry.concatSearchableText(rest as Record<string, unknown>))
95
+
96
+ const oldRows = (await tx.unsafe(
97
+ `SELECT doc FROM ${this.tableName} WHERE id = $1`,
98
+ [idStr]
99
+ )) as Array<{ doc: Record<string, unknown> | string }>
100
+ if (oldRows.length > 0) {
101
+ const oldDoc = parseDoc(oldRows[0]!.doc)
102
+ const oldText = this.registry.concatSearchableText(oldDoc)
103
+ if (this.typo.enabled) await unrecordTerms(tx, this.schema, this.index, oldText)
104
+ }
105
+
106
+ const ftsExpr = this.buildFtsExpression(rest as Record<string, unknown>)
107
+ const sqlStr =
108
+ `INSERT INTO ${this.tableName} (id, doc, fts) VALUES ($1, $2, ${ftsExpr.sql}) ` +
109
+ `ON CONFLICT (id) DO UPDATE SET doc = EXCLUDED.doc, fts = EXCLUDED.fts`
110
+ await tx.unsafe(sqlStr, [idStr, doc as any, ...ftsExpr.params])
111
+
112
+ if (this.typo.enabled) await recordTerms(tx, this.schema, this.index, newText)
113
+ }
114
+ })
115
+ }
116
+
117
+ async delete(id: string | number): Promise<void> {
118
+ await this.deleteMany([id])
119
+ }
120
+
121
+ async deleteMany(ids: Array<string | number>): Promise<void> {
122
+ if (ids.length === 0) return
123
+ await this.ensure()
124
+
125
+ await this.sql.begin(async (tx: SQL) => {
126
+ const idStrs = ids.map(String)
127
+ const placeholders = idStrs.map((_, i) => `$${i + 1}`).join(', ')
128
+
129
+ if (this.typo.enabled) {
130
+ const rows = (await tx.unsafe(
131
+ `SELECT doc FROM ${this.tableName} WHERE id IN (${placeholders})`,
132
+ idStrs
133
+ )) as Array<{ doc: Record<string, unknown> | string }>
134
+ for (const r of rows) {
135
+ const oldDoc = parseDoc(r.doc)
136
+ await unrecordTerms(tx, this.schema, this.index, this.registry.concatSearchableText(oldDoc))
137
+ }
138
+ }
139
+
140
+ await tx.unsafe(
141
+ `DELETE FROM ${this.tableName} WHERE id IN (${placeholders})`,
142
+ idStrs
143
+ )
144
+ })
145
+ }
146
+
147
+ async flush(): Promise<void> {
148
+ await this.ensure()
149
+ await this.sql.begin(async (tx: SQL) => {
150
+ await tx.unsafe(`TRUNCATE ${this.tableName}`)
151
+ if (this.typo.enabled) {
152
+ await tx.unsafe(`TRUNCATE ${termsTableName(this.schema, this.index)}`)
153
+ }
154
+ })
155
+ }
156
+
157
+ async drop(): Promise<void> {
158
+ await dropIndexSchema(this.sql, this.schema, this.index)
159
+ this.ensured = false
160
+ }
161
+
162
+ // ── Reads ───────────────────────────────────────────────────────────────
163
+
164
+ async search(query: string, options?: SearchOptions): Promise<SearchResult> {
165
+ await this.ensure()
166
+ const start = performance.now()
167
+ const opts = options ?? {}
168
+ const parsed = parseQuery(query)
169
+
170
+ const expansions = await this.maybeExpand(parsed.positiveTokens)
171
+ const tsquery = buildTsqueryExpression(parsed, expansions, this.registry.language)
172
+
173
+ const compiled = compileSearch({
174
+ registry: this.registry,
175
+ schema: this.schema,
176
+ index: this.index,
177
+ tsquery: { sql: tsquery.sql, params: tsquery.params },
178
+ search: opts,
179
+ })
180
+
181
+ const result = await this.sql.begin(async (tx: SQL) => {
182
+ if (this.workMem) {
183
+ await tx.unsafe(`SET LOCAL work_mem = ${quoteLiteral(this.workMem)}`)
184
+ }
185
+ const rows = (await tx.unsafe(compiled.sql, compiled.params)) as RawHitRow[]
186
+ const totalRows = (await tx.unsafe(compiled.countSql, compiled.countParams)) as Array<{
187
+ n: number
188
+ }>
189
+ return { rows, total: totalRows[0]?.n ?? rows.length }
190
+ })
191
+
192
+ const projection = opts.attributesToRetrieve
193
+ const hits: SearchHit[] = result.rows.map(row =>
194
+ projectHit(row, compiled.snippetColumns, projection)
195
+ )
196
+
197
+ return {
198
+ hits,
199
+ totalHits: result.total,
200
+ page: Math.max(1, opts.page ?? 1),
201
+ perPage: Math.max(1, opts.perPage ?? 20),
202
+ processingTimeMs: Math.round(performance.now() - start),
203
+ }
204
+ }
205
+
206
+ /** REINDEX the GIN index. Periodic maintenance for write-heavy indexes. */
207
+ async optimize(): Promise<void> {
208
+ await this.ensure()
209
+ const ginName = `${quoteIdent(this.schema)}.${quoteIdent(`search_${this.index}_fts_gin`)}`
210
+ await this.sql.unsafe(`REINDEX INDEX ${ginName}`)
211
+ }
212
+
213
+ /**
214
+ * Recompute every row's `fts` using the current registry's language + weight
215
+ * scheme. Auto-picks tier (in-place vs batched) by row count; throws on
216
+ * tables larger than the supported tier-2 ceiling.
217
+ */
218
+ async rebuild(options?: RebuildOptions) {
219
+ await this.ensure()
220
+ return rebuildInPlace(this.sql, this.schema, this.index, this.registry, options)
221
+ }
222
+
223
+ // ── Internals ───────────────────────────────────────────────────────────
224
+
225
+ private buildFtsExpression(document: Record<string, unknown>): {
226
+ sql: string
227
+ params: string[]
228
+ } {
229
+ const segments = this.registry.projectFtsSegments(document)
230
+ const lang = `${quoteLiteral(this.registry.language)}::regconfig`
231
+ const params: string[] = []
232
+ const fragments = segments.map(seg => {
233
+ params.push(truncate(seg.text))
234
+ return `setweight(to_tsvector(${lang}, $${params.length + 2}), '${seg.tier}')`
235
+ })
236
+ // The `+2` above accounts for the leading id ($1) and doc ($2) bindings
237
+ // that callers prepend. Caller MUST keep those positions stable.
238
+ return { sql: fragments.join(' || '), params }
239
+ }
240
+
241
+ private async maybeExpand(tokens: string[]): Promise<Map<string, string[]>> {
242
+ if (!this.typo.enabled || tokens.length === 0) return new Map()
243
+ return expandTokens(
244
+ this.sql,
245
+ this.schema,
246
+ this.index,
247
+ tokens,
248
+ this.typo,
249
+ this.fuzzyAvailable === true
250
+ )
251
+ }
252
+ }
253
+
254
+ interface RawHitRow {
255
+ id: string
256
+ doc: Record<string, unknown> | string
257
+ score: number
258
+ [snippetCol: string]: unknown
259
+ }
260
+
261
+ function projectHit(
262
+ row: RawHitRow,
263
+ snippetCols: string[],
264
+ attributesToRetrieve: string[] | undefined
265
+ ): SearchHit {
266
+ const document = parseDoc(row.doc)
267
+
268
+ let projected = document
269
+ if (attributesToRetrieve && attributesToRetrieve.length > 0) {
270
+ const out: Record<string, unknown> = {}
271
+ for (const attr of attributesToRetrieve) {
272
+ if (attr in document) out[attr] = document[attr]
273
+ }
274
+ projected = out
275
+ }
276
+
277
+ const hit: SearchHit = { document: projected }
278
+
279
+ if (snippetCols.length > 0) {
280
+ const highlights: Record<string, string> = {}
281
+ for (const col of snippetCols) {
282
+ const raw = row[`__snip_${col}`] as string | null | undefined
283
+ if (raw) highlights[col] = formatSnippet(raw)
284
+ }
285
+ if (Object.keys(highlights).length > 0) hit.highlights = highlights
286
+ }
287
+
288
+ return hit
289
+ }
290
+
291
+ function parseDoc(doc: Record<string, unknown> | string): Record<string, unknown> {
292
+ if (typeof doc === 'string') return JSON.parse(doc) as Record<string, unknown>
293
+ return doc
294
+ }
295
+
296
+ function truncate(text: string): string {
297
+ if (Buffer.byteLength(text, 'utf8') <= MAX_TEXT_BYTES) return text
298
+ // Truncate by char count; over-conservative is fine.
299
+ return text.slice(0, MAX_TEXT_BYTES)
300
+ }
@@ -0,0 +1,165 @@
1
+ import type { SearchOptions } from '../../../types.ts'
2
+ import type { FieldRegistry } from './field_registry.ts'
3
+ import { compileFilter } from '../filters/filter_compiler.ts'
4
+ import { quoteIdent, quoteLiteral, indexTableName } from '../storage/identifiers.ts'
5
+
6
+ export interface CompiledSearch {
7
+ /** Main SELECT returning hits + score + snippets. */
8
+ sql: string
9
+ /** Bound parameters for the SELECT. */
10
+ params: unknown[]
11
+ /** COUNT(*) variant for totalHits (uses the same MATCH + filter, no rank/snippets). */
12
+ countSql: string
13
+ countParams: unknown[]
14
+ /** Names of headlight columns we asked PG to return (`__snip_<field>`). */
15
+ snippetColumns: string[]
16
+ }
17
+
18
+ const DEFAULT_HEADLINE_OPTIONS =
19
+ 'StartSel=<mark>,StopSel=</mark>,MaxWords=35,MinWords=15,ShortWord=0,HighlightAll=false,MaxFragments=2'
20
+
21
+ /** ts_rank_cd normalization bitmask. 1 = divide by 1+log(doc length), 32 = rank/(rank+1). */
22
+ const DEFAULT_RANK_FLAGS = 1 | 32
23
+
24
+ export interface QueryCompilerOptions {
25
+ registry: FieldRegistry
26
+ schema: string
27
+ index: string
28
+ /** Output of buildTsqueryExpression — already starts at placeholder 1. */
29
+ tsquery: { sql: string; params: string[] }
30
+ search: SearchOptions
31
+ }
32
+
33
+ export function compileSearch(opts: QueryCompilerOptions): CompiledSearch {
34
+ const { registry, schema, index, tsquery, search } = opts
35
+ const filterableSet = new Set(registry.filterable)
36
+ const sortableSet = new Set(registry.sortable)
37
+
38
+ const filter = compileFilter(search.filter, filterableSet, tsquery.params.length)
39
+ const params: unknown[] = [...tsquery.params, ...filter.params]
40
+
41
+ const whereParts: string[] = []
42
+ if (tsquery.sql) whereParts.push(`fts @@ q.query`)
43
+ if (filter.sql) whereParts.push(filter.sql)
44
+ const where = whereParts.length > 0 ? `WHERE ${whereParts.join(' AND ')}` : ''
45
+
46
+ const orderBy = compileOrder(search.sort, sortableSet, !tsquery.sql)
47
+
48
+ const perPage = Math.max(1, search.perPage ?? 20)
49
+ const page = Math.max(1, search.page ?? 1)
50
+ const offset = (page - 1) * perPage
51
+
52
+ const limitPh = `$${params.length + 1}`
53
+ const offsetPh = `$${params.length + 2}`
54
+ params.push(perPage, offset)
55
+
56
+ const wantedHighlights = pickHighlightFields(search.attributesToHighlight, registry)
57
+ const lang = `${quoteLiteral(registry.language)}::regconfig`
58
+
59
+ // The ranked CTE: filter + order + LIMIT, returns top-K rows + score only.
60
+ // ts_headline runs only on this top-K slice (huge perf win — ts_headline
61
+ // re-tokenizes raw text per row).
62
+ const cte = tsquery.sql
63
+ ? `WITH q AS (SELECT (${tsquery.sql}) AS query),
64
+ ranked AS (
65
+ SELECT id, doc, ts_rank_cd(fts, q.query, ${DEFAULT_RANK_FLAGS}) AS score
66
+ FROM ${indexTableName(schema, index)}, q
67
+ ${where}
68
+ ${orderBy}
69
+ LIMIT ${limitPh} OFFSET ${offsetPh}
70
+ )`
71
+ : `WITH ranked AS (
72
+ SELECT id, doc, 0::real AS score
73
+ FROM ${indexTableName(schema, index)}
74
+ ${where}
75
+ ${orderBy}
76
+ LIMIT ${limitPh} OFFSET ${offsetPh}
77
+ )`
78
+
79
+ const snippetCols = wantedHighlights.map(field => {
80
+ return `ts_headline(${lang}, coalesce(doc->>${quoteLiteral(field)}, ''), ` +
81
+ `${tsquery.sql ? '(SELECT query FROM q)' : 'plainto_tsquery(' + lang + ", '')"}, ` +
82
+ `${quoteLiteral(DEFAULT_HEADLINE_OPTIONS)}) AS ${quoteIdent(`__snip_${field}`)}`
83
+ })
84
+
85
+ const selectCols = ['id', 'doc', 'score', ...snippetCols]
86
+
87
+ // Re-emit ORDER BY in the outer SELECT — Postgres doesn't preserve row
88
+ // order across CTE boundaries.
89
+ const outerOrderBy = compileOuterOrder(search.sort, sortableSet, !tsquery.sql)
90
+ const sql = `${cte}
91
+ SELECT ${selectCols.join(', ')}
92
+ FROM ranked
93
+ ${outerOrderBy}`
94
+
95
+ // Count uses the MATCH + filter, but no rank/snippet/limit.
96
+ const countSql = tsquery.sql
97
+ ? `SELECT COUNT(*)::int AS n FROM ${indexTableName(schema, index)}, ` +
98
+ `(SELECT (${tsquery.sql}) AS query) q ${where}`
99
+ : `SELECT COUNT(*)::int AS n FROM ${indexTableName(schema, index)} ${where}`
100
+
101
+ const countParams = [...tsquery.params, ...filter.params]
102
+
103
+ return {
104
+ sql,
105
+ params,
106
+ countSql,
107
+ countParams,
108
+ snippetColumns: wantedHighlights,
109
+ }
110
+ }
111
+
112
+ function compileOrder(
113
+ sort: string[] | undefined,
114
+ sortableSet: ReadonlySet<string>,
115
+ matchAll: boolean
116
+ ): string {
117
+ if (sort && sort.length > 0) {
118
+ const parts: string[] = []
119
+ for (const spec of sort) {
120
+ const [field, dirRaw] = spec.split(':') as [string, string | undefined]
121
+ if (!field || !sortableSet.has(field)) {
122
+ throw new Error(
123
+ `Field "${field}" is not in sortableAttributes. Add it to the index settings before sorting on it.`
124
+ )
125
+ }
126
+ const dir = dirRaw?.toLowerCase() === 'desc' ? 'DESC' : 'ASC'
127
+ parts.push(`${quoteIdent(field)} ${dir}`)
128
+ }
129
+ return `ORDER BY ${parts.join(', ')}`
130
+ }
131
+ if (matchAll) return 'ORDER BY id ASC'
132
+ return 'ORDER BY score DESC'
133
+ }
134
+
135
+ /** ORDER BY for the outer SELECT — references columns visible on `ranked`. */
136
+ function compileOuterOrder(
137
+ sort: string[] | undefined,
138
+ sortableSet: ReadonlySet<string>,
139
+ matchAll: boolean
140
+ ): string {
141
+ if (sort && sort.length > 0) {
142
+ // The CTE only exposes id, doc, score — sortable columns aren't in scope,
143
+ // so we sort by `doc->>'field'` lexically. Same lex semantics as the
144
+ // typed generated columns (which are TEXT) used inside the CTE.
145
+ const parts: string[] = []
146
+ for (const spec of sort) {
147
+ const [field, dirRaw] = spec.split(':') as [string, string | undefined]
148
+ if (!field || !sortableSet.has(field)) continue
149
+ const dir = dirRaw?.toLowerCase() === 'desc' ? 'DESC' : 'ASC'
150
+ parts.push(`(doc->>${quoteLiteral(field)}) ${dir}`)
151
+ }
152
+ return parts.length > 0 ? `ORDER BY ${parts.join(', ')}` : ''
153
+ }
154
+ if (matchAll) return 'ORDER BY id ASC'
155
+ return 'ORDER BY score DESC'
156
+ }
157
+
158
+ function pickHighlightFields(
159
+ requested: string[] | undefined,
160
+ registry: FieldRegistry
161
+ ): string[] {
162
+ if (registry.usesDefaultTextColumn) return []
163
+ if (!requested || requested.length === 0) return []
164
+ return requested.filter(f => registry.searchable.includes(f))
165
+ }
@@ -0,0 +1,187 @@
1
+ import type { SQL } from 'bun'
2
+ import { quoteIdent, quoteLiteral, indexTableName, termsTableName, metaTableName, bareIndexTable, bareTermsTable } from '../storage/identifiers.ts'
3
+ import { MissingExtensionError } from '../errors.ts'
4
+ import type { FieldRegistry } from './field_registry.ts'
5
+ import type { ResolvedTypoTolerance } from '../types.ts'
6
+
7
+ const SCHEMA_VERSION = 1
8
+
9
+ /**
10
+ * Idempotent: ensures the search schema, the shared `_meta` table, and the
11
+ * required extensions exist. Called once per driver instantiation.
12
+ */
13
+ export async function ensureSchemaAndExtensions(
14
+ sql: SQL,
15
+ schema: string,
16
+ typo: ResolvedTypoTolerance
17
+ ): Promise<void> {
18
+ await sql.unsafe(`CREATE SCHEMA IF NOT EXISTS ${quoteIdent(schema)}`)
19
+
20
+ await sql.unsafe(`
21
+ CREATE TABLE IF NOT EXISTS ${metaTableName(schema)} (
22
+ index_name TEXT NOT NULL,
23
+ key TEXT NOT NULL,
24
+ value TEXT NOT NULL,
25
+ PRIMARY KEY (index_name, key)
26
+ )
27
+ `)
28
+
29
+ if (typo.enabled) {
30
+ try {
31
+ await sql.unsafe('CREATE EXTENSION IF NOT EXISTS pg_trgm')
32
+ } catch {
33
+ throw new MissingExtensionError('pg_trgm')
34
+ }
35
+ // fuzzystrmatch is optional — used to re-rank trigram candidates with a
36
+ // bounded Levenshtein. If absent we silently fall back to trigram-only.
37
+ try {
38
+ await sql.unsafe('CREATE EXTENSION IF NOT EXISTS fuzzystrmatch')
39
+ } catch {
40
+ // ignore
41
+ }
42
+ }
43
+ }
44
+
45
+ /**
46
+ * Idempotent: create the per-index table, GIN index, terms_dict, and trigger.
47
+ * Returns true if the table was newly created (caller may seed `_meta`).
48
+ */
49
+ export async function ensureIndexTable(
50
+ sql: SQL,
51
+ schema: string,
52
+ index: string,
53
+ registry: FieldRegistry,
54
+ ginFastUpdate: boolean
55
+ ): Promise<boolean> {
56
+ const exists = await tableExists(sql, schema, bareIndexTable(index))
57
+ if (exists) return false
58
+
59
+ const typedColsDdl = registry.typedColumns
60
+ .map(c => `, ${quoteIdent(c.name)} TEXT GENERATED ALWAYS AS (${c.expression}) STORED`)
61
+ .join('')
62
+
63
+ await sql.unsafe(`
64
+ CREATE TABLE ${indexTableName(schema, index)} (
65
+ id TEXT PRIMARY KEY,
66
+ doc JSONB NOT NULL,
67
+ fts tsvector NOT NULL DEFAULT ''::tsvector${typedColsDdl}
68
+ )
69
+ `)
70
+
71
+ await sql.unsafe(
72
+ `CREATE INDEX ${quoteIdent(`${bareIndexTable(index)}_fts_gin`)} ` +
73
+ `ON ${indexTableName(schema, index)} USING gin(fts) ` +
74
+ `WITH (fastupdate = ${ginFastUpdate ? 'on' : 'off'})`
75
+ )
76
+
77
+ for (const col of registry.typedColumns) {
78
+ await sql.unsafe(
79
+ `CREATE INDEX ${quoteIdent(`${bareIndexTable(index)}_${col.name}_idx`)} ` +
80
+ `ON ${indexTableName(schema, index)}(${quoteIdent(col.name)})`
81
+ )
82
+ }
83
+
84
+ // Belt-and-suspenders: if anyone INSERTs without computing fts, recompute it
85
+ // from doc using the current language + weight scheme. The driver always
86
+ // sets fts itself; the trigger only fires when the caller didn't.
87
+ await ensureFtsTrigger(sql, schema, index, registry)
88
+
89
+ await ensureTermsDict(sql, schema, index)
90
+
91
+ await sql.unsafe(
92
+ `INSERT INTO ${metaTableName(schema)} (index_name, key, value) VALUES ` +
93
+ `($1, 'schema_version', $2), ($1, 'language', $3), ($1, 'searchable', $4), ` +
94
+ `($1, 'filterable', $5), ($1, 'sortable', $6) ` +
95
+ `ON CONFLICT (index_name, key) DO NOTHING`,
96
+ [
97
+ index,
98
+ String(SCHEMA_VERSION),
99
+ registry.language,
100
+ JSON.stringify(registry.searchable),
101
+ JSON.stringify(registry.filterable),
102
+ JSON.stringify(registry.sortable),
103
+ ]
104
+ )
105
+
106
+ return true
107
+ }
108
+
109
+ export async function ensureTermsDict(sql: SQL, schema: string, index: string): Promise<void> {
110
+ const exists = await tableExists(sql, schema, bareTermsTable(index))
111
+ if (exists) return
112
+
113
+ await sql.unsafe(`
114
+ CREATE TABLE ${termsTableName(schema, index)} (
115
+ term TEXT PRIMARY KEY,
116
+ doc_freq INTEGER NOT NULL DEFAULT 0
117
+ )
118
+ `)
119
+ await sql.unsafe(
120
+ `CREATE INDEX ${quoteIdent(`${bareTermsTable(index)}_trgm`)} ` +
121
+ `ON ${termsTableName(schema, index)} USING gin (term gin_trgm_ops)`
122
+ )
123
+ await sql.unsafe(
124
+ `CREATE INDEX ${quoteIdent(`${bareTermsTable(index)}_len`)} ` +
125
+ `ON ${termsTableName(schema, index)} (length(term))`
126
+ )
127
+ }
128
+
129
+ async function ensureFtsTrigger(
130
+ sql: SQL,
131
+ schema: string,
132
+ index: string,
133
+ registry: FieldRegistry
134
+ ): Promise<void> {
135
+ const fnName = `${bareIndexTable(index)}_fts_trigger`
136
+ const lang = quoteLiteral(registry.language)
137
+ const segments = registry.usesDefaultTextColumn
138
+ ? `setweight(to_tsvector(${lang}::regconfig, ${defaultTextProjection()}), 'A')`
139
+ : registry.searchable
140
+ .map(attr => {
141
+ const weight = registry.weights.get(attr)!
142
+ return `setweight(to_tsvector(${lang}::regconfig, coalesce(NEW.doc->>${quoteLiteral(attr)}, '')), '${weight}')`
143
+ })
144
+ .join(' || ')
145
+
146
+ await sql.unsafe(`
147
+ CREATE OR REPLACE FUNCTION ${quoteIdent(schema)}.${quoteIdent(fnName)}() RETURNS trigger AS $$
148
+ BEGIN
149
+ IF NEW.fts IS NULL OR NEW.fts = ''::tsvector THEN
150
+ NEW.fts := ${segments};
151
+ END IF;
152
+ RETURN NEW;
153
+ END;
154
+ $$ LANGUAGE plpgsql
155
+ `)
156
+
157
+ await sql.unsafe(`
158
+ CREATE TRIGGER ${quoteIdent(`${bareIndexTable(index)}_fts_trg`)}
159
+ BEFORE INSERT OR UPDATE ON ${indexTableName(schema, index)}
160
+ FOR EACH ROW EXECUTE FUNCTION ${quoteIdent(schema)}.${quoteIdent(fnName)}()
161
+ `)
162
+ }
163
+
164
+ function defaultTextProjection(): string {
165
+ // For default mode (no searchableAttributes) the trigger fallback walks
166
+ // every JSONB string value — same shape as the field_registry default.
167
+ return `(SELECT coalesce(string_agg(value, ' '), '') FROM jsonb_each_text(NEW.doc))`
168
+ }
169
+
170
+ async function tableExists(sql: SQL, schema: string, table: string): Promise<boolean> {
171
+ const rows = (await sql.unsafe(
172
+ `SELECT 1 AS present FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace ` +
173
+ `WHERE n.nspname = $1 AND c.relname = $2 AND c.relkind = 'r' LIMIT 1`,
174
+ [schema, table]
175
+ )) as Array<Record<string, unknown>>
176
+ return rows.length > 0
177
+ }
178
+
179
+ /** Drop a single index's tables and trigger function. Idempotent. */
180
+ export async function dropIndex(sql: SQL, schema: string, index: string): Promise<void> {
181
+ await sql.unsafe(`DROP TABLE IF EXISTS ${indexTableName(schema, index)} CASCADE`)
182
+ await sql.unsafe(`DROP TABLE IF EXISTS ${termsTableName(schema, index)} CASCADE`)
183
+ await sql.unsafe(
184
+ `DROP FUNCTION IF EXISTS ${quoteIdent(schema)}.${quoteIdent(`${bareIndexTable(index)}_fts_trigger`)}() CASCADE`
185
+ )
186
+ await sql.unsafe(`DELETE FROM ${metaTableName(schema)} WHERE index_name = $1`, [index])
187
+ }
@@ -0,0 +1,31 @@
1
+ /**
2
+ * `ts_headline` returns text with literal `<mark>` / `</mark>` markers around
3
+ * matched terms. The surrounding text comes from the source document — which
4
+ * may itself contain HTML the caller didn't escape. We HTML-escape the
5
+ * snippet, then restore the marker tags, mirroring what the embedded driver
6
+ * does with sentinel markers.
7
+ */
8
+ const OPEN_TAG = '<mark>'
9
+ const CLOSE_TAG = '</mark>'
10
+ const OPEN_PLACEHOLDER = 'STRAV_OPEN'
11
+ const CLOSE_PLACEHOLDER = 'STRAV_CLOSE'
12
+
13
+ export function formatSnippet(snippet: string | null | undefined): string {
14
+ if (!snippet) return ''
15
+ // Replace ts_headline's literal tags with sentinel control bytes that
16
+ // can't appear in source text, escape, then swap back.
17
+ const swapped = snippet
18
+ .replaceAll(OPEN_TAG, OPEN_PLACEHOLDER)
19
+ .replaceAll(CLOSE_TAG, CLOSE_PLACEHOLDER)
20
+ const escaped = escapeHtml(swapped)
21
+ return escaped.replaceAll(OPEN_PLACEHOLDER, OPEN_TAG).replaceAll(CLOSE_PLACEHOLDER, CLOSE_TAG)
22
+ }
23
+
24
+ function escapeHtml(input: string): string {
25
+ return input
26
+ .replace(/&/g, '&amp;')
27
+ .replace(/</g, '&lt;')
28
+ .replace(/>/g, '&gt;')
29
+ .replace(/"/g, '&quot;')
30
+ .replace(/'/g, '&#39;')
31
+ }