@strav/search 0.3.20 → 0.3.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +122 -3
- package/package.json +4 -4
- package/src/commands/search_optimize.ts +52 -0
- package/src/commands/search_rebuild.ts +73 -0
- package/src/drivers/embedded/embedded_driver.ts +136 -0
- package/src/drivers/embedded/engine/field_registry.ts +97 -0
- package/src/drivers/embedded/engine/fts_query_builder.ts +184 -0
- package/src/drivers/embedded/engine/query_compiler.ts +134 -0
- package/src/drivers/embedded/engine/schema.ts +99 -0
- package/src/drivers/embedded/engine/snippet_formatter.ts +29 -0
- package/src/drivers/embedded/engine/sqlite_engine.ts +255 -0
- package/src/drivers/embedded/engine/typo_expander.ts +138 -0
- package/src/drivers/embedded/errors.ts +15 -0
- package/src/drivers/embedded/filters/filter_compiler.ts +136 -0
- package/src/drivers/embedded/index.ts +3 -0
- package/src/drivers/embedded/storage/paths.ts +23 -0
- package/src/drivers/embedded/types.ts +34 -0
- package/src/drivers/postgres/engine/field_registry.ts +116 -0
- package/src/drivers/postgres/engine/fts_query_builder.ts +105 -0
- package/src/drivers/postgres/engine/pg_engine.ts +300 -0
- package/src/drivers/postgres/engine/query_compiler.ts +165 -0
- package/src/drivers/postgres/engine/schema.ts +187 -0
- package/src/drivers/postgres/engine/snippet_formatter.ts +31 -0
- package/src/drivers/postgres/engine/typo_expander.ts +131 -0
- package/src/drivers/postgres/errors.ts +33 -0
- package/src/drivers/postgres/filters/filter_compiler.ts +138 -0
- package/src/drivers/postgres/index.ts +14 -0
- package/src/drivers/postgres/postgres_fts_driver.ts +184 -0
- package/src/drivers/postgres/rebuild/rebuild_inplace.ts +113 -0
- package/src/drivers/postgres/storage/identifiers.ts +46 -0
- package/src/drivers/postgres/types.ts +53 -0
- package/src/index.ts +11 -0
- package/src/search_manager.ts +7 -0
- package/stubs/config/search.ts +25 -0
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
import type { SQL } from 'bun'
|
|
2
|
+
import type {
|
|
3
|
+
SearchDocument,
|
|
4
|
+
SearchOptions,
|
|
5
|
+
SearchResult,
|
|
6
|
+
SearchHit,
|
|
7
|
+
} from '../../../types.ts'
|
|
8
|
+
import type { PgIndexSettings, ResolvedTypoTolerance } from '../types.ts'
|
|
9
|
+
import { FieldRegistry } from './field_registry.ts'
|
|
10
|
+
import { ensureIndexTable, dropIndex as dropIndexSchema } from './schema.ts'
|
|
11
|
+
import { parseQuery, buildTsqueryExpression } from './fts_query_builder.ts'
|
|
12
|
+
import { compileSearch } from './query_compiler.ts'
|
|
13
|
+
import { formatSnippet } from './snippet_formatter.ts'
|
|
14
|
+
import {
|
|
15
|
+
expandTokens,
|
|
16
|
+
hasFuzzystrmatch,
|
|
17
|
+
recordTerms,
|
|
18
|
+
unrecordTerms,
|
|
19
|
+
} from './typo_expander.ts'
|
|
20
|
+
import {
|
|
21
|
+
indexTableName,
|
|
22
|
+
termsTableName,
|
|
23
|
+
quoteIdent,
|
|
24
|
+
quoteLiteral,
|
|
25
|
+
} from '../storage/identifiers.ts'
|
|
26
|
+
import { rebuildInPlace, type RebuildOptions } from '../rebuild/rebuild_inplace.ts'
|
|
27
|
+
|
|
28
|
+
export interface PgEngineOptions {
|
|
29
|
+
sql: SQL
|
|
30
|
+
schema: string
|
|
31
|
+
index: string
|
|
32
|
+
language: string
|
|
33
|
+
typoTolerance: ResolvedTypoTolerance
|
|
34
|
+
ginFastUpdate: boolean
|
|
35
|
+
workMem: string | null
|
|
36
|
+
settings?: PgIndexSettings
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/** Postgres tsvector silently truncates at ~1MB lexemes. Truncate inputs to be safe. */
|
|
40
|
+
const MAX_TEXT_BYTES = 900_000
|
|
41
|
+
|
|
42
|
+
/** One PgEngine wraps a single index. */
|
|
43
|
+
export class PgEngine {
|
|
44
|
+
readonly registry: FieldRegistry
|
|
45
|
+
private readonly sql: SQL
|
|
46
|
+
private readonly schema: string
|
|
47
|
+
private readonly index: string
|
|
48
|
+
private readonly typo: ResolvedTypoTolerance
|
|
49
|
+
private readonly ginFastUpdate: boolean
|
|
50
|
+
private readonly workMem: string | null
|
|
51
|
+
private readonly tableName: string
|
|
52
|
+
private fuzzyAvailable: boolean | null = null
|
|
53
|
+
private ensured = false
|
|
54
|
+
|
|
55
|
+
constructor(opts: PgEngineOptions) {
|
|
56
|
+
this.sql = opts.sql
|
|
57
|
+
this.schema = opts.schema
|
|
58
|
+
this.index = opts.index
|
|
59
|
+
this.typo = opts.typoTolerance
|
|
60
|
+
this.ginFastUpdate = opts.ginFastUpdate
|
|
61
|
+
this.workMem = opts.workMem
|
|
62
|
+
this.registry = new FieldRegistry(opts.settings, opts.language)
|
|
63
|
+
this.tableName = indexTableName(opts.schema, opts.index)
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/** Lazy: ensure the table + indexes + trigger exist. Idempotent. */
|
|
67
|
+
async ensure(): Promise<void> {
|
|
68
|
+
if (this.ensured) return
|
|
69
|
+
await ensureIndexTable(this.sql, this.schema, this.index, this.registry, this.ginFastUpdate)
|
|
70
|
+
if (this.typo.enabled && this.fuzzyAvailable === null) {
|
|
71
|
+
this.fuzzyAvailable = await hasFuzzystrmatch(this.sql)
|
|
72
|
+
}
|
|
73
|
+
this.ensured = true
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// ── Writes ──────────────────────────────────────────────────────────────
|
|
77
|
+
|
|
78
|
+
async upsert(id: string | number, document: Record<string, unknown>): Promise<void> {
|
|
79
|
+
await this.upsertMany([{ id, ...document }])
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
async upsertMany(documents: SearchDocument[]): Promise<void> {
|
|
83
|
+
if (documents.length === 0) return
|
|
84
|
+
await this.ensure()
|
|
85
|
+
|
|
86
|
+
await this.sql.begin(async (tx: SQL) => {
|
|
87
|
+
for (const raw of documents) {
|
|
88
|
+
const { id, ...rest } = raw
|
|
89
|
+
const idStr = String(id)
|
|
90
|
+
// Bun's SQL treats stringified JSON as a JSONB string value (double-
|
|
91
|
+
// encoding the JSON). Passing the object directly lets it generate
|
|
92
|
+
// proper JSONB so `doc->>'field'` works for the typed generated cols.
|
|
93
|
+
const doc = { id, ...(rest as Record<string, unknown>) }
|
|
94
|
+
const newText = truncate(this.registry.concatSearchableText(rest as Record<string, unknown>))
|
|
95
|
+
|
|
96
|
+
const oldRows = (await tx.unsafe(
|
|
97
|
+
`SELECT doc FROM ${this.tableName} WHERE id = $1`,
|
|
98
|
+
[idStr]
|
|
99
|
+
)) as Array<{ doc: Record<string, unknown> | string }>
|
|
100
|
+
if (oldRows.length > 0) {
|
|
101
|
+
const oldDoc = parseDoc(oldRows[0]!.doc)
|
|
102
|
+
const oldText = this.registry.concatSearchableText(oldDoc)
|
|
103
|
+
if (this.typo.enabled) await unrecordTerms(tx, this.schema, this.index, oldText)
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
const ftsExpr = this.buildFtsExpression(rest as Record<string, unknown>)
|
|
107
|
+
const sqlStr =
|
|
108
|
+
`INSERT INTO ${this.tableName} (id, doc, fts) VALUES ($1, $2, ${ftsExpr.sql}) ` +
|
|
109
|
+
`ON CONFLICT (id) DO UPDATE SET doc = EXCLUDED.doc, fts = EXCLUDED.fts`
|
|
110
|
+
await tx.unsafe(sqlStr, [idStr, doc as any, ...ftsExpr.params])
|
|
111
|
+
|
|
112
|
+
if (this.typo.enabled) await recordTerms(tx, this.schema, this.index, newText)
|
|
113
|
+
}
|
|
114
|
+
})
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
async delete(id: string | number): Promise<void> {
|
|
118
|
+
await this.deleteMany([id])
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
async deleteMany(ids: Array<string | number>): Promise<void> {
|
|
122
|
+
if (ids.length === 0) return
|
|
123
|
+
await this.ensure()
|
|
124
|
+
|
|
125
|
+
await this.sql.begin(async (tx: SQL) => {
|
|
126
|
+
const idStrs = ids.map(String)
|
|
127
|
+
const placeholders = idStrs.map((_, i) => `$${i + 1}`).join(', ')
|
|
128
|
+
|
|
129
|
+
if (this.typo.enabled) {
|
|
130
|
+
const rows = (await tx.unsafe(
|
|
131
|
+
`SELECT doc FROM ${this.tableName} WHERE id IN (${placeholders})`,
|
|
132
|
+
idStrs
|
|
133
|
+
)) as Array<{ doc: Record<string, unknown> | string }>
|
|
134
|
+
for (const r of rows) {
|
|
135
|
+
const oldDoc = parseDoc(r.doc)
|
|
136
|
+
await unrecordTerms(tx, this.schema, this.index, this.registry.concatSearchableText(oldDoc))
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
await tx.unsafe(
|
|
141
|
+
`DELETE FROM ${this.tableName} WHERE id IN (${placeholders})`,
|
|
142
|
+
idStrs
|
|
143
|
+
)
|
|
144
|
+
})
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
async flush(): Promise<void> {
|
|
148
|
+
await this.ensure()
|
|
149
|
+
await this.sql.begin(async (tx: SQL) => {
|
|
150
|
+
await tx.unsafe(`TRUNCATE ${this.tableName}`)
|
|
151
|
+
if (this.typo.enabled) {
|
|
152
|
+
await tx.unsafe(`TRUNCATE ${termsTableName(this.schema, this.index)}`)
|
|
153
|
+
}
|
|
154
|
+
})
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
async drop(): Promise<void> {
|
|
158
|
+
await dropIndexSchema(this.sql, this.schema, this.index)
|
|
159
|
+
this.ensured = false
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// ── Reads ───────────────────────────────────────────────────────────────
|
|
163
|
+
|
|
164
|
+
async search(query: string, options?: SearchOptions): Promise<SearchResult> {
|
|
165
|
+
await this.ensure()
|
|
166
|
+
const start = performance.now()
|
|
167
|
+
const opts = options ?? {}
|
|
168
|
+
const parsed = parseQuery(query)
|
|
169
|
+
|
|
170
|
+
const expansions = await this.maybeExpand(parsed.positiveTokens)
|
|
171
|
+
const tsquery = buildTsqueryExpression(parsed, expansions, this.registry.language)
|
|
172
|
+
|
|
173
|
+
const compiled = compileSearch({
|
|
174
|
+
registry: this.registry,
|
|
175
|
+
schema: this.schema,
|
|
176
|
+
index: this.index,
|
|
177
|
+
tsquery: { sql: tsquery.sql, params: tsquery.params },
|
|
178
|
+
search: opts,
|
|
179
|
+
})
|
|
180
|
+
|
|
181
|
+
const result = await this.sql.begin(async (tx: SQL) => {
|
|
182
|
+
if (this.workMem) {
|
|
183
|
+
await tx.unsafe(`SET LOCAL work_mem = ${quoteLiteral(this.workMem)}`)
|
|
184
|
+
}
|
|
185
|
+
const rows = (await tx.unsafe(compiled.sql, compiled.params)) as RawHitRow[]
|
|
186
|
+
const totalRows = (await tx.unsafe(compiled.countSql, compiled.countParams)) as Array<{
|
|
187
|
+
n: number
|
|
188
|
+
}>
|
|
189
|
+
return { rows, total: totalRows[0]?.n ?? rows.length }
|
|
190
|
+
})
|
|
191
|
+
|
|
192
|
+
const projection = opts.attributesToRetrieve
|
|
193
|
+
const hits: SearchHit[] = result.rows.map(row =>
|
|
194
|
+
projectHit(row, compiled.snippetColumns, projection)
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
return {
|
|
198
|
+
hits,
|
|
199
|
+
totalHits: result.total,
|
|
200
|
+
page: Math.max(1, opts.page ?? 1),
|
|
201
|
+
perPage: Math.max(1, opts.perPage ?? 20),
|
|
202
|
+
processingTimeMs: Math.round(performance.now() - start),
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/** REINDEX the GIN index. Periodic maintenance for write-heavy indexes. */
|
|
207
|
+
async optimize(): Promise<void> {
|
|
208
|
+
await this.ensure()
|
|
209
|
+
const ginName = `${quoteIdent(this.schema)}.${quoteIdent(`search_${this.index}_fts_gin`)}`
|
|
210
|
+
await this.sql.unsafe(`REINDEX INDEX ${ginName}`)
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
/**
|
|
214
|
+
* Recompute every row's `fts` using the current registry's language + weight
|
|
215
|
+
* scheme. Auto-picks tier (in-place vs batched) by row count; throws on
|
|
216
|
+
* tables larger than the supported tier-2 ceiling.
|
|
217
|
+
*/
|
|
218
|
+
async rebuild(options?: RebuildOptions) {
|
|
219
|
+
await this.ensure()
|
|
220
|
+
return rebuildInPlace(this.sql, this.schema, this.index, this.registry, options)
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
// ── Internals ───────────────────────────────────────────────────────────
|
|
224
|
+
|
|
225
|
+
private buildFtsExpression(document: Record<string, unknown>): {
|
|
226
|
+
sql: string
|
|
227
|
+
params: string[]
|
|
228
|
+
} {
|
|
229
|
+
const segments = this.registry.projectFtsSegments(document)
|
|
230
|
+
const lang = `${quoteLiteral(this.registry.language)}::regconfig`
|
|
231
|
+
const params: string[] = []
|
|
232
|
+
const fragments = segments.map(seg => {
|
|
233
|
+
params.push(truncate(seg.text))
|
|
234
|
+
return `setweight(to_tsvector(${lang}, $${params.length + 2}), '${seg.tier}')`
|
|
235
|
+
})
|
|
236
|
+
// The `+2` above accounts for the leading id ($1) and doc ($2) bindings
|
|
237
|
+
// that callers prepend. Caller MUST keep those positions stable.
|
|
238
|
+
return { sql: fragments.join(' || '), params }
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
private async maybeExpand(tokens: string[]): Promise<Map<string, string[]>> {
|
|
242
|
+
if (!this.typo.enabled || tokens.length === 0) return new Map()
|
|
243
|
+
return expandTokens(
|
|
244
|
+
this.sql,
|
|
245
|
+
this.schema,
|
|
246
|
+
this.index,
|
|
247
|
+
tokens,
|
|
248
|
+
this.typo,
|
|
249
|
+
this.fuzzyAvailable === true
|
|
250
|
+
)
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
interface RawHitRow {
|
|
255
|
+
id: string
|
|
256
|
+
doc: Record<string, unknown> | string
|
|
257
|
+
score: number
|
|
258
|
+
[snippetCol: string]: unknown
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
function projectHit(
|
|
262
|
+
row: RawHitRow,
|
|
263
|
+
snippetCols: string[],
|
|
264
|
+
attributesToRetrieve: string[] | undefined
|
|
265
|
+
): SearchHit {
|
|
266
|
+
const document = parseDoc(row.doc)
|
|
267
|
+
|
|
268
|
+
let projected = document
|
|
269
|
+
if (attributesToRetrieve && attributesToRetrieve.length > 0) {
|
|
270
|
+
const out: Record<string, unknown> = {}
|
|
271
|
+
for (const attr of attributesToRetrieve) {
|
|
272
|
+
if (attr in document) out[attr] = document[attr]
|
|
273
|
+
}
|
|
274
|
+
projected = out
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
const hit: SearchHit = { document: projected }
|
|
278
|
+
|
|
279
|
+
if (snippetCols.length > 0) {
|
|
280
|
+
const highlights: Record<string, string> = {}
|
|
281
|
+
for (const col of snippetCols) {
|
|
282
|
+
const raw = row[`__snip_${col}`] as string | null | undefined
|
|
283
|
+
if (raw) highlights[col] = formatSnippet(raw)
|
|
284
|
+
}
|
|
285
|
+
if (Object.keys(highlights).length > 0) hit.highlights = highlights
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
return hit
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
function parseDoc(doc: Record<string, unknown> | string): Record<string, unknown> {
|
|
292
|
+
if (typeof doc === 'string') return JSON.parse(doc) as Record<string, unknown>
|
|
293
|
+
return doc
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
function truncate(text: string): string {
|
|
297
|
+
if (Buffer.byteLength(text, 'utf8') <= MAX_TEXT_BYTES) return text
|
|
298
|
+
// Truncate by char count; over-conservative is fine.
|
|
299
|
+
return text.slice(0, MAX_TEXT_BYTES)
|
|
300
|
+
}
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
import type { SearchOptions } from '../../../types.ts'
|
|
2
|
+
import type { FieldRegistry } from './field_registry.ts'
|
|
3
|
+
import { compileFilter } from '../filters/filter_compiler.ts'
|
|
4
|
+
import { quoteIdent, quoteLiteral, indexTableName } from '../storage/identifiers.ts'
|
|
5
|
+
|
|
6
|
+
export interface CompiledSearch {
|
|
7
|
+
/** Main SELECT returning hits + score + snippets. */
|
|
8
|
+
sql: string
|
|
9
|
+
/** Bound parameters for the SELECT. */
|
|
10
|
+
params: unknown[]
|
|
11
|
+
/** COUNT(*) variant for totalHits (uses the same MATCH + filter, no rank/snippets). */
|
|
12
|
+
countSql: string
|
|
13
|
+
countParams: unknown[]
|
|
14
|
+
/** Names of headlight columns we asked PG to return (`__snip_<field>`). */
|
|
15
|
+
snippetColumns: string[]
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
const DEFAULT_HEADLINE_OPTIONS =
|
|
19
|
+
'StartSel=<mark>,StopSel=</mark>,MaxWords=35,MinWords=15,ShortWord=0,HighlightAll=false,MaxFragments=2'
|
|
20
|
+
|
|
21
|
+
/** ts_rank_cd normalization bitmask. 1 = divide by 1+log(doc length), 32 = rank/(rank+1). */
|
|
22
|
+
const DEFAULT_RANK_FLAGS = 1 | 32
|
|
23
|
+
|
|
24
|
+
export interface QueryCompilerOptions {
|
|
25
|
+
registry: FieldRegistry
|
|
26
|
+
schema: string
|
|
27
|
+
index: string
|
|
28
|
+
/** Output of buildTsqueryExpression — already starts at placeholder 1. */
|
|
29
|
+
tsquery: { sql: string; params: string[] }
|
|
30
|
+
search: SearchOptions
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export function compileSearch(opts: QueryCompilerOptions): CompiledSearch {
|
|
34
|
+
const { registry, schema, index, tsquery, search } = opts
|
|
35
|
+
const filterableSet = new Set(registry.filterable)
|
|
36
|
+
const sortableSet = new Set(registry.sortable)
|
|
37
|
+
|
|
38
|
+
const filter = compileFilter(search.filter, filterableSet, tsquery.params.length)
|
|
39
|
+
const params: unknown[] = [...tsquery.params, ...filter.params]
|
|
40
|
+
|
|
41
|
+
const whereParts: string[] = []
|
|
42
|
+
if (tsquery.sql) whereParts.push(`fts @@ q.query`)
|
|
43
|
+
if (filter.sql) whereParts.push(filter.sql)
|
|
44
|
+
const where = whereParts.length > 0 ? `WHERE ${whereParts.join(' AND ')}` : ''
|
|
45
|
+
|
|
46
|
+
const orderBy = compileOrder(search.sort, sortableSet, !tsquery.sql)
|
|
47
|
+
|
|
48
|
+
const perPage = Math.max(1, search.perPage ?? 20)
|
|
49
|
+
const page = Math.max(1, search.page ?? 1)
|
|
50
|
+
const offset = (page - 1) * perPage
|
|
51
|
+
|
|
52
|
+
const limitPh = `$${params.length + 1}`
|
|
53
|
+
const offsetPh = `$${params.length + 2}`
|
|
54
|
+
params.push(perPage, offset)
|
|
55
|
+
|
|
56
|
+
const wantedHighlights = pickHighlightFields(search.attributesToHighlight, registry)
|
|
57
|
+
const lang = `${quoteLiteral(registry.language)}::regconfig`
|
|
58
|
+
|
|
59
|
+
// The ranked CTE: filter + order + LIMIT, returns top-K rows + score only.
|
|
60
|
+
// ts_headline runs only on this top-K slice (huge perf win — ts_headline
|
|
61
|
+
// re-tokenizes raw text per row).
|
|
62
|
+
const cte = tsquery.sql
|
|
63
|
+
? `WITH q AS (SELECT (${tsquery.sql}) AS query),
|
|
64
|
+
ranked AS (
|
|
65
|
+
SELECT id, doc, ts_rank_cd(fts, q.query, ${DEFAULT_RANK_FLAGS}) AS score
|
|
66
|
+
FROM ${indexTableName(schema, index)}, q
|
|
67
|
+
${where}
|
|
68
|
+
${orderBy}
|
|
69
|
+
LIMIT ${limitPh} OFFSET ${offsetPh}
|
|
70
|
+
)`
|
|
71
|
+
: `WITH ranked AS (
|
|
72
|
+
SELECT id, doc, 0::real AS score
|
|
73
|
+
FROM ${indexTableName(schema, index)}
|
|
74
|
+
${where}
|
|
75
|
+
${orderBy}
|
|
76
|
+
LIMIT ${limitPh} OFFSET ${offsetPh}
|
|
77
|
+
)`
|
|
78
|
+
|
|
79
|
+
const snippetCols = wantedHighlights.map(field => {
|
|
80
|
+
return `ts_headline(${lang}, coalesce(doc->>${quoteLiteral(field)}, ''), ` +
|
|
81
|
+
`${tsquery.sql ? '(SELECT query FROM q)' : 'plainto_tsquery(' + lang + ", '')"}, ` +
|
|
82
|
+
`${quoteLiteral(DEFAULT_HEADLINE_OPTIONS)}) AS ${quoteIdent(`__snip_${field}`)}`
|
|
83
|
+
})
|
|
84
|
+
|
|
85
|
+
const selectCols = ['id', 'doc', 'score', ...snippetCols]
|
|
86
|
+
|
|
87
|
+
// Re-emit ORDER BY in the outer SELECT — Postgres doesn't preserve row
|
|
88
|
+
// order across CTE boundaries.
|
|
89
|
+
const outerOrderBy = compileOuterOrder(search.sort, sortableSet, !tsquery.sql)
|
|
90
|
+
const sql = `${cte}
|
|
91
|
+
SELECT ${selectCols.join(', ')}
|
|
92
|
+
FROM ranked
|
|
93
|
+
${outerOrderBy}`
|
|
94
|
+
|
|
95
|
+
// Count uses the MATCH + filter, but no rank/snippet/limit.
|
|
96
|
+
const countSql = tsquery.sql
|
|
97
|
+
? `SELECT COUNT(*)::int AS n FROM ${indexTableName(schema, index)}, ` +
|
|
98
|
+
`(SELECT (${tsquery.sql}) AS query) q ${where}`
|
|
99
|
+
: `SELECT COUNT(*)::int AS n FROM ${indexTableName(schema, index)} ${where}`
|
|
100
|
+
|
|
101
|
+
const countParams = [...tsquery.params, ...filter.params]
|
|
102
|
+
|
|
103
|
+
return {
|
|
104
|
+
sql,
|
|
105
|
+
params,
|
|
106
|
+
countSql,
|
|
107
|
+
countParams,
|
|
108
|
+
snippetColumns: wantedHighlights,
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
function compileOrder(
|
|
113
|
+
sort: string[] | undefined,
|
|
114
|
+
sortableSet: ReadonlySet<string>,
|
|
115
|
+
matchAll: boolean
|
|
116
|
+
): string {
|
|
117
|
+
if (sort && sort.length > 0) {
|
|
118
|
+
const parts: string[] = []
|
|
119
|
+
for (const spec of sort) {
|
|
120
|
+
const [field, dirRaw] = spec.split(':') as [string, string | undefined]
|
|
121
|
+
if (!field || !sortableSet.has(field)) {
|
|
122
|
+
throw new Error(
|
|
123
|
+
`Field "${field}" is not in sortableAttributes. Add it to the index settings before sorting on it.`
|
|
124
|
+
)
|
|
125
|
+
}
|
|
126
|
+
const dir = dirRaw?.toLowerCase() === 'desc' ? 'DESC' : 'ASC'
|
|
127
|
+
parts.push(`${quoteIdent(field)} ${dir}`)
|
|
128
|
+
}
|
|
129
|
+
return `ORDER BY ${parts.join(', ')}`
|
|
130
|
+
}
|
|
131
|
+
if (matchAll) return 'ORDER BY id ASC'
|
|
132
|
+
return 'ORDER BY score DESC'
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/** ORDER BY for the outer SELECT — references columns visible on `ranked`. */
|
|
136
|
+
function compileOuterOrder(
|
|
137
|
+
sort: string[] | undefined,
|
|
138
|
+
sortableSet: ReadonlySet<string>,
|
|
139
|
+
matchAll: boolean
|
|
140
|
+
): string {
|
|
141
|
+
if (sort && sort.length > 0) {
|
|
142
|
+
// The CTE only exposes id, doc, score — sortable columns aren't in scope,
|
|
143
|
+
// so we sort by `doc->>'field'` lexically. Same lex semantics as the
|
|
144
|
+
// typed generated columns (which are TEXT) used inside the CTE.
|
|
145
|
+
const parts: string[] = []
|
|
146
|
+
for (const spec of sort) {
|
|
147
|
+
const [field, dirRaw] = spec.split(':') as [string, string | undefined]
|
|
148
|
+
if (!field || !sortableSet.has(field)) continue
|
|
149
|
+
const dir = dirRaw?.toLowerCase() === 'desc' ? 'DESC' : 'ASC'
|
|
150
|
+
parts.push(`(doc->>${quoteLiteral(field)}) ${dir}`)
|
|
151
|
+
}
|
|
152
|
+
return parts.length > 0 ? `ORDER BY ${parts.join(', ')}` : ''
|
|
153
|
+
}
|
|
154
|
+
if (matchAll) return 'ORDER BY id ASC'
|
|
155
|
+
return 'ORDER BY score DESC'
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
function pickHighlightFields(
|
|
159
|
+
requested: string[] | undefined,
|
|
160
|
+
registry: FieldRegistry
|
|
161
|
+
): string[] {
|
|
162
|
+
if (registry.usesDefaultTextColumn) return []
|
|
163
|
+
if (!requested || requested.length === 0) return []
|
|
164
|
+
return requested.filter(f => registry.searchable.includes(f))
|
|
165
|
+
}
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
import type { SQL } from 'bun'
|
|
2
|
+
import { quoteIdent, quoteLiteral, indexTableName, termsTableName, metaTableName, bareIndexTable, bareTermsTable } from '../storage/identifiers.ts'
|
|
3
|
+
import { MissingExtensionError } from '../errors.ts'
|
|
4
|
+
import type { FieldRegistry } from './field_registry.ts'
|
|
5
|
+
import type { ResolvedTypoTolerance } from '../types.ts'
|
|
6
|
+
|
|
7
|
+
const SCHEMA_VERSION = 1
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Idempotent: ensures the search schema, the shared `_meta` table, and the
|
|
11
|
+
* required extensions exist. Called once per driver instantiation.
|
|
12
|
+
*/
|
|
13
|
+
export async function ensureSchemaAndExtensions(
|
|
14
|
+
sql: SQL,
|
|
15
|
+
schema: string,
|
|
16
|
+
typo: ResolvedTypoTolerance
|
|
17
|
+
): Promise<void> {
|
|
18
|
+
await sql.unsafe(`CREATE SCHEMA IF NOT EXISTS ${quoteIdent(schema)}`)
|
|
19
|
+
|
|
20
|
+
await sql.unsafe(`
|
|
21
|
+
CREATE TABLE IF NOT EXISTS ${metaTableName(schema)} (
|
|
22
|
+
index_name TEXT NOT NULL,
|
|
23
|
+
key TEXT NOT NULL,
|
|
24
|
+
value TEXT NOT NULL,
|
|
25
|
+
PRIMARY KEY (index_name, key)
|
|
26
|
+
)
|
|
27
|
+
`)
|
|
28
|
+
|
|
29
|
+
if (typo.enabled) {
|
|
30
|
+
try {
|
|
31
|
+
await sql.unsafe('CREATE EXTENSION IF NOT EXISTS pg_trgm')
|
|
32
|
+
} catch {
|
|
33
|
+
throw new MissingExtensionError('pg_trgm')
|
|
34
|
+
}
|
|
35
|
+
// fuzzystrmatch is optional — used to re-rank trigram candidates with a
|
|
36
|
+
// bounded Levenshtein. If absent we silently fall back to trigram-only.
|
|
37
|
+
try {
|
|
38
|
+
await sql.unsafe('CREATE EXTENSION IF NOT EXISTS fuzzystrmatch')
|
|
39
|
+
} catch {
|
|
40
|
+
// ignore
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Idempotent: create the per-index table, GIN index, terms_dict, and trigger.
|
|
47
|
+
* Returns true if the table was newly created (caller may seed `_meta`).
|
|
48
|
+
*/
|
|
49
|
+
export async function ensureIndexTable(
|
|
50
|
+
sql: SQL,
|
|
51
|
+
schema: string,
|
|
52
|
+
index: string,
|
|
53
|
+
registry: FieldRegistry,
|
|
54
|
+
ginFastUpdate: boolean
|
|
55
|
+
): Promise<boolean> {
|
|
56
|
+
const exists = await tableExists(sql, schema, bareIndexTable(index))
|
|
57
|
+
if (exists) return false
|
|
58
|
+
|
|
59
|
+
const typedColsDdl = registry.typedColumns
|
|
60
|
+
.map(c => `, ${quoteIdent(c.name)} TEXT GENERATED ALWAYS AS (${c.expression}) STORED`)
|
|
61
|
+
.join('')
|
|
62
|
+
|
|
63
|
+
await sql.unsafe(`
|
|
64
|
+
CREATE TABLE ${indexTableName(schema, index)} (
|
|
65
|
+
id TEXT PRIMARY KEY,
|
|
66
|
+
doc JSONB NOT NULL,
|
|
67
|
+
fts tsvector NOT NULL DEFAULT ''::tsvector${typedColsDdl}
|
|
68
|
+
)
|
|
69
|
+
`)
|
|
70
|
+
|
|
71
|
+
await sql.unsafe(
|
|
72
|
+
`CREATE INDEX ${quoteIdent(`${bareIndexTable(index)}_fts_gin`)} ` +
|
|
73
|
+
`ON ${indexTableName(schema, index)} USING gin(fts) ` +
|
|
74
|
+
`WITH (fastupdate = ${ginFastUpdate ? 'on' : 'off'})`
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
for (const col of registry.typedColumns) {
|
|
78
|
+
await sql.unsafe(
|
|
79
|
+
`CREATE INDEX ${quoteIdent(`${bareIndexTable(index)}_${col.name}_idx`)} ` +
|
|
80
|
+
`ON ${indexTableName(schema, index)}(${quoteIdent(col.name)})`
|
|
81
|
+
)
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Belt-and-suspenders: if anyone INSERTs without computing fts, recompute it
|
|
85
|
+
// from doc using the current language + weight scheme. The driver always
|
|
86
|
+
// sets fts itself; the trigger only fires when the caller didn't.
|
|
87
|
+
await ensureFtsTrigger(sql, schema, index, registry)
|
|
88
|
+
|
|
89
|
+
await ensureTermsDict(sql, schema, index)
|
|
90
|
+
|
|
91
|
+
await sql.unsafe(
|
|
92
|
+
`INSERT INTO ${metaTableName(schema)} (index_name, key, value) VALUES ` +
|
|
93
|
+
`($1, 'schema_version', $2), ($1, 'language', $3), ($1, 'searchable', $4), ` +
|
|
94
|
+
`($1, 'filterable', $5), ($1, 'sortable', $6) ` +
|
|
95
|
+
`ON CONFLICT (index_name, key) DO NOTHING`,
|
|
96
|
+
[
|
|
97
|
+
index,
|
|
98
|
+
String(SCHEMA_VERSION),
|
|
99
|
+
registry.language,
|
|
100
|
+
JSON.stringify(registry.searchable),
|
|
101
|
+
JSON.stringify(registry.filterable),
|
|
102
|
+
JSON.stringify(registry.sortable),
|
|
103
|
+
]
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
return true
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
export async function ensureTermsDict(sql: SQL, schema: string, index: string): Promise<void> {
|
|
110
|
+
const exists = await tableExists(sql, schema, bareTermsTable(index))
|
|
111
|
+
if (exists) return
|
|
112
|
+
|
|
113
|
+
await sql.unsafe(`
|
|
114
|
+
CREATE TABLE ${termsTableName(schema, index)} (
|
|
115
|
+
term TEXT PRIMARY KEY,
|
|
116
|
+
doc_freq INTEGER NOT NULL DEFAULT 0
|
|
117
|
+
)
|
|
118
|
+
`)
|
|
119
|
+
await sql.unsafe(
|
|
120
|
+
`CREATE INDEX ${quoteIdent(`${bareTermsTable(index)}_trgm`)} ` +
|
|
121
|
+
`ON ${termsTableName(schema, index)} USING gin (term gin_trgm_ops)`
|
|
122
|
+
)
|
|
123
|
+
await sql.unsafe(
|
|
124
|
+
`CREATE INDEX ${quoteIdent(`${bareTermsTable(index)}_len`)} ` +
|
|
125
|
+
`ON ${termsTableName(schema, index)} (length(term))`
|
|
126
|
+
)
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
async function ensureFtsTrigger(
|
|
130
|
+
sql: SQL,
|
|
131
|
+
schema: string,
|
|
132
|
+
index: string,
|
|
133
|
+
registry: FieldRegistry
|
|
134
|
+
): Promise<void> {
|
|
135
|
+
const fnName = `${bareIndexTable(index)}_fts_trigger`
|
|
136
|
+
const lang = quoteLiteral(registry.language)
|
|
137
|
+
const segments = registry.usesDefaultTextColumn
|
|
138
|
+
? `setweight(to_tsvector(${lang}::regconfig, ${defaultTextProjection()}), 'A')`
|
|
139
|
+
: registry.searchable
|
|
140
|
+
.map(attr => {
|
|
141
|
+
const weight = registry.weights.get(attr)!
|
|
142
|
+
return `setweight(to_tsvector(${lang}::regconfig, coalesce(NEW.doc->>${quoteLiteral(attr)}, '')), '${weight}')`
|
|
143
|
+
})
|
|
144
|
+
.join(' || ')
|
|
145
|
+
|
|
146
|
+
await sql.unsafe(`
|
|
147
|
+
CREATE OR REPLACE FUNCTION ${quoteIdent(schema)}.${quoteIdent(fnName)}() RETURNS trigger AS $$
|
|
148
|
+
BEGIN
|
|
149
|
+
IF NEW.fts IS NULL OR NEW.fts = ''::tsvector THEN
|
|
150
|
+
NEW.fts := ${segments};
|
|
151
|
+
END IF;
|
|
152
|
+
RETURN NEW;
|
|
153
|
+
END;
|
|
154
|
+
$$ LANGUAGE plpgsql
|
|
155
|
+
`)
|
|
156
|
+
|
|
157
|
+
await sql.unsafe(`
|
|
158
|
+
CREATE TRIGGER ${quoteIdent(`${bareIndexTable(index)}_fts_trg`)}
|
|
159
|
+
BEFORE INSERT OR UPDATE ON ${indexTableName(schema, index)}
|
|
160
|
+
FOR EACH ROW EXECUTE FUNCTION ${quoteIdent(schema)}.${quoteIdent(fnName)}()
|
|
161
|
+
`)
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
function defaultTextProjection(): string {
|
|
165
|
+
// For default mode (no searchableAttributes) the trigger fallback walks
|
|
166
|
+
// every JSONB string value — same shape as the field_registry default.
|
|
167
|
+
return `(SELECT coalesce(string_agg(value, ' '), '') FROM jsonb_each_text(NEW.doc))`
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
async function tableExists(sql: SQL, schema: string, table: string): Promise<boolean> {
|
|
171
|
+
const rows = (await sql.unsafe(
|
|
172
|
+
`SELECT 1 AS present FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace ` +
|
|
173
|
+
`WHERE n.nspname = $1 AND c.relname = $2 AND c.relkind = 'r' LIMIT 1`,
|
|
174
|
+
[schema, table]
|
|
175
|
+
)) as Array<Record<string, unknown>>
|
|
176
|
+
return rows.length > 0
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
/** Drop a single index's tables and trigger function. Idempotent. */
|
|
180
|
+
export async function dropIndex(sql: SQL, schema: string, index: string): Promise<void> {
|
|
181
|
+
await sql.unsafe(`DROP TABLE IF EXISTS ${indexTableName(schema, index)} CASCADE`)
|
|
182
|
+
await sql.unsafe(`DROP TABLE IF EXISTS ${termsTableName(schema, index)} CASCADE`)
|
|
183
|
+
await sql.unsafe(
|
|
184
|
+
`DROP FUNCTION IF EXISTS ${quoteIdent(schema)}.${quoteIdent(`${bareIndexTable(index)}_fts_trigger`)}() CASCADE`
|
|
185
|
+
)
|
|
186
|
+
await sql.unsafe(`DELETE FROM ${metaTableName(schema)} WHERE index_name = $1`, [index])
|
|
187
|
+
}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `ts_headline` returns text with literal `<mark>` / `</mark>` markers around
|
|
3
|
+
* matched terms. The surrounding text comes from the source document — which
|
|
4
|
+
* may itself contain HTML the caller didn't escape. We HTML-escape the
|
|
5
|
+
* snippet, then restore the marker tags, mirroring what the embedded driver
|
|
6
|
+
* does with sentinel markers.
|
|
7
|
+
*/
|
|
8
|
+
const OPEN_TAG = '<mark>'
|
|
9
|
+
const CLOSE_TAG = '</mark>'
|
|
10
|
+
const OPEN_PLACEHOLDER = 'STRAV_OPEN'
|
|
11
|
+
const CLOSE_PLACEHOLDER = 'STRAV_CLOSE'
|
|
12
|
+
|
|
13
|
+
export function formatSnippet(snippet: string | null | undefined): string {
|
|
14
|
+
if (!snippet) return ''
|
|
15
|
+
// Replace ts_headline's literal tags with sentinel control bytes that
|
|
16
|
+
// can't appear in source text, escape, then swap back.
|
|
17
|
+
const swapped = snippet
|
|
18
|
+
.replaceAll(OPEN_TAG, OPEN_PLACEHOLDER)
|
|
19
|
+
.replaceAll(CLOSE_TAG, CLOSE_PLACEHOLDER)
|
|
20
|
+
const escaped = escapeHtml(swapped)
|
|
21
|
+
return escaped.replaceAll(OPEN_PLACEHOLDER, OPEN_TAG).replaceAll(CLOSE_PLACEHOLDER, CLOSE_TAG)
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
function escapeHtml(input: string): string {
|
|
25
|
+
return input
|
|
26
|
+
.replace(/&/g, '&')
|
|
27
|
+
.replace(/</g, '<')
|
|
28
|
+
.replace(/>/g, '>')
|
|
29
|
+
.replace(/"/g, '"')
|
|
30
|
+
.replace(/'/g, ''')
|
|
31
|
+
}
|