@strav/search 0.3.21 → 0.3.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -51,7 +51,8 @@ await search.delete('posts', ['1'])
51
51
 
52
52
  ## Drivers
53
53
 
54
- - **Embedded** — in-process SQLite FTS5, zero deps, recommended for self-host / SMB
54
+ - **Embedded** — in-process SQLite FTS5, zero deps, recommended for self-host / SMB (~50k–500k docs)
55
+ - **Postgres FTS** — tsvector + GIN + pg_trgm, drop-in upgrade for higher volume (1M–100M docs)
55
56
  - **Meilisearch** — fast, typo-tolerant, self-hosted
56
57
  - **Typesense** — open-source, instant search
57
58
  - **Algolia** — hosted search-as-a-service
@@ -91,6 +92,52 @@ embedded: {
91
92
 
92
93
  Select it as the default with `SEARCH_DRIVER=embedded`.
93
94
 
95
+ ### Postgres FTS driver
96
+
97
+ Higher-volume tier (1M–100M docs per index) backed by your existing Postgres. Same `SearchEngine` interface as the embedded driver — drop-in swap by changing one config line.
98
+
99
+ Features:
100
+
101
+ - BM25-shaped ranking via `ts_rank_cd(fts, q, 1 | 32)` with per-field weights (`A`/`B`/`C`/`D`)
102
+ - `websearch_to_tsquery` Google-style queries plus prefix (`type*`)
103
+ - Multi-language stemming via Postgres text-search configurations (`english`, `french`, ...) — set per index
104
+ - Levenshtein-near typo tolerance via `pg_trgm` + optional `fuzzystrmatch`
105
+ - `<mark>`-highlighted snippets via `ts_headline`, computed only on the top-K to keep latency bounded
106
+ - Object-form filters with `eq`/`neq`/`gt`/`gte`/`lt`/`lte`/`in`/`nin` against generated typed columns
107
+ - One table per index in a dedicated `strav_search` schema (auto-created)
108
+
109
+ Requirements:
110
+
111
+ - Postgres ≥ 15
112
+ - `pg_trgm` extension (auto-`CREATE EXTENSION IF NOT EXISTS` on first use; superuser or owner privilege)
113
+ - `fuzzystrmatch` is optional — if present, typo expansion re-ranks trigram candidates with bounded Levenshtein for higher precision
114
+
115
+ Configuration:
116
+
117
+ ```ts
118
+ postgres: {
119
+ driver: 'postgres-fts',
120
+ // Optional: pass a Bun SQL handle. Falls back to @strav/database's Database.raw.
121
+ // connection: db.sql,
122
+ schema: env('SEARCH_PG_SCHEMA', 'strav_search'),
123
+ language: env('SEARCH_PG_LANGUAGE', 'english'),
124
+ typoTolerance: env('SEARCH_TYPO_TOLERANCE', 'auto'),
125
+ workMem: env('SEARCH_PG_WORK_MEM', '64MB'),
126
+ gin: { fastupdate: false }, // better tail latency
127
+ }
128
+ ```
129
+
130
+ Select it with `SEARCH_DRIVER=postgres`.
131
+
132
+ Limitations for v1:
133
+
134
+ - Settings change (e.g. add a new searchable attribute) requires `bun strav search:rebuild <model>`. Tier picked by row count: in-place UPDATE under 100k, batched UPDATE up to 10M, dual-table swap deferred to v1.1 with a clear error above 10M.
135
+ - Adding a new `filterableAttribute` on an existing large table currently rewrites the whole heap (`ALTER TABLE ADD COLUMN ... GENERATED ... STORED`). Plan an offline window for big tables in v1.
136
+ - One language per index — mixed-locale indexes deferred.
137
+ - Object-form filters only; raw SQL filter strings rejected.
138
+
139
+ Ranking note: `ts_rank_cd` is BM25-*shaped* (length normalisation + bounded mapping), not strict BM25. For the size and shape of corpora the driver targets, the difference is small in practice; the embedded driver remains the answer when strict BM25 matters and the corpus fits.
140
+
94
141
  Model example with per-field weights (column order determines BM25 weight — title first = highest):
95
142
 
96
143
  ```ts
@@ -132,6 +179,7 @@ You run `bun strav search:import Ticket` once to populate the index, then model
132
179
  bun strav search:import <model> # Import all records for a model
133
180
  bun strav search:flush <model> # Flush all documents from an index
134
181
  bun strav search:optimize <model> # (embedded) Merge FTS5 segments; run periodically
182
+ bun strav search:rebuild <model> # (postgres) Recompute fts after settings change
135
183
  ```
136
184
 
137
185
  ## Documentation
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@strav/search",
3
- "version": "0.3.21",
3
+ "version": "0.3.23",
4
4
  "type": "module",
5
5
  "description": "Full-text search for the Strav framework",
6
6
  "license": "MIT",
@@ -18,9 +18,9 @@
18
18
  "tsconfig.json"
19
19
  ],
20
20
  "peerDependencies": {
21
- "@strav/kernel": "0.3.21",
22
- "@strav/database": "0.3.21",
23
- "@strav/cli": "0.3.21"
21
+ "@strav/kernel": "0.3.23",
22
+ "@strav/database": "0.3.23",
23
+ "@strav/cli": "0.3.23"
24
24
  },
25
25
  "scripts": {
26
26
  "test": "bun test tests/",
@@ -0,0 +1,73 @@
1
+ import type { Command } from 'commander'
2
+ import chalk from 'chalk'
3
+ import { bootstrap, shutdown } from '@strav/cli'
4
+ import { BaseModel } from '@strav/database'
5
+ import SearchManager from '../search_manager.ts'
6
+ import { PostgresFtsDriver } from '../drivers/postgres/index.ts'
7
+
8
+ export function register(program: Command): void {
9
+ program
10
+ .command('search:rebuild <model>')
11
+ .description("Recompute a model's fts column in place (postgres-fts driver only)")
12
+ .option('--no-reindex', "Skip the GIN REINDEX after the rebuild")
13
+ .option('--pause <ms>', 'Pause between batches in tier-2 mode (default 50)', '50')
14
+ .action(async (modelPath: string, options: { reindex: boolean; pause: string }) => {
15
+ let db
16
+ try {
17
+ const { db: database, config } = await bootstrap()
18
+ db = database
19
+
20
+ new BaseModel(db)
21
+ new SearchManager(config)
22
+
23
+ const resolved = require.resolve(`${process.cwd()}/${modelPath}`)
24
+ const module = await import(resolved)
25
+ const ModelClass = module.default ?? (Object.values(module)[0] as any)
26
+
27
+ if (typeof ModelClass?.searchableAs !== 'function') {
28
+ console.error(chalk.red(`Model "${modelPath}" does not use the searchable() mixin.`))
29
+ process.exit(1)
30
+ }
31
+
32
+ const indexName = SearchManager.indexName(ModelClass.searchableAs())
33
+ const engine = SearchManager.engine()
34
+
35
+ if (!(engine instanceof PostgresFtsDriver)) {
36
+ console.error(
37
+ chalk.red(
38
+ `search:rebuild is only meaningful for the postgres-fts driver (current: ${engine.name}).`
39
+ )
40
+ )
41
+ process.exit(1)
42
+ }
43
+
44
+ // Make sure the engine knows about the model's settings (so rebuild
45
+ // computes fts with the right weights/language).
46
+ const settings = (ModelClass.searchableSettings?.() ?? undefined) as any
47
+ if (settings) await engine.createIndex(indexName, settings)
48
+
49
+ console.log(chalk.dim(`Rebuilding "${indexName}"...`))
50
+ const result = await engine.rebuild(indexName, {
51
+ reindex: options.reindex !== false,
52
+ pauseMs: Number(options.pause),
53
+ onProgress: (done, total) => {
54
+ const pct = total > 0 ? Math.round((done / total) * 100) : 100
55
+ process.stdout.write(`\r ${done}/${total} rows (${pct}%) `)
56
+ },
57
+ })
58
+ if (result.tier === 2) process.stdout.write('\n')
59
+
60
+ console.log(
61
+ chalk.green(
62
+ `Rebuilt ${result.rows} row(s) in "${indexName}" using tier-${result.tier} ` +
63
+ `strategy (${result.elapsedMs}ms).`
64
+ )
65
+ )
66
+ } catch (err) {
67
+ console.error(chalk.red(`Error: ${err instanceof Error ? err.message : err}`))
68
+ process.exit(1)
69
+ } finally {
70
+ if (db) await shutdown(db)
71
+ }
72
+ })
73
+ }
@@ -0,0 +1,116 @@
1
+ import type { PgIndexSettings } from '../types.ts'
2
+
3
+ /** Default searchable column when no `searchableAttributes` are configured. */
4
+ export const DEFAULT_TEXT_COLUMN = '_text'
5
+
6
+ /** FTS5 weight tiers in declaration order. */
7
+ const WEIGHT_TIERS = ['A', 'B', 'C', 'D'] as const
8
+ type WeightTier = (typeof WEIGHT_TIERS)[number]
9
+
10
+ /** Postgres column type derived from a sample value, or `text` as the conservative default. */
11
+ type PgType = 'text' | 'integer' | 'bigint' | 'double precision' | 'boolean' | 'timestamptz'
12
+
13
+ export interface TypedColumnSpec {
14
+ name: string
15
+ pgType: PgType
16
+ /** JSONB extraction expression: `(doc->>'name')::pgType` (cast suppressed for text). */
17
+ expression: string
18
+ }
19
+
20
+ /**
21
+ * The schema layout for one index: which document attributes feed which
22
+ * tsvector segment + weight, and which typed columns exist for filter/sort.
23
+ *
24
+ * Mirrors `embedded/engine/field_registry.ts` so the two drivers project
25
+ * documents identically. Differences:
26
+ * - Per-attribute weight tier (A/B/C/D) is explicit.
27
+ * - Typed columns are emitted as `GENERATED ALWAYS AS (...) STORED` SQL.
28
+ */
29
+ export class FieldRegistry {
30
+ readonly searchable: string[]
31
+ readonly weights: Map<string, WeightTier>
32
+ readonly filterable: string[]
33
+ readonly sortable: string[]
34
+ readonly typedColumns: TypedColumnSpec[]
35
+ readonly primaryKey: string
36
+ readonly language: string
37
+
38
+ constructor(settings?: PgIndexSettings, language = 'english') {
39
+ this.primaryKey = settings?.primaryKey ?? 'id'
40
+ this.language = settings?.language ?? language
41
+ this.searchable =
42
+ settings?.searchableAttributes && settings.searchableAttributes.length > 0
43
+ ? [...settings.searchableAttributes]
44
+ : [DEFAULT_TEXT_COLUMN]
45
+
46
+ this.weights = new Map()
47
+ for (let i = 0; i < this.searchable.length; i++) {
48
+ const attr = this.searchable[i]!
49
+ const tier = (settings?.weights?.[attr] ?? WEIGHT_TIERS[Math.min(i, 3)]) as WeightTier
50
+ this.weights.set(attr, tier)
51
+ }
52
+
53
+ this.filterable = settings?.filterableAttributes ?? []
54
+ this.sortable = settings?.sortableAttributes ?? []
55
+
56
+ const seen = new Set<string>()
57
+ const typed: TypedColumnSpec[] = []
58
+ for (const attr of [...this.filterable, ...this.sortable]) {
59
+ if (seen.has(attr)) continue
60
+ seen.add(attr)
61
+ typed.push({ name: attr, pgType: 'text', expression: `(doc->>${literal(attr)})` })
62
+ }
63
+ this.typedColumns = typed
64
+ }
65
+
66
+ get usesDefaultTextColumn(): boolean {
67
+ return this.searchable.length === 1 && this.searchable[0] === DEFAULT_TEXT_COLUMN
68
+ }
69
+
70
+ /**
71
+ * Project a document into [text, tier] pairs for tsvector construction.
72
+ * Default mode collapses every string into one A-weighted blob.
73
+ */
74
+ projectFtsSegments(document: Record<string, unknown>): Array<{ text: string; tier: WeightTier }> {
75
+ if (this.usesDefaultTextColumn) {
76
+ return [{ text: collectStrings(document), tier: 'A' }]
77
+ }
78
+ return this.searchable.map(attr => ({
79
+ text: coerceText(document[attr]),
80
+ tier: this.weights.get(attr)!,
81
+ }))
82
+ }
83
+
84
+ /** Single string spanning all searchable text (for terms-dict tokenization). */
85
+ concatSearchableText(document: Record<string, unknown>): string {
86
+ return this.projectFtsSegments(document)
87
+ .map(s => s.text)
88
+ .filter(Boolean)
89
+ .join(' ')
90
+ }
91
+ }
92
+
93
+ function literal(value: string): string {
94
+ return `'${value.replace(/'/g, "''")}'`
95
+ }
96
+
97
+ function coerceText(value: unknown): string {
98
+ if (value === null || value === undefined) return ''
99
+ if (typeof value === 'string') return value
100
+ if (Array.isArray(value)) return value.map(v => coerceText(v)).filter(Boolean).join(' ')
101
+ if (typeof value === 'number' || typeof value === 'boolean') return String(value)
102
+ return ''
103
+ }
104
+
105
+ function collectStrings(document: Record<string, unknown>): string {
106
+ const parts: string[] = []
107
+ for (const value of Object.values(document)) {
108
+ if (typeof value === 'string' && value.length > 0) parts.push(value)
109
+ else if (Array.isArray(value)) {
110
+ for (const item of value) {
111
+ if (typeof item === 'string' && item.length > 0) parts.push(item)
112
+ }
113
+ }
114
+ }
115
+ return parts.join(' ')
116
+ }
@@ -0,0 +1,105 @@
1
+ import { quoteLiteral } from '../storage/identifiers.ts'
2
+
3
+ /**
4
+ * Translate a user-facing query string into one that's safe for
5
+ * `websearch_to_tsquery`, plus extract positive tokens for typo expansion.
6
+ *
7
+ * websearch_to_tsquery already accepts Google-style syntax:
8
+ * - `"foo bar"` — phrase
9
+ * - `-foo` — exclude
10
+ * - `OR`/`AND` — boolean (case-insensitive)
11
+ *
12
+ * It does NOT support prefix matching (`foo*`); we recognise that ourselves
13
+ * and emit a separate `to_tsquery('foo:*')` ORed onto the result.
14
+ */
15
+ export interface ParsedQuery {
16
+ /** The raw query, ready to pass to `websearch_to_tsquery`. */
17
+ websearch: string
18
+ /** Positive bare tokens (no quotes/operators) — used for typo expansion. */
19
+ positiveTokens: string[]
20
+ /** Prefix tokens from `foo*` syntax — emitted separately to `to_tsquery`. */
21
+ prefixTokens: string[]
22
+ /** Whether the input was effectively empty. */
23
+ isEmpty: boolean
24
+ }
25
+
26
+ const PHRASE_RE = /"([^"]*)"/g
27
+
28
+ export function parseQuery(input: string): ParsedQuery {
29
+ const trimmed = input.trim()
30
+ if (!trimmed) {
31
+ return { websearch: '', positiveTokens: [], prefixTokens: [], isEmpty: true }
32
+ }
33
+
34
+ const positiveTokens: string[] = []
35
+ const prefixTokens: string[] = []
36
+
37
+ // Strip phrases first so we don't tokenize their inner whitespace.
38
+ const scratch = trimmed.replace(PHRASE_RE, ' ')
39
+ for (const raw of scratch.split(/\s+/)) {
40
+ if (!raw) continue
41
+ let text = raw
42
+ if (text.startsWith('-') || text.startsWith('+')) text = text.slice(1)
43
+ if (text.endsWith('*')) {
44
+ const stem = text.slice(0, -1).toLowerCase().replace(/[^\p{L}\p{N}_-]/gu, '')
45
+ if (stem) prefixTokens.push(stem)
46
+ continue
47
+ }
48
+ if (text.toUpperCase() === 'AND' || text.toUpperCase() === 'OR') continue
49
+ const norm = text.toLowerCase().replace(/[^\p{L}\p{N}_-]/gu, '')
50
+ if (norm.length >= 2) positiveTokens.push(norm)
51
+ }
52
+
53
+ return { websearch: trimmed, positiveTokens, prefixTokens, isEmpty: false }
54
+ }
55
+
56
+ /**
57
+ * Build a tsquery SQL expression that ORs together the user's websearch query,
58
+ * any prefix tokens, and any typo-expanded alternatives. Returns the
59
+ * expression + the user-text bindings (the language is embedded as a literal
60
+ * since it's a per-index server-controlled value, not user input).
61
+ *
62
+ * `startAt` is the placeholder counter the caller has already used. Returned
63
+ * `paramCount` lets the caller continue numbering for filter/limit/offset.
64
+ */
65
+ export function buildTsqueryExpression(
66
+ parsed: ParsedQuery,
67
+ expansions: Map<string, string[]>,
68
+ language: string,
69
+ startAt = 0
70
+ ): { sql: string; params: string[]; paramCount: number } {
71
+ const params: string[] = []
72
+ const fragments: string[] = []
73
+ const lang = `${quoteLiteral(language)}::regconfig`
74
+ let cursor = startAt
75
+ const ph = () => `$${++cursor}`
76
+
77
+ if (parsed.websearch) {
78
+ params.push(parsed.websearch)
79
+ fragments.push(`websearch_to_tsquery(${lang}, ${ph()})`)
80
+ }
81
+
82
+ for (const stem of parsed.prefixTokens) {
83
+ params.push(`${stem}:*`)
84
+ fragments.push(`to_tsquery(${lang}, ${ph()})`)
85
+ }
86
+
87
+ for (const token of parsed.positiveTokens) {
88
+ const cands = expansions.get(token)
89
+ if (!cands || cands.length === 0) continue
90
+ const expr = cands.map(sanitiseTsTerm).filter(Boolean).join(' | ')
91
+ if (!expr) continue
92
+ params.push(expr)
93
+ fragments.push(`to_tsquery(${lang}, ${ph()})`)
94
+ }
95
+
96
+ if (fragments.length === 0) {
97
+ return { sql: '', params: [], paramCount: 0 }
98
+ }
99
+ return { sql: fragments.join(' || '), params, paramCount: cursor - startAt }
100
+ }
101
+
102
+ /** Sanitise a single term for inclusion in a manually built tsquery. */
103
+ function sanitiseTsTerm(term: string): string {
104
+ return term.toLowerCase().replace(/[^\p{L}\p{N}_-]/gu, '')
105
+ }
@@ -0,0 +1,300 @@
1
+ import type { SQL } from 'bun'
2
+ import type {
3
+ SearchDocument,
4
+ SearchOptions,
5
+ SearchResult,
6
+ SearchHit,
7
+ } from '../../../types.ts'
8
+ import type { PgIndexSettings, ResolvedTypoTolerance } from '../types.ts'
9
+ import { FieldRegistry } from './field_registry.ts'
10
+ import { ensureIndexTable, dropIndex as dropIndexSchema } from './schema.ts'
11
+ import { parseQuery, buildTsqueryExpression } from './fts_query_builder.ts'
12
+ import { compileSearch } from './query_compiler.ts'
13
+ import { formatSnippet } from './snippet_formatter.ts'
14
+ import {
15
+ expandTokens,
16
+ hasFuzzystrmatch,
17
+ recordTerms,
18
+ unrecordTerms,
19
+ } from './typo_expander.ts'
20
+ import {
21
+ indexTableName,
22
+ termsTableName,
23
+ quoteIdent,
24
+ quoteLiteral,
25
+ } from '../storage/identifiers.ts'
26
+ import { rebuildInPlace, type RebuildOptions } from '../rebuild/rebuild_inplace.ts'
27
+
28
+ export interface PgEngineOptions {
29
+ sql: SQL
30
+ schema: string
31
+ index: string
32
+ language: string
33
+ typoTolerance: ResolvedTypoTolerance
34
+ ginFastUpdate: boolean
35
+ workMem: string | null
36
+ settings?: PgIndexSettings
37
+ }
38
+
39
+ /** Postgres tsvector silently truncates at ~1MB lexemes. Truncate inputs to be safe. */
40
+ const MAX_TEXT_BYTES = 900_000
41
+
42
+ /** One PgEngine wraps a single index. */
43
+ export class PgEngine {
44
+ readonly registry: FieldRegistry
45
+ private readonly sql: SQL
46
+ private readonly schema: string
47
+ private readonly index: string
48
+ private readonly typo: ResolvedTypoTolerance
49
+ private readonly ginFastUpdate: boolean
50
+ private readonly workMem: string | null
51
+ private readonly tableName: string
52
+ private fuzzyAvailable: boolean | null = null
53
+ private ensured = false
54
+
55
+ constructor(opts: PgEngineOptions) {
56
+ this.sql = opts.sql
57
+ this.schema = opts.schema
58
+ this.index = opts.index
59
+ this.typo = opts.typoTolerance
60
+ this.ginFastUpdate = opts.ginFastUpdate
61
+ this.workMem = opts.workMem
62
+ this.registry = new FieldRegistry(opts.settings, opts.language)
63
+ this.tableName = indexTableName(opts.schema, opts.index)
64
+ }
65
+
66
+ /** Lazy: ensure the table + indexes + trigger exist. Idempotent. */
67
+ async ensure(): Promise<void> {
68
+ if (this.ensured) return
69
+ await ensureIndexTable(this.sql, this.schema, this.index, this.registry, this.ginFastUpdate)
70
+ if (this.typo.enabled && this.fuzzyAvailable === null) {
71
+ this.fuzzyAvailable = await hasFuzzystrmatch(this.sql)
72
+ }
73
+ this.ensured = true
74
+ }
75
+
76
+ // ── Writes ──────────────────────────────────────────────────────────────
77
+
78
+ async upsert(id: string | number, document: Record<string, unknown>): Promise<void> {
79
+ await this.upsertMany([{ id, ...document }])
80
+ }
81
+
82
+ async upsertMany(documents: SearchDocument[]): Promise<void> {
83
+ if (documents.length === 0) return
84
+ await this.ensure()
85
+
86
+ await this.sql.begin(async (tx: SQL) => {
87
+ for (const raw of documents) {
88
+ const { id, ...rest } = raw
89
+ const idStr = String(id)
90
+ // Bun's SQL treats stringified JSON as a JSONB string value (double-
91
+ // encoding the JSON). Passing the object directly lets it generate
92
+ // proper JSONB so `doc->>'field'` works for the typed generated cols.
93
+ const doc = { id, ...(rest as Record<string, unknown>) }
94
+ const newText = truncate(this.registry.concatSearchableText(rest as Record<string, unknown>))
95
+
96
+ const oldRows = (await tx.unsafe(
97
+ `SELECT doc FROM ${this.tableName} WHERE id = $1`,
98
+ [idStr]
99
+ )) as Array<{ doc: Record<string, unknown> | string }>
100
+ if (oldRows.length > 0) {
101
+ const oldDoc = parseDoc(oldRows[0]!.doc)
102
+ const oldText = this.registry.concatSearchableText(oldDoc)
103
+ if (this.typo.enabled) await unrecordTerms(tx, this.schema, this.index, oldText)
104
+ }
105
+
106
+ const ftsExpr = this.buildFtsExpression(rest as Record<string, unknown>)
107
+ const sqlStr =
108
+ `INSERT INTO ${this.tableName} (id, doc, fts) VALUES ($1, $2, ${ftsExpr.sql}) ` +
109
+ `ON CONFLICT (id) DO UPDATE SET doc = EXCLUDED.doc, fts = EXCLUDED.fts`
110
+ await tx.unsafe(sqlStr, [idStr, doc as any, ...ftsExpr.params])
111
+
112
+ if (this.typo.enabled) await recordTerms(tx, this.schema, this.index, newText)
113
+ }
114
+ })
115
+ }
116
+
117
+ async delete(id: string | number): Promise<void> {
118
+ await this.deleteMany([id])
119
+ }
120
+
121
+ async deleteMany(ids: Array<string | number>): Promise<void> {
122
+ if (ids.length === 0) return
123
+ await this.ensure()
124
+
125
+ await this.sql.begin(async (tx: SQL) => {
126
+ const idStrs = ids.map(String)
127
+ const placeholders = idStrs.map((_, i) => `$${i + 1}`).join(', ')
128
+
129
+ if (this.typo.enabled) {
130
+ const rows = (await tx.unsafe(
131
+ `SELECT doc FROM ${this.tableName} WHERE id IN (${placeholders})`,
132
+ idStrs
133
+ )) as Array<{ doc: Record<string, unknown> | string }>
134
+ for (const r of rows) {
135
+ const oldDoc = parseDoc(r.doc)
136
+ await unrecordTerms(tx, this.schema, this.index, this.registry.concatSearchableText(oldDoc))
137
+ }
138
+ }
139
+
140
+ await tx.unsafe(
141
+ `DELETE FROM ${this.tableName} WHERE id IN (${placeholders})`,
142
+ idStrs
143
+ )
144
+ })
145
+ }
146
+
147
+ async flush(): Promise<void> {
148
+ await this.ensure()
149
+ await this.sql.begin(async (tx: SQL) => {
150
+ await tx.unsafe(`TRUNCATE ${this.tableName}`)
151
+ if (this.typo.enabled) {
152
+ await tx.unsafe(`TRUNCATE ${termsTableName(this.schema, this.index)}`)
153
+ }
154
+ })
155
+ }
156
+
157
+ async drop(): Promise<void> {
158
+ await dropIndexSchema(this.sql, this.schema, this.index)
159
+ this.ensured = false
160
+ }
161
+
162
+ // ── Reads ───────────────────────────────────────────────────────────────
163
+
164
+ async search(query: string, options?: SearchOptions): Promise<SearchResult> {
165
+ await this.ensure()
166
+ const start = performance.now()
167
+ const opts = options ?? {}
168
+ const parsed = parseQuery(query)
169
+
170
+ const expansions = await this.maybeExpand(parsed.positiveTokens)
171
+ const tsquery = buildTsqueryExpression(parsed, expansions, this.registry.language)
172
+
173
+ const compiled = compileSearch({
174
+ registry: this.registry,
175
+ schema: this.schema,
176
+ index: this.index,
177
+ tsquery: { sql: tsquery.sql, params: tsquery.params },
178
+ search: opts,
179
+ })
180
+
181
+ const result = await this.sql.begin(async (tx: SQL) => {
182
+ if (this.workMem) {
183
+ await tx.unsafe(`SET LOCAL work_mem = ${quoteLiteral(this.workMem)}`)
184
+ }
185
+ const rows = (await tx.unsafe(compiled.sql, compiled.params)) as RawHitRow[]
186
+ const totalRows = (await tx.unsafe(compiled.countSql, compiled.countParams)) as Array<{
187
+ n: number
188
+ }>
189
+ return { rows, total: totalRows[0]?.n ?? rows.length }
190
+ })
191
+
192
+ const projection = opts.attributesToRetrieve
193
+ const hits: SearchHit[] = result.rows.map(row =>
194
+ projectHit(row, compiled.snippetColumns, projection)
195
+ )
196
+
197
+ return {
198
+ hits,
199
+ totalHits: result.total,
200
+ page: Math.max(1, opts.page ?? 1),
201
+ perPage: Math.max(1, opts.perPage ?? 20),
202
+ processingTimeMs: Math.round(performance.now() - start),
203
+ }
204
+ }
205
+
206
+ /** REINDEX the GIN index. Periodic maintenance for write-heavy indexes. */
207
+ async optimize(): Promise<void> {
208
+ await this.ensure()
209
+ const ginName = `${quoteIdent(this.schema)}.${quoteIdent(`search_${this.index}_fts_gin`)}`
210
+ await this.sql.unsafe(`REINDEX INDEX ${ginName}`)
211
+ }
212
+
213
+ /**
214
+ * Recompute every row's `fts` using the current registry's language + weight
215
+ * scheme. Auto-picks tier (in-place vs batched) by row count; throws on
216
+ * tables larger than the supported tier-2 ceiling.
217
+ */
218
+ async rebuild(options?: RebuildOptions) {
219
+ await this.ensure()
220
+ return rebuildInPlace(this.sql, this.schema, this.index, this.registry, options)
221
+ }
222
+
223
+ // ── Internals ───────────────────────────────────────────────────────────
224
+
225
+ private buildFtsExpression(document: Record<string, unknown>): {
226
+ sql: string
227
+ params: string[]
228
+ } {
229
+ const segments = this.registry.projectFtsSegments(document)
230
+ const lang = `${quoteLiteral(this.registry.language)}::regconfig`
231
+ const params: string[] = []
232
+ const fragments = segments.map(seg => {
233
+ params.push(truncate(seg.text))
234
+ return `setweight(to_tsvector(${lang}, $${params.length + 2}), '${seg.tier}')`
235
+ })
236
+ // The `+2` above accounts for the leading id ($1) and doc ($2) bindings
237
+ // that callers prepend. Caller MUST keep those positions stable.
238
+ return { sql: fragments.join(' || '), params }
239
+ }
240
+
241
+ private async maybeExpand(tokens: string[]): Promise<Map<string, string[]>> {
242
+ if (!this.typo.enabled || tokens.length === 0) return new Map()
243
+ return expandTokens(
244
+ this.sql,
245
+ this.schema,
246
+ this.index,
247
+ tokens,
248
+ this.typo,
249
+ this.fuzzyAvailable === true
250
+ )
251
+ }
252
+ }
253
+
254
+ interface RawHitRow {
255
+ id: string
256
+ doc: Record<string, unknown> | string
257
+ score: number
258
+ [snippetCol: string]: unknown
259
+ }
260
+
261
+ function projectHit(
262
+ row: RawHitRow,
263
+ snippetCols: string[],
264
+ attributesToRetrieve: string[] | undefined
265
+ ): SearchHit {
266
+ const document = parseDoc(row.doc)
267
+
268
+ let projected = document
269
+ if (attributesToRetrieve && attributesToRetrieve.length > 0) {
270
+ const out: Record<string, unknown> = {}
271
+ for (const attr of attributesToRetrieve) {
272
+ if (attr in document) out[attr] = document[attr]
273
+ }
274
+ projected = out
275
+ }
276
+
277
+ const hit: SearchHit = { document: projected }
278
+
279
+ if (snippetCols.length > 0) {
280
+ const highlights: Record<string, string> = {}
281
+ for (const col of snippetCols) {
282
+ const raw = row[`__snip_${col}`] as string | null | undefined
283
+ if (raw) highlights[col] = formatSnippet(raw)
284
+ }
285
+ if (Object.keys(highlights).length > 0) hit.highlights = highlights
286
+ }
287
+
288
+ return hit
289
+ }
290
+
291
+ function parseDoc(doc: Record<string, unknown> | string): Record<string, unknown> {
292
+ if (typeof doc === 'string') return JSON.parse(doc) as Record<string, unknown>
293
+ return doc
294
+ }
295
+
296
+ function truncate(text: string): string {
297
+ if (Buffer.byteLength(text, 'utf8') <= MAX_TEXT_BYTES) return text
298
+ // Truncate by char count; over-conservative is fine.
299
+ return text.slice(0, MAX_TEXT_BYTES)
300
+ }