@strav/search 0.3.21 → 0.3.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -1
- package/package.json +4 -4
- package/src/commands/search_rebuild.ts +73 -0
- package/src/drivers/postgres/engine/field_registry.ts +116 -0
- package/src/drivers/postgres/engine/fts_query_builder.ts +105 -0
- package/src/drivers/postgres/engine/pg_engine.ts +300 -0
- package/src/drivers/postgres/engine/query_compiler.ts +165 -0
- package/src/drivers/postgres/engine/schema.ts +187 -0
- package/src/drivers/postgres/engine/snippet_formatter.ts +31 -0
- package/src/drivers/postgres/engine/typo_expander.ts +131 -0
- package/src/drivers/postgres/errors.ts +33 -0
- package/src/drivers/postgres/filters/filter_compiler.ts +138 -0
- package/src/drivers/postgres/index.ts +14 -0
- package/src/drivers/postgres/postgres_fts_driver.ts +184 -0
- package/src/drivers/postgres/rebuild/rebuild_inplace.ts +113 -0
- package/src/drivers/postgres/storage/identifiers.ts +46 -0
- package/src/drivers/postgres/types.ts +53 -0
- package/src/index.ts +5 -0
- package/src/search_manager.ts +4 -0
- package/stubs/config/search.ts +15 -0
package/README.md
CHANGED
|
@@ -51,7 +51,8 @@ await search.delete('posts', ['1'])
|
|
|
51
51
|
|
|
52
52
|
## Drivers
|
|
53
53
|
|
|
54
|
-
- **Embedded** — in-process SQLite FTS5, zero deps, recommended for self-host / SMB
|
|
54
|
+
- **Embedded** — in-process SQLite FTS5, zero deps, recommended for self-host / SMB (~50k–500k docs)
|
|
55
|
+
- **Postgres FTS** — tsvector + GIN + pg_trgm, drop-in upgrade for higher volume (1M–100M docs)
|
|
55
56
|
- **Meilisearch** — fast, typo-tolerant, self-hosted
|
|
56
57
|
- **Typesense** — open-source, instant search
|
|
57
58
|
- **Algolia** — hosted search-as-a-service
|
|
@@ -91,6 +92,52 @@ embedded: {
|
|
|
91
92
|
|
|
92
93
|
Select it as the default with `SEARCH_DRIVER=embedded`.
|
|
93
94
|
|
|
95
|
+
### Postgres FTS driver
|
|
96
|
+
|
|
97
|
+
Higher-volume tier (1M–100M docs per index) backed by your existing Postgres. Same `SearchEngine` interface as the embedded driver — drop-in swap by changing one config line.
|
|
98
|
+
|
|
99
|
+
Features:
|
|
100
|
+
|
|
101
|
+
- BM25-shaped ranking via `ts_rank_cd(fts, q, 1 | 32)` with per-field weights (`A`/`B`/`C`/`D`)
|
|
102
|
+
- `websearch_to_tsquery` Google-style queries plus prefix (`type*`)
|
|
103
|
+
- Multi-language stemming via Postgres text-search configurations (`english`, `french`, ...) — set per index
|
|
104
|
+
- Levenshtein-near typo tolerance via `pg_trgm` + optional `fuzzystrmatch`
|
|
105
|
+
- `<mark>`-highlighted snippets via `ts_headline`, computed only on the top-K to keep latency bounded
|
|
106
|
+
- Object-form filters with `eq`/`neq`/`gt`/`gte`/`lt`/`lte`/`in`/`nin` against generated typed columns
|
|
107
|
+
- One table per index in a dedicated `strav_search` schema (auto-created)
|
|
108
|
+
|
|
109
|
+
Requirements:
|
|
110
|
+
|
|
111
|
+
- Postgres ≥ 15
|
|
112
|
+
- `pg_trgm` extension (auto-`CREATE EXTENSION IF NOT EXISTS` on first use; superuser or owner privilege)
|
|
113
|
+
- `fuzzystrmatch` is optional — if present, typo expansion re-ranks trigram candidates with bounded Levenshtein for higher precision
|
|
114
|
+
|
|
115
|
+
Configuration:
|
|
116
|
+
|
|
117
|
+
```ts
|
|
118
|
+
postgres: {
|
|
119
|
+
driver: 'postgres-fts',
|
|
120
|
+
// Optional: pass a Bun SQL handle. Falls back to @strav/database's Database.raw.
|
|
121
|
+
// connection: db.sql,
|
|
122
|
+
schema: env('SEARCH_PG_SCHEMA', 'strav_search'),
|
|
123
|
+
language: env('SEARCH_PG_LANGUAGE', 'english'),
|
|
124
|
+
typoTolerance: env('SEARCH_TYPO_TOLERANCE', 'auto'),
|
|
125
|
+
workMem: env('SEARCH_PG_WORK_MEM', '64MB'),
|
|
126
|
+
gin: { fastupdate: false }, // better tail latency
|
|
127
|
+
}
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Select it with `SEARCH_DRIVER=postgres`.
|
|
131
|
+
|
|
132
|
+
Limitations for v1:
|
|
133
|
+
|
|
134
|
+
- Settings change (e.g. add a new searchable attribute) requires `bun strav search:rebuild <model>`. Tier picked by row count: in-place UPDATE under 100k, batched UPDATE up to 10M, dual-table swap deferred to v1.1 with a clear error above 10M.
|
|
135
|
+
- Adding a new `filterableAttribute` on an existing large table currently rewrites the whole heap (`ALTER TABLE ADD COLUMN ... GENERATED ... STORED`). Plan an offline window for big tables in v1.
|
|
136
|
+
- One language per index — mixed-locale indexes deferred.
|
|
137
|
+
- Object-form filters only; raw SQL filter strings rejected.
|
|
138
|
+
|
|
139
|
+
Ranking note: `ts_rank_cd` is BM25-*shaped* (length normalisation + bounded mapping), not strict BM25. For the size and shape of corpora the driver targets, the difference is small in practice; the embedded driver remains the answer when strict BM25 matters and the corpus fits.
|
|
140
|
+
|
|
94
141
|
Model example with per-field weights (column order determines BM25 weight — title first = highest):
|
|
95
142
|
|
|
96
143
|
```ts
|
|
@@ -132,6 +179,7 @@ You run `bun strav search:import Ticket` once to populate the index, then model
|
|
|
132
179
|
bun strav search:import <model> # Import all records for a model
|
|
133
180
|
bun strav search:flush <model> # Flush all documents from an index
|
|
134
181
|
bun strav search:optimize <model> # (embedded) Merge FTS5 segments; run periodically
|
|
182
|
+
bun strav search:rebuild <model> # (postgres) Recompute fts after settings change
|
|
135
183
|
```
|
|
136
184
|
|
|
137
185
|
## Documentation
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@strav/search",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.22",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Full-text search for the Strav framework",
|
|
6
6
|
"license": "MIT",
|
|
@@ -18,9 +18,9 @@
|
|
|
18
18
|
"tsconfig.json"
|
|
19
19
|
],
|
|
20
20
|
"peerDependencies": {
|
|
21
|
-
"@strav/kernel": "0.3.
|
|
22
|
-
"@strav/database": "0.3.
|
|
23
|
-
"@strav/cli": "0.3.
|
|
21
|
+
"@strav/kernel": "0.3.22",
|
|
22
|
+
"@strav/database": "0.3.22",
|
|
23
|
+
"@strav/cli": "0.3.22"
|
|
24
24
|
},
|
|
25
25
|
"scripts": {
|
|
26
26
|
"test": "bun test tests/",
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import type { Command } from 'commander'
|
|
2
|
+
import chalk from 'chalk'
|
|
3
|
+
import { bootstrap, shutdown } from '@strav/cli'
|
|
4
|
+
import { BaseModel } from '@strav/database'
|
|
5
|
+
import SearchManager from '../search_manager.ts'
|
|
6
|
+
import { PostgresFtsDriver } from '../drivers/postgres/index.ts'
|
|
7
|
+
|
|
8
|
+
export function register(program: Command): void {
|
|
9
|
+
program
|
|
10
|
+
.command('search:rebuild <model>')
|
|
11
|
+
.description("Recompute a model's fts column in place (postgres-fts driver only)")
|
|
12
|
+
.option('--no-reindex', "Skip the GIN REINDEX after the rebuild")
|
|
13
|
+
.option('--pause <ms>', 'Pause between batches in tier-2 mode (default 50)', '50')
|
|
14
|
+
.action(async (modelPath: string, options: { reindex: boolean; pause: string }) => {
|
|
15
|
+
let db
|
|
16
|
+
try {
|
|
17
|
+
const { db: database, config } = await bootstrap()
|
|
18
|
+
db = database
|
|
19
|
+
|
|
20
|
+
new BaseModel(db)
|
|
21
|
+
new SearchManager(config)
|
|
22
|
+
|
|
23
|
+
const resolved = require.resolve(`${process.cwd()}/${modelPath}`)
|
|
24
|
+
const module = await import(resolved)
|
|
25
|
+
const ModelClass = module.default ?? (Object.values(module)[0] as any)
|
|
26
|
+
|
|
27
|
+
if (typeof ModelClass?.searchableAs !== 'function') {
|
|
28
|
+
console.error(chalk.red(`Model "${modelPath}" does not use the searchable() mixin.`))
|
|
29
|
+
process.exit(1)
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const indexName = SearchManager.indexName(ModelClass.searchableAs())
|
|
33
|
+
const engine = SearchManager.engine()
|
|
34
|
+
|
|
35
|
+
if (!(engine instanceof PostgresFtsDriver)) {
|
|
36
|
+
console.error(
|
|
37
|
+
chalk.red(
|
|
38
|
+
`search:rebuild is only meaningful for the postgres-fts driver (current: ${engine.name}).`
|
|
39
|
+
)
|
|
40
|
+
)
|
|
41
|
+
process.exit(1)
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Make sure the engine knows about the model's settings (so rebuild
|
|
45
|
+
// computes fts with the right weights/language).
|
|
46
|
+
const settings = (ModelClass.searchableSettings?.() ?? undefined) as any
|
|
47
|
+
if (settings) await engine.createIndex(indexName, settings)
|
|
48
|
+
|
|
49
|
+
console.log(chalk.dim(`Rebuilding "${indexName}"...`))
|
|
50
|
+
const result = await engine.rebuild(indexName, {
|
|
51
|
+
reindex: options.reindex !== false,
|
|
52
|
+
pauseMs: Number(options.pause),
|
|
53
|
+
onProgress: (done, total) => {
|
|
54
|
+
const pct = total > 0 ? Math.round((done / total) * 100) : 100
|
|
55
|
+
process.stdout.write(`\r ${done}/${total} rows (${pct}%) `)
|
|
56
|
+
},
|
|
57
|
+
})
|
|
58
|
+
if (result.tier === 2) process.stdout.write('\n')
|
|
59
|
+
|
|
60
|
+
console.log(
|
|
61
|
+
chalk.green(
|
|
62
|
+
`Rebuilt ${result.rows} row(s) in "${indexName}" using tier-${result.tier} ` +
|
|
63
|
+
`strategy (${result.elapsedMs}ms).`
|
|
64
|
+
)
|
|
65
|
+
)
|
|
66
|
+
} catch (err) {
|
|
67
|
+
console.error(chalk.red(`Error: ${err instanceof Error ? err.message : err}`))
|
|
68
|
+
process.exit(1)
|
|
69
|
+
} finally {
|
|
70
|
+
if (db) await shutdown(db)
|
|
71
|
+
}
|
|
72
|
+
})
|
|
73
|
+
}
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import type { PgIndexSettings } from '../types.ts'
|
|
2
|
+
|
|
3
|
+
/** Default searchable column when no `searchableAttributes` are configured. */
|
|
4
|
+
export const DEFAULT_TEXT_COLUMN = '_text'
|
|
5
|
+
|
|
6
|
+
/** FTS5 weight tiers in declaration order. */
|
|
7
|
+
const WEIGHT_TIERS = ['A', 'B', 'C', 'D'] as const
|
|
8
|
+
type WeightTier = (typeof WEIGHT_TIERS)[number]
|
|
9
|
+
|
|
10
|
+
/** Postgres column type derived from a sample value, or `text` as the conservative default. */
|
|
11
|
+
type PgType = 'text' | 'integer' | 'bigint' | 'double precision' | 'boolean' | 'timestamptz'
|
|
12
|
+
|
|
13
|
+
export interface TypedColumnSpec {
|
|
14
|
+
name: string
|
|
15
|
+
pgType: PgType
|
|
16
|
+
/** JSONB extraction expression: `(doc->>'name')::pgType` (cast suppressed for text). */
|
|
17
|
+
expression: string
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* The schema layout for one index: which document attributes feed which
|
|
22
|
+
* tsvector segment + weight, and which typed columns exist for filter/sort.
|
|
23
|
+
*
|
|
24
|
+
* Mirrors `embedded/engine/field_registry.ts` so the two drivers project
|
|
25
|
+
* documents identically. Differences:
|
|
26
|
+
* - Per-attribute weight tier (A/B/C/D) is explicit.
|
|
27
|
+
* - Typed columns are emitted as `GENERATED ALWAYS AS (...) STORED` SQL.
|
|
28
|
+
*/
|
|
29
|
+
export class FieldRegistry {
|
|
30
|
+
readonly searchable: string[]
|
|
31
|
+
readonly weights: Map<string, WeightTier>
|
|
32
|
+
readonly filterable: string[]
|
|
33
|
+
readonly sortable: string[]
|
|
34
|
+
readonly typedColumns: TypedColumnSpec[]
|
|
35
|
+
readonly primaryKey: string
|
|
36
|
+
readonly language: string
|
|
37
|
+
|
|
38
|
+
constructor(settings?: PgIndexSettings, language = 'english') {
|
|
39
|
+
this.primaryKey = settings?.primaryKey ?? 'id'
|
|
40
|
+
this.language = settings?.language ?? language
|
|
41
|
+
this.searchable =
|
|
42
|
+
settings?.searchableAttributes && settings.searchableAttributes.length > 0
|
|
43
|
+
? [...settings.searchableAttributes]
|
|
44
|
+
: [DEFAULT_TEXT_COLUMN]
|
|
45
|
+
|
|
46
|
+
this.weights = new Map()
|
|
47
|
+
for (let i = 0; i < this.searchable.length; i++) {
|
|
48
|
+
const attr = this.searchable[i]!
|
|
49
|
+
const tier = (settings?.weights?.[attr] ?? WEIGHT_TIERS[Math.min(i, 3)]) as WeightTier
|
|
50
|
+
this.weights.set(attr, tier)
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
this.filterable = settings?.filterableAttributes ?? []
|
|
54
|
+
this.sortable = settings?.sortableAttributes ?? []
|
|
55
|
+
|
|
56
|
+
const seen = new Set<string>()
|
|
57
|
+
const typed: TypedColumnSpec[] = []
|
|
58
|
+
for (const attr of [...this.filterable, ...this.sortable]) {
|
|
59
|
+
if (seen.has(attr)) continue
|
|
60
|
+
seen.add(attr)
|
|
61
|
+
typed.push({ name: attr, pgType: 'text', expression: `(doc->>${literal(attr)})` })
|
|
62
|
+
}
|
|
63
|
+
this.typedColumns = typed
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
get usesDefaultTextColumn(): boolean {
|
|
67
|
+
return this.searchable.length === 1 && this.searchable[0] === DEFAULT_TEXT_COLUMN
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Project a document into [text, tier] pairs for tsvector construction.
|
|
72
|
+
* Default mode collapses every string into one A-weighted blob.
|
|
73
|
+
*/
|
|
74
|
+
projectFtsSegments(document: Record<string, unknown>): Array<{ text: string; tier: WeightTier }> {
|
|
75
|
+
if (this.usesDefaultTextColumn) {
|
|
76
|
+
return [{ text: collectStrings(document), tier: 'A' }]
|
|
77
|
+
}
|
|
78
|
+
return this.searchable.map(attr => ({
|
|
79
|
+
text: coerceText(document[attr]),
|
|
80
|
+
tier: this.weights.get(attr)!,
|
|
81
|
+
}))
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/** Single string spanning all searchable text (for terms-dict tokenization). */
|
|
85
|
+
concatSearchableText(document: Record<string, unknown>): string {
|
|
86
|
+
return this.projectFtsSegments(document)
|
|
87
|
+
.map(s => s.text)
|
|
88
|
+
.filter(Boolean)
|
|
89
|
+
.join(' ')
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
function literal(value: string): string {
|
|
94
|
+
return `'${value.replace(/'/g, "''")}'`
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
function coerceText(value: unknown): string {
|
|
98
|
+
if (value === null || value === undefined) return ''
|
|
99
|
+
if (typeof value === 'string') return value
|
|
100
|
+
if (Array.isArray(value)) return value.map(v => coerceText(v)).filter(Boolean).join(' ')
|
|
101
|
+
if (typeof value === 'number' || typeof value === 'boolean') return String(value)
|
|
102
|
+
return ''
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function collectStrings(document: Record<string, unknown>): string {
|
|
106
|
+
const parts: string[] = []
|
|
107
|
+
for (const value of Object.values(document)) {
|
|
108
|
+
if (typeof value === 'string' && value.length > 0) parts.push(value)
|
|
109
|
+
else if (Array.isArray(value)) {
|
|
110
|
+
for (const item of value) {
|
|
111
|
+
if (typeof item === 'string' && item.length > 0) parts.push(item)
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
return parts.join(' ')
|
|
116
|
+
}
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import { quoteLiteral } from '../storage/identifiers.ts'
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Translate a user-facing query string into one that's safe for
|
|
5
|
+
* `websearch_to_tsquery`, plus extract positive tokens for typo expansion.
|
|
6
|
+
*
|
|
7
|
+
* websearch_to_tsquery already accepts Google-style syntax:
|
|
8
|
+
* - `"foo bar"` — phrase
|
|
9
|
+
* - `-foo` — exclude
|
|
10
|
+
* - `OR`/`AND` — boolean (case-insensitive)
|
|
11
|
+
*
|
|
12
|
+
* It does NOT support prefix matching (`foo*`); we recognise that ourselves
|
|
13
|
+
* and emit a separate `to_tsquery('foo:*')` ORed onto the result.
|
|
14
|
+
*/
|
|
15
|
+
export interface ParsedQuery {
|
|
16
|
+
/** The raw query, ready to pass to `websearch_to_tsquery`. */
|
|
17
|
+
websearch: string
|
|
18
|
+
/** Positive bare tokens (no quotes/operators) — used for typo expansion. */
|
|
19
|
+
positiveTokens: string[]
|
|
20
|
+
/** Prefix tokens from `foo*` syntax — emitted separately to `to_tsquery`. */
|
|
21
|
+
prefixTokens: string[]
|
|
22
|
+
/** Whether the input was effectively empty. */
|
|
23
|
+
isEmpty: boolean
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
const PHRASE_RE = /"([^"]*)"/g
|
|
27
|
+
|
|
28
|
+
export function parseQuery(input: string): ParsedQuery {
|
|
29
|
+
const trimmed = input.trim()
|
|
30
|
+
if (!trimmed) {
|
|
31
|
+
return { websearch: '', positiveTokens: [], prefixTokens: [], isEmpty: true }
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
const positiveTokens: string[] = []
|
|
35
|
+
const prefixTokens: string[] = []
|
|
36
|
+
|
|
37
|
+
// Strip phrases first so we don't tokenize their inner whitespace.
|
|
38
|
+
const scratch = trimmed.replace(PHRASE_RE, ' ')
|
|
39
|
+
for (const raw of scratch.split(/\s+/)) {
|
|
40
|
+
if (!raw) continue
|
|
41
|
+
let text = raw
|
|
42
|
+
if (text.startsWith('-') || text.startsWith('+')) text = text.slice(1)
|
|
43
|
+
if (text.endsWith('*')) {
|
|
44
|
+
const stem = text.slice(0, -1).toLowerCase().replace(/[^\p{L}\p{N}_-]/gu, '')
|
|
45
|
+
if (stem) prefixTokens.push(stem)
|
|
46
|
+
continue
|
|
47
|
+
}
|
|
48
|
+
if (text.toUpperCase() === 'AND' || text.toUpperCase() === 'OR') continue
|
|
49
|
+
const norm = text.toLowerCase().replace(/[^\p{L}\p{N}_-]/gu, '')
|
|
50
|
+
if (norm.length >= 2) positiveTokens.push(norm)
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return { websearch: trimmed, positiveTokens, prefixTokens, isEmpty: false }
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Build a tsquery SQL expression that ORs together the user's websearch query,
|
|
58
|
+
* any prefix tokens, and any typo-expanded alternatives. Returns the
|
|
59
|
+
* expression + the user-text bindings (the language is embedded as a literal
|
|
60
|
+
* since it's a per-index server-controlled value, not user input).
|
|
61
|
+
*
|
|
62
|
+
* `startAt` is the placeholder counter the caller has already used. Returned
|
|
63
|
+
* `paramCount` lets the caller continue numbering for filter/limit/offset.
|
|
64
|
+
*/
|
|
65
|
+
export function buildTsqueryExpression(
|
|
66
|
+
parsed: ParsedQuery,
|
|
67
|
+
expansions: Map<string, string[]>,
|
|
68
|
+
language: string,
|
|
69
|
+
startAt = 0
|
|
70
|
+
): { sql: string; params: string[]; paramCount: number } {
|
|
71
|
+
const params: string[] = []
|
|
72
|
+
const fragments: string[] = []
|
|
73
|
+
const lang = `${quoteLiteral(language)}::regconfig`
|
|
74
|
+
let cursor = startAt
|
|
75
|
+
const ph = () => `$${++cursor}`
|
|
76
|
+
|
|
77
|
+
if (parsed.websearch) {
|
|
78
|
+
params.push(parsed.websearch)
|
|
79
|
+
fragments.push(`websearch_to_tsquery(${lang}, ${ph()})`)
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
for (const stem of parsed.prefixTokens) {
|
|
83
|
+
params.push(`${stem}:*`)
|
|
84
|
+
fragments.push(`to_tsquery(${lang}, ${ph()})`)
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
for (const token of parsed.positiveTokens) {
|
|
88
|
+
const cands = expansions.get(token)
|
|
89
|
+
if (!cands || cands.length === 0) continue
|
|
90
|
+
const expr = cands.map(sanitiseTsTerm).filter(Boolean).join(' | ')
|
|
91
|
+
if (!expr) continue
|
|
92
|
+
params.push(expr)
|
|
93
|
+
fragments.push(`to_tsquery(${lang}, ${ph()})`)
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
if (fragments.length === 0) {
|
|
97
|
+
return { sql: '', params: [], paramCount: 0 }
|
|
98
|
+
}
|
|
99
|
+
return { sql: fragments.join(' || '), params, paramCount: cursor - startAt }
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
/** Sanitise a single term for inclusion in a manually built tsquery. */
|
|
103
|
+
function sanitiseTsTerm(term: string): string {
|
|
104
|
+
return term.toLowerCase().replace(/[^\p{L}\p{N}_-]/gu, '')
|
|
105
|
+
}
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
import type { SQL } from 'bun'
|
|
2
|
+
import type {
|
|
3
|
+
SearchDocument,
|
|
4
|
+
SearchOptions,
|
|
5
|
+
SearchResult,
|
|
6
|
+
SearchHit,
|
|
7
|
+
} from '../../../types.ts'
|
|
8
|
+
import type { PgIndexSettings, ResolvedTypoTolerance } from '../types.ts'
|
|
9
|
+
import { FieldRegistry } from './field_registry.ts'
|
|
10
|
+
import { ensureIndexTable, dropIndex as dropIndexSchema } from './schema.ts'
|
|
11
|
+
import { parseQuery, buildTsqueryExpression } from './fts_query_builder.ts'
|
|
12
|
+
import { compileSearch } from './query_compiler.ts'
|
|
13
|
+
import { formatSnippet } from './snippet_formatter.ts'
|
|
14
|
+
import {
|
|
15
|
+
expandTokens,
|
|
16
|
+
hasFuzzystrmatch,
|
|
17
|
+
recordTerms,
|
|
18
|
+
unrecordTerms,
|
|
19
|
+
} from './typo_expander.ts'
|
|
20
|
+
import {
|
|
21
|
+
indexTableName,
|
|
22
|
+
termsTableName,
|
|
23
|
+
quoteIdent,
|
|
24
|
+
quoteLiteral,
|
|
25
|
+
} from '../storage/identifiers.ts'
|
|
26
|
+
import { rebuildInPlace, type RebuildOptions } from '../rebuild/rebuild_inplace.ts'
|
|
27
|
+
|
|
28
|
+
export interface PgEngineOptions {
|
|
29
|
+
sql: SQL
|
|
30
|
+
schema: string
|
|
31
|
+
index: string
|
|
32
|
+
language: string
|
|
33
|
+
typoTolerance: ResolvedTypoTolerance
|
|
34
|
+
ginFastUpdate: boolean
|
|
35
|
+
workMem: string | null
|
|
36
|
+
settings?: PgIndexSettings
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/** Postgres tsvector silently truncates at ~1MB lexemes. Truncate inputs to be safe. */
|
|
40
|
+
const MAX_TEXT_BYTES = 900_000
|
|
41
|
+
|
|
42
|
+
/** One PgEngine wraps a single index. */
|
|
43
|
+
export class PgEngine {
|
|
44
|
+
readonly registry: FieldRegistry
|
|
45
|
+
private readonly sql: SQL
|
|
46
|
+
private readonly schema: string
|
|
47
|
+
private readonly index: string
|
|
48
|
+
private readonly typo: ResolvedTypoTolerance
|
|
49
|
+
private readonly ginFastUpdate: boolean
|
|
50
|
+
private readonly workMem: string | null
|
|
51
|
+
private readonly tableName: string
|
|
52
|
+
private fuzzyAvailable: boolean | null = null
|
|
53
|
+
private ensured = false
|
|
54
|
+
|
|
55
|
+
constructor(opts: PgEngineOptions) {
|
|
56
|
+
this.sql = opts.sql
|
|
57
|
+
this.schema = opts.schema
|
|
58
|
+
this.index = opts.index
|
|
59
|
+
this.typo = opts.typoTolerance
|
|
60
|
+
this.ginFastUpdate = opts.ginFastUpdate
|
|
61
|
+
this.workMem = opts.workMem
|
|
62
|
+
this.registry = new FieldRegistry(opts.settings, opts.language)
|
|
63
|
+
this.tableName = indexTableName(opts.schema, opts.index)
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/** Lazy: ensure the table + indexes + trigger exist. Idempotent. */
|
|
67
|
+
async ensure(): Promise<void> {
|
|
68
|
+
if (this.ensured) return
|
|
69
|
+
await ensureIndexTable(this.sql, this.schema, this.index, this.registry, this.ginFastUpdate)
|
|
70
|
+
if (this.typo.enabled && this.fuzzyAvailable === null) {
|
|
71
|
+
this.fuzzyAvailable = await hasFuzzystrmatch(this.sql)
|
|
72
|
+
}
|
|
73
|
+
this.ensured = true
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// ── Writes ──────────────────────────────────────────────────────────────
|
|
77
|
+
|
|
78
|
+
async upsert(id: string | number, document: Record<string, unknown>): Promise<void> {
|
|
79
|
+
await this.upsertMany([{ id, ...document }])
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
async upsertMany(documents: SearchDocument[]): Promise<void> {
|
|
83
|
+
if (documents.length === 0) return
|
|
84
|
+
await this.ensure()
|
|
85
|
+
|
|
86
|
+
await this.sql.begin(async (tx: SQL) => {
|
|
87
|
+
for (const raw of documents) {
|
|
88
|
+
const { id, ...rest } = raw
|
|
89
|
+
const idStr = String(id)
|
|
90
|
+
// Bun's SQL treats stringified JSON as a JSONB string value (double-
|
|
91
|
+
// encoding the JSON). Passing the object directly lets it generate
|
|
92
|
+
// proper JSONB so `doc->>'field'` works for the typed generated cols.
|
|
93
|
+
const doc = { id, ...(rest as Record<string, unknown>) }
|
|
94
|
+
const newText = truncate(this.registry.concatSearchableText(rest as Record<string, unknown>))
|
|
95
|
+
|
|
96
|
+
const oldRows = (await tx.unsafe(
|
|
97
|
+
`SELECT doc FROM ${this.tableName} WHERE id = $1`,
|
|
98
|
+
[idStr]
|
|
99
|
+
)) as Array<{ doc: Record<string, unknown> | string }>
|
|
100
|
+
if (oldRows.length > 0) {
|
|
101
|
+
const oldDoc = parseDoc(oldRows[0]!.doc)
|
|
102
|
+
const oldText = this.registry.concatSearchableText(oldDoc)
|
|
103
|
+
if (this.typo.enabled) await unrecordTerms(tx, this.schema, this.index, oldText)
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
const ftsExpr = this.buildFtsExpression(rest as Record<string, unknown>)
|
|
107
|
+
const sqlStr =
|
|
108
|
+
`INSERT INTO ${this.tableName} (id, doc, fts) VALUES ($1, $2, ${ftsExpr.sql}) ` +
|
|
109
|
+
`ON CONFLICT (id) DO UPDATE SET doc = EXCLUDED.doc, fts = EXCLUDED.fts`
|
|
110
|
+
await tx.unsafe(sqlStr, [idStr, doc as any, ...ftsExpr.params])
|
|
111
|
+
|
|
112
|
+
if (this.typo.enabled) await recordTerms(tx, this.schema, this.index, newText)
|
|
113
|
+
}
|
|
114
|
+
})
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
async delete(id: string | number): Promise<void> {
|
|
118
|
+
await this.deleteMany([id])
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
async deleteMany(ids: Array<string | number>): Promise<void> {
|
|
122
|
+
if (ids.length === 0) return
|
|
123
|
+
await this.ensure()
|
|
124
|
+
|
|
125
|
+
await this.sql.begin(async (tx: SQL) => {
|
|
126
|
+
const idStrs = ids.map(String)
|
|
127
|
+
const placeholders = idStrs.map((_, i) => `$${i + 1}`).join(', ')
|
|
128
|
+
|
|
129
|
+
if (this.typo.enabled) {
|
|
130
|
+
const rows = (await tx.unsafe(
|
|
131
|
+
`SELECT doc FROM ${this.tableName} WHERE id IN (${placeholders})`,
|
|
132
|
+
idStrs
|
|
133
|
+
)) as Array<{ doc: Record<string, unknown> | string }>
|
|
134
|
+
for (const r of rows) {
|
|
135
|
+
const oldDoc = parseDoc(r.doc)
|
|
136
|
+
await unrecordTerms(tx, this.schema, this.index, this.registry.concatSearchableText(oldDoc))
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
await tx.unsafe(
|
|
141
|
+
`DELETE FROM ${this.tableName} WHERE id IN (${placeholders})`,
|
|
142
|
+
idStrs
|
|
143
|
+
)
|
|
144
|
+
})
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
async flush(): Promise<void> {
|
|
148
|
+
await this.ensure()
|
|
149
|
+
await this.sql.begin(async (tx: SQL) => {
|
|
150
|
+
await tx.unsafe(`TRUNCATE ${this.tableName}`)
|
|
151
|
+
if (this.typo.enabled) {
|
|
152
|
+
await tx.unsafe(`TRUNCATE ${termsTableName(this.schema, this.index)}`)
|
|
153
|
+
}
|
|
154
|
+
})
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
async drop(): Promise<void> {
|
|
158
|
+
await dropIndexSchema(this.sql, this.schema, this.index)
|
|
159
|
+
this.ensured = false
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
// ── Reads ───────────────────────────────────────────────────────────────
|
|
163
|
+
|
|
164
|
+
async search(query: string, options?: SearchOptions): Promise<SearchResult> {
|
|
165
|
+
await this.ensure()
|
|
166
|
+
const start = performance.now()
|
|
167
|
+
const opts = options ?? {}
|
|
168
|
+
const parsed = parseQuery(query)
|
|
169
|
+
|
|
170
|
+
const expansions = await this.maybeExpand(parsed.positiveTokens)
|
|
171
|
+
const tsquery = buildTsqueryExpression(parsed, expansions, this.registry.language)
|
|
172
|
+
|
|
173
|
+
const compiled = compileSearch({
|
|
174
|
+
registry: this.registry,
|
|
175
|
+
schema: this.schema,
|
|
176
|
+
index: this.index,
|
|
177
|
+
tsquery: { sql: tsquery.sql, params: tsquery.params },
|
|
178
|
+
search: opts,
|
|
179
|
+
})
|
|
180
|
+
|
|
181
|
+
const result = await this.sql.begin(async (tx: SQL) => {
|
|
182
|
+
if (this.workMem) {
|
|
183
|
+
await tx.unsafe(`SET LOCAL work_mem = ${quoteLiteral(this.workMem)}`)
|
|
184
|
+
}
|
|
185
|
+
const rows = (await tx.unsafe(compiled.sql, compiled.params)) as RawHitRow[]
|
|
186
|
+
const totalRows = (await tx.unsafe(compiled.countSql, compiled.countParams)) as Array<{
|
|
187
|
+
n: number
|
|
188
|
+
}>
|
|
189
|
+
return { rows, total: totalRows[0]?.n ?? rows.length }
|
|
190
|
+
})
|
|
191
|
+
|
|
192
|
+
const projection = opts.attributesToRetrieve
|
|
193
|
+
const hits: SearchHit[] = result.rows.map(row =>
|
|
194
|
+
projectHit(row, compiled.snippetColumns, projection)
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
return {
|
|
198
|
+
hits,
|
|
199
|
+
totalHits: result.total,
|
|
200
|
+
page: Math.max(1, opts.page ?? 1),
|
|
201
|
+
perPage: Math.max(1, opts.perPage ?? 20),
|
|
202
|
+
processingTimeMs: Math.round(performance.now() - start),
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/** REINDEX the GIN index. Periodic maintenance for write-heavy indexes. */
|
|
207
|
+
async optimize(): Promise<void> {
|
|
208
|
+
await this.ensure()
|
|
209
|
+
const ginName = `${quoteIdent(this.schema)}.${quoteIdent(`search_${this.index}_fts_gin`)}`
|
|
210
|
+
await this.sql.unsafe(`REINDEX INDEX ${ginName}`)
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
/**
|
|
214
|
+
* Recompute every row's `fts` using the current registry's language + weight
|
|
215
|
+
* scheme. Auto-picks tier (in-place vs batched) by row count; throws on
|
|
216
|
+
* tables larger than the supported tier-2 ceiling.
|
|
217
|
+
*/
|
|
218
|
+
async rebuild(options?: RebuildOptions) {
|
|
219
|
+
await this.ensure()
|
|
220
|
+
return rebuildInPlace(this.sql, this.schema, this.index, this.registry, options)
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
// ── Internals ───────────────────────────────────────────────────────────
|
|
224
|
+
|
|
225
|
+
private buildFtsExpression(document: Record<string, unknown>): {
|
|
226
|
+
sql: string
|
|
227
|
+
params: string[]
|
|
228
|
+
} {
|
|
229
|
+
const segments = this.registry.projectFtsSegments(document)
|
|
230
|
+
const lang = `${quoteLiteral(this.registry.language)}::regconfig`
|
|
231
|
+
const params: string[] = []
|
|
232
|
+
const fragments = segments.map(seg => {
|
|
233
|
+
params.push(truncate(seg.text))
|
|
234
|
+
return `setweight(to_tsvector(${lang}, $${params.length + 2}), '${seg.tier}')`
|
|
235
|
+
})
|
|
236
|
+
// The `+2` above accounts for the leading id ($1) and doc ($2) bindings
|
|
237
|
+
// that callers prepend. Caller MUST keep those positions stable.
|
|
238
|
+
return { sql: fragments.join(' || '), params }
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
private async maybeExpand(tokens: string[]): Promise<Map<string, string[]>> {
|
|
242
|
+
if (!this.typo.enabled || tokens.length === 0) return new Map()
|
|
243
|
+
return expandTokens(
|
|
244
|
+
this.sql,
|
|
245
|
+
this.schema,
|
|
246
|
+
this.index,
|
|
247
|
+
tokens,
|
|
248
|
+
this.typo,
|
|
249
|
+
this.fuzzyAvailable === true
|
|
250
|
+
)
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
interface RawHitRow {
|
|
255
|
+
id: string
|
|
256
|
+
doc: Record<string, unknown> | string
|
|
257
|
+
score: number
|
|
258
|
+
[snippetCol: string]: unknown
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
function projectHit(
|
|
262
|
+
row: RawHitRow,
|
|
263
|
+
snippetCols: string[],
|
|
264
|
+
attributesToRetrieve: string[] | undefined
|
|
265
|
+
): SearchHit {
|
|
266
|
+
const document = parseDoc(row.doc)
|
|
267
|
+
|
|
268
|
+
let projected = document
|
|
269
|
+
if (attributesToRetrieve && attributesToRetrieve.length > 0) {
|
|
270
|
+
const out: Record<string, unknown> = {}
|
|
271
|
+
for (const attr of attributesToRetrieve) {
|
|
272
|
+
if (attr in document) out[attr] = document[attr]
|
|
273
|
+
}
|
|
274
|
+
projected = out
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
const hit: SearchHit = { document: projected }
|
|
278
|
+
|
|
279
|
+
if (snippetCols.length > 0) {
|
|
280
|
+
const highlights: Record<string, string> = {}
|
|
281
|
+
for (const col of snippetCols) {
|
|
282
|
+
const raw = row[`__snip_${col}`] as string | null | undefined
|
|
283
|
+
if (raw) highlights[col] = formatSnippet(raw)
|
|
284
|
+
}
|
|
285
|
+
if (Object.keys(highlights).length > 0) hit.highlights = highlights
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
return hit
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
function parseDoc(doc: Record<string, unknown> | string): Record<string, unknown> {
|
|
292
|
+
if (typeof doc === 'string') return JSON.parse(doc) as Record<string, unknown>
|
|
293
|
+
return doc
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
function truncate(text: string): string {
|
|
297
|
+
if (Buffer.byteLength(text, 'utf8') <= MAX_TEXT_BYTES) return text
|
|
298
|
+
// Truncate by char count; over-conservative is fine.
|
|
299
|
+
return text.slice(0, MAX_TEXT_BYTES)
|
|
300
|
+
}
|