@strav/search 0.3.20 → 0.3.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +122 -3
- package/package.json +4 -4
- package/src/commands/search_optimize.ts +52 -0
- package/src/commands/search_rebuild.ts +73 -0
- package/src/drivers/embedded/embedded_driver.ts +136 -0
- package/src/drivers/embedded/engine/field_registry.ts +97 -0
- package/src/drivers/embedded/engine/fts_query_builder.ts +184 -0
- package/src/drivers/embedded/engine/query_compiler.ts +134 -0
- package/src/drivers/embedded/engine/schema.ts +99 -0
- package/src/drivers/embedded/engine/snippet_formatter.ts +29 -0
- package/src/drivers/embedded/engine/sqlite_engine.ts +255 -0
- package/src/drivers/embedded/engine/typo_expander.ts +138 -0
- package/src/drivers/embedded/errors.ts +15 -0
- package/src/drivers/embedded/filters/filter_compiler.ts +136 -0
- package/src/drivers/embedded/index.ts +3 -0
- package/src/drivers/embedded/storage/paths.ts +23 -0
- package/src/drivers/embedded/types.ts +34 -0
- package/src/drivers/postgres/engine/field_registry.ts +116 -0
- package/src/drivers/postgres/engine/fts_query_builder.ts +105 -0
- package/src/drivers/postgres/engine/pg_engine.ts +300 -0
- package/src/drivers/postgres/engine/query_compiler.ts +165 -0
- package/src/drivers/postgres/engine/schema.ts +187 -0
- package/src/drivers/postgres/engine/snippet_formatter.ts +31 -0
- package/src/drivers/postgres/engine/typo_expander.ts +131 -0
- package/src/drivers/postgres/errors.ts +33 -0
- package/src/drivers/postgres/filters/filter_compiler.ts +138 -0
- package/src/drivers/postgres/index.ts +14 -0
- package/src/drivers/postgres/postgres_fts_driver.ts +184 -0
- package/src/drivers/postgres/rebuild/rebuild_inplace.ts +113 -0
- package/src/drivers/postgres/storage/identifiers.ts +46 -0
- package/src/drivers/postgres/types.ts +53 -0
- package/src/index.ts +11 -0
- package/src/search_manager.ts +7 -0
- package/stubs/config/search.ts +25 -0
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import type { SQL } from 'bun'
|
|
2
|
+
import { termsTableName } from '../storage/identifiers.ts'
|
|
3
|
+
import type { ResolvedTypoTolerance } from '../types.ts'
|
|
4
|
+
|
|
5
|
+
/** Tokeniser used for terms-dict maintenance. Mirrors embedded driver. */
|
|
6
|
+
export function tokenize(text: string): string[] {
|
|
7
|
+
if (!text) return []
|
|
8
|
+
const tokens: string[] = []
|
|
9
|
+
for (const raw of text.toLowerCase().split(/[^\p{L}\p{N}]+/u)) {
|
|
10
|
+
if (raw.length >= 2) tokens.push(raw)
|
|
11
|
+
}
|
|
12
|
+
return tokens
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
/** Increment per-document term frequencies (counting unique tokens per doc). */
|
|
16
|
+
export async function recordTerms(
|
|
17
|
+
sql: SQL,
|
|
18
|
+
schema: string,
|
|
19
|
+
index: string,
|
|
20
|
+
text: string
|
|
21
|
+
): Promise<void> {
|
|
22
|
+
const unique = Array.from(new Set(tokenize(text)))
|
|
23
|
+
if (unique.length === 0) return
|
|
24
|
+
|
|
25
|
+
const placeholders = unique.map((_, i) => `($${i + 1})`).join(', ')
|
|
26
|
+
await sql.unsafe(
|
|
27
|
+
`INSERT INTO ${termsTableName(schema, index)} (term) VALUES ${placeholders} ` +
|
|
28
|
+
`ON CONFLICT (term) DO UPDATE SET doc_freq = ${termsTableName(schema, index)}.doc_freq + 1`,
|
|
29
|
+
unique
|
|
30
|
+
)
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/** Decrement; purge rows that drop to zero. */
|
|
34
|
+
export async function unrecordTerms(
|
|
35
|
+
sql: SQL,
|
|
36
|
+
schema: string,
|
|
37
|
+
index: string,
|
|
38
|
+
text: string
|
|
39
|
+
): Promise<void> {
|
|
40
|
+
const unique = Array.from(new Set(tokenize(text)))
|
|
41
|
+
if (unique.length === 0) return
|
|
42
|
+
|
|
43
|
+
const placeholders = unique.map((_, i) => `$${i + 1}`).join(', ')
|
|
44
|
+
await sql.unsafe(
|
|
45
|
+
`UPDATE ${termsTableName(schema, index)} SET doc_freq = doc_freq - 1 WHERE term IN (${placeholders})`,
|
|
46
|
+
unique
|
|
47
|
+
)
|
|
48
|
+
await sql.unsafe(`DELETE FROM ${termsTableName(schema, index)} WHERE doc_freq <= 0`)
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Look up Levenshtein-near terms via pg_trgm prefilter. When fuzzystrmatch is
|
|
53
|
+
* available we re-rank with bounded Levenshtein for precision (trigram on
|
|
54
|
+
* short tokens is statistically noisy).
|
|
55
|
+
*/
|
|
56
|
+
export async function expandTokens(
|
|
57
|
+
sql: SQL,
|
|
58
|
+
schema: string,
|
|
59
|
+
index: string,
|
|
60
|
+
tokens: string[],
|
|
61
|
+
settings: ResolvedTypoTolerance,
|
|
62
|
+
hasFuzzystrmatch: boolean,
|
|
63
|
+
maxCandidates = 8
|
|
64
|
+
): Promise<Map<string, string[]>> {
|
|
65
|
+
const out = new Map<string, string[]>()
|
|
66
|
+
if (!settings.enabled || tokens.length === 0) return out
|
|
67
|
+
|
|
68
|
+
for (const token of tokens) {
|
|
69
|
+
if (token.length < settings.minTokenLength) continue
|
|
70
|
+
|
|
71
|
+
// pg_trgm uses a per-session similarity threshold. We set it transactionally
|
|
72
|
+
// via the WHERE clause comparison instead, so caller's session isn't touched.
|
|
73
|
+
const rows = (await sql.unsafe(
|
|
74
|
+
hasFuzzystrmatch
|
|
75
|
+
? `WITH cands AS (
|
|
76
|
+
SELECT term FROM ${termsTableName(schema, index)}
|
|
77
|
+
WHERE similarity(term, $1) >= $2 AND term <> $1
|
|
78
|
+
ORDER BY similarity(term, $1) DESC
|
|
79
|
+
LIMIT 32
|
|
80
|
+
)
|
|
81
|
+
SELECT term FROM cands
|
|
82
|
+
WHERE levenshtein(term, $1) <= $3
|
|
83
|
+
LIMIT $4`
|
|
84
|
+
: `SELECT term FROM ${termsTableName(schema, index)}
|
|
85
|
+
WHERE similarity(term, $1) >= $2 AND term <> $1
|
|
86
|
+
ORDER BY similarity(term, $1) DESC
|
|
87
|
+
LIMIT $3`,
|
|
88
|
+
hasFuzzystrmatch
|
|
89
|
+
? [token, settings.similarity, settings.maxDistance, maxCandidates]
|
|
90
|
+
: [token, settings.similarity, maxCandidates]
|
|
91
|
+
)) as Array<{ term: string }>
|
|
92
|
+
|
|
93
|
+
if (rows.length > 0) out.set(token, rows.map(r => r.term))
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
return out
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/** Resolve user-provided typo tolerance settings into concrete numbers. */
|
|
100
|
+
export function resolveTypoTolerance(
|
|
101
|
+
setting:
|
|
102
|
+
| 'off'
|
|
103
|
+
| 'auto'
|
|
104
|
+
| { minTokenLength?: number; maxDistance?: number; similarity?: number }
|
|
105
|
+
| undefined
|
|
106
|
+
): ResolvedTypoTolerance {
|
|
107
|
+
if (setting === 'off') {
|
|
108
|
+
return { enabled: false, minTokenLength: 4, maxDistance: 1, similarity: 0.4 }
|
|
109
|
+
}
|
|
110
|
+
if (setting === undefined || setting === 'auto') {
|
|
111
|
+
return { enabled: true, minTokenLength: 4, maxDistance: 1, similarity: 0.4 }
|
|
112
|
+
}
|
|
113
|
+
return {
|
|
114
|
+
enabled: true,
|
|
115
|
+
minTokenLength: setting.minTokenLength ?? 4,
|
|
116
|
+
maxDistance: setting.maxDistance ?? 1,
|
|
117
|
+
similarity: setting.similarity ?? 0.4,
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/** Detect whether fuzzystrmatch.levenshtein is available. */
|
|
122
|
+
export async function hasFuzzystrmatch(sql: SQL): Promise<boolean> {
|
|
123
|
+
try {
|
|
124
|
+
const rows = (await sql.unsafe(
|
|
125
|
+
`SELECT 1 FROM pg_proc WHERE proname = 'levenshtein' LIMIT 1`
|
|
126
|
+
)) as Array<Record<string, unknown>>
|
|
127
|
+
return rows.length > 0
|
|
128
|
+
} catch {
|
|
129
|
+
return false
|
|
130
|
+
}
|
|
131
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { SearchError } from '../../errors.ts'
|
|
2
|
+
|
|
3
|
+
export class PostgresFtsError extends SearchError {}
|
|
4
|
+
|
|
5
|
+
export class MissingExtensionError extends PostgresFtsError {
|
|
6
|
+
constructor(extension: string) {
|
|
7
|
+
super(
|
|
8
|
+
`Postgres extension "${extension}" is required by the postgres-fts driver. ` +
|
|
9
|
+
`Run \`CREATE EXTENSION ${extension}\` as a superuser, or set typoTolerance: 'off' if you can't.`
|
|
10
|
+
)
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
export class RebuildRequiredError extends PostgresFtsError {
|
|
15
|
+
constructor(message: string) {
|
|
16
|
+
super(message)
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export class UnsupportedFilterError extends PostgresFtsError {
|
|
21
|
+
constructor(message: string) {
|
|
22
|
+
super(`Postgres-fts driver filter is unsupported: ${message}`)
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export class MissingConnectionError extends PostgresFtsError {
|
|
27
|
+
constructor() {
|
|
28
|
+
super(
|
|
29
|
+
'PostgresFtsDriver has no Postgres connection. ' +
|
|
30
|
+
'Pass `connection` in the driver config, or bootstrap @strav/database first so Database.raw is available.'
|
|
31
|
+
)
|
|
32
|
+
}
|
|
33
|
+
}
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import { UnsupportedFilterError } from '../errors.ts'
|
|
2
|
+
import { quoteIdent } from '../storage/identifiers.ts'
|
|
3
|
+
|
|
4
|
+
export interface CompiledFilter {
|
|
5
|
+
/** SQL fragment to splice into a WHERE clause (no leading 'WHERE'). Empty if no filter. */
|
|
6
|
+
sql: string
|
|
7
|
+
/** Bound parameters in the order their `$N` placeholders appear. */
|
|
8
|
+
params: unknown[]
|
|
9
|
+
/** Number of params already used (caller offsets later placeholders). */
|
|
10
|
+
paramCount: number
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
const OPERATORS = new Set(['eq', 'neq', 'gt', 'gte', 'lt', 'lte', 'in', 'nin'])
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Compile a filter object into a parameterized SQL WHERE fragment.
|
|
17
|
+
* Mirrors the embedded driver's contract — same operator set, same shape.
|
|
18
|
+
*
|
|
19
|
+
* Placeholder numbering starts at `startAt + 1` ($N+1, $N+2, ...) so callers
|
|
20
|
+
* can compose with their own bindings.
|
|
21
|
+
*/
|
|
22
|
+
export function compileFilter(
|
|
23
|
+
filter: Record<string, unknown> | string | undefined,
|
|
24
|
+
filterableAttributes: ReadonlySet<string>,
|
|
25
|
+
startAt = 0
|
|
26
|
+
): CompiledFilter {
|
|
27
|
+
if (!filter) return { sql: '', params: [], paramCount: 0 }
|
|
28
|
+
|
|
29
|
+
if (typeof filter === 'string') {
|
|
30
|
+
throw new UnsupportedFilterError(
|
|
31
|
+
'Raw string filters are not supported by the postgres-fts driver. ' +
|
|
32
|
+
'Pass an object like `{ status: "published" }` instead.'
|
|
33
|
+
)
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
const clauses: string[] = []
|
|
37
|
+
const params: unknown[] = []
|
|
38
|
+
let cursor = startAt
|
|
39
|
+
|
|
40
|
+
const ph = () => `$${++cursor}`
|
|
41
|
+
|
|
42
|
+
for (const [key, value] of Object.entries(filter)) {
|
|
43
|
+
if (value === undefined) continue
|
|
44
|
+
if (!filterableAttributes.has(key)) {
|
|
45
|
+
throw new UnsupportedFilterError(
|
|
46
|
+
`Field "${key}" is not in filterableAttributes. Add it to the index settings before filtering on it.`
|
|
47
|
+
)
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
const col = quoteIdent(key)
|
|
51
|
+
|
|
52
|
+
if (value === null) {
|
|
53
|
+
clauses.push(`${col} IS NULL`)
|
|
54
|
+
continue
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
if (Array.isArray(value)) {
|
|
58
|
+
if (value.length === 0) {
|
|
59
|
+
clauses.push('1 = 0')
|
|
60
|
+
} else {
|
|
61
|
+
const placeholders = value.map(() => ph()).join(', ')
|
|
62
|
+
clauses.push(`${col} IN (${placeholders})`)
|
|
63
|
+
params.push(...value.map(coerce))
|
|
64
|
+
}
|
|
65
|
+
continue
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
if (isOperatorObject(value)) {
|
|
69
|
+
for (const [op, opValue] of Object.entries(value)) {
|
|
70
|
+
const compiled = compileOperator(col, op, opValue, ph)
|
|
71
|
+
clauses.push(compiled.sql)
|
|
72
|
+
params.push(...compiled.params)
|
|
73
|
+
}
|
|
74
|
+
continue
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
if (isPrimitive(value)) {
|
|
78
|
+
clauses.push(`${col} = ${ph()}`)
|
|
79
|
+
params.push(coerce(value))
|
|
80
|
+
continue
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
throw new UnsupportedFilterError(
|
|
84
|
+
`Unsupported filter value for key "${key}": ${JSON.stringify(value)}`
|
|
85
|
+
)
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
return { sql: clauses.join(' AND '), params, paramCount: cursor - startAt }
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function compileOperator(
|
|
92
|
+
col: string,
|
|
93
|
+
op: string,
|
|
94
|
+
value: unknown,
|
|
95
|
+
ph: () => string
|
|
96
|
+
): { sql: string; params: unknown[] } {
|
|
97
|
+
switch (op) {
|
|
98
|
+
case 'eq':
|
|
99
|
+
return { sql: `${col} = ${ph()}`, params: [coerce(value)] }
|
|
100
|
+
case 'neq':
|
|
101
|
+
return { sql: `${col} <> ${ph()}`, params: [coerce(value)] }
|
|
102
|
+
case 'gt':
|
|
103
|
+
return { sql: `${col} > ${ph()}`, params: [coerce(value)] }
|
|
104
|
+
case 'gte':
|
|
105
|
+
return { sql: `${col} >= ${ph()}`, params: [coerce(value)] }
|
|
106
|
+
case 'lt':
|
|
107
|
+
return { sql: `${col} < ${ph()}`, params: [coerce(value)] }
|
|
108
|
+
case 'lte':
|
|
109
|
+
return { sql: `${col} <= ${ph()}`, params: [coerce(value)] }
|
|
110
|
+
case 'in': {
|
|
111
|
+
if (!Array.isArray(value) || value.length === 0) return { sql: '1 = 0', params: [] }
|
|
112
|
+
const placeholders = value.map(() => ph()).join(', ')
|
|
113
|
+
return { sql: `${col} IN (${placeholders})`, params: value.map(coerce) }
|
|
114
|
+
}
|
|
115
|
+
case 'nin': {
|
|
116
|
+
if (!Array.isArray(value) || value.length === 0) return { sql: '1 = 1', params: [] }
|
|
117
|
+
const placeholders = value.map(() => ph()).join(', ')
|
|
118
|
+
return { sql: `${col} NOT IN (${placeholders})`, params: value.map(coerce) }
|
|
119
|
+
}
|
|
120
|
+
default:
|
|
121
|
+
throw new UnsupportedFilterError(`Unknown operator "${op}"`)
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
function isOperatorObject(value: unknown): value is Record<string, unknown> {
|
|
126
|
+
if (value === null || typeof value !== 'object' || Array.isArray(value)) return false
|
|
127
|
+
return Object.keys(value).every(k => OPERATORS.has(k))
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
function isPrimitive(value: unknown): boolean {
|
|
131
|
+
return typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean'
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
function coerce(value: unknown): unknown {
|
|
135
|
+
if (value === null || value === undefined) return null
|
|
136
|
+
if (typeof value === 'boolean') return value ? 1 : 0
|
|
137
|
+
return value
|
|
138
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
export { PostgresFtsDriver } from './postgres_fts_driver.ts'
|
|
2
|
+
export type {
|
|
3
|
+
PostgresFtsConfig,
|
|
4
|
+
TypoToleranceMode,
|
|
5
|
+
TypoToleranceSettings,
|
|
6
|
+
PgIndexSettings,
|
|
7
|
+
} from './types.ts'
|
|
8
|
+
export {
|
|
9
|
+
PostgresFtsError,
|
|
10
|
+
MissingExtensionError,
|
|
11
|
+
RebuildRequiredError,
|
|
12
|
+
UnsupportedFilterError,
|
|
13
|
+
MissingConnectionError,
|
|
14
|
+
} from './errors.ts'
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
import type { SQL } from 'bun'
|
|
2
|
+
import type { SearchEngine } from '../../search_engine.ts'
|
|
3
|
+
import type {
|
|
4
|
+
SearchDocument,
|
|
5
|
+
SearchOptions,
|
|
6
|
+
SearchResult,
|
|
7
|
+
IndexSettings,
|
|
8
|
+
DriverConfig,
|
|
9
|
+
} from '../../types.ts'
|
|
10
|
+
import { PgEngine } from './engine/pg_engine.ts'
|
|
11
|
+
import { ensureSchemaAndExtensions } from './engine/schema.ts'
|
|
12
|
+
import { resolveTypoTolerance } from './engine/typo_expander.ts'
|
|
13
|
+
import type { PostgresFtsConfig, PgIndexSettings, ResolvedTypoTolerance } from './types.ts'
|
|
14
|
+
import { MissingConnectionError } from './errors.ts'
|
|
15
|
+
|
|
16
|
+
const DEFAULT_SCHEMA = 'strav_search'
|
|
17
|
+
const DEFAULT_LANGUAGE = 'english'
|
|
18
|
+
const DEFAULT_WORK_MEM = '64MB'
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Postgres-backed full-text search driver. Implements the same `SearchEngine`
|
|
22
|
+
* interface as the embedded SQLite driver — drop-in swap by config.
|
|
23
|
+
*
|
|
24
|
+
* Sized for higher-volume workloads (1M-100M docs per index) using `tsvector`
|
|
25
|
+
* + GIN + `pg_trgm` for typo tolerance + `ts_headline` for snippets.
|
|
26
|
+
*
|
|
27
|
+
* Connection: pass `connection` (a Bun `SQL` instance) in the driver config,
|
|
28
|
+
* or rely on `Database.raw` from `@strav/database` (must be bootstrapped).
|
|
29
|
+
*/
|
|
30
|
+
export class PostgresFtsDriver implements SearchEngine {
|
|
31
|
+
readonly name = 'postgres-fts'
|
|
32
|
+
|
|
33
|
+
private readonly config: PostgresFtsConfig
|
|
34
|
+
private readonly schemaName: string
|
|
35
|
+
private readonly defaultLanguage: string
|
|
36
|
+
private readonly typo: ResolvedTypoTolerance
|
|
37
|
+
private readonly ginFastUpdate: boolean
|
|
38
|
+
private readonly workMem: string | null
|
|
39
|
+
private readonly engines = new Map<string, PgEngine>()
|
|
40
|
+
private readonly pendingSettings = new Map<string, PgIndexSettings>()
|
|
41
|
+
private bootstrapped: Promise<void> | null = null
|
|
42
|
+
private resolvedSql: SQL | null = null
|
|
43
|
+
|
|
44
|
+
constructor(config: DriverConfig) {
|
|
45
|
+
this.config = (config ?? {}) as PostgresFtsConfig
|
|
46
|
+
this.schemaName = this.config.schema ?? DEFAULT_SCHEMA
|
|
47
|
+
this.defaultLanguage = this.config.language ?? DEFAULT_LANGUAGE
|
|
48
|
+
this.typo = resolveTypoTolerance(this.config.typoTolerance)
|
|
49
|
+
this.ginFastUpdate = this.config.gin?.fastupdate ?? false
|
|
50
|
+
this.workMem =
|
|
51
|
+
this.config.workMem === null
|
|
52
|
+
? null
|
|
53
|
+
: (this.config.workMem ?? DEFAULT_WORK_MEM)
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// ── Document operations ──────────────────────────────────────────────────
|
|
57
|
+
|
|
58
|
+
async upsert(
|
|
59
|
+
index: string,
|
|
60
|
+
id: string | number,
|
|
61
|
+
document: Record<string, unknown>
|
|
62
|
+
): Promise<void> {
|
|
63
|
+
await (await this.engineFor(index)).upsert(id, document)
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
async upsertMany(index: string, documents: SearchDocument[]): Promise<void> {
|
|
67
|
+
await (await this.engineFor(index)).upsertMany(documents)
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
async delete(index: string, id: string | number): Promise<void> {
|
|
71
|
+
await (await this.engineFor(index)).delete(id)
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
async deleteMany(index: string, ids: Array<string | number>): Promise<void> {
|
|
75
|
+
await (await this.engineFor(index)).deleteMany(ids)
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// ── Index operations ─────────────────────────────────────────────────────
|
|
79
|
+
|
|
80
|
+
async flush(index: string): Promise<void> {
|
|
81
|
+
await (await this.engineFor(index)).flush()
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
async deleteIndex(index: string): Promise<void> {
|
|
85
|
+
const engine = this.engines.get(index)
|
|
86
|
+
if (engine) {
|
|
87
|
+
await engine.drop()
|
|
88
|
+
this.engines.delete(index)
|
|
89
|
+
} else {
|
|
90
|
+
// Drop directly without instantiating an engine.
|
|
91
|
+
const sql = this.resolveSql()
|
|
92
|
+
const { dropIndex } = await import('./engine/schema.ts')
|
|
93
|
+
await dropIndex(sql, this.schemaName, index)
|
|
94
|
+
}
|
|
95
|
+
this.pendingSettings.delete(index)
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
async createIndex(index: string, options?: IndexSettings): Promise<void> {
|
|
99
|
+
if (options) this.pendingSettings.set(index, options as PgIndexSettings)
|
|
100
|
+
const engine = await this.engineFor(index)
|
|
101
|
+
await engine.ensure()
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// ── Search ───────────────────────────────────────────────────────────────
|
|
105
|
+
|
|
106
|
+
async search(index: string, query: string, options?: SearchOptions): Promise<SearchResult> {
|
|
107
|
+
return (await this.engineFor(index)).search(query, options)
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// ── Lifecycle ────────────────────────────────────────────────────────────
|
|
111
|
+
|
|
112
|
+
/** Run REINDEX on every open index, or just one if specified. */
|
|
113
|
+
async optimize(index?: string): Promise<void> {
|
|
114
|
+
if (index) {
|
|
115
|
+
await (await this.engineFor(index)).optimize()
|
|
116
|
+
return
|
|
117
|
+
}
|
|
118
|
+
for (const engine of this.engines.values()) await engine.optimize()
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
/**
|
|
122
|
+
* Rebuild a single index's `fts` column in place. Use after changing
|
|
123
|
+
* `searchableAttributes` or weights — without it, existing rows keep the
|
|
124
|
+
* old fts values.
|
|
125
|
+
*/
|
|
126
|
+
async rebuild(
|
|
127
|
+
index: string,
|
|
128
|
+
options?: { reindex?: boolean; pauseMs?: number; onProgress?: (done: number, total: number) => void }
|
|
129
|
+
): Promise<{ tier: 1 | 2; rows: number; elapsedMs: number }> {
|
|
130
|
+
return (await this.engineFor(index)).rebuild(options)
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// ── Internals ────────────────────────────────────────────────────────────
|
|
134
|
+
|
|
135
|
+
private async engineFor(index: string): Promise<PgEngine> {
|
|
136
|
+
let engine = this.engines.get(index)
|
|
137
|
+
if (engine) return engine
|
|
138
|
+
|
|
139
|
+
await this.bootstrap()
|
|
140
|
+
const settings = this.pendingSettings.get(index)
|
|
141
|
+
engine = new PgEngine({
|
|
142
|
+
sql: this.resolveSql(),
|
|
143
|
+
schema: this.schemaName,
|
|
144
|
+
index,
|
|
145
|
+
language: settings?.language ?? this.defaultLanguage,
|
|
146
|
+
typoTolerance: this.typo,
|
|
147
|
+
ginFastUpdate: this.ginFastUpdate,
|
|
148
|
+
workMem: this.workMem,
|
|
149
|
+
settings,
|
|
150
|
+
})
|
|
151
|
+
this.engines.set(index, engine)
|
|
152
|
+
this.pendingSettings.delete(index)
|
|
153
|
+
return engine
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/** Resolve the SQL connection (config.connection or Database.raw fallback). */
|
|
157
|
+
private resolveSql(): SQL {
|
|
158
|
+
if (this.resolvedSql) return this.resolvedSql
|
|
159
|
+
if (this.config.connection) {
|
|
160
|
+
this.resolvedSql = this.config.connection
|
|
161
|
+
return this.resolvedSql
|
|
162
|
+
}
|
|
163
|
+
try {
|
|
164
|
+
// Lazy require to avoid a hard dep at import time.
|
|
165
|
+
const databaseModule = require('@strav/database')
|
|
166
|
+
const Database = databaseModule.default ?? databaseModule.Database
|
|
167
|
+
this.resolvedSql = Database.raw as SQL
|
|
168
|
+
return this.resolvedSql
|
|
169
|
+
} catch {
|
|
170
|
+
throw new MissingConnectionError()
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/** Idempotent: ensure schema + extensions exist, once per driver. */
|
|
175
|
+
private bootstrap(): Promise<void> {
|
|
176
|
+
if (this.bootstrapped) return this.bootstrapped
|
|
177
|
+
this.bootstrapped = ensureSchemaAndExtensions(
|
|
178
|
+
this.resolveSql(),
|
|
179
|
+
this.schemaName,
|
|
180
|
+
this.typo
|
|
181
|
+
)
|
|
182
|
+
return this.bootstrapped
|
|
183
|
+
}
|
|
184
|
+
}
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import type { SQL } from 'bun'
|
|
2
|
+
import type { FieldRegistry } from '../engine/field_registry.ts'
|
|
3
|
+
import { indexTableName, quoteLiteral, quoteIdent } from '../storage/identifiers.ts'
|
|
4
|
+
import { RebuildRequiredError } from '../errors.ts'
|
|
5
|
+
|
|
6
|
+
/** Tier boundaries for rebuild strategy selection. */
|
|
7
|
+
const TIER1_MAX = 100_000
|
|
8
|
+
const TIER2_MAX = 10_000_000
|
|
9
|
+
|
|
10
|
+
/** Batch size for tier-2 batched UPDATE. */
|
|
11
|
+
const BATCH_SIZE = 5_000
|
|
12
|
+
|
|
13
|
+
export interface RebuildOptions {
|
|
14
|
+
/** If true, run REINDEX on the GIN index after the rebuild. Default true. */
|
|
15
|
+
reindex?: boolean
|
|
16
|
+
/** Per-batch sleep in milliseconds (tier 2 only). Default 50. */
|
|
17
|
+
pauseMs?: number
|
|
18
|
+
/** Optional progress callback fired after each batch. */
|
|
19
|
+
onProgress?: (done: number, total: number) => void
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Rebuild an index's `fts` column in place using the current registry's
|
|
24
|
+
* language + weight scheme. Picks tier by row count:
|
|
25
|
+
* - < 100k → single UPDATE
|
|
26
|
+
* - 100k-10M → batched UPDATE with pauses
|
|
27
|
+
* - > 10M → RebuildRequiredError (defer to v1.1 swap strategy)
|
|
28
|
+
*/
|
|
29
|
+
export async function rebuildInPlace(
|
|
30
|
+
sql: SQL,
|
|
31
|
+
schema: string,
|
|
32
|
+
index: string,
|
|
33
|
+
registry: FieldRegistry,
|
|
34
|
+
options: RebuildOptions = {}
|
|
35
|
+
): Promise<{ tier: 1 | 2; rows: number; elapsedMs: number }> {
|
|
36
|
+
const reindex = options.reindex ?? true
|
|
37
|
+
const pauseMs = options.pauseMs ?? 50
|
|
38
|
+
const table = indexTableName(schema, index)
|
|
39
|
+
const start = performance.now()
|
|
40
|
+
|
|
41
|
+
const countRows = (await sql.unsafe(
|
|
42
|
+
`SELECT COUNT(*)::bigint AS n FROM ${table}`
|
|
43
|
+
)) as Array<{ n: string | number }>
|
|
44
|
+
const total = Number(countRows[0]?.n ?? 0)
|
|
45
|
+
|
|
46
|
+
if (total > TIER2_MAX) {
|
|
47
|
+
throw new RebuildRequiredError(
|
|
48
|
+
`Index "${index}" has ${total} rows (>${TIER2_MAX}). ` +
|
|
49
|
+
`In-place / batched rebuild is unsafe at this scale. ` +
|
|
50
|
+
`Use the v1.1 dual-table swap strategy (not yet shipped).`
|
|
51
|
+
)
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const ftsExpr = buildSetFtsExpression(registry)
|
|
55
|
+
|
|
56
|
+
if (total <= TIER1_MAX) {
|
|
57
|
+
await sql.unsafe(`UPDATE ${table} SET fts = ${ftsExpr}`)
|
|
58
|
+
if (reindex) await reindexGin(sql, schema, index)
|
|
59
|
+
return { tier: 1, rows: total, elapsedMs: Math.round(performance.now() - start) }
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Tier 2: batched update keyed by id, with pauses for autovacuum.
|
|
63
|
+
let cursor: string | null = null
|
|
64
|
+
let done = 0
|
|
65
|
+
|
|
66
|
+
while (true) {
|
|
67
|
+
const where = cursor === null ? '' : `WHERE id > $1`
|
|
68
|
+
const params = cursor === null ? [] : [cursor]
|
|
69
|
+
const batch = (await sql.unsafe(
|
|
70
|
+
`SELECT id FROM ${table} ${where} ORDER BY id LIMIT ${BATCH_SIZE}`,
|
|
71
|
+
params
|
|
72
|
+
)) as Array<{ id: string }>
|
|
73
|
+
if (batch.length === 0) break
|
|
74
|
+
|
|
75
|
+
const ids = batch.map(r => r.id)
|
|
76
|
+
const placeholders = ids.map((_, i) => `$${i + 1}`).join(', ')
|
|
77
|
+
await sql.unsafe(
|
|
78
|
+
`UPDATE ${table} SET fts = ${ftsExpr} WHERE id IN (${placeholders})`,
|
|
79
|
+
ids
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
done += batch.length
|
|
83
|
+
cursor = ids[ids.length - 1]!
|
|
84
|
+
options.onProgress?.(done, total)
|
|
85
|
+
if (pauseMs > 0) await new Promise(r => setTimeout(r, pauseMs))
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
if (reindex) await reindexGin(sql, schema, index)
|
|
89
|
+
return { tier: 2, rows: total, elapsedMs: Math.round(performance.now() - start) }
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
function buildSetFtsExpression(registry: FieldRegistry): string {
|
|
93
|
+
const lang = `${quoteLiteral(registry.language)}::regconfig`
|
|
94
|
+
if (registry.usesDefaultTextColumn) {
|
|
95
|
+
return (
|
|
96
|
+
`setweight(to_tsvector(${lang}, ` +
|
|
97
|
+
`(SELECT coalesce(string_agg(value, ' '), '') FROM jsonb_each_text(doc))), 'A')`
|
|
98
|
+
)
|
|
99
|
+
}
|
|
100
|
+
return registry.searchable
|
|
101
|
+
.map(attr => {
|
|
102
|
+
const weight = registry.weights.get(attr)!
|
|
103
|
+
return (
|
|
104
|
+
`setweight(to_tsvector(${lang}, coalesce(doc->>${quoteLiteral(attr)}, '')), '${weight}')`
|
|
105
|
+
)
|
|
106
|
+
})
|
|
107
|
+
.join(' || ')
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
async function reindexGin(sql: SQL, schema: string, index: string): Promise<void> {
|
|
111
|
+
const ginName = `${quoteIdent(schema)}.${quoteIdent(`search_${index}_fts_gin`)}`
|
|
112
|
+
await sql.unsafe(`REINDEX INDEX ${ginName}`)
|
|
113
|
+
}
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import { PostgresFtsError } from '../errors.ts'
|
|
2
|
+
|
|
3
|
+
const PG_IDENT_MAX = 63
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Quote a Postgres identifier (schema, table, column). Throws on identifiers
|
|
7
|
+
* containing NUL or exceeding the 63-byte name limit.
|
|
8
|
+
*/
|
|
9
|
+
export function quoteIdent(name: string): string {
|
|
10
|
+
if (name.includes('\0')) throw new PostgresFtsError(`Invalid identifier: contains NUL byte.`)
|
|
11
|
+
if (Buffer.byteLength(name, 'utf8') > PG_IDENT_MAX) {
|
|
12
|
+
throw new PostgresFtsError(
|
|
13
|
+
`Identifier "${name}" exceeds Postgres' ${PG_IDENT_MAX}-byte limit.`
|
|
14
|
+
)
|
|
15
|
+
}
|
|
16
|
+
return `"${name.replace(/"/g, '""')}"`
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
/** Quote a single-quoted SQL string literal (used inside DDL options). */
|
|
20
|
+
export function quoteLiteral(value: string): string {
|
|
21
|
+
return `'${value.replace(/'/g, "''")}'`
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/** Build the schema-qualified table name for a search index. */
|
|
25
|
+
export function indexTableName(schema: string, index: string): string {
|
|
26
|
+
return `${quoteIdent(schema)}.${quoteIdent(`search_${index}`)}`
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/** Terms-dictionary table name for a given index. */
|
|
30
|
+
export function termsTableName(schema: string, index: string): string {
|
|
31
|
+
return `${quoteIdent(schema)}.${quoteIdent(`search_${index}_terms`)}`
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/** Meta table — single shared table; rows keyed by (index_name, key). */
|
|
35
|
+
export function metaTableName(schema: string): string {
|
|
36
|
+
return `${quoteIdent(schema)}.${quoteIdent('_meta')}`
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/** Bare (unquoted) tablename — useful for pg_class lookups. */
|
|
40
|
+
export function bareIndexTable(index: string): string {
|
|
41
|
+
return `search_${index}`
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export function bareTermsTable(index: string): string {
|
|
45
|
+
return `search_${index}_terms`
|
|
46
|
+
}
|