@strav/search 0.4.31 → 1.0.0-alpha.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. package/package.json +20 -22
  2. package/src/console/index.ts +5 -0
  3. package/src/console/search_console_provider.ts +20 -0
  4. package/src/console/search_flush.ts +49 -0
  5. package/src/console/search_import.ts +103 -0
  6. package/src/console/search_list.ts +46 -0
  7. package/src/console/search_reindex.ts +94 -0
  8. package/src/drivers/meilisearch/meilisearch_driver.ts +304 -0
  9. package/src/drivers/memory/memory_driver.ts +344 -0
  10. package/src/drivers/postgres/apply_search_migration.ts +74 -0
  11. package/src/drivers/postgres/postgres_fts_driver.ts +493 -135
  12. package/src/drivers/typesense/typesense_driver.ts +345 -0
  13. package/src/index.ts +50 -39
  14. package/src/search_engine.ts +40 -25
  15. package/src/search_error.ts +86 -0
  16. package/src/search_manager.ts +112 -94
  17. package/src/search_provider.ts +68 -6
  18. package/src/searchable.ts +173 -160
  19. package/src/searchable_registry.ts +61 -0
  20. package/src/types.ts +59 -49
  21. package/README.md +0 -191
  22. package/src/commands/search_flush.ts +0 -41
  23. package/src/commands/search_import.ts +0 -43
  24. package/src/commands/search_optimize.ts +0 -52
  25. package/src/commands/search_rebuild.ts +0 -73
  26. package/src/drivers/algolia_driver.ts +0 -170
  27. package/src/drivers/embedded/embedded_driver.ts +0 -136
  28. package/src/drivers/embedded/engine/field_registry.ts +0 -97
  29. package/src/drivers/embedded/engine/fts_query_builder.ts +0 -184
  30. package/src/drivers/embedded/engine/query_compiler.ts +0 -134
  31. package/src/drivers/embedded/engine/schema.ts +0 -99
  32. package/src/drivers/embedded/engine/snippet_formatter.ts +0 -29
  33. package/src/drivers/embedded/engine/sqlite_engine.ts +0 -255
  34. package/src/drivers/embedded/engine/typo_expander.ts +0 -138
  35. package/src/drivers/embedded/errors.ts +0 -15
  36. package/src/drivers/embedded/filters/filter_compiler.ts +0 -136
  37. package/src/drivers/embedded/index.ts +0 -3
  38. package/src/drivers/embedded/storage/paths.ts +0 -23
  39. package/src/drivers/embedded/types.ts +0 -34
  40. package/src/drivers/meilisearch_driver.ts +0 -150
  41. package/src/drivers/null_driver.ts +0 -27
  42. package/src/drivers/postgres/engine/field_registry.ts +0 -116
  43. package/src/drivers/postgres/engine/fts_query_builder.ts +0 -105
  44. package/src/drivers/postgres/engine/pg_engine.ts +0 -300
  45. package/src/drivers/postgres/engine/query_compiler.ts +0 -165
  46. package/src/drivers/postgres/engine/schema.ts +0 -187
  47. package/src/drivers/postgres/engine/snippet_formatter.ts +0 -31
  48. package/src/drivers/postgres/engine/typo_expander.ts +0 -131
  49. package/src/drivers/postgres/errors.ts +0 -33
  50. package/src/drivers/postgres/filters/filter_compiler.ts +0 -138
  51. package/src/drivers/postgres/index.ts +0 -14
  52. package/src/drivers/postgres/rebuild/rebuild_inplace.ts +0 -113
  53. package/src/drivers/postgres/storage/identifiers.ts +0 -46
  54. package/src/drivers/postgres/types.ts +0 -53
  55. package/src/drivers/typesense_driver.ts +0 -229
  56. package/src/errors.ts +0 -18
  57. package/src/helpers.ts +0 -120
  58. package/stubs/config/search.ts +0 -57
  59. package/tsconfig.json +0 -5
@@ -0,0 +1,344 @@
1
+ /**
2
+ * `MemoryDriver` — in-process `SearchEngine` backed by `Map`s.
3
+ *
4
+ * Two real use cases:
5
+ *
6
+ * 1. **Tests.** Apps exercise their search code without a
7
+ * Meilisearch / Typesense / Postgres dependency. Reset
8
+ * between tests via `new MemoryDriver()`.
9
+ * 2. **Local dev / low-volume self-host.** Up to a few
10
+ * thousand documents the O(N) scan-per-query is fine.
11
+ *
12
+ * Scoring is BM25 with the standard `k1 = 1.2` / `b = 0.75`
13
+ * parameters, computed over the configured
14
+ * `searchableAttributes` (or every string field when settings
15
+ * aren't supplied). Tokenization is lowercase, splits on
16
+ * non-alphanumeric characters; no stemming, no stop-word
17
+ * removal — apps that need stronger linguistics flip to the
18
+ * Meilisearch or Postgres driver.
19
+ *
20
+ * Out of scope:
21
+ *
22
+ * - **Multitenancy.** Single Map, no scope.
23
+ * - **Persistence.** Documents die with the process.
24
+ * - **Typo tolerance.** Exact-token match only.
25
+ */
26
+
27
+ import { IndexNotFoundError, SearchQueryError } from '../../search_error.ts'
28
+ import type { SearchEngine } from '../../search_engine.ts'
29
+ import type {
30
+ IndexSettings,
31
+ SearchDocument,
32
+ SearchHit,
33
+ SearchOptions,
34
+ SearchResult,
35
+ } from '../../types.ts'
36
+
37
+ interface StoredDoc {
38
+ id: string | number
39
+ document: Record<string, unknown>
40
+ /** Per-attribute token list — keyed by the attribute name. */
41
+ tokens: Map<string, string[]>
42
+ /** Total token count across searchable attributes (BM25 length normalization). */
43
+ length: number
44
+ }
45
+
46
+ interface IndexBucket {
47
+ settings: IndexSettings
48
+ docs: Map<string, StoredDoc>
49
+ /** Sum of doc lengths — for the BM25 average length. */
50
+ totalLength: number
51
+ /** Per-term document frequency for BM25 IDF. */
52
+ df: Map<string, number>
53
+ }
54
+
55
+ const BM25_K1 = 1.2
56
+ const BM25_B = 0.75
57
+
58
+ export class MemoryDriver implements SearchEngine {
59
+ readonly name = 'memory'
60
+
61
+ private readonly indexes = new Map<string, IndexBucket>()
62
+
63
+ // ─── Index lifecycle ────────────────────────────────────────────────────
64
+
65
+ async createIndex(index: string, settings: IndexSettings = {}): Promise<void> {
66
+ const existing = this.indexes.get(index)
67
+ if (existing) {
68
+ // Idempotent — merge in settings if the caller supplied new ones.
69
+ existing.settings = { ...existing.settings, ...settings }
70
+ return
71
+ }
72
+ this.indexes.set(index, {
73
+ settings,
74
+ docs: new Map(),
75
+ totalLength: 0,
76
+ df: new Map(),
77
+ })
78
+ }
79
+
80
+ async deleteIndex(index: string): Promise<void> {
81
+ this.indexes.delete(index)
82
+ }
83
+
84
+ async flush(index: string): Promise<void> {
85
+ const bucket = this.indexes.get(index)
86
+ if (!bucket) return
87
+ bucket.docs.clear()
88
+ bucket.totalLength = 0
89
+ bucket.df.clear()
90
+ }
91
+
92
+ // ─── Writes ─────────────────────────────────────────────────────────────
93
+
94
+ async upsert(
95
+ index: string,
96
+ id: string | number,
97
+ document: Record<string, unknown>,
98
+ ): Promise<void> {
99
+ const bucket = this.requireBucket(index)
100
+ this.upsertInto(bucket, { id, ...document } as SearchDocument)
101
+ }
102
+
103
+ async upsertMany(index: string, documents: readonly SearchDocument[]): Promise<void> {
104
+ const bucket = this.requireBucket(index)
105
+ for (const doc of documents) this.upsertInto(bucket, doc)
106
+ }
107
+
108
+ async delete(index: string, id: string | number): Promise<void> {
109
+ const bucket = this.requireBucket(index)
110
+ this.removeFrom(bucket, String(id))
111
+ }
112
+
113
+ async deleteMany(index: string, ids: readonly (string | number)[]): Promise<void> {
114
+ const bucket = this.requireBucket(index)
115
+ for (const id of ids) this.removeFrom(bucket, String(id))
116
+ }
117
+
118
+ // ─── Reads ──────────────────────────────────────────────────────────────
119
+
120
+ async search(index: string, query: string, options: SearchOptions = {}): Promise<SearchResult> {
121
+ const bucket = this.requireBucket(index)
122
+ const start = performance.now()
123
+
124
+ const page = Math.max(1, options.page ?? 1)
125
+ const perPage = Math.max(1, options.perPage ?? 20)
126
+ const filter = options.filter
127
+
128
+ if (filter !== undefined && (typeof filter !== 'object' || Array.isArray(filter))) {
129
+ throw new SearchQueryError(
130
+ 'MemoryDriver: `filter` must be a flat key/value object. String filters are driver-native and not portable.',
131
+ )
132
+ }
133
+
134
+ const terms = tokenize(query)
135
+ const docCount = bucket.docs.size
136
+ const avgdl = docCount === 0 ? 0 : bucket.totalLength / docCount
137
+
138
+ type Scored = { doc: StoredDoc; score: number; perAttribute: Map<string, number[]> }
139
+ const scored: Scored[] = []
140
+
141
+ for (const doc of bucket.docs.values()) {
142
+ if (filter && !matchesFilter(doc.document, filter)) continue
143
+
144
+ let score = 0
145
+ // Per-attribute token positions for highlight generation.
146
+ const perAttribute = new Map<string, number[]>()
147
+
148
+ if (terms.length === 0) {
149
+ // Empty query → return all filtered docs with score 0.
150
+ scored.push({ doc, score: 0, perAttribute })
151
+ continue
152
+ }
153
+
154
+ for (const term of terms) {
155
+ const df = bucket.df.get(term) ?? 0
156
+ if (df === 0) continue
157
+ const idf = Math.log(1 + (docCount - df + 0.5) / (df + 0.5))
158
+
159
+ let tf = 0
160
+ for (const [attr, attrTokens] of doc.tokens) {
161
+ for (let i = 0; i < attrTokens.length; i++) {
162
+ if (attrTokens[i] === term) {
163
+ tf++
164
+ if (!perAttribute.has(attr)) perAttribute.set(attr, [])
165
+ perAttribute.get(attr)!.push(i)
166
+ }
167
+ }
168
+ }
169
+ if (tf === 0) continue
170
+
171
+ const dl = doc.length
172
+ const denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (avgdl === 0 ? 0 : dl / avgdl))
173
+ score += idf * ((tf * (BM25_K1 + 1)) / denom)
174
+ }
175
+
176
+ if (terms.length > 0 && score <= 0) continue
177
+ scored.push({ doc, score, perAttribute })
178
+ }
179
+
180
+ scored.sort((a, b) => b.score - a.score)
181
+
182
+ const totalHits = scored.length
183
+ const startIdx = (page - 1) * perPage
184
+ const slice = scored.slice(startIdx, startIdx + perPage)
185
+
186
+ const hits: SearchHit[] = slice.map(({ doc, perAttribute }) => {
187
+ const projected = projectAttributes(doc.document, options.attributesToRetrieve)
188
+ const hit: SearchHit = { document: projected }
189
+ if (options.attributesToHighlight && options.attributesToHighlight.length > 0) {
190
+ hit.highlights = buildHighlights(doc, perAttribute, options.attributesToHighlight)
191
+ }
192
+ return hit
193
+ })
194
+
195
+ return {
196
+ hits,
197
+ totalHits,
198
+ page,
199
+ perPage,
200
+ processingTimeMs: performance.now() - start,
201
+ }
202
+ }
203
+
204
+ // ─── Internals ──────────────────────────────────────────────────────────
205
+
206
+ private requireBucket(index: string): IndexBucket {
207
+ const bucket = this.indexes.get(index)
208
+ if (!bucket) throw new IndexNotFoundError(index, this.name)
209
+ return bucket
210
+ }
211
+
212
+ private upsertInto(bucket: IndexBucket, document: SearchDocument): void {
213
+ const key = String(document.id)
214
+ // Remove the previous version first so df / totalLength stay consistent.
215
+ if (bucket.docs.has(key)) this.removeFrom(bucket, key)
216
+
217
+ const searchable = resolveSearchableAttributes(document, bucket.settings)
218
+ const tokens = new Map<string, string[]>()
219
+ let length = 0
220
+ const seenTerms = new Set<string>()
221
+
222
+ for (const attr of searchable) {
223
+ const value = document[attr]
224
+ if (typeof value !== 'string' || value.length === 0) continue
225
+ const attrTokens = tokenize(value)
226
+ tokens.set(attr, attrTokens)
227
+ length += attrTokens.length
228
+ for (const t of attrTokens) seenTerms.add(t)
229
+ }
230
+
231
+ for (const t of seenTerms) {
232
+ bucket.df.set(t, (bucket.df.get(t) ?? 0) + 1)
233
+ }
234
+
235
+ bucket.docs.set(key, {
236
+ id: document.id,
237
+ document: { ...document },
238
+ tokens,
239
+ length,
240
+ })
241
+ bucket.totalLength += length
242
+ }
243
+
244
+ private removeFrom(bucket: IndexBucket, key: string): void {
245
+ const doc = bucket.docs.get(key)
246
+ if (!doc) return
247
+ bucket.docs.delete(key)
248
+ bucket.totalLength -= doc.length
249
+ const seenTerms = new Set<string>()
250
+ for (const list of doc.tokens.values()) for (const t of list) seenTerms.add(t)
251
+ for (const t of seenTerms) {
252
+ const next = (bucket.df.get(t) ?? 0) - 1
253
+ if (next <= 0) bucket.df.delete(t)
254
+ else bucket.df.set(t, next)
255
+ }
256
+ }
257
+ }
258
+
259
+ function resolveSearchableAttributes(
260
+ document: SearchDocument,
261
+ settings: IndexSettings,
262
+ ): string[] {
263
+ if (settings.searchableAttributes && settings.searchableAttributes.length > 0) {
264
+ return settings.searchableAttributes
265
+ }
266
+ return Object.keys(document).filter((k) => k !== 'id' && typeof document[k] === 'string')
267
+ }
268
+
269
+ function projectAttributes(
270
+ document: Record<string, unknown>,
271
+ attributes: string[] | undefined,
272
+ ): Record<string, unknown> {
273
+ if (!attributes || attributes.length === 0) return { ...document }
274
+ const out: Record<string, unknown> = {}
275
+ for (const attr of attributes) {
276
+ if (attr in document) out[attr] = document[attr]
277
+ }
278
+ // Always include the primary key.
279
+ if ('id' in document && !('id' in out)) out.id = document.id
280
+ return out
281
+ }
282
+
283
+ function buildHighlights(
284
+ doc: StoredDoc,
285
+ perAttribute: Map<string, number[]>,
286
+ attributes: string[],
287
+ ): Record<string, string> {
288
+ const highlights: Record<string, string> = {}
289
+ for (const attr of attributes) {
290
+ const raw = doc.document[attr]
291
+ if (typeof raw !== 'string') continue
292
+ const positions = perAttribute.get(attr)
293
+ if (!positions || positions.length === 0) {
294
+ highlights[attr] = raw
295
+ continue
296
+ }
297
+ highlights[attr] = wrapMatches(raw, new Set(positions))
298
+ }
299
+ return highlights
300
+ }
301
+
302
+ /**
303
+ * Wrap matched tokens in the original string with `<mark>` tags.
304
+ * Re-tokenizes the source so the highlight tracks original
305
+ * casing + surrounding whitespace.
306
+ */
307
+ function wrapMatches(source: string, positionsToHighlight: Set<number>): string {
308
+ const out: string[] = []
309
+ let cursor = 0
310
+ let position = 0
311
+ const re = /[\p{L}\p{N}]+/gu
312
+ let match: RegExpExecArray | null
313
+ // biome-ignore lint/suspicious/noAssignInExpressions: idiomatic regex exec loop.
314
+ while ((match = re.exec(source)) !== null) {
315
+ out.push(source.slice(cursor, match.index))
316
+ if (positionsToHighlight.has(position)) {
317
+ out.push(`<mark>${match[0]}</mark>`)
318
+ } else {
319
+ out.push(match[0])
320
+ }
321
+ cursor = match.index + match[0].length
322
+ position++
323
+ }
324
+ out.push(source.slice(cursor))
325
+ return out.join('')
326
+ }
327
+
328
+ function tokenize(input: string): string[] {
329
+ if (!input) return []
330
+ return input
331
+ .toLowerCase()
332
+ .split(/[^\p{L}\p{N}]+/u)
333
+ .filter((t) => t.length > 0)
334
+ }
335
+
336
+ function matchesFilter(
337
+ document: Record<string, unknown>,
338
+ filter: Record<string, unknown>,
339
+ ): boolean {
340
+ for (const key of Object.keys(filter)) {
341
+ if (document[key] !== filter[key]) return false
342
+ }
343
+ return true
344
+ }
@@ -0,0 +1,74 @@
1
+ /**
2
+ * `applySearchMigration` — provision the schema + meta table the
3
+ * `PostgresFtsDriver` reads from.
4
+ *
5
+ * Apps that use the postgres-fts driver drop one call into a
6
+ * migration's `up()`:
7
+ *
8
+ * ```ts
9
+ * import { applySearchMigration } from '@strav/search'
10
+ *
11
+ * export const migration: Migration = {
12
+ * name: '20260601000000_create_search_schema',
13
+ * async up(db) {
14
+ * await applySearchMigration(db)
15
+ * },
16
+ * async down(db) {
17
+ * await db.execute(`DROP SCHEMA "strav_search" CASCADE`)
18
+ * },
19
+ * }
20
+ * ```
21
+ *
22
+ * Per-index tables are NOT created here — the driver creates
23
+ * them lazily the first time `createIndex(name, …)` is called.
24
+ * This helper just ensures the namespace + `_meta` table exist
25
+ * so the driver can persist settings between processes.
26
+ *
27
+ * The default schema is `strav_search`. Apps that want a
28
+ * different schema name pass `{ schema: 'app_search' }` and use
29
+ * the same value in their `config.search` driver entry.
30
+ */
31
+
32
+ import type { DatabaseExecutor } from '@strav/database'
33
+
34
+ export interface ApplySearchMigrationOptions {
35
+ /** Schema name. Defaults to `'strav_search'`. */
36
+ schema?: string
37
+ }
38
+
39
+ export const DEFAULT_SEARCH_SCHEMA = 'strav_search'
40
+
41
+ export async function applySearchMigration(
42
+ db: DatabaseExecutor,
43
+ options: ApplySearchMigrationOptions = {},
44
+ ): Promise<void> {
45
+ const schema = validateIdentifier(options.schema ?? DEFAULT_SEARCH_SCHEMA, 'schema')
46
+
47
+ await db.execute(`CREATE SCHEMA IF NOT EXISTS "${schema}"`)
48
+ await db.execute(
49
+ `CREATE TABLE IF NOT EXISTS "${schema}"."_meta" (
50
+ "index_name" text PRIMARY KEY,
51
+ "settings" jsonb NOT NULL DEFAULT '{}'::jsonb,
52
+ "language" text NOT NULL DEFAULT 'english',
53
+ "created_at" timestamptz NOT NULL DEFAULT now(),
54
+ "updated_at" timestamptz NOT NULL DEFAULT now()
55
+ )`,
56
+ )
57
+ }
58
+
59
+ /**
60
+ * Identifiers (schema, index, attribute names) flow directly
61
+ * into SQL DDL and `document->>` projections. Refuse anything
62
+ * outside `[a-z0-9_]` to keep the lexicon trivially safe — the
63
+ * mixin's default `indexName()` is the snake_case schema name,
64
+ * which always matches; apps that override pick something
65
+ * matching the same lexicon.
66
+ */
67
+ export function validateIdentifier(identifier: string, kind: string): string {
68
+ if (!/^[a-z_][a-z0-9_]*$/.test(identifier)) {
69
+ throw new Error(
70
+ `PostgresFtsDriver: invalid ${kind} ${JSON.stringify(identifier)} — must match /^[a-z_][a-z0-9_]*$/.`,
71
+ )
72
+ }
73
+ return identifier
74
+ }