npm - @strav/search - Versions diffs - 0.4.31 → 1.0.0-alpha.31 - Mend

@strav/search 0.4.31 → 1.0.0-alpha.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

package/package.json +20 -22
package/src/console/index.ts +5 -0
package/src/console/search_console_provider.ts +20 -0
package/src/console/search_flush.ts +49 -0
package/src/console/search_import.ts +103 -0
package/src/console/search_list.ts +46 -0
package/src/console/search_reindex.ts +94 -0
package/src/drivers/meilisearch/meilisearch_driver.ts +304 -0
package/src/drivers/memory/memory_driver.ts +344 -0
package/src/drivers/postgres/apply_search_migration.ts +74 -0
package/src/drivers/postgres/postgres_fts_driver.ts +493 -135
package/src/drivers/typesense/typesense_driver.ts +345 -0
package/src/index.ts +50 -39
package/src/search_engine.ts +40 -25
package/src/search_error.ts +86 -0
package/src/search_manager.ts +112 -94
package/src/search_provider.ts +68 -6
package/src/searchable.ts +173 -160
package/src/searchable_registry.ts +61 -0
package/src/types.ts +59 -49
package/README.md +0 -191
package/src/commands/search_flush.ts +0 -41
package/src/commands/search_import.ts +0 -43
package/src/commands/search_optimize.ts +0 -52
package/src/commands/search_rebuild.ts +0 -73
package/src/drivers/algolia_driver.ts +0 -170
package/src/drivers/embedded/embedded_driver.ts +0 -136
package/src/drivers/embedded/engine/field_registry.ts +0 -97
package/src/drivers/embedded/engine/fts_query_builder.ts +0 -184
package/src/drivers/embedded/engine/query_compiler.ts +0 -134
package/src/drivers/embedded/engine/schema.ts +0 -99
package/src/drivers/embedded/engine/snippet_formatter.ts +0 -29
package/src/drivers/embedded/engine/sqlite_engine.ts +0 -255
package/src/drivers/embedded/engine/typo_expander.ts +0 -138
package/src/drivers/embedded/errors.ts +0 -15
package/src/drivers/embedded/filters/filter_compiler.ts +0 -136
package/src/drivers/embedded/index.ts +0 -3
package/src/drivers/embedded/storage/paths.ts +0 -23
package/src/drivers/embedded/types.ts +0 -34
package/src/drivers/meilisearch_driver.ts +0 -150
package/src/drivers/null_driver.ts +0 -27
package/src/drivers/postgres/engine/field_registry.ts +0 -116
package/src/drivers/postgres/engine/fts_query_builder.ts +0 -105
package/src/drivers/postgres/engine/pg_engine.ts +0 -300
package/src/drivers/postgres/engine/query_compiler.ts +0 -165
package/src/drivers/postgres/engine/schema.ts +0 -187
package/src/drivers/postgres/engine/snippet_formatter.ts +0 -31
package/src/drivers/postgres/engine/typo_expander.ts +0 -131
package/src/drivers/postgres/errors.ts +0 -33
package/src/drivers/postgres/filters/filter_compiler.ts +0 -138
package/src/drivers/postgres/index.ts +0 -14
package/src/drivers/postgres/rebuild/rebuild_inplace.ts +0 -113
package/src/drivers/postgres/storage/identifiers.ts +0 -46
package/src/drivers/postgres/types.ts +0 -53
package/src/drivers/typesense_driver.ts +0 -229
package/src/errors.ts +0 -18
package/src/helpers.ts +0 -120
package/stubs/config/search.ts +0 -57
package/tsconfig.json +0 -5

package/src/drivers/memory/memory_driver.ts ADDED Viewed

@@ -0,0 +1,344 @@
+/**
+ * `MemoryDriver` — in-process `SearchEngine` backed by `Map`s.
+ *
+ * Two real use cases:
+ *
+ *   1. **Tests.** Apps exercise their search code without a
+ *      Meilisearch / Typesense / Postgres dependency. Reset
+ *      between tests via `new MemoryDriver()`.
+ *   2. **Local dev / low-volume self-host.** Up to a few
+ *      thousand documents the O(N) scan-per-query is fine.
+ *
+ * Scoring is BM25 with the standard `k1 = 1.2` / `b = 0.75`
+ * parameters, computed over the configured
+ * `searchableAttributes` (or every string field when settings
+ * aren't supplied). Tokenization is lowercase, splits on
+ * non-alphanumeric characters; no stemming, no stop-word
+ * removal — apps that need stronger linguistics flip to the
+ * Meilisearch or Postgres driver.
+ *
+ * Out of scope:
+ *
+ *   - **Multitenancy.** Single Map, no scope.
+ *   - **Persistence.** Documents die with the process.
+ *   - **Typo tolerance.** Exact-token match only.
+ */
+import { IndexNotFoundError, SearchQueryError } from '../../search_error.ts'
+import type { SearchEngine } from '../../search_engine.ts'
+import type {
+  IndexSettings,
+  SearchDocument,
+  SearchHit,
+  SearchOptions,
+  SearchResult,
+} from '../../types.ts'
+interface StoredDoc {
+  id: string | number
+  document: Record<string, unknown>
+  /** Per-attribute token list — keyed by the attribute name. */
+  tokens: Map<string, string[]>
+  /** Total token count across searchable attributes (BM25 length normalization). */
+  length: number
+}
+interface IndexBucket {
+  settings: IndexSettings
+  docs: Map<string, StoredDoc>
+  /** Sum of doc lengths — for the BM25 average length. */
+  totalLength: number
+  /** Per-term document frequency for BM25 IDF. */
+  df: Map<string, number>
+}
+const BM25_K1 = 1.2
+const BM25_B = 0.75
+export class MemoryDriver implements SearchEngine {
+  readonly name = 'memory'
+  private readonly indexes = new Map<string, IndexBucket>()
+  // ─── Index lifecycle ────────────────────────────────────────────────────
+  async createIndex(index: string, settings: IndexSettings = {}): Promise<void> {
+    const existing = this.indexes.get(index)
+    if (existing) {
+      // Idempotent — merge in settings if the caller supplied new ones.
+      existing.settings = { ...existing.settings, ...settings }
+      return
+    }
+    this.indexes.set(index, {
+      settings,
+      docs: new Map(),
+      totalLength: 0,
+      df: new Map(),
+    })
+  }
+  async deleteIndex(index: string): Promise<void> {
+    this.indexes.delete(index)
+  }
+  async flush(index: string): Promise<void> {
+    const bucket = this.indexes.get(index)
+    if (!bucket) return
+    bucket.docs.clear()
+    bucket.totalLength = 0
+    bucket.df.clear()
+  }
+  // ─── Writes ─────────────────────────────────────────────────────────────
+  async upsert(
+    index: string,
+    id: string | number,
+    document: Record<string, unknown>,
+  ): Promise<void> {
+    const bucket = this.requireBucket(index)
+    this.upsertInto(bucket, { id, ...document } as SearchDocument)
+  }
+  async upsertMany(index: string, documents: readonly SearchDocument[]): Promise<void> {
+    const bucket = this.requireBucket(index)
+    for (const doc of documents) this.upsertInto(bucket, doc)
+  }
+  async delete(index: string, id: string | number): Promise<void> {
+    const bucket = this.requireBucket(index)
+    this.removeFrom(bucket, String(id))
+  }
+  async deleteMany(index: string, ids: readonly (string | number)[]): Promise<void> {
+    const bucket = this.requireBucket(index)
+    for (const id of ids) this.removeFrom(bucket, String(id))
+  }
+  // ─── Reads ──────────────────────────────────────────────────────────────
+  async search(index: string, query: string, options: SearchOptions = {}): Promise<SearchResult> {
+    const bucket = this.requireBucket(index)
+    const start = performance.now()
+    const page = Math.max(1, options.page ?? 1)
+    const perPage = Math.max(1, options.perPage ?? 20)
+    const filter = options.filter
+    if (filter !== undefined && (typeof filter !== 'object' || Array.isArray(filter))) {
+      throw new SearchQueryError(
+        'MemoryDriver: `filter` must be a flat key/value object. String filters are driver-native and not portable.',
+      )
+    }
+    const terms = tokenize(query)
+    const docCount = bucket.docs.size
+    const avgdl = docCount === 0 ? 0 : bucket.totalLength / docCount
+    type Scored = { doc: StoredDoc; score: number; perAttribute: Map<string, number[]> }
+    const scored: Scored[] = []
+    for (const doc of bucket.docs.values()) {
+      if (filter && !matchesFilter(doc.document, filter)) continue
+      let score = 0
+      // Per-attribute token positions for highlight generation.
+      const perAttribute = new Map<string, number[]>()
+      if (terms.length === 0) {
+        // Empty query → return all filtered docs with score 0.
+        scored.push({ doc, score: 0, perAttribute })
+        continue
+      }
+      for (const term of terms) {
+        const df = bucket.df.get(term) ?? 0
+        if (df === 0) continue
+        const idf = Math.log(1 + (docCount - df + 0.5) / (df + 0.5))
+        let tf = 0
+        for (const [attr, attrTokens] of doc.tokens) {
+          for (let i = 0; i < attrTokens.length; i++) {
+            if (attrTokens[i] === term) {
+              tf++
+              if (!perAttribute.has(attr)) perAttribute.set(attr, [])
+              perAttribute.get(attr)!.push(i)
+            }
+          }
+        }
+        if (tf === 0) continue
+        const dl = doc.length
+        const denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (avgdl === 0 ? 0 : dl / avgdl))
+        score += idf * ((tf * (BM25_K1 + 1)) / denom)
+      }
+      if (terms.length > 0 && score <= 0) continue
+      scored.push({ doc, score, perAttribute })
+    }
+    scored.sort((a, b) => b.score - a.score)
+    const totalHits = scored.length
+    const startIdx = (page - 1) * perPage
+    const slice = scored.slice(startIdx, startIdx + perPage)
+    const hits: SearchHit[] = slice.map(({ doc, perAttribute }) => {
+      const projected = projectAttributes(doc.document, options.attributesToRetrieve)
+      const hit: SearchHit = { document: projected }
+      if (options.attributesToHighlight && options.attributesToHighlight.length > 0) {
+        hit.highlights = buildHighlights(doc, perAttribute, options.attributesToHighlight)
+      }
+      return hit
+    })
+    return {
+      hits,
+      totalHits,
+      page,
+      perPage,
+      processingTimeMs: performance.now() - start,
+    }
+  }
+  // ─── Internals ──────────────────────────────────────────────────────────
+  private requireBucket(index: string): IndexBucket {
+    const bucket = this.indexes.get(index)
+    if (!bucket) throw new IndexNotFoundError(index, this.name)
+    return bucket
+  }
+  private upsertInto(bucket: IndexBucket, document: SearchDocument): void {
+    const key = String(document.id)
+    // Remove the previous version first so df / totalLength stay consistent.
+    if (bucket.docs.has(key)) this.removeFrom(bucket, key)
+    const searchable = resolveSearchableAttributes(document, bucket.settings)
+    const tokens = new Map<string, string[]>()
+    let length = 0
+    const seenTerms = new Set<string>()
+    for (const attr of searchable) {
+      const value = document[attr]
+      if (typeof value !== 'string' || value.length === 0) continue
+      const attrTokens = tokenize(value)
+      tokens.set(attr, attrTokens)
+      length += attrTokens.length
+      for (const t of attrTokens) seenTerms.add(t)
+    }
+    for (const t of seenTerms) {
+      bucket.df.set(t, (bucket.df.get(t) ?? 0) + 1)
+    }
+    bucket.docs.set(key, {
+      id: document.id,
+      document: { ...document },
+      tokens,
+      length,
+    })
+    bucket.totalLength += length
+  }
+  private removeFrom(bucket: IndexBucket, key: string): void {
+    const doc = bucket.docs.get(key)
+    if (!doc) return
+    bucket.docs.delete(key)
+    bucket.totalLength -= doc.length
+    const seenTerms = new Set<string>()
+    for (const list of doc.tokens.values()) for (const t of list) seenTerms.add(t)
+    for (const t of seenTerms) {
+      const next = (bucket.df.get(t) ?? 0) - 1
+      if (next <= 0) bucket.df.delete(t)
+      else bucket.df.set(t, next)
+    }
+  }
+}
+function resolveSearchableAttributes(
+  document: SearchDocument,
+  settings: IndexSettings,
+): string[] {
+  if (settings.searchableAttributes && settings.searchableAttributes.length > 0) {
+    return settings.searchableAttributes
+  }
+  return Object.keys(document).filter((k) => k !== 'id' && typeof document[k] === 'string')
+}
+function projectAttributes(
+  document: Record<string, unknown>,
+  attributes: string[] | undefined,
+): Record<string, unknown> {
+  if (!attributes || attributes.length === 0) return { ...document }
+  const out: Record<string, unknown> = {}
+  for (const attr of attributes) {
+    if (attr in document) out[attr] = document[attr]
+  }
+  // Always include the primary key.
+  if ('id' in document && !('id' in out)) out.id = document.id
+  return out
+}
+function buildHighlights(
+  doc: StoredDoc,
+  perAttribute: Map<string, number[]>,
+  attributes: string[],
+): Record<string, string> {
+  const highlights: Record<string, string> = {}
+  for (const attr of attributes) {
+    const raw = doc.document[attr]
+    if (typeof raw !== 'string') continue
+    const positions = perAttribute.get(attr)
+    if (!positions || positions.length === 0) {
+      highlights[attr] = raw
+      continue
+    }
+    highlights[attr] = wrapMatches(raw, new Set(positions))
+  }
+  return highlights
+}
+/**
+ * Wrap matched tokens in the original string with `<mark>` tags.
+ * Re-tokenizes the source so the highlight tracks original
+ * casing + surrounding whitespace.
+ */
+function wrapMatches(source: string, positionsToHighlight: Set<number>): string {
+  const out: string[] = []
+  let cursor = 0
+  let position = 0
+  const re = /[\p{L}\p{N}]+/gu
+  let match: RegExpExecArray | null
+  // biome-ignore lint/suspicious/noAssignInExpressions: idiomatic regex exec loop.
+  while ((match = re.exec(source)) !== null) {
+    out.push(source.slice(cursor, match.index))
+    if (positionsToHighlight.has(position)) {
+      out.push(`<mark>${match[0]}</mark>`)
+    } else {
+      out.push(match[0])
+    }
+    cursor = match.index + match[0].length
+    position++
+  }
+  out.push(source.slice(cursor))
+  return out.join('')
+}
+function tokenize(input: string): string[] {
+  if (!input) return []
+  return input
+    .toLowerCase()
+    .split(/[^\p{L}\p{N}]+/u)
+    .filter((t) => t.length > 0)
+}
+function matchesFilter(
+  document: Record<string, unknown>,
+  filter: Record<string, unknown>,
+): boolean {
+  for (const key of Object.keys(filter)) {
+    if (document[key] !== filter[key]) return false
+  }
+  return true
+}

package/src/drivers/postgres/apply_search_migration.ts ADDED Viewed

@@ -0,0 +1,74 @@
+/**
+ * `applySearchMigration` — provision the schema + meta table the
+ * `PostgresFtsDriver` reads from.
+ *
+ * Apps that use the postgres-fts driver drop one call into a
+ * migration's `up()`:
+ *
+ * ```ts
+ * import { applySearchMigration } from '@strav/search'
+ *
+ * export const migration: Migration = {
+ *   name: '20260601000000_create_search_schema',
+ *   async up(db) {
+ *     await applySearchMigration(db)
+ *   },
+ *   async down(db) {
+ *     await db.execute(`DROP SCHEMA "strav_search" CASCADE`)
+ *   },
+ * }
+ * ```
+ *
+ * Per-index tables are NOT created here — the driver creates
+ * them lazily the first time `createIndex(name, …)` is called.
+ * This helper just ensures the namespace + `_meta` table exist
+ * so the driver can persist settings between processes.
+ *
+ * The default schema is `strav_search`. Apps that want a
+ * different schema name pass `{ schema: 'app_search' }` and use
+ * the same value in their `config.search` driver entry.
+ */
+import type { DatabaseExecutor } from '@strav/database'
+export interface ApplySearchMigrationOptions {
+  /** Schema name. Defaults to `'strav_search'`. */
+  schema?: string
+}
+export const DEFAULT_SEARCH_SCHEMA = 'strav_search'
+export async function applySearchMigration(
+  db: DatabaseExecutor,
+  options: ApplySearchMigrationOptions = {},
+): Promise<void> {
+  const schema = validateIdentifier(options.schema ?? DEFAULT_SEARCH_SCHEMA, 'schema')
+  await db.execute(`CREATE SCHEMA IF NOT EXISTS "${schema}"`)
+  await db.execute(
+    `CREATE TABLE IF NOT EXISTS "${schema}"."_meta" (
+       "index_name" text PRIMARY KEY,
+       "settings" jsonb NOT NULL DEFAULT '{}'::jsonb,
+       "language" text NOT NULL DEFAULT 'english',
+       "created_at" timestamptz NOT NULL DEFAULT now(),
+       "updated_at" timestamptz NOT NULL DEFAULT now()
+     )`,
+  )
+}
+/**
+ * Identifiers (schema, index, attribute names) flow directly
+ * into SQL DDL and `document->>` projections. Refuse anything
+ * outside `[a-z0-9_]` to keep the lexicon trivially safe — the
+ * mixin's default `indexName()` is the snake_case schema name,
+ * which always matches; apps that override pick something
+ * matching the same lexicon.
+ */
+export function validateIdentifier(identifier: string, kind: string): string {
+  if (!/^[a-z_][a-z0-9_]*$/.test(identifier)) {
+    throw new Error(
+      `PostgresFtsDriver: invalid ${kind} ${JSON.stringify(identifier)} — must match /^[a-z_][a-z0-9_]*$/.`,
+    )
+  }
+  return identifier
+}