@oomkapwn/enquire-mcp 1.11.0 → 2.0.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,110 @@
2
2
 
3
3
  All notable changes to this project will be documented here. The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and the project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
4
4
 
5
+ ## [2.0.0-alpha.0] — 2026-05-06
6
+
7
+ **Theme: ML-embedding retrieval.** v1.8 shipped TF-IDF cosine as the no-deps semantic-search floor. v2.0 raises the ceiling with real transformer embeddings — closer to Smart Connections quality, but free, offline-capable, multilingual, and (uniquely) chunk-aligned with the FTS5 BM25 index so the v2.0 beta hybrid RRF can score across both surfaces using the same identifier space.
8
+
9
+ ### Added — `obsidian_embeddings_search`
10
+
11
+ ML-embedding retrieval via [@huggingface/transformers](https://github.com/huggingface/transformers.js) + `paraphrase-multilingual-MiniLM-L12-v2` (50+ languages, 384-dim, runs on CPU). Persistent SQLite vector index next to the FTS5 db. Brute-force cosine top-K (sub-100ms on 50K chunks; HNSW ladder is v2.1 if real users hit that ceiling).
12
+
13
+ Higher-quality than `obsidian_semantic_search` for paraphrases, synonyms, and cross-language queries — but requires a one-time setup (see below). The TF-IDF path remains the no-deps default.
14
+
15
+ ### Added — `enquire-mcp install-model [alias]` subcommand
16
+
17
+ Pre-downloads an embedding model so the first MCP call doesn't block on a ~120MB HuggingFace download. Aliases:
18
+
19
+ - `multilingual` (default) — `Xenova/paraphrase-multilingual-MiniLM-L12-v2`, 384-dim, ~120MB, 50+ languages
20
+ - `bge` — `Xenova/bge-small-en-v1.5`, 384-dim, ~33MB, English-only (better recall on technical content)
21
+
22
+ Models are cached under `~/.cache/huggingface/transformers.js/` and reused across vaults. Subsequent `install-model` calls are no-ops if the cache is warm.
23
+
24
+ ### Added — `enquire-mcp build-embeddings --vault <path>` subcommand
25
+
26
+ Cold-build (or refresh) the persistent embedding index for a vault. Same paragraph-level chunking as the FTS5 index (`fts5.chunkContent`) so chunk identity matches across BM25 and embeddings — foundation for the v2.0 beta hybrid RRF.
27
+
28
+ Incremental rebuilds via `source_state` mtime tracking — only re-embeds notes whose mtime changed since the last `build-embeddings`. ~5-30ms per chunk on M1 CPU.
29
+
30
+ Supports `--embedding-model <alias>`, `--exclude-glob`, `--read-paths`, `--embed-file <path>`.
31
+
32
+ ### Added — `enquire-mcp clear-embeddings --vault <path>` subcommand
33
+
34
+ Removes the `.embed.db` + WAL/SHM sidecars. Mirrors `clear-cache` and `clear-index`.
35
+
36
+ ### Added — `@huggingface/transformers ^4.2.0` as `optionalDependencies`
37
+
38
+ Mirrors the `better-sqlite3` pattern: the heavy ONNX runtime + tokenizer transitive deps install only if the user's npm policy allows optional deps (default). Read-only / TF-IDF / FTS5 paths stay zero-cost — no model load, no runtime cost. Tarball stays under 200KB.
39
+
40
+ If optional deps are skipped (`npm install --omit=optional`), the embedding tools and subcommands surface a clean error pointing the user at `npm install @huggingface/transformers` rather than an opaque module-not-found.
41
+
42
+ ### Architecture decisions (locked for v2.0)
43
+
44
+ - **Default model = multilingual.** The user's dogfood vault is bilingual Russian + English; v2.0 covers >80% of real Obsidian users (most personal vaults are not pure English).
45
+ - **Models download on subcommand, not on first MCP call.** Predictable for CI; air-gap-friendly; explicit consent for networked operations. Pattern follows Stripe / Cloudflare CLI conventions.
46
+ - **Hardcoded RRF in v2.0 beta.** No `--rrf-weights` flag (yet). Sensible defaults work in 80% of cases per Cormack et al. Add the flag in v2.1 if real issues come in.
47
+ - **CJK is v2.0 backlog.** The Unicode tokenizer in v1.11.1 catches Cyrillic / Greek / Hebrew / Arabic. Chinese / Japanese / Thai need an `Intl.Segmenter` pass first; out-of-scope for alpha.
48
+ - **Brute-force cosine, not HNSW.** ~50ms top-10 on 50K × 384 floats — fine for >99% of personal vaults. HNSW ladder when the ceiling is hit.
49
+
50
+ ### Tests
51
+
52
+ 364 unit tests pass (was 341, +23). New: `tests/embed-db.test.ts` (synthetic-vector schema + upsert/delete/search semantics, cross-vault contamination guard, dim mismatch, folder filter, minScore threshold). `tests/embeddings.test.ts` (catalog + cosine math, no model load).
53
+
54
+ End-to-end ML smoke is out-of-band — CI doesn't download the model. Manual verification:
55
+ ```bash
56
+ enquire-mcp install-model multilingual
57
+ enquire-mcp build-embeddings --vault ~/Documents/Obsidian\ Vault
58
+ # then via MCP: obsidian_embeddings_search { query: "OAuth flows" }
59
+ ```
60
+
61
+ ### Migration from v1.x
62
+
63
+ **No breaking changes for read-only / TF-IDF / FTS5 users.** All v1.x tools and CLI flags continue to work exactly as before. Embedding features are pure additions, gated behind explicit subcommand invocations.
64
+
65
+ The next prerelease (v2.0.0-beta.0) will add hybrid RRF scoring (`obsidian_search` umbrella tool over BM25 + TF-IDF + embeddings) — additive, not breaking.
66
+
67
+ ### Excluded from this alpha (deferred to v2.0 beta / RC)
68
+
69
+ - Hybrid RRF tool (`obsidian_search`) — needs alpha shipping first to validate the embedding plumbing in real vaults
70
+ - HNSW vector index — only matters past 50K chunks, which no current user has
71
+ - `--persistent-embeddings` server flag (auto-build on serve startup) — pulls model load into hot path; alpha users prefer explicit subcommand
72
+ - CJK segmenter — needs `Intl.Segmenter` v18+ feature gating; v2.1 backlog
73
+
74
+ ## [1.11.1] — 2026-05-05
75
+
76
+ Audit-driven patch. Five-agent audit of the v1.10 → v1.11 surface flagged two real P1 code bugs and one CI/process gap; this release fixes all three plus the doc drift the audit found.
77
+
78
+ ### Fixed — `obsidian_semantic_search` now indexes non-Latin content
79
+
80
+ The TF-IDF tokenizer used `/[a-z0-9][a-z0-9_-]*/g` — ASCII-only. Russian / Greek / Hebrew / Arabic notes were silently dropped from the index AND non-Latin queries returned zero hits.
81
+
82
+ Replaced with `/[\p{L}\p{N}][\p{L}\p{N}_-]*/gu` (Unicode-aware). Cyrillic / Greek / Hebrew / Arabic / Devanagari now work end-to-end. CJK languages (Chinese / Japanese / Thai) still need a segmenter pass first — tracked as v2.0 backlog (the regex matches them, but unsegmented sentences become single >40-char tokens which the length filter drops).
83
+
84
+ Regression tests: `tests/semantic.test.ts` now seeds Cyrillic + Greek vaults and asserts top-hit ranking.
85
+
86
+ ### Fixed — periodic-alias resolver respects `--read-paths` / `--exclude-glob` consistently
87
+
88
+ `resolveTarget()` had two codepaths: path-based lookup (which preserved exclusion errors and re-threw them via `lastErr`) and periodic-alias lookup (which had a bare `catch {}` that silently swallowed exclusion errors). When a user requested `title: "today"` and the configured Daily Notes folder was excluded, the periodic-alias path fell through to the legacy basename matcher — which could surface a different (visible) note with a colliding basename.
89
+
90
+ Both codepaths now surface exclusion errors uniformly. The agent gets a clear `"Path is excluded by --read-paths allowlist"` or `"--exclude-glob denylist"` error instead of a wrong-note return.
91
+
92
+ Regression test: `tests/security.test.ts` adds two cases — one for `--exclude-glob`, one for `--read-paths`.
93
+
94
+ ### Fixed — synthetic vault now exercises the v1.10 plugin-aware periodic resolver
95
+
96
+ `scripts/synthetic-vault.mjs` (CI smoke) didn't write `.obsidian/daily-notes.json`, so smoke fell back to the v0.11 hard-coded defaults — leaving `loadPeriodicConfig()` + `formatMoment()` regression-free in CI even when the actual code broke.
97
+
98
+ Added a 3-line config (`folder: "99_Daily"`, `format: "YYYY-MM-DD"`) so `obsidian_resolve_periodic_alias today` now exercises the lazy-load → cache → format codepath in every CI run.
99
+
100
+ ### Docs
101
+
102
+ - README: write-tools quick-start now lists all five (`obsidian_create_note`, `_append_to_note`, `_rename_note`, `_replace_in_notes`, `_archive_note`); FAQ updated to "five write tools"; test-count badge bumped 294 → 341.
103
+ - SECURITY.md: new sections for the v1.10 periodic-config disk-read posture and the `--enabled-tools` / `--disabled-tools` per-tool gating posture.
104
+
105
+ ### Tests
106
+
107
+ 341 unit tests pass (was 337). Three regression tests added: 2× Unicode tokenizer (Cyrillic + Greek), 2× periodic-alias exclusion (`--exclude-glob` + `--read-paths`).
108
+
5
109
  ## [1.11.0] — 2026-05-06
6
110
 
7
111
  Two more small wins, both completing surfaces from earlier releases:
package/README.md CHANGED
@@ -86,9 +86,10 @@ There are several Obsidian-MCP servers out there. enquire differentiates on thre
86
86
  | **Strict path allowlist** (`--read-paths '01_Projects/**'` — only paths matching one of these globs are visible; complement to `--exclude-glob` denylist) | ❌ | ✅ |
87
87
  | **Canvas (`.canvas`) read tools** (`obsidian_list_canvases` + `obsidian_read_canvas` — typed nodes + edges, broken-ref detection) | ❌ rare / partial | ✅ first-class |
88
88
  | **Semantic search** (`obsidian_semantic_search` — TF-IDF cosine, free / offline / no model download) | ❌ usually paywalled (Smart Connections) | ✅ in-tree |
89
- | TypeScript strict + Biome lint + 294+ unit tests | varies | ✅ |
89
+ | **ML embeddings search** (`obsidian_embeddings_search` paraphrase-multilingual-MiniLM-L12-v2, 50+ languages, persistent SQLite vector index) | usually paywalled (Smart Connections) | ✅ free + offline-capable (v2.0 alpha) |
90
+ | TypeScript strict + Biome lint + 364+ unit tests | varies | ✅ |
90
91
 
91
- That's the gap. enquire closes it in ~3000 lines of TypeScript with five mandatory runtime dependencies (`@modelcontextprotocol/sdk`, `chokidar`, `commander`, `gray-matter`, `zod`) plus one optional (`better-sqlite3`, only loaded when `--persistent-index` is passed).
92
+ That's the gap. enquire closes it in ~3500 lines of TypeScript with five mandatory runtime dependencies (`@modelcontextprotocol/sdk`, `chokidar`, `commander`, `gray-matter`, `zod`) plus two optional (`better-sqlite3` for `--persistent-index` and `--build-embeddings`; `@huggingface/transformers` for ML embeddings — both are no-ops when not invoked).
92
93
 
93
94
  > **Not affiliated with Obsidian.md.** Obsidian and the Obsidian logo are trademarks of Dynalist Inc. enquire-mcp is an independent open-source project that reads Obsidian-format vaults. The name «enquire» is a tribute to Tim Berners-Lee's 1980 hypertext system, not a trademark claim against any party.
94
95
 
@@ -128,7 +129,7 @@ That's the gap. enquire closes it in ~3000 lines of TypeScript with five mandato
128
129
  | **Codex / Codex CLI** | per-project `.mcp.json` or environment-specific config |
129
130
  | **Devin / any other MCP client** | wherever your client expects MCP server JSON |
130
131
 
131
- To enable write tools (`obsidian_create_note`, `obsidian_append_to_note`, `obsidian_rename_note`), add `"--enable-write"` to the `args` array.
132
+ To enable write tools (`obsidian_create_note`, `obsidian_append_to_note`, `obsidian_rename_note`, `obsidian_replace_in_notes`, `obsidian_archive_note`), add `"--enable-write"` to the `args` array.
132
133
 
133
134
  <details>
134
135
  <summary><b>Alternative: global npm install</b></summary>
@@ -186,6 +187,7 @@ Restart your client. The server logs `enquire <version> ready (read-only, vault=
186
187
  | `obsidian_list_canvases` | Lists `.canvas` files (Obsidian's whiteboard / mind-map format) with each canvas's node and edge counts. Honors `--exclude-glob` and `--read-paths`. |
187
188
  | `obsidian_read_canvas` | Parses one `.canvas` file into typed nodes (text / file / link / group / unknown) + edges (with from/to node IDs, sides, labels, colors). Each `file` node carries a `file_resolved` field (vault-relative path or `null` if broken). Returns a node-kind summary + `broken_file_refs` array. |
188
189
  | `obsidian_semantic_search` | **TF-IDF cosine retrieval.** Free / offline / no model download. Tokenizes + TF-IDFs + L2-normalizes every note's body once per session, then ranks notes by cosine similarity to the query. Catches synonym + related-term matches that `obsidian_search_text` (substring) and `obsidian_full_text_search` (BM25) miss. |
190
+ | `obsidian_embeddings_search` | _Opt-in via `enquire-mcp install-model` + `enquire-mcp build-embeddings --vault <path>`._ **ML-embedding retrieval** via @huggingface/transformers + paraphrase-multilingual-MiniLM-L12-v2 (50+ languages, 384-dim, runs on CPU). Higher-quality than `obsidian_semantic_search` for paraphrases / synonyms / cross-language queries. Persistent SQLite vector index next to the FTS5 db. Chunks match the FTS5 chunker so v2.0 beta can do hybrid RRF over both. |
189
191
  | `obsidian_full_text_search` | _Opt-in via `--persistent-index`._ BM25-ranked full-text search backed by SQLite FTS5 inverted index. Sub-100ms on multi-thousand-note vaults. Hyphenated tokens (`claude-telegram`) auto-quoted. Returns chunk-level hits with `«…»`-bracketed snippets. |
190
192
 
191
193
  ### 5 write tools (opt-in via `--enable-write`)
@@ -362,7 +364,7 @@ Found a security issue? See [SECURITY.md](./SECURITY.md).
362
364
  No. Obsidian's wikilink semantics, frontmatter conventions, and folder structure are baked in. Other tools are out of scope.
363
365
 
364
366
  **Will it modify my vault?**
365
- Not unless you start it with `--enable-write`. By default the server is strictly read-only. With write enabled, the four write tools refuse to overwrite existing notes by default (`obsidian_create_note` and `obsidian_rename_note` both require `overwrite=true` to clobber; `obsidian_replace_in_notes` refuses identical search/replace + empty search), and all writes refuse to land outside the vault even if a parent dir is symlinked away.
367
+ Not unless you start it with `--enable-write`. By default the server is strictly read-only. With write enabled, the five write tools refuse to overwrite existing notes by default (`obsidian_create_note` and `obsidian_rename_note` both require `overwrite=true` to clobber; `obsidian_replace_in_notes` refuses identical search/replace + empty search; `obsidian_archive_note` is a thin rename-into-archive wrapper that inherits the same guards), and all writes refuse to land outside the vault even if a parent dir is symlinked away.
366
368
 
367
369
  **Does it work over the network?**
368
370
  No. It's a local stdio MCP server, designed for one client process per vault. There's no HTTP transport, no auth, no rate limiting — and that's intentional.
package/SECURITY.md CHANGED
@@ -106,6 +106,56 @@ Out of scope:
106
106
  - Timing-side-channel: `--exclude-glob` filtering happens AFTER chokidar's stat call, so an external observer with read access to system call timing could in principle infer that *some* event fired even for excluded paths. Acceptable — anyone with that level of system access already controls the vault.
107
107
  - Watcher event ordering: chokidar coalesces but doesn't strictly serialize events. If the server's own write tools (`create_note`, `append_to_note`, `rename_note`) fire and the watcher reacts before the tool's own cache invalidation, the watcher may do redundant work but never produces inconsistent state — every read goes back to the disk.
108
108
 
109
+ ## Periodic-Notes plugin config: disk-read posture
110
+
111
+ `obsidian_resolve_periodic_alias` and the periodic-alias resolver inside `obsidian_read_note` / `obsidian_append_to_note` etc. (added v1.10.0) lazily read **two files** under the vault's `.obsidian/` directory at first use:
112
+
113
+ 1. `.obsidian/daily-notes.json` — the core Daily Notes plugin's settings.
114
+ 2. `.obsidian/plugins/periodic-notes/data.json` — the community Periodic Notes plugin's settings.
115
+
116
+ Posture:
117
+
118
+ - **Reads only.** Both files are opened with `fs.readFile` and parsed via `JSON.parse`; the resolver never writes back. A malformed file logs to stderr and falls through to the v0.11 hard-coded defaults — never throws.
119
+ - **Inside the vault root.** Both paths live under the vault root the user already exposed. No new filesystem surface is introduced.
120
+ - **No `.obsidian/` listing.** The walker's `SKIP_DIRS` set (which includes `.obsidian`) still hides everything else under that folder; only those two specific files are read by-name.
121
+ - **Cached for the process lifetime.** The first call populates `Vault.periodicConfig` and subsequent calls return that snapshot — restart the server after editing the plugin config.
122
+ - **No string interpolation.** The `format` string from the plugin config feeds a fixed Moment.js token table (`YYYY`, `MMM`, `Do`, …) and bracket-escaped literals; there's no `eval` or template path that could turn user-provided format text into code execution.
123
+ - **`--read-paths` allowlist now consistent.** v1.11.1 surfaces "excluded by --read-paths / --exclude-glob" errors from the periodic-alias path lookup the same way as the path-based lookup. Pre-1.11.1, exclusion errors were silently caught and the resolver fell through to the legacy basename matcher — which could surface a different (visible) note with a colliding basename. v1.11.1 re-throws exclusion errors, so the agent gets a clear refusal instead.
124
+
125
+ ## `--enabled-tools` / `--disabled-tools`: per-tool gating posture
126
+
127
+ `--disabled-tools` (added v1.10.0) and `--enabled-tools` (added v1.11.0) both gate which MCP tools the server registers, via a monkey-patched `server.registerTool()`:
128
+
129
+ - **`--disabled-tools` is a denylist.** Comma-separated list of tool names; matching tools are skipped at registration time. Useful for surface-area reduction without forking.
130
+ - **`--enabled-tools` is an allowlist.** Comma-separated list; ONLY listed tools are registered. Combined with `--disabled-tools`, both predicates apply (a tool must be in the allowlist AND not in the denylist).
131
+ - **Names are validated against the canonical tool list.** Unknown names log a stderr warning and are otherwise ignored — typos don't silently disable nothing.
132
+ - **Write-tool gating composes with `--enable-write`.** Disabling `obsidian_create_note` while leaving `obsidian_replace_in_notes` enabled is a valid configuration; the gate is independent of the global write flag.
133
+ - **Posture is "fail closed".** Tools blocked at registration time never appear in `tools/list` and a `tools/call` against a gated name returns a clean MCP-protocol error from the SDK — there's no codepath where a disabled tool can still execute.
134
+
135
+ ## ML embeddings (v2.0 alpha): networked-download + cache posture
136
+
137
+ The `obsidian_embeddings_search` tool plus the `install-model` and `build-embeddings` subcommands (added v2.0.0-alpha.0) introduce two new surfaces with networked / on-disk implications:
138
+
139
+ ### Model download (`install-model`)
140
+
141
+ - **Explicit, opt-in.** The `enquire-mcp install-model [alias]` subcommand is the ONLY codepath that hits the network. Serving / read-only / TF-IDF / FTS5 paths never make outbound HTTP. Air-gap-safe by default.
142
+ - **Source: HuggingFace Hub.** Model weights ship as ONNX from the `Xenova/*` org. `@huggingface/transformers` handles the download, hash verification, and caching to `~/.cache/huggingface/transformers.js/`.
143
+ - **Reusable across vaults.** The cache is per-machine, not per-vault. Multiple `enquire-mcp` instances on different vaults share the same model files.
144
+ - **Manual purge.** Delete `~/.cache/huggingface/transformers.js/` to remove cached models.
145
+
146
+ ### Persistent embedding index (`build-embeddings`)
147
+
148
+ - **0600 chmod** on `<vault-hash>.embed.db` + WAL + SHM sidecar files, parent directory mode 0700 — same as the FTS5 index posture.
149
+ - **Cross-vault contamination guard.** `meta` table stores `vault_root`, `model_alias`, `dim`, and `schema_version`; if any change between runs, the embedding tables are dropped and rebuilt with a stderr warning. Prevents a stale index from leaking content into a different vault.
150
+ - **Caveat — embedding values can leak content via cosine.** Float32 vectors stored in the index are reversible-ish: with the same model loaded, an attacker with read access to the .embed.db file can run nearest-neighbor searches against arbitrary queries to recover note content topics. Treat the .embed.db as having the same sensitivity as the .fts5.db (which already stores raw chunk content). If your threat model includes other local users on the same machine, do not use `--persistent-cache` / `--persistent-index` / build-embeddings.
151
+ - **Manual purge.** `enquire-mcp clear-embeddings --vault <path>` removes the `.embed.db`, `.embed.db-wal`, and `.embed.db-shm` files.
152
+ - **`--exclude-glob` / `--read-paths` honored.** The `build-embeddings` subcommand accepts both flags — excluded notes are never embedded, never appear in results.
153
+
154
+ ### Optional-dep failure mode
155
+
156
+ - If `@huggingface/transformers` failed to install (e.g., user ran `npm install --omit=optional`, or the platform lacks ONNX runtime binaries), the embedding tools and subcommands surface a clean error message pointing the user at `npm install @huggingface/transformers` — never a cryptic module-not-found stack trace.
157
+ - Read-only / TF-IDF / FTS5 surfaces are unaffected. The server starts and serves all v1.x tools normally.
158
+
109
159
  ## Persistent FTS5 index: privacy posture
110
160
 
111
161
  When `--persistent-index` is enabled, the search-index file at `<vault-hash>.fts5.db` (alongside the parse cache) stores **chunked note content** (paragraph-level, ~4 KB each), the **comma-serialized tag list** of each note, and the **list of wikilink targets** as part of the FTS5 enrichment for recall.
@@ -0,0 +1,73 @@
1
+ export interface EmbedSearchHit {
2
+ rel_path: string;
3
+ chunk_index: number;
4
+ line_start: number;
5
+ line_end: number;
6
+ /** Raw chunk text — caller can render snippets. */
7
+ text_preview: string;
8
+ /** Cosine similarity (since vectors are L2-normalized at insert time). */
9
+ score: number;
10
+ }
11
+ export interface EmbedSyncReport {
12
+ added: number;
13
+ updated: number;
14
+ deleted: number;
15
+ unchanged: number;
16
+ total_chunks: number;
17
+ }
18
+ interface SourceStateRow {
19
+ rel_path: string;
20
+ mtime_ms: number;
21
+ }
22
+ export interface EmbedDbOptions {
23
+ /** Absolute path to the .embed.db file. */
24
+ file: string;
25
+ /** Vault root for cross-vault contamination guard. */
26
+ vaultRoot: string;
27
+ /** Model alias the user built this index with (e.g. "multilingual"). */
28
+ modelAlias: string;
29
+ /** Vector dimensionality (must match the model). */
30
+ dim: number;
31
+ }
32
+ export declare class EmbedDb {
33
+ private db;
34
+ private readonly file;
35
+ private readonly vaultRoot;
36
+ private readonly modelAlias;
37
+ private readonly dim;
38
+ constructor(opts: EmbedDbOptions);
39
+ open(): Promise<void>;
40
+ /** Remove the embed db + WAL/SHM sidecars. Idempotent. */
41
+ clearOnDisk(): Promise<boolean>;
42
+ close(): void;
43
+ private bootstrapSchema;
44
+ private readMeta;
45
+ private writeMeta;
46
+ private requireDb;
47
+ /** Replace all embeddings for a single note. Caller computes vectors. */
48
+ upsertNote(relPath: string, mtimeMs: number, chunks: ReadonlyArray<{
49
+ chunkIndex: number;
50
+ lineStart: number;
51
+ lineEnd: number;
52
+ textPreview: string;
53
+ vector: Float32Array;
54
+ }>): void;
55
+ /** Drop a note's embeddings entirely (used on file deletion). */
56
+ deleteNote(relPath: string): void;
57
+ /** Read the source-state table — caller compares mtimes to decide what to
58
+ * re-embed. */
59
+ getSourceStates(): SourceStateRow[];
60
+ /** Brute-force cosine top-K. Vectors are L2-normalized at insert time so
61
+ * cosine == dot product. Acceptable up to ~50K chunks; v2.1 will swap to
62
+ * HNSW if real vaults hit that ceiling. */
63
+ search(queryVec: Float32Array, k: number, opts?: {
64
+ folder?: string;
65
+ minScore?: number;
66
+ }): EmbedSearchHit[];
67
+ /** Total embedded chunks — used by stats / UI. */
68
+ totalChunks(): number;
69
+ }
70
+ /** Default location for the embed db, alongside the FTS5 db + parse cache. */
71
+ export declare function defaultEmbedDbFile(vaultHashPrefix: string): string;
72
+ export {};
73
+ //# sourceMappingURL=embed-db.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"embed-db.d.ts","sourceRoot":"","sources":["../src/embed-db.ts"],"names":[],"mappings":"AAmBA,MAAM,WAAW,cAAc;IAC7B,QAAQ,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,UAAU,EAAE,MAAM,CAAC;IACnB,QAAQ,EAAE,MAAM,CAAC;IACjB,mDAAmD;IACnD,YAAY,EAAE,MAAM,CAAC;IACrB,0EAA0E;IAC1E,KAAK,EAAE,MAAM,CAAC;CACf;AAED,MAAM,WAAW,eAAe;IAC9B,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,EAAE,MAAM,CAAC;IAClB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED,UAAU,cAAc;IACtB,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAoCD,MAAM,WAAW,cAAc;IAC7B,2CAA2C;IAC3C,IAAI,EAAE,MAAM,CAAC;IACb,sDAAsD;IACtD,SAAS,EAAE,MAAM,CAAC;IAClB,wEAAwE;IACxE,UAAU,EAAE,MAAM,CAAC;IACnB,oDAAoD;IACpD,GAAG,EAAE,MAAM,CAAC;CACb;AAED,qBAAa,OAAO;IAClB,OAAO,CAAC,EAAE,CAAmB;IAC7B,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAS;IAC9B,OAAO,CAAC,QAAQ,CAAC,SAAS,CAAS;IACnC,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAS;IACpC,OAAO,CAAC,QAAQ,CAAC,GAAG,CAAS;gBAEjB,IAAI,EAAE,cAAc;IAO1B,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC;IAc3B,0DAA0D;IACpD,WAAW,IAAI,OAAO,CAAC,OAAO,CAAC;IAcrC,KAAK,IAAI,IAAI;IAOb,OAAO,CAAC,eAAe;IAqDvB,OAAO,CAAC,QAAQ;IAQhB,OAAO,CAAC,SAAS;IAMjB,OAAO,CAAC,SAAS;IAKjB,yEAAyE;IACzE,UAAU,CACR,OAAO,EAAE,MAAM,EACf,OAAO,EAAE,MAAM,EACf,MAAM,EAAE,aAAa,CAAC;QACpB,UAAU,EAAE,MAAM,CAAC;QACnB,SAAS,EAAE,MAAM,CAAC;QAClB,OAAO,EAAE,MAAM,CAAC;QAChB,WAAW,EAAE,MAAM,CAAC;QACpB,MAAM,EAAE,YAAY,CAAC;KACtB,CAAC,GACD,IAAI;IA+BP,iEAAiE;IACjE,UAAU,CAAC,OAAO,EAAE,MAAM,GAAG,IAAI;IAMjC;oBACgB;IAChB,eAAe,IAAI,cAAc,EAAE;IAKnC;;gDAE4C;IAC5C,MAAM,CAAC,QAAQ,EAAE,YAAY,EAAE,CAAC,EAAE,MAAM,EAAE,IAAI,GAAE;QAAE,MAAM,CAAC,EAAE,MAAM,CAAC;QAAC,QAAQ,CAAC,EAAE,MAAM,CAAA;KAAO,GAAG,cAAc,EAAE;IA6C9G,kDAAkD;IAClD,WAAW,IAAI,MAAM;CAKtB;AAED,8EAA8E;AAC9E,wBAAgB,kBAAkB,CAAC,eAAe,EAAE,MAAM,GAAG,MAAM,CAIlE"}
@@ -0,0 +1,229 @@
1
+ // Persistent embedding store (v2.0 alpha). SQLite-backed Float32 vectors,
2
+ // brute-force cosine top-K retrieval. Same chunking as FTS5 (paragraph-level
3
+ // via fts5.chunkContent) so chunk identity matches across BM25 and embeddings —
4
+ // foundation for the v2.0 beta hybrid RRF scorer.
5
+ //
6
+ // Architecture mirrors fts5.ts:
7
+ // - Lazy-loaded better-sqlite3 (optional dep)
8
+ // - 0600 chmod on db + WAL/SHM sidecars
9
+ // - meta-table cross-vault contamination guard (vault_root, model alias, dim)
10
+ // - source_state mtime tracking for incremental rebuilds
11
+ //
12
+ // Brute-force cosine is fast enough for vaults up to ~50K chunks (~50ms top-10
13
+ // on 50K × 384 floats). HNSW comes in v2.1 if real users hit that ceiling.
14
+ import { promises as fs } from "node:fs";
15
+ import * as path from "node:path";
16
+ const SCHEMA_VERSION = 1;
17
+ let BetterSqliteCtor = null;
18
+ async function loadBetterSqlite() {
19
+ if (BetterSqliteCtor)
20
+ return BetterSqliteCtor;
21
+ try {
22
+ const mod = (await import("better-sqlite3"));
23
+ const ctor = mod.default;
24
+ if (!ctor)
25
+ throw new Error("better-sqlite3 has no default export");
26
+ BetterSqliteCtor = ctor;
27
+ return ctor;
28
+ }
29
+ catch (err) {
30
+ throw new Error(`Persistent embeddings require the optional 'better-sqlite3' dependency; install failed or the binding could not be loaded. ${err instanceof Error ? err.message : String(err)}`);
31
+ }
32
+ }
33
+ export class EmbedDb {
34
+ db = null;
35
+ file;
36
+ vaultRoot;
37
+ modelAlias;
38
+ dim;
39
+ constructor(opts) {
40
+ this.file = opts.file;
41
+ this.vaultRoot = opts.vaultRoot;
42
+ this.modelAlias = opts.modelAlias;
43
+ this.dim = opts.dim;
44
+ }
45
+ async open() {
46
+ if (this.db)
47
+ return;
48
+ const Ctor = await loadBetterSqlite();
49
+ await fs.mkdir(path.dirname(this.file), { recursive: true, mode: 0o700 });
50
+ await fs.chmod(path.dirname(this.file), 0o700).catch(() => { });
51
+ this.db = new Ctor(this.file);
52
+ this.db.pragma("journal_mode = WAL");
53
+ this.db.pragma("synchronous = NORMAL");
54
+ this.bootstrapSchema();
55
+ await Promise.all([this.file, `${this.file}-wal`, `${this.file}-shm`].map((p) => fs.chmod(p, 0o600).catch(() => { })));
56
+ }
57
+ /** Remove the embed db + WAL/SHM sidecars. Idempotent. */
58
+ async clearOnDisk() {
59
+ this.close();
60
+ let removed = false;
61
+ for (const p of [this.file, `${this.file}-wal`, `${this.file}-shm`]) {
62
+ try {
63
+ await fs.unlink(p);
64
+ removed = true;
65
+ }
66
+ catch {
67
+ // missing is fine
68
+ }
69
+ }
70
+ return removed;
71
+ }
72
+ close() {
73
+ if (this.db) {
74
+ this.db.close();
75
+ this.db = null;
76
+ }
77
+ }
78
+ bootstrapSchema() {
79
+ const db = this.requireDb();
80
+ db.exec(`
81
+ CREATE TABLE IF NOT EXISTS meta (
82
+ key TEXT PRIMARY KEY,
83
+ value TEXT NOT NULL
84
+ );
85
+ `);
86
+ const meta = this.readMeta();
87
+ const versionMatch = meta.schema_version === undefined || meta.schema_version === String(SCHEMA_VERSION);
88
+ const rootMatch = meta.vault_root === undefined || meta.vault_root === this.vaultRoot;
89
+ const modelMatch = meta.model_alias === undefined || meta.model_alias === this.modelAlias;
90
+ const dimMatch = meta.dim === undefined || meta.dim === String(this.dim);
91
+ if (!versionMatch || !rootMatch || !modelMatch || !dimMatch) {
92
+ const reason = [];
93
+ if (!versionMatch)
94
+ reason.push(`schema_version ${meta.schema_version} → ${SCHEMA_VERSION}`);
95
+ if (!rootMatch)
96
+ reason.push(`vault_root ${meta.vault_root} → ${this.vaultRoot}`);
97
+ if (!modelMatch)
98
+ reason.push(`model ${meta.model_alias} → ${this.modelAlias}`);
99
+ if (!dimMatch)
100
+ reason.push(`dim ${meta.dim} → ${this.dim}`);
101
+ process.stderr.write(`enquire: rebuilding embed index (${reason.join("; ")})\n`);
102
+ db.exec("DROP TABLE IF EXISTS embeddings; DROP TABLE IF EXISTS source_state;");
103
+ }
104
+ db.exec(`
105
+ CREATE TABLE IF NOT EXISTS embeddings (
106
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
107
+ rel_path TEXT NOT NULL,
108
+ chunk_index INTEGER NOT NULL,
109
+ line_start INTEGER NOT NULL,
110
+ line_end INTEGER NOT NULL,
111
+ text_preview TEXT NOT NULL,
112
+ vector BLOB NOT NULL,
113
+ UNIQUE(rel_path, chunk_index)
114
+ );
115
+ CREATE INDEX IF NOT EXISTS embeddings_rel_path ON embeddings(rel_path);
116
+ CREATE TABLE IF NOT EXISTS source_state (
117
+ rel_path TEXT PRIMARY KEY,
118
+ mtime_ms INTEGER NOT NULL,
119
+ n_chunks INTEGER NOT NULL,
120
+ indexed_at TEXT NOT NULL
121
+ );
122
+ `);
123
+ this.writeMeta({
124
+ schema_version: String(SCHEMA_VERSION),
125
+ vault_root: this.vaultRoot,
126
+ model_alias: this.modelAlias,
127
+ dim: String(this.dim)
128
+ });
129
+ }
130
+ readMeta() {
131
+ const db = this.requireDb();
132
+ const rows = db.prepare("SELECT key, value FROM meta").all();
133
+ const out = {};
134
+ for (const r of rows)
135
+ out[r.key] = r.value;
136
+ return out;
137
+ }
138
+ writeMeta(kv) {
139
+ const db = this.requireDb();
140
+ const stmt = db.prepare("INSERT OR REPLACE INTO meta (key, value) VALUES (?, ?)");
141
+ for (const [k, v] of Object.entries(kv))
142
+ stmt.run(k, v);
143
+ }
144
+ requireDb() {
145
+ if (!this.db)
146
+ throw new Error("EmbedDb is not open — call .open() first");
147
+ return this.db;
148
+ }
149
+ /** Replace all embeddings for a single note. Caller computes vectors. */
150
+ upsertNote(relPath, mtimeMs, chunks) {
151
+ const db = this.requireDb();
152
+ const tx = db.transaction((rows) => {
153
+ db.prepare("DELETE FROM embeddings WHERE rel_path = ?").run(relPath);
154
+ const insert = db.prepare(`INSERT INTO embeddings (rel_path, chunk_index, line_start, line_end, text_preview, vector)
155
+ VALUES (?, ?, ?, ?, ?, ?)`);
156
+ for (const c of rows) {
157
+ if (c.vector.length !== this.dim) {
158
+ throw new Error(`vector dim mismatch for ${relPath} chunk ${c.chunkIndex}: got ${c.vector.length}, expected ${this.dim}`);
159
+ }
160
+ insert.run(relPath, c.chunkIndex, c.lineStart, c.lineEnd, c.textPreview, Buffer.from(c.vector.buffer, c.vector.byteOffset, c.vector.byteLength));
161
+ }
162
+ db.prepare(`INSERT OR REPLACE INTO source_state (rel_path, mtime_ms, n_chunks, indexed_at)
163
+ VALUES (?, ?, ?, datetime('now'))`).run(relPath, mtimeMs, rows.length);
164
+ });
165
+ tx(chunks);
166
+ }
167
+ /** Drop a note's embeddings entirely (used on file deletion). */
168
+ deleteNote(relPath) {
169
+ const db = this.requireDb();
170
+ db.prepare("DELETE FROM embeddings WHERE rel_path = ?").run(relPath);
171
+ db.prepare("DELETE FROM source_state WHERE rel_path = ?").run(relPath);
172
+ }
173
+ /** Read the source-state table — caller compares mtimes to decide what to
174
+ * re-embed. */
175
+ getSourceStates() {
176
+ const db = this.requireDb();
177
+ return db.prepare("SELECT rel_path, mtime_ms FROM source_state").all();
178
+ }
179
+ /** Brute-force cosine top-K. Vectors are L2-normalized at insert time so
180
+ * cosine == dot product. Acceptable up to ~50K chunks; v2.1 will swap to
181
+ * HNSW if real vaults hit that ceiling. */
182
+ search(queryVec, k, opts = {}) {
183
+ const db = this.requireDb();
184
+ if (queryVec.length !== this.dim) {
185
+ throw new Error(`query vector dim mismatch: got ${queryVec.length}, expected ${this.dim}`);
186
+ }
187
+ const minScore = opts.minScore ?? -Infinity;
188
+ const folderPrefix = opts.folder ? `${opts.folder.replace(/\/+$/, "")}/` : null;
189
+ const rows = db
190
+ .prepare(folderPrefix
191
+ ? `SELECT rel_path, chunk_index, line_start, line_end, text_preview, vector
192
+ FROM embeddings WHERE rel_path LIKE ? || '%'`
193
+ : `SELECT rel_path, chunk_index, line_start, line_end, text_preview, vector FROM embeddings`)
194
+ .all(...(folderPrefix ? [folderPrefix] : []));
195
+ const heap = [];
196
+ for (const r of rows) {
197
+ const vec = new Float32Array(r.vector.buffer, r.vector.byteOffset, this.dim);
198
+ let score = 0;
199
+ for (let i = 0; i < this.dim; i++) {
200
+ score += (queryVec[i] ?? 0) * (vec[i] ?? 0);
201
+ }
202
+ if (score < minScore)
203
+ continue;
204
+ heap.push({
205
+ rel_path: r.rel_path,
206
+ chunk_index: r.chunk_index,
207
+ line_start: r.line_start,
208
+ line_end: r.line_end,
209
+ text_preview: r.text_preview,
210
+ score
211
+ });
212
+ }
213
+ heap.sort((a, b) => b.score - a.score);
214
+ return heap.slice(0, k);
215
+ }
216
+ /** Total embedded chunks — used by stats / UI. */
217
+ totalChunks() {
218
+ const db = this.requireDb();
219
+ const row = db.prepare("SELECT COUNT(*) AS n FROM embeddings").get();
220
+ return row?.n ?? 0;
221
+ }
222
+ }
223
+ /** Default location for the embed db, alongside the FTS5 db + parse cache. */
224
+ export function defaultEmbedDbFile(vaultHashPrefix) {
225
+ // Caller is expected to compose the prefix with `~/.cache/enquire/<hash>` —
226
+ // we just append the .embed.db extension for consistency with .fts5.db.
227
+ return `${vaultHashPrefix}.embed.db`;
228
+ }
229
+ //# sourceMappingURL=embed-db.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"embed-db.js","sourceRoot":"","sources":["../src/embed-db.ts"],"names":[],"mappings":"AAAA,0EAA0E;AAC1E,6EAA6E;AAC7E,gFAAgF;AAChF,kDAAkD;AAClD,EAAE;AACF,gCAAgC;AAChC,gDAAgD;AAChD,0CAA0C;AAC1C,gFAAgF;AAChF,2DAA2D;AAC3D,EAAE;AACF,+EAA+E;AAC/E,2EAA2E;AAE3E,OAAO,EAAE,QAAQ,IAAI,EAAE,EAAE,MAAM,SAAS,CAAC;AACzC,OAAO,KAAK,IAAI,MAAM,WAAW,CAAC;AAElC,MAAM,cAAc,GAAG,CAAC,CAAC;AA0BzB,IAAI,gBAAgB,GAA2C,IAAI,CAAC;AACpE,KAAK,UAAU,gBAAgB;IAC7B,IAAI,gBAAgB;QAAE,OAAO,gBAAgB,CAAC;IAC9C,IAAI,CAAC;QACH,MAAM,GAAG,GAAG,CAAC,MAAM,MAAM,CAAC,gBAAgB,CAAC,CAAgD,CAAC;QAC5F,MAAM,IAAI,GAAG,GAAG,CAAC,OAAO,CAAC;QACzB,IAAI,CAAC,IAAI;YAAE,MAAM,IAAI,KAAK,CAAC,sCAAsC,CAAC,CAAC;QACnE,gBAAgB,GAAG,IAAI,CAAC;QACxB,OAAO,IAAI,CAAC;IACd,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,MAAM,IAAI,KAAK,CACb,8HACE,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CACjD,EAAE,CACH,CAAC;IACJ,CAAC;AACH,CAAC;AA6BD,MAAM,OAAO,OAAO;IACV,EAAE,GAAc,IAAI,CAAC;IACZ,IAAI,CAAS;IACb,SAAS,CAAS;IAClB,UAAU,CAAS;IACnB,GAAG,CAAS;IAE7B,YAAY,IAAoB;QAC9B,IAAI,CAAC,IAAI,GAAG,IAAI,CAAC,IAAI,CAAC;QACtB,IAAI,CAAC,SAAS,GAAG,IAAI,CAAC,SAAS,CAAC;QAChC,IAAI,CAAC,UAAU,GAAG,IAAI,CAAC,UAAU,CAAC;QAClC,IAAI,CAAC,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC;IACtB,CAAC;IAED,KAAK,CAAC,IAAI;QACR,IAAI,IAAI,CAAC,EAAE;YAAE,OAAO;QACpB,MAAM,IAAI,GAAG,MAAM,gBAAgB,EAAE,CAAC;QACtC,MAAM,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC;QAC1E,MAAM,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,KAAK,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;QAC/D,IAAI,CAAC,EAAE,GAAG,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,CAAO,CAAC;QACpC,IAAI,CAAC,EAAE,CAAC,MAAM,CAAC,oBAAoB,CAAC,CAAC;QACrC,IAAI,CAAC,EAAE,CAAC,MAAM,CAAC,sBAAsB,CAAC,CAAC;QACvC,IAAI,CAAC,eAAe,EAAE,CAAC;QACvB,MAAM,OAAO,CAAC,GAAG,CACf,CAAC,IAAI,CAAC,IAAI,EAAE,GAAG,IAAI,CAAC,IAAI,MAAM,EAAE,GAAG,IAAI,CAAC,IAAI,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC,CAAC,KAAK,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC,CACnG,CAAC;IACJ,CAAC;IAED,0DAA0D;IAC1D,KAAK,CAAC,WAAW;QACf,IAAI,CAAC,KAAK,EAAE,CAAC;QACb,IAAI,OAAO,GAAG,KAAK,CAAC;QACpB,KAAK,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,GAAG,IAAI,CAAC,IAAI,MAAM,EAAE,GAAG,IAAI,CAAC,IAAI,MAAM,CAAC,EAAE,CAAC;YACpE,IAAI,CAAC;gBACH,MAAM,EAAE,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;gBACnB,OAAO,GAAG,IAAI,CAAC;YACjB,CAAC;YAAC,MAAM,CAAC;gBACP,kBAAkB;YACpB,CAAC;QACH,CAAC;QACD,OAAO,OAAO,CAAC;IACjB,CAAC;IAED,KAAK;QACH,IAAI,IAAI,CAAC,EAAE,EAAE,CAAC;YACZ,IAAI,CAAC,EAAE,CAAC,KAAK,EAAE,CAAC;YAChB,IAAI,CAAC,EAAE,GAAG,IAAI,CAAC;QACjB,CAAC;IACH,CAAC;IAEO,eAAe;QACrB,MAAM,EAAE,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAE5B,EAAE,CAAC,IAAI,CAAC;;;;;KAKP,CAAC,CAAC;QAEH,MAAM,IAAI,GAAG,IAAI,CAAC,QAAQ,EAAE,CAAC;QAC7B,MAAM,YAAY,GAAG,IAAI,CAAC,cAAc,KAAK,SAAS,IAAI,IAAI,CAAC,cAAc,KAAK,MAAM,CAAC,cAAc,CAAC,CAAC;QACzG,MAAM,SAAS,GAAG,IAAI,CAAC,UAAU,KAAK,SAAS,IAAI,IAAI,CAAC,UAAU,KAAK,IAAI,CAAC,SAAS,CAAC;QACtF,MAAM,UAAU,GAAG,IAAI,CAAC,WAAW,KAAK,SAAS,IAAI,IAAI,CAAC,WAAW,KAAK,IAAI,CAAC,UAAU,CAAC;QAC1F,MAAM,QAAQ,GAAG,IAAI,CAAC,GAAG,KAAK,SAAS,IAAI,IAAI,CAAC,GAAG,KAAK,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QACzE,IAAI,CAAC,YAAY,IAAI,CAAC,SAAS,IAAI,CAAC,UAAU,IAAI,CAAC,QAAQ,EAAE,CAAC;YAC5D,MAAM,MAAM,GAAa,EAAE,CAAC;YAC5B,IAAI,CAAC,YAAY;gBAAE,MAAM,CAAC,IAAI,CAAC,kBAAkB,IAAI,CAAC,cAAc,MAAM,cAAc,EAAE,CAAC,CAAC;YAC5F,IAAI,CAAC,SAAS;gBAAE,MAAM,CAAC,IAAI,CAAC,cAAc,IAAI,CAAC,UAAU,MAAM,IAAI,CAAC,SAAS,EAAE,CAAC,CAAC;YACjF,IAAI,CAAC,UAAU;gBAAE,MAAM,CAAC,IAAI,CAAC,SAAS,IAAI,CAAC,WAAW,MAAM,IAAI,CAAC,UAAU,EAAE,CAAC,CAAC;YAC/E,IAAI,CAAC,QAAQ;gBAAE,MAAM,CAAC,IAAI,CAAC,OAAO,IAAI,CAAC,GAAG,MAAM,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC;YAC5D,OAAO,CAAC,MAAM,CAAC,KAAK,CAAC,oCAAoC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACjF,EAAE,CAAC,IAAI,CAAC,qEAAqE,CAAC,CAAC;QACjF,CAAC;QAED,EAAE,CAAC,IAAI,CAAC;;;;;;;;;;;;;;;;;;KAkBP,CAAC,CAAC;QAEH,IAAI,CAAC,SAAS,CAAC;YACb,cAAc,EAAE,MAAM,CAAC,cAAc,CAAC;YACtC,UAAU,EAAE,IAAI,CAAC,SAAS;YAC1B,WAAW,EAAE,IAAI,CAAC,UAAU;YAC5B,GAAG,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC;SACtB,CAAC,CAAC;IACL,CAAC;IAEO,QAAQ;QACd,MAAM,EAAE,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAC5B,MAAM,IAAI,GAAG,EAAE,CAAC,OAAO,CAAC,6BAA6B,CAAC,CAAC,GAAG,EAAkC,CAAC;QAC7F,MAAM,GAAG,GAA2B,EAAE,CAAC;QACvC,KAAK,MAAM,CAAC,IAAI,IAAI;YAAE,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC;QAC3C,OAAO,GAAG,CAAC;IACb,CAAC;IAEO,SAAS,CAAC,EAA0B;QAC1C,MAAM,EAAE,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAC5B,MAAM,IAAI,GAAG,EAAE,CAAC,OAAO,CAAC,wDAAwD,CAAC,CAAC;QAClF,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC;YAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IAC1D,CAAC;IAEO,SAAS;QACf,IAAI,CAAC,IAAI,CAAC,EAAE;YAAE,MAAM,IAAI,KAAK,CAAC,0CAA0C,CAAC,CAAC;QAC1E,OAAO,IAAI,CAAC,EAAE,CAAC;IACjB,CAAC;IAED,yEAAyE;IACzE,UAAU,CACR,OAAe,EACf,OAAe,EACf,MAME;QAEF,MAAM,EAAE,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAC5B,MAAM,EAAE,GAAG,EAAE,CAAC,WAAW,CAAC,CAAC,IAAmB,EAAE,EAAE;YAChD,EAAE,CAAC,OAAO,CAAC,2CAA2C,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;YACrE,MAAM,MAAM,GAAG,EAAE,CAAC,OAAO,CACvB;mCAC2B,CAC5B,CAAC;YACF,KAAK,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC;gBACrB,IAAI,CAAC,CAAC,MAAM,CAAC,MAAM,KAAK,IAAI,CAAC,GAAG,EAAE,CAAC;oBACjC,MAAM,IAAI,KAAK,CACb,2BAA2B,OAAO,UAAU,CAAC,CAAC,UAAU,SAAS,CAAC,CAAC,MAAM,CAAC,MAAM,cAAc,IAAI,CAAC,GAAG,EAAE,CACzG,CAAC;gBACJ,CAAC;gBACD,MAAM,CAAC,GAAG,CACR,OAAO,EACP,CAAC,CAAC,UAAU,EACZ,CAAC,CAAC,SAAS,EACX,CAAC,CAAC,OAAO,EACT,CAAC,CAAC,WAAW,EACb,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,UAAU,EAAE,CAAC,CAAC,MAAM,CAAC,UAAU,CAAC,CACvE,CAAC;YACJ,CAAC;YACD,EAAE,CAAC,OAAO,CACR;2CACmC,CACpC,CAAC,GAAG,CAAC,OAAO,EAAE,OAAO,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;QACvC,CAAC,CAAC,CAAC;QACH,EAAE,CAAC,MAAM,CAAC,CAAC;IACb,CAAC;IAED,iEAAiE;IACjE,UAAU,CAAC,OAAe;QACxB,MAAM,EAAE,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAC5B,EAAE,CAAC,OAAO,CAAC,2CAA2C,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;QACrE,EAAE,CAAC,OAAO,CAAC,6CAA6C,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC;IACzE,CAAC;IAED;oBACgB;IAChB,eAAe;QACb,MAAM,EAAE,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAC5B,OAAO,EAAE,CAAC,OAAO,CAAC,6CAA6C,CAAC,CAAC,GAAG,EAAkB,CAAC;IACzF,CAAC;IAED;;gDAE4C;IAC5C,MAAM,CAAC,QAAsB,EAAE,CAAS,EAAE,OAA+C,EAAE;QACzF,MAAM,EAAE,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAC5B,IAAI,QAAQ,CAAC,MAAM,KAAK,IAAI,CAAC,GAAG,EAAE,CAAC;YACjC,MAAM,IAAI,KAAK,CAAC,kCAAkC,QAAQ,CAAC,MAAM,cAAc,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC;QAC7F,CAAC;QACD,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,IAAI,CAAC,QAAQ,CAAC;QAC5C,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC;QAEhF,MAAM,IAAI,GAAG,EAAE;aACZ,OAAO,CACN,YAAY;YACV,CAAC,CAAC;0DAC8C;YAChD,CAAC,CAAC,0FAA0F,CAC/F;aACA,GAAG,CAOD,GAAG,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;QAE9C,MAAM,IAAI,GAAqB,EAAE,CAAC;QAClC,KAAK,MAAM,CAAC,IAAI,IAAI,EAAE,CAAC;YACrB,MAAM,GAAG,GAAG,IAAI,YAAY,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,UAAU,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC;YAC7E,IAAI,KAAK,GAAG,CAAC,CAAC;YACd,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;gBAClC,KAAK,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC;YAC9C,CAAC;YACD,IAAI,KAAK,GAAG,QAAQ;gBAAE,SAAS;YAC/B,IAAI,CAAC,IAAI,CAAC;gBACR,QAAQ,EAAE,CAAC,CAAC,QAAQ;gBACpB,WAAW,EAAE,CAAC,CAAC,WAAW;gBAC1B,UAAU,EAAE,CAAC,CAAC,UAAU;gBACxB,QAAQ,EAAE,CAAC,CAAC,QAAQ;gBACpB,YAAY,EAAE,CAAC,CAAC,YAAY;gBAC5B,KAAK;aACN,CAAC,CAAC;QACL,CAAC;QACD,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC;QACvC,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IAC1B,CAAC;IAED,kDAAkD;IAClD,WAAW;QACT,MAAM,EAAE,GAAG,IAAI,CAAC,SAAS,EAAE,CAAC;QAC5B,MAAM,GAAG,GAAG,EAAE,CAAC,OAAO,CAAC,sCAAsC,CAAC,CAAC,GAAG,EAAiB,CAAC;QACpF,OAAO,GAAG,EAAE,CAAC,IAAI,CAAC,CAAC;IACrB,CAAC;CACF;AAED,8EAA8E;AAC9E,MAAM,UAAU,kBAAkB,CAAC,eAAuB;IACxD,4EAA4E;IAC5E,wEAAwE;IACxE,OAAO,GAAG,eAAe,WAAW,CAAC;AACvC,CAAC"}
@@ -0,0 +1,38 @@
1
+ /** Catalog of embedding models supported by enquire. Add new entries by
2
+ * pinning the Xenova-converted ONNX model id + the dim count + a friendly
3
+ * alias users pass on the CLI. */
4
+ export interface EmbeddingModel {
5
+ /** CLI-friendly alias passed via `--embedding-model <alias>`. */
6
+ alias: string;
7
+ /** HuggingFace model id (Xenova-converted to ONNX). */
8
+ hfId: string;
9
+ /** Output vector dimensionality (384 for MiniLM family). */
10
+ dim: number;
11
+ /** Approximate disk footprint in MB after download, for progress messages. */
12
+ approxSizeMB: number;
13
+ /** True if this model has been trained on multilingual data. */
14
+ multilingual: boolean;
15
+ /** Maximum input tokens before transformers.js truncates. */
16
+ maxTokens: number;
17
+ }
18
+ export declare const EMBEDDING_MODELS: Readonly<Record<string, EmbeddingModel>>;
19
+ /** Default model alias when the user doesn't pass `--embedding-model`. */
20
+ export declare const DEFAULT_MODEL_ALIAS = "multilingual";
21
+ export declare function resolveModel(alias: string | undefined): EmbeddingModel;
22
+ /** Opaque handle for a loaded embedder. Constructed via `loadEmbedder()`. */
23
+ export interface Embedder {
24
+ readonly model: EmbeddingModel;
25
+ /** Embed a batch of texts. Each text is L2-normalized; output is one
26
+ * Float32Array per input, length === model.dim. */
27
+ embed(texts: readonly string[]): Promise<Float32Array[]>;
28
+ }
29
+ /** Load an embedder for the given model alias. First call may block on
30
+ * model download from HuggingFace (~120MB for multilingual). Subsequent
31
+ * calls reuse the cached weights under `~/.cache/huggingface/`.
32
+ *
33
+ * @param alias - Model alias from EMBEDDING_MODELS (default: "multilingual").
34
+ */
35
+ export declare function loadEmbedder(alias?: string): Promise<Embedder>;
36
+ /** Cosine similarity between two L2-normalized vectors (= dot product). */
37
+ export declare function cosineSim(a: Float32Array, b: Float32Array): number;
38
+ //# sourceMappingURL=embeddings.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"embeddings.d.ts","sourceRoot":"","sources":["../src/embeddings.ts"],"names":[],"mappings":"AAcA;;mCAEmC;AACnC,MAAM,WAAW,cAAc;IAC7B,iEAAiE;IACjE,KAAK,EAAE,MAAM,CAAC;IACd,uDAAuD;IACvD,IAAI,EAAE,MAAM,CAAC;IACb,4DAA4D;IAC5D,GAAG,EAAE,MAAM,CAAC;IACZ,8EAA8E;IAC9E,YAAY,EAAE,MAAM,CAAC;IACrB,gEAAgE;IAChE,YAAY,EAAE,OAAO,CAAC;IACtB,6DAA6D;IAC7D,SAAS,EAAE,MAAM,CAAC;CACnB;AAED,eAAO,MAAM,gBAAgB,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,cAAc,CAAC,CAiBpE,CAAC;AAEH,0EAA0E;AAC1E,eAAO,MAAM,mBAAmB,iBAAiB,CAAC;AAElD,wBAAgB,YAAY,CAAC,KAAK,EAAE,MAAM,GAAG,SAAS,GAAG,cAAc,CAQtE;AAED,6EAA6E;AAC7E,MAAM,WAAW,QAAQ;IACvB,QAAQ,CAAC,KAAK,EAAE,cAAc,CAAC;IAC/B;wDACoD;IACpD,KAAK,CAAC,KAAK,EAAE,SAAS,MAAM,EAAE,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC,CAAC;CAC1D;AA0BD;;;;;GAKG;AACH,wBAAsB,YAAY,CAAC,KAAK,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC,QAAQ,CAAC,CAgCpE;AAED,2EAA2E;AAC3E,wBAAgB,SAAS,CAAC,CAAC,EAAE,YAAY,EAAE,CAAC,EAAE,YAAY,GAAG,MAAM,CASlE"}