@jcode.labs/mimir 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/CHANGELOG.md +22 -3
  2. package/CONTRIBUTING.md +28 -0
  3. package/README.md +262 -36
  4. package/SECURITY-HARDENING.md +58 -20
  5. package/dist/chunking.d.ts.map +1 -1
  6. package/dist/chunking.js +6 -3
  7. package/dist/chunking.js.map +1 -1
  8. package/dist/cli.js +103 -9
  9. package/dist/cli.js.map +1 -1
  10. package/dist/config.d.ts.map +1 -1
  11. package/dist/config.js +50 -36
  12. package/dist/config.js.map +1 -1
  13. package/dist/defaults.d.ts +11 -0
  14. package/dist/defaults.d.ts.map +1 -0
  15. package/dist/defaults.js +31 -0
  16. package/dist/defaults.js.map +1 -0
  17. package/dist/embeddings.d.ts.map +1 -1
  18. package/dist/embeddings.js +85 -11
  19. package/dist/embeddings.js.map +1 -1
  20. package/dist/files.d.ts +2 -1
  21. package/dist/files.d.ts.map +1 -1
  22. package/dist/files.js +39 -2
  23. package/dist/files.js.map +1 -1
  24. package/dist/gitignore.d.ts +1 -1
  25. package/dist/gitignore.d.ts.map +1 -1
  26. package/dist/gitignore.js +8 -7
  27. package/dist/gitignore.js.map +1 -1
  28. package/dist/ingest.d.ts.map +1 -1
  29. package/dist/ingest.js +2 -1
  30. package/dist/ingest.js.map +1 -1
  31. package/dist/init.d.ts.map +1 -1
  32. package/dist/init.js +4 -24
  33. package/dist/init.js.map +1 -1
  34. package/dist/mcp.d.ts.map +1 -1
  35. package/dist/mcp.js +14 -13
  36. package/dist/mcp.js.map +1 -1
  37. package/dist/parsing.d.ts.map +1 -1
  38. package/dist/parsing.js +138 -0
  39. package/dist/parsing.js.map +1 -1
  40. package/dist/query.d.ts.map +1 -1
  41. package/dist/query.js +14 -22
  42. package/dist/query.js.map +1 -1
  43. package/dist/security.js +16 -18
  44. package/dist/security.js.map +1 -1
  45. package/dist/skill.d.ts +2 -1
  46. package/dist/skill.d.ts.map +1 -1
  47. package/dist/skill.js +24 -9
  48. package/dist/skill.js.map +1 -1
  49. package/dist/store.d.ts.map +1 -1
  50. package/dist/store.js +2 -1
  51. package/dist/store.js.map +1 -1
  52. package/dist/types.d.ts +12 -14
  53. package/dist/types.d.ts.map +1 -1
  54. package/dist/version.d.ts +1 -1
  55. package/dist/version.js +1 -1
  56. package/examples/sovereign-rag-demo/.kb/config.json +22 -0
  57. package/examples/sovereign-rag-demo/.kb/sources.txt +2 -0
  58. package/examples/sovereign-rag-demo/README.md +80 -0
  59. package/examples/sovereign-rag-demo/raw/dataset-inventory.csv +5 -0
  60. package/examples/sovereign-rag-demo/raw/incident-timeline.jsonl +4 -0
  61. package/examples/sovereign-rag-demo/raw/operations-brief.md +16 -0
  62. package/examples/sovereign-rag-demo/raw/review-notes.evidence +11 -0
  63. package/examples/sovereign-rag-demo/raw/security-policy.yaml +14 -0
  64. package/package.json +24 -25
  65. package/skills/mimir/SKILL.md +66 -5
  66. package/skills/mimir-audio-summary/SKILL.md +134 -0
  67. package/skills/mimir-audio-summary/forge-voice.sh +153 -0
  68. package/skills/mimir-audio-summary/split-lines.py +13 -0
  69. package/skills/mimir-audio-summary/xtts-voice.py +46 -0
  70. package/dist/network.d.ts +0 -4
  71. package/dist/network.d.ts.map +0 -1
  72. package/dist/network.js +0 -59
  73. package/dist/network.js.map +0 -1
package/CHANGELOG.md CHANGED
@@ -1,10 +1,29 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.4.0 - 2026-06-28
4
+
5
+ - Reposition Mimir as sovereign local RAG for confidential datasets and AI agents.
6
+ - Expand default ingestion to common text, Office/OpenDocument, data, config, log, and source-code
7
+ file types.
8
+ - Add `includeExtensions` / `KB_INCLUDE_EXTENSIONS` for custom UTF-8 text file extensions.
9
+ - Add the optional `mimir-audio-summary` bundled skill for confidential audio summaries.
10
+ - Install both the main Mimir skill and optional audio-summary skill with `kb install-skill`.
11
+ - Improve agent guidance for deep multi-query retrieval before synthesis.
12
+ - Make Mimir core retrieval-only: `kb ask` now returns cited context for external agents or LLMs
13
+ instead of generating answers internally.
14
+ - Add optional Transformers.js semantic embeddings through `embeddingProvider: "transformers"`.
15
+ - Remove Ollama providers and keep `embeddingProvider: "local-hash"` as the no-model default.
16
+ - Move the repository to a simple pnpm workspace monorepo without adding Turbo.
17
+ - Move the core `@jcode.labs/mimir` package into `packages/mimir`.
18
+ - Add `@jcode.labs/mimir-tts` for plug-and-play JS/ONNX WAV rendering without Python or ffmpeg.
19
+ - Add `kb audio` and update the audio-summary skill to use Mimir TTS before advanced fallback
20
+ engines.
21
+
3
22
  ## 0.3.0 - 2026-06-28
4
23
 
5
- - Add confidentiality hardening defaults: local-only Ollama network policy, built-in
6
- redaction before indexing, metadata-only access logs, and bounded MCP retrieval.
7
- - Add `kb security-audit` for zero-telemetry, network, redaction, gitignore, storage, and
24
+ - Add confidentiality hardening defaults: built-in redaction before indexing, metadata-only access
25
+ logs, and bounded MCP retrieval.
26
+ - Add `kb security-audit` for zero-telemetry, provider, redaction, gitignore, storage, and
8
27
  MCP posture checks.
9
28
  - Add `kb destroy-index --yes` to remove generated vector indexes.
10
29
  - Add release verification artifacts: npm tarball, SHA256 checksums, SBOM, and manifest.
@@ -0,0 +1,28 @@
1
+ # Contributing
2
+
3
+ Mimir is an open-source project under the MIT License. Issues and pull requests are welcome.
4
+
5
+ ## Development
6
+
7
+ Use Node.js 20+ and pnpm:
8
+
9
+ ```bash
10
+ pnpm install
11
+ pnpm validate
12
+ ```
13
+
14
+ `pnpm validate` runs Biome, TypeScript, Vitest, the production CLI/MCP smoke test, and npm
15
+ package metadata checks.
16
+
17
+ ## Pull Requests
18
+
19
+ - Open pull requests against `main`.
20
+ - Keep changes focused and include tests or smoke coverage for behavior changes.
21
+ - Do not commit private documents, generated vector stores, environment files, tokens, or
22
+ credentials.
23
+ - Use conventional commit messages such as `feat: add source parser` or
24
+ `fix: handle empty index`.
25
+
26
+ ## Security
27
+
28
+ Do not report vulnerabilities through public issues. Follow [`SECURITY.md`](./SECURITY.md).
package/README.md CHANGED
@@ -5,11 +5,18 @@
5
5
  [![npm](https://img.shields.io/npm/v/@jcode.labs/mimir)](https://www.npmjs.com/package/@jcode.labs/mimir)
6
6
  [![license: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](./LICENSE)
7
7
 
8
- Open-source, local-first memory and retrieval for private project knowledge.
8
+ Open-source, sovereign local RAG for confidential datasets and AI agents.
9
9
 
10
- Mimir provides a TypeScript CLI and library that can be installed in any Node.js
11
- repository. It indexes files from the target repository, stores vectors locally with LanceDB,
12
- and uses Ollama for local embeddings and answers.
10
+ Mimir provides a TypeScript CLI, library, MCP server, and portable agent skills that can be
11
+ installed in any Node.js repository. It indexes local files from the target repository, stores
12
+ vectors locally with LanceDB, and can use either built-in local-hash retrieval or optional
13
+ Transformers.js semantic embeddings. Mimir core returns cited retrieval context; answer synthesis
14
+ belongs to the AI agent, LLM, or local model runtime you choose around it.
15
+
16
+ The intended use case is simple: put confidential company, institutional, legal, operational, or
17
+ research documents in a private local folder, index them locally, then let any compatible AI agent or
18
+ LLM workflow retrieve grounded context for summaries, briefs, audits, and decision support without
19
+ shipping the dataset to a hosted RAG service.
13
20
 
14
21
  Created by Jean-Baptiste Thery and published under the JCode Labs npm scope.
15
22
 
@@ -41,22 +48,50 @@ Suggested GitHub Sponsors tiers:
41
48
 
42
49
  Early public package. APIs may evolve before `1.0.0`.
43
50
 
51
+ ## What Mimir Is For
52
+
53
+ - Build a local RAG knowledge base inside any repository.
54
+ - Analyze confidential datasets while keeping raw files and generated indexes local.
55
+ - Give Claude, Codex, Cursor, internal assistants, or other MCP-compatible tools the same private
56
+ retrieval layer.
57
+ - Retrieve grounded local evidence through CLI, library calls, MCP tools, or the bundled agent
58
+ skills so your chosen AI agent can produce cited summaries.
59
+ - Optionally create listenable WAV summaries with `kb audio`, `@jcode.labs/mimir-tts`, and the
60
+ bundled `mimir-audio-summary` skill.
61
+
62
+ Mimir is not a hosted SaaS, not a remote vector database, and not a certified high-assurance system.
63
+ For regulated or state-grade environments, pair it with encrypted disks, controlled machines, release
64
+ verification, and an external security review.
65
+
66
+ ## Use Cases
67
+
68
+ Mimir is useful whenever the source material should stay local but an AI agent still needs grounded
69
+ context.
70
+
71
+ | Use case | Example questions |
72
+ | --- | --- |
73
+ | Understand a code repository | "Where is authentication implemented?", "What depends on this module?", "Summarize the payment flow." |
74
+ | Understand architecture | "What services exist?", "What are the data boundaries?", "Which components are risky to change?" |
75
+ | Analyze specifications | "What does the technical spec require?", "Which requirements are still unclear?", "Generate an implementation checklist." |
76
+ | Work through a request for proposal or tender | "What are the mandatory constraints?", "Which documents prove compliance?", "What risks should be clarified?" |
77
+ | Study courses and training material | "Summarize chapter three.", "Create revision questions.", "Compare these two concepts." |
78
+ | Analyze a book or long report | "Extract the main thesis.", "Find recurring arguments.", "Create a chapter-by-chapter brief." |
79
+ | Build an internal knowledge base | "What is the policy for incident review?", "Who owns this process?", "Which source says that?" |
80
+ | Prepare meetings or decisions | "Give me a one-page briefing.", "What is missing before deciding?", "List action items and evidence." |
81
+ | Ask questions over offline documents | "Which files mention local-only operation?", "What evidence supports this claim?" |
82
+ | Generate audio briefings | "Create a listenable summary of the current dossier using offline TTS." |
83
+
44
84
  ## Requirements
45
85
 
46
86
  - Node.js 20+
47
87
  - pnpm, npm, yarn or bun
48
- - Ollama running locally
49
- - Embedding model installed once:
50
-
51
- ```bash
52
- ollama pull nomic-embed-text
53
- ```
54
-
55
- Optional answer model:
56
-
57
- ```bash
58
- ollama pull gemma4
59
- ```
88
+ - No model runtime is required for the default `embeddingProvider: "local-hash"` mode.
89
+ - Optional semantic embeddings use Transformers.js with local model files under `.mimir/models` by
90
+ default.
91
+ - Generated answers are intentionally outside Mimir core. Use Claude, Codex, OpenAI, a local model
92
+ MCP server, or another trusted model runtime to synthesize from Mimir's cited context.
93
+ - Optional audio summaries use the separate `@jcode.labs/mimir-tts` workspace package. It renders
94
+ WAV files with Transformers.js and does not require Python, ffmpeg, Piper, XTTS, or a local server.
60
95
 
61
96
  ## Install From npm
62
97
 
@@ -76,23 +111,26 @@ npm install --save-dev @jcode.labs/mimir
76
111
 
77
112
  Maintainer tokens are only needed to publish new versions.
78
113
 
79
- ## Install From Git
114
+ ## Install From Source Checkout
80
115
 
81
116
  ```bash
82
- pnpm add -D git+ssh://git@github.com/jcode-works/jcode-mimir.git
117
+ git clone git@github.com:jcode-works/jcode-mimir.git
118
+ cd jcode-mimir
119
+ pnpm install
120
+ pnpm build
83
121
  ```
84
122
 
85
123
  For local development:
86
124
 
87
125
  ```bash
88
- pnpm add -D file:../jcode-mimir
126
+ pnpm add -D file:../jcode-mimir/packages/mimir
89
127
  ```
90
128
 
91
129
  Before creating an npm tarball later, run:
92
130
 
93
131
  ```bash
94
132
  pnpm build
95
- pnpm pack
133
+ pnpm --dir packages/mimir pack
96
134
  ```
97
135
 
98
136
  ## Use In Any Repository
@@ -126,9 +164,158 @@ npx kb security-audit
126
164
  npx kb status
127
165
  ```
128
166
 
129
- ## Agent Skill And MCP
167
+ ## Choose A Retrieval Mode
168
+
169
+ Mimir has two embedding modes.
170
+
171
+ ### Default Local Hash Retrieval
172
+
173
+ Use this when you want a fully local, no-model smoke test or a dependency-light setup. Retrieval is
174
+ lexical/hash-based, not semantic.
175
+
176
+ `.kb/config.json`:
177
+
178
+ ```json
179
+ {
180
+ "embeddingProvider": "local-hash"
181
+ }
182
+ ```
183
+
184
+ Commands:
185
+
186
+ ```bash
187
+ pnpm exec kb ingest
188
+ pnpm exec kb search "offline retrieval approval"
189
+ pnpm exec kb ask "What evidence supports offline operation?"
190
+ ```
191
+
192
+ `kb ask` always returns cited retrieved passages instead of a generated synthesis. You can pass those
193
+ passages to any LLM or agent you trust.
194
+
195
+ ### Optional Semantic Embeddings With Transformers.js
130
196
 
131
- Mimir ships with a portable agent skill and a standard MCP server.
197
+ Use this when you want better semantic retrieval while keeping Mimir core free of an LLM server.
198
+
199
+ `.kb/config.json`:
200
+
201
+ ```json
202
+ {
203
+ "embeddingProvider": "transformers",
204
+ "embeddingModel": "mixedbread-ai/mxbai-embed-xsmall-v1",
205
+ "embeddingModelPath": ".mimir/models",
206
+ "transformersAllowRemoteModels": false
207
+ }
208
+ ```
209
+
210
+ Commands:
211
+
212
+ ```bash
213
+ pnpm exec kb ingest
214
+ pnpm exec kb ask "Which passages support offline operation?"
215
+ ```
216
+
217
+ Keep `transformersAllowRemoteModels` false for confidential or air-gapped work and preload model
218
+ files into `embeddingModelPath`. Set it to true only when you explicitly allow Transformers.js to
219
+ download model files from Hugging Face.
220
+
221
+ ## Dependency Footprint
222
+
223
+ Mimir can run retrieval without a model runtime. Some runtime dependencies remain because they own
224
+ core features:
225
+
226
+ | Dependency | Why it remains |
227
+ | --- | --- |
228
+ | @huggingface/transformers | optional local semantic embeddings |
229
+ | LanceDB | local vector storage and nearest-neighbor retrieval |
230
+ | MCP SDK | MCP server for compatible agents |
231
+ | fast-glob | safe source-file discovery |
232
+ | unpdf, html-to-text, yaml, fflate | document parsing for PDF, HTML, YAML, Office/OpenDocument ZIP files |
233
+ | commander, zod, picocolors | CLI, config validation, readable terminal output |
234
+
235
+ Removing more dependencies is possible only by dropping features or replacing them with smaller
236
+ internal implementations. The current low-friction path is dependency-light at runtime for users who
237
+ choose `local-hash`, while preserving richer parsing, MCP support, and optional semantic embeddings.
238
+
239
+ ## Example Test Workspace
240
+
241
+ This repository includes a synthetic example under
242
+ [`examples/sovereign-rag-demo`](./examples/sovereign-rag-demo). It can be used to test ingestion,
243
+ retrieval, `security-audit`, and custom text extensions without using private documents.
244
+
245
+ From a local checkout:
246
+
247
+ ```bash
248
+ pnpm build
249
+ cd examples/sovereign-rag-demo
250
+ node ../../dist/cli.js security-audit
251
+ node ../../dist/cli.js ingest
252
+ node ../../dist/cli.js search "offline retrieval approval"
253
+ node ../../dist/cli.js audit
254
+ ```
255
+
256
+ The example uses the default local-hash retrieval mode, so it can run without downloading an
257
+ embedding or chat model.
258
+
259
+ ## Typical Workflows
260
+
261
+ ### Understand A Codebase
262
+
263
+ ```bash
264
+ pnpm exec kb init
265
+ printf "src\nREADME.md\ndocs\n" >> .kb/sources.txt
266
+ pnpm exec kb ingest
267
+ pnpm exec kb search "authentication flow"
268
+ pnpm exec kb ask "Explain the architecture and cite the relevant files."
269
+ ```
270
+
271
+ ### Analyze Specifications Or A Course
272
+
273
+ ```bash
274
+ pnpm exec kb ingest
275
+ pnpm exec kb ask "Summarize the requirements and list open questions."
276
+ pnpm exec kb ask "Create revision questions from the indexed course material."
277
+ ```
278
+
279
+ ### Work Offline
280
+
281
+ ```bash
282
+ pnpm exec kb security-audit --strict
283
+ pnpm exec kb ingest
284
+ pnpm exec kb search "incident review policy"
285
+ pnpm exec kb ask "What does the local evidence prove?"
286
+ ```
287
+
288
+ Use `embeddingProvider: "local-hash"` for a no-model offline workflow. Use
289
+ `embeddingProvider: "transformers"` with preloaded model files for semantic offline retrieval.
290
+ Generated answers should come from a trusted external agent or model runtime.
291
+
292
+ ### Generate A Local Audio Briefing
293
+
294
+ Mimir includes a plug-and-play JS text-to-speech path for listenable summaries:
295
+
296
+ ```bash
297
+ pnpm exec kb audio --doctor
298
+ pnpm exec kb audio /tmp/MIMIR-SUMMARY-project.txt --out .mimir/audio/project-summary.wav
299
+ ```
300
+
301
+ The command writes WAV output locally and does not require Python or ffmpeg. The first render can
302
+ download a public Transformers.js-compatible model into `.mimir/models/tts`; the narration text is
303
+ processed locally. For confidential air-gapped work, preload model files and run:
304
+
305
+ ```bash
306
+ pnpm exec kb audio /tmp/MIMIR-SUMMARY-project.txt --out .mimir/audio/project-summary.wav --offline
307
+ ```
308
+
309
+ The standalone package can also be installed directly:
310
+
311
+ ```bash
312
+ pnpm add -D @jcode.labs/mimir-tts
313
+ pnpm exec mimir-tts render /tmp/MIMIR-SUMMARY-project.txt --out .mimir/audio/project-summary.wav
314
+ ```
315
+
316
+ ## Agent Skills And MCP
317
+
318
+ Mimir ships with portable agent skills and a standard MCP server.
132
319
 
133
320
  Install the agent kit into a repository:
134
321
 
@@ -140,12 +327,14 @@ This creates:
140
327
 
141
328
  ```plain text
142
329
  .mimir/skills/mimir/SKILL.md
330
+ .mimir/skills/mimir-audio-summary/SKILL.md
143
331
  .mimir/mcp.json
144
332
  .mimir/README.md
145
333
  ```
146
334
 
147
- Agents that support skill folders can load `.mimir/skills/mimir/`. Other agents can read the
148
- generated `.mimir/README.md` and use the MCP config snippet.
335
+ Agents that support skill folders can load `.mimir/skills/mimir/` for deep local RAG usage.
336
+ Load `.mimir/skills/mimir-audio-summary/` only when an optional spoken summary is needed.
337
+ Other agents can read the generated `.mimir/README.md` and use the MCP config snippet.
149
338
 
150
339
  Start the MCP server from the repository root:
151
340
 
@@ -161,6 +350,10 @@ MCP tools exposed:
161
350
  - `mimir_audit`
162
351
  - `mimir_security_audit`
163
352
 
353
+ This MCP layer is the recommended way to let any compatible LLM or agent query the same local
354
+ knowledge base. The LLM does not need to know about LanceDB or the raw file layout; it asks Mimir for
355
+ ranked passages or cited context and uses the returned citations.
356
+
164
357
  Print the bundled skill path from the installed package:
165
358
 
166
359
  ```bash
@@ -190,7 +383,9 @@ state.
190
383
  Mimir is designed for private repositories and sensitive local evidence.
191
384
 
192
385
  - Zero telemetry: no analytics or document content is sent to JCode Labs.
193
- - Local-only network policy: Ollama must be on loopback by default.
386
+ - No LLM generation in core: Mimir returns cited context for the agent/runtime you choose.
387
+ - Local-hash by default: no model runtime is required for the default retrieval path.
388
+ - Transformers.js remote model loading is disabled by default.
194
389
  - Redaction before indexing: common secrets and identifiers are redacted before chunks are
195
390
  embedded and stored.
196
391
  - Metadata-only access logs: query hashes and action metadata are logged, not raw queries.
@@ -214,6 +409,8 @@ read [`SECURITY-HARDENING.md`](./SECURITY-HARDENING.md).
214
409
 
215
410
  ## Supported Files
216
411
 
412
+ Mimir supports common text, document, data, config, log, and source-code files out of the box:
413
+
217
414
  - Markdown: `.md`, `.mdx`
218
415
  - Text: `.txt`, `.text`
219
416
  - JSON: `.json`
@@ -221,6 +418,32 @@ read [`SECURITY-HARDENING.md`](./SECURITY-HARDENING.md).
221
418
  - CSV/TSV: `.csv`, `.tsv`
222
419
  - HTML: `.html`, `.htm`
223
420
  - PDF: `.pdf`
421
+ - Office/OpenDocument: `.docx`, `.pptx`, `.xlsx`, `.odt`, `.ods`, `.odp`
422
+ - Rich text: `.rtf`
423
+ - Line data and logs: `.jsonl`, `.ndjson`, `.log`
424
+ - XML feeds and documents: `.xml`, `.rss`, `.atom`
425
+ - Config and data files: `.toml`, `.ini`, `.conf`, `.cfg`, `.properties`, `.sql`
426
+ - Source code: `.ts`, `.tsx`, `.js`, `.jsx`, `.py`, `.go`, `.rs`, `.java`, `.rb`, `.php`,
427
+ `.cs`, `.c`, `.cpp`, `.h`, `.css`
428
+
429
+ Custom UTF-8 text extensions can be enabled without changing code:
430
+
431
+ ```json
432
+ {
433
+ "includeExtensions": [".transcript", ".evidence"]
434
+ }
435
+ ```
436
+
437
+ Or through:
438
+
439
+ ```bash
440
+ KB_INCLUDE_EXTENSIONS=".transcript,.evidence" pnpm exec kb ingest
441
+ ```
442
+
443
+ Images, scans, audio/video files, old proprietary Office binaries such as `.doc`, and other formats
444
+ that are not listed should be OCRed, transcribed, converted, or exported to text/PDF/HTML first.
445
+ Mimir intentionally avoids pretending that every binary format can be indexed safely without
446
+ extraction logic.
224
447
 
225
448
  ## Config
226
449
 
@@ -232,11 +455,11 @@ read [`SECURITY-HARDENING.md`](./SECURITY-HARDENING.md).
232
455
  "storageDir": ".kb/storage",
233
456
  "sourcesFile": ".kb/sources.txt",
234
457
  "accessLogPath": ".kb/access.log",
458
+ "embeddingModelPath": ".mimir/models",
235
459
  "tableName": "chunks",
236
- "ollamaHost": "http://localhost:11434",
237
- "networkPolicy": "local-only",
238
- "embedModel": "nomic-embed-text",
239
- "llmModel": "gemma4:latest",
460
+ "embeddingProvider": "local-hash",
461
+ "embeddingModel": "mixedbread-ai/mxbai-embed-xsmall-v1",
462
+ "transformersAllowRemoteModels": false,
240
463
  "redaction": {
241
464
  "enabled": true,
242
465
  "builtIn": true,
@@ -246,7 +469,8 @@ read [`SECURITY-HARDENING.md`](./SECURITY-HARDENING.md).
246
469
  "mcpMaxTopK": 10,
247
470
  "topK": 5,
248
471
  "chunkSize": 1200,
249
- "chunkOverlap": 150
472
+ "chunkOverlap": 150,
473
+ "includeExtensions": []
250
474
  }
251
475
  ```
252
476
 
@@ -256,10 +480,10 @@ Environment overrides:
256
480
  - `KB_STORAGE_DIR`
257
481
  - `KB_SOURCES_FILE`
258
482
  - `KB_ACCESS_LOG_PATH`
259
- - `KB_OLLAMA_HOST`
260
- - `KB_NETWORK_POLICY`
261
- - `KB_EMBED_MODEL`
262
- - `KB_LLM_MODEL`
483
+ - `KB_EMBEDDING_PROVIDER`
484
+ - `KB_EMBEDDING_MODEL`
485
+ - `KB_EMBEDDING_MODEL_PATH`
486
+ - `KB_TRANSFORMERS_ALLOW_REMOTE_MODELS`
263
487
  - `KB_REDACTION_ENABLED`
264
488
  - `KB_REDACTION_BUILT_IN`
265
489
  - `KB_ACCESS_LOG`
@@ -267,6 +491,7 @@ Environment overrides:
267
491
  - `KB_TOP_K`
268
492
  - `KB_CHUNK_SIZE`
269
493
  - `KB_CHUNK_OVERLAP`
494
+ - `KB_INCLUDE_EXTENSIONS`
270
495
 
271
496
  ## Library API
272
497
 
@@ -280,8 +505,9 @@ const answer = await ask("What documents support the project timeline?")
280
505
 
281
506
  ## Privacy
282
507
 
283
- - Embeddings and answers use local Ollama by default.
284
- - Remote Ollama hosts are blocked unless `networkPolicy` explicitly allows them.
508
+ - Mimir core does not generate answers or call a chat model.
509
+ - `local-hash` can run ingestion, search, and cited retrieval without a model runtime.
510
+ - Transformers.js remote model loading is disabled by default.
285
511
  - Built-in redaction runs before indexing by default.
286
512
  - Access logs store query hashes, not raw queries.
287
513
  - The vector index is stored locally.
@@ -1,13 +1,16 @@
1
1
  # Mimir Security Hardening
2
2
 
3
- Mimir is a local-first knowledge base for private project documents. It is built to minimize
4
- data movement, but it is not a certified high-assurance system.
3
+ Mimir is a sovereign local RAG knowledge base for confidential project documents and datasets. It is
4
+ built to minimize data movement, but it is not a certified high-assurance system.
5
5
 
6
6
  ## Current Guarantees
7
7
 
8
8
  - Zero telemetry: Mimir does not send usage analytics or document content to JCode Labs.
9
- - Local-only network policy by default: document text can only be sent to loopback Ollama hosts
10
- unless the repository explicitly opts in to broader network access.
9
+ - Retrieval-only core: Mimir does not call a chat model or generate LLM answers.
10
+ - No-model retrieval mode: `embeddingProvider: "local-hash"` can ingest, search, and return cited
11
+ passages without a model server.
12
+ - Optional semantic embeddings: `embeddingProvider: "transformers"` uses Transformers.js, with
13
+ remote model loading disabled by default through `transformersAllowRemoteModels: false`.
11
14
  - Redaction before indexing: built-in DLP patterns redact common secrets and identifiers before
12
15
  chunks are embedded and stored.
13
16
  - Metadata-only access logs: access logs contain action metadata and query hashes, not raw
@@ -16,13 +19,15 @@ data movement, but it is not a certified high-assurance system.
16
19
  default.
17
20
  - MCP is read-focused: destructive tools are not exposed over MCP, and MCP retrieval is capped by
18
21
  `mcpMaxTopK`.
22
+ - Optional audio summaries use `kb audio` / `@jcode.labs/mimir-tts` for local WAV rendering with
23
+ Transformers.js. They do not require Python, ffmpeg, Piper, XTTS, or a local TTS server.
19
24
  - npm releases are published with provenance from the protected GitHub Actions workflow.
20
25
  - Release artifacts include a package tarball, SHA256 checksums, SBOM, and manifest.
21
26
 
22
27
  ## Threat Model
23
28
 
24
- Mimir protects against accidental repository leaks, accidental remote LLM usage, accidental secret
25
- indexing, and weak release traceability.
29
+ Mimir protects against accidental repository leaks, accidental built-in LLM usage, accidental online
30
+ TTS usage for generated summaries, accidental secret indexing, and weak release traceability.
26
31
 
27
32
  Mimir does not protect against a compromised local machine, malicious dependencies already present
28
33
  in the runtime, a user with filesystem access to the same checkout, or forensic recovery from an
@@ -50,32 +55,45 @@ pnpm build
50
55
  pnpm release:artifacts
51
56
  ```
52
57
 
53
- Move the generated tarball from `release-artifacts/` into the offline environment and install it:
58
+ Move the generated tarballs from `release-artifacts/` into the offline environment and install them:
54
59
 
55
60
  ```bash
56
- pnpm add -D ./jcode.labs-mimir-<version>.tgz
61
+ pnpm add -D ./jcode.labs-mimir-tts-<version>.tgz ./jcode.labs-mimir-<version>.tgz
57
62
  pnpm exec kb init
58
63
  pnpm exec kb ingest
59
64
  ```
60
65
 
61
- Ollama and the required models must also be preloaded inside the offline environment.
66
+ For semantic embeddings, preload the Transformers.js-compatible embedding model files inside the
67
+ offline environment under the configured `embeddingModelPath`. For audio, preload the TTS model
68
+ files under `.mimir/models/tts` and render with `pnpm exec kb audio <text-file> --offline`.
62
69
 
63
70
  ## Zero Network Posture
64
71
 
65
- Default config:
72
+ Default no-model config:
66
73
 
67
74
  ```json
68
75
  {
69
- "ollamaHost": "http://localhost:11434",
70
- "networkPolicy": "local-only"
76
+ "embeddingProvider": "local-hash"
71
77
  }
72
78
  ```
73
79
 
74
- Allowed policies:
80
+ Optional semantic config:
75
81
 
76
- - `local-only`: only loopback hosts such as `localhost` and `127.0.0.1`.
77
- - `allow-private`: loopback and private LAN hosts.
78
- - `allow-any`: any host. Use only when the remote endpoint is explicitly trusted.
82
+ ```json
83
+ {
84
+ "embeddingProvider": "transformers",
85
+ "embeddingModel": "mixedbread-ai/mxbai-embed-xsmall-v1",
86
+ "embeddingModelPath": ".mimir/models",
87
+ "transformersAllowRemoteModels": false
88
+ }
89
+ ```
90
+
91
+ The local-hash mode performs lexical/hash retrieval only. It is useful for smoke tests,
92
+ dependency-light offline workflows, and handing cited passages to another trusted LLM. It is not
93
+ equivalent to model semantic retrieval.
94
+
95
+ Keep `transformersAllowRemoteModels` false for confidential or air-gapped work. If it is true,
96
+ Transformers.js may download model files from Hugging Face during model loading.
79
97
 
80
98
  Run:
81
99
 
@@ -108,6 +126,25 @@ Custom patterns can be added in `.kb/config.json`:
108
126
 
109
127
  Redaction changes the indexed text, not the raw files under `private/`.
110
128
 
129
+ ## Optional Audio Summaries
130
+
131
+ `kb install-skill` installs an optional `mimir-audio-summary` skill. It is designed for listenable
132
+ briefings from a local Mimir index. The default renderer is `kb audio`, backed by
133
+ `@jcode.labs/mimir-tts` and Transformers.js.
134
+
135
+ Confidentiality defaults:
136
+
137
+ - narration text is written to a temp file outside the repository;
138
+ - generated WAV audio should be written under `.mimir/audio/`;
139
+ - `.mimir/` is ignored by Git;
140
+ - Python, ffmpeg, Piper, XTTS, and local TTS servers are not required for the default path;
141
+ - the first online-enabled render may download public model weights into `.mimir/models/tts`, but
142
+ the narration text is processed locally;
143
+ - `--offline` disables remote model loading and requires preloaded model files.
144
+
145
+ Generated audio can still contain sensitive information. Treat it like a derived confidential
146
+ document.
147
+
111
148
  ## MCP Hardening
112
149
 
113
150
  MCP gives an agent access to retrieved private context. Use it only for agents running under the
@@ -124,16 +161,17 @@ For team use, prefer one checkout per user or per role. Mimir does not implement
124
161
 
125
162
  ## Release Verification
126
163
 
127
- The protected npm workflow runs validation, generates release artifacts, and publishes with
128
- provenance:
164
+ The protected npm workflow runs validation, generates release artifacts, and publishes both
165
+ workspace packages with provenance:
129
166
 
130
167
  ```bash
131
- npm publish --access public --provenance
168
+ pnpm --dir packages/mimir-tts publish --access public --provenance --no-git-checks
169
+ pnpm --dir packages/mimir publish --access public --provenance --no-git-checks
132
170
  ```
133
171
 
134
172
  Release artifacts include:
135
173
 
136
- - npm tarball;
174
+ - npm tarballs for `@jcode.labs/mimir-tts` and `@jcode.labs/mimir`;
137
175
  - `SHA256SUMS`;
138
176
  - CycloneDX SBOM;
139
177
  - `release-manifest.json`.
@@ -1 +1 @@
1
- {"version":3,"file":"chunking.d.ts","sourceRoot":"","sources":["../src/chunking.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,YAAY,CAAA;AAE3D,wBAAgB,aAAa,CAC3B,QAAQ,EAAE,cAAc,EACxB,SAAS,EAAE,MAAM,EACjB,YAAY,EAAE,MAAM,GACnB,SAAS,EAAE,CAqCb"}
1
+ {"version":3,"file":"chunking.d.ts","sourceRoot":"","sources":["../src/chunking.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,YAAY,CAAA;AAM3D,wBAAgB,aAAa,CAC3B,QAAQ,EAAE,cAAc,EACxB,SAAS,EAAE,MAAM,EACjB,YAAY,EAAE,MAAM,GACnB,SAAS,EAAE,CAqCb"}
package/dist/chunking.js CHANGED
@@ -1,4 +1,7 @@
1
1
  import { createHash } from "node:crypto";
2
+ const PARAGRAPH_BREAK_MIN_RATIO = 0.45;
3
+ const SENTENCE_BREAK_MIN_RATIO = 0.55;
4
+ const WHITESPACE_BREAK_MIN_RATIO = 0.75;
2
5
  export function chunkDocument(document, chunkSize, chunkOverlap) {
3
6
  if (!document.text) {
4
7
  return [];
@@ -39,15 +42,15 @@ function chooseChunkEnd(text, cursor, chunkSize) {
39
42
  }
40
43
  const window = text.slice(cursor, hardEnd);
41
44
  const paragraphBreak = window.lastIndexOf("\n\n");
42
- if (paragraphBreak > chunkSize * 0.45) {
45
+ if (paragraphBreak > chunkSize * PARAGRAPH_BREAK_MIN_RATIO) {
43
46
  return cursor + paragraphBreak;
44
47
  }
45
48
  const sentenceBreak = Math.max(window.lastIndexOf(". "), window.lastIndexOf("? "), window.lastIndexOf("! "));
46
- if (sentenceBreak > chunkSize * 0.55) {
49
+ if (sentenceBreak > chunkSize * SENTENCE_BREAK_MIN_RATIO) {
47
50
  return cursor + sentenceBreak + 1;
48
51
  }
49
52
  const whitespace = window.lastIndexOf(" ");
50
- if (whitespace > chunkSize * 0.75) {
53
+ if (whitespace > chunkSize * WHITESPACE_BREAK_MIN_RATIO) {
51
54
  return cursor + whitespace;
52
55
  }
53
56
  return hardEnd;
@@ -1 +1 @@
1
- {"version":3,"file":"chunking.js","sourceRoot":"","sources":["../src/chunking.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAA;AAGxC,MAAM,UAAU,aAAa,CAC3B,QAAwB,EACxB,SAAiB,EACjB,YAAoB;IAEpB,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;QACnB,OAAO,EAAE,CAAA;IACX,CAAC;IAED,MAAM,MAAM,GAAgB,EAAE,CAAA;IAC9B,IAAI,MAAM,GAAG,CAAC,CAAA;IACd,IAAI,UAAU,GAAG,CAAC,CAAA;IAElB,OAAO,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,cAAc,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,CAAC,CAAA;QAC5D,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;QAEpD,IAAI,IAAI,EAAE,CAAC;YACT,MAAM,EAAE,GAAG,UAAU,CAAC,QAAQ,CAAC;iBAC5B,MAAM,CAAC,GAAG,QAAQ,CAAC,IAAI,CAAC,YAAY,IAAI,UAAU,IAAI,IAAI,EAAE,CAAC;iBAC7D,MAAM,CAAC,KAAK,CAAC,CAAA;YAChB,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE;gBACF,MAAM,EAAE,QAAQ,CAAC,IAAI,CAAC,MAAM;gBAC5B,YAAY,EAAE,QAAQ,CAAC,IAAI,CAAC,YAAY;gBACxC,UAAU;gBACV,IAAI;gBACJ,QAAQ,EAAE,QAAQ,CAAC,IAAI,CAAC,QAAQ;gBAChC,KAAK,EAAE,QAAQ,CAAC,IAAI,CAAC,KAAK;gBAC1B,OAAO,EAAE,QAAQ,CAAC,IAAI,CAAC,OAAO;aAC/B,CAAC,CAAA;YACF,UAAU,IAAI,CAAC,CAAA;QACjB,CAAC;QAED,IAAI,GAAG,IAAI,QAAQ,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;YAChC,MAAK;QACP,CAAC;QACD,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,GAAG,YAAY,EAAE,MAAM,GAAG,CAAC,CAAC,CAAA;IACnD,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED,SAAS,cAAc,CAAC,IAAY,EAAE,MAAc,EAAE,SAAiB;IACrE,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,GAAG,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,CAAA;IACzD,IAAI,OAAO,KAAK,IAAI,CAAC,MAAM,EAAE,CAAC;QAC5B,OAAO,OAAO,CAAA;IAChB,CAAC;IAED,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;IAC1C,MAAM,cAAc,GAAG,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,CAAA;IACjD,IAAI,cAAc,GAAG,SAAS,GAAG,IAAI,EAAE,CAAC;QACtC,OAAO,MAAM,GAAG,cAAc,CAAA;IAChC,CAAC;IAED,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAC5B,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,EACxB,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,EACxB,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,CACzB,CAAA;IACD,IAAI,aAAa,GAAG,SAAS,GAAG,IAAI,EAAE,CAAC;QACrC,OAAO,MAAM,GAAG,aAAa,GAAG,CAAC,CAAA;IACnC,CAAC;IAED,MAAM,UAAU,GAAG,MAAM,CAAC,WAAW,CAAC,GAAG,CAAC,CAAA;IAC1C,IAAI,UAAU,GAAG,SAAS,GAAG,IAAI,EAAE,CAAC;QAClC,OAAO,MAAM,GAAG,UAAU,CAAA;IAC5B,CAAC;IAED,OAAO,OAAO,CAAA;AAChB,CAAC"}
1
+ {"version":3,"file":"chunking.js","sourceRoot":"","sources":["../src/chunking.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAA;AAGxC,MAAM,yBAAyB,GAAG,IAAI,CAAA;AACtC,MAAM,wBAAwB,GAAG,IAAI,CAAA;AACrC,MAAM,0BAA0B,GAAG,IAAI,CAAA;AAEvC,MAAM,UAAU,aAAa,CAC3B,QAAwB,EACxB,SAAiB,EACjB,YAAoB;IAEpB,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;QACnB,OAAO,EAAE,CAAA;IACX,CAAC;IAED,MAAM,MAAM,GAAgB,EAAE,CAAA;IAC9B,IAAI,MAAM,GAAG,CAAC,CAAA;IACd,IAAI,UAAU,GAAG,CAAC,CAAA;IAElB,OAAO,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,cAAc,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,CAAC,CAAA;QAC5D,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;QAEpD,IAAI,IAAI,EAAE,CAAC;YACT,MAAM,EAAE,GAAG,UAAU,CAAC,QAAQ,CAAC;iBAC5B,MAAM,CAAC,GAAG,QAAQ,CAAC,IAAI,CAAC,YAAY,IAAI,UAAU,IAAI,IAAI,EAAE,CAAC;iBAC7D,MAAM,CAAC,KAAK,CAAC,CAAA;YAChB,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE;gBACF,MAAM,EAAE,QAAQ,CAAC,IAAI,CAAC,MAAM;gBAC5B,YAAY,EAAE,QAAQ,CAAC,IAAI,CAAC,YAAY;gBACxC,UAAU;gBACV,IAAI;gBACJ,QAAQ,EAAE,QAAQ,CAAC,IAAI,CAAC,QAAQ;gBAChC,KAAK,EAAE,QAAQ,CAAC,IAAI,CAAC,KAAK;gBAC1B,OAAO,EAAE,QAAQ,CAAC,IAAI,CAAC,OAAO;aAC/B,CAAC,CAAA;YACF,UAAU,IAAI,CAAC,CAAA;QACjB,CAAC;QAED,IAAI,GAAG,IAAI,QAAQ,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;YAChC,MAAK;QACP,CAAC;QACD,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,GAAG,YAAY,EAAE,MAAM,GAAG,CAAC,CAAC,CAAA;IACnD,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED,SAAS,cAAc,CAAC,IAAY,EAAE,MAAc,EAAE,SAAiB;IACrE,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,GAAG,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,CAAA;IACzD,IAAI,OAAO,KAAK,IAAI,CAAC,MAAM,EAAE,CAAC;QAC5B,OAAO,OAAO,CAAA;IAChB,CAAC;IAED,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;IAC1C,MAAM,cAAc,GAAG,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,CAAA;IACjD,IAAI,cAAc,GAAG,SAAS,GAAG,yBAAyB,EAAE,CAAC;QAC3D,OAAO,MAAM,GAAG,cAAc,CAAA;IAChC,CAAC;IAED,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAC5B,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,EACxB,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,EACxB,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,CACzB,CAAA;IACD,IAAI,aAAa,GAAG,SAAS,GAAG,wBAAwB,EAAE,CAAC;QACzD,OAAO,MAAM,GAAG,aAAa,GAAG,CAAC,CAAA;IACnC,CAAC;IAED,MAAM,UAAU,GAAG,MAAM,CAAC,WAAW,CAAC,GAAG,CAAC,CAAA;IAC1C,IAAI,UAAU,GAAG,SAAS,GAAG,0BAA0B,EAAE,CAAC;QACxD,OAAO,MAAM,GAAG,UAAU,CAAA;IAC5B,CAAC;IAED,OAAO,OAAO,CAAA;AAChB,CAAC"}