@jcode.labs/mimir 0.2.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. package/CHANGELOG.md +47 -0
  2. package/CONTRIBUTING.md +28 -0
  3. package/README.md +307 -32
  4. package/SECURITY-HARDENING.md +194 -0
  5. package/SECURITY.md +21 -0
  6. package/dist/access-log.d.ts +10 -0
  7. package/dist/access-log.d.ts.map +1 -0
  8. package/dist/access-log.js +29 -0
  9. package/dist/access-log.js.map +1 -0
  10. package/dist/chunking.d.ts.map +1 -1
  11. package/dist/chunking.js +6 -3
  12. package/dist/chunking.js.map +1 -1
  13. package/dist/cli.js +151 -5
  14. package/dist/cli.js.map +1 -1
  15. package/dist/config.d.ts.map +1 -1
  16. package/dist/config.js +83 -20
  17. package/dist/config.js.map +1 -1
  18. package/dist/defaults.d.ts +11 -0
  19. package/dist/defaults.d.ts.map +1 -0
  20. package/dist/defaults.js +31 -0
  21. package/dist/defaults.js.map +1 -0
  22. package/dist/destroy.d.ts +3 -0
  23. package/dist/destroy.d.ts.map +1 -0
  24. package/dist/destroy.js +16 -0
  25. package/dist/destroy.js.map +1 -0
  26. package/dist/embeddings.d.ts.map +1 -1
  27. package/dist/embeddings.js +85 -9
  28. package/dist/embeddings.js.map +1 -1
  29. package/dist/files.d.ts +2 -1
  30. package/dist/files.d.ts.map +1 -1
  31. package/dist/files.js +40 -3
  32. package/dist/files.js.map +1 -1
  33. package/dist/gitignore.d.ts +1 -1
  34. package/dist/gitignore.d.ts.map +1 -1
  35. package/dist/gitignore.js +8 -7
  36. package/dist/gitignore.js.map +1 -1
  37. package/dist/index.d.ts +4 -1
  38. package/dist/index.d.ts.map +1 -1
  39. package/dist/index.js +3 -0
  40. package/dist/index.js.map +1 -1
  41. package/dist/ingest.d.ts.map +1 -1
  42. package/dist/ingest.js +14 -2
  43. package/dist/ingest.js.map +1 -1
  44. package/dist/init.d.ts.map +1 -1
  45. package/dist/init.js +4 -15
  46. package/dist/init.js.map +1 -1
  47. package/dist/mcp.d.ts.map +1 -1
  48. package/dist/mcp.js +27 -15
  49. package/dist/mcp.js.map +1 -1
  50. package/dist/parsing.d.ts.map +1 -1
  51. package/dist/parsing.js +138 -0
  52. package/dist/parsing.js.map +1 -1
  53. package/dist/query.d.ts.map +1 -1
  54. package/dist/query.js +28 -20
  55. package/dist/query.js.map +1 -1
  56. package/dist/redaction.d.ts +7 -0
  57. package/dist/redaction.d.ts.map +1 -0
  58. package/dist/redaction.js +63 -0
  59. package/dist/redaction.js.map +1 -0
  60. package/dist/security.d.ts +3 -0
  61. package/dist/security.d.ts.map +1 -0
  62. package/dist/security.js +84 -0
  63. package/dist/security.js.map +1 -0
  64. package/dist/skill.d.ts +2 -1
  65. package/dist/skill.d.ts.map +1 -1
  66. package/dist/skill.js +24 -9
  67. package/dist/skill.js.map +1 -1
  68. package/dist/store.d.ts.map +1 -1
  69. package/dist/store.js +2 -1
  70. package/dist/store.js.map +1 -1
  71. package/dist/types.d.ts +68 -3
  72. package/dist/types.d.ts.map +1 -1
  73. package/dist/version.d.ts +1 -1
  74. package/dist/version.js +1 -1
  75. package/examples/sovereign-rag-demo/.kb/config.json +22 -0
  76. package/examples/sovereign-rag-demo/.kb/sources.txt +2 -0
  77. package/examples/sovereign-rag-demo/README.md +80 -0
  78. package/examples/sovereign-rag-demo/raw/dataset-inventory.csv +5 -0
  79. package/examples/sovereign-rag-demo/raw/incident-timeline.jsonl +4 -0
  80. package/examples/sovereign-rag-demo/raw/operations-brief.md +16 -0
  81. package/examples/sovereign-rag-demo/raw/review-notes.evidence +11 -0
  82. package/examples/sovereign-rag-demo/raw/security-policy.yaml +14 -0
  83. package/package.json +28 -25
  84. package/skills/mimir/SKILL.md +77 -6
  85. package/skills/mimir-audio-summary/SKILL.md +134 -0
  86. package/skills/mimir-audio-summary/forge-voice.sh +153 -0
  87. package/skills/mimir-audio-summary/split-lines.py +13 -0
  88. package/skills/mimir-audio-summary/xtts-voice.py +46 -0
package/CHANGELOG.md ADDED
@@ -0,0 +1,47 @@
1
+ # Changelog
2
+
3
+ ## 0.4.0 - 2026-06-28
4
+
5
+ - Reposition Mimir as sovereign local RAG for confidential datasets and AI agents.
6
+ - Expand default ingestion to common text, Office/OpenDocument, data, config, log, and source-code
7
+ file types.
8
+ - Add `includeExtensions` / `KB_INCLUDE_EXTENSIONS` for custom UTF-8 text file extensions.
9
+ - Add the optional `mimir-audio-summary` bundled skill for confidential audio summaries.
10
+ - Install both the main Mimir skill and optional audio-summary skill with `kb install-skill`.
11
+ - Improve agent guidance for deep multi-query retrieval before synthesis.
12
+ - Make Mimir core retrieval-only: `kb ask` now returns cited context for external agents or LLMs
13
+ instead of generating answers internally.
14
+ - Add optional Transformers.js semantic embeddings through `embeddingProvider: "transformers"`.
15
+ - Remove Ollama providers and keep `embeddingProvider: "local-hash"` as the no-model default.
16
+ - Move the repository to a simple pnpm workspace monorepo without adding Turbo.
17
+ - Move the core `@jcode.labs/mimir` package into `packages/mimir`.
18
+ - Add `@jcode.labs/mimir-tts` for plug-and-play JS/ONNX WAV rendering without Python or ffmpeg.
19
+ - Add `kb audio` and update the audio-summary skill to use Mimir TTS before advanced fallback
20
+ engines.
21
+
22
+ ## 0.3.0 - 2026-06-28
23
+
24
+ - Add confidentiality hardening defaults: built-in redaction before indexing, metadata-only access
25
+ logs, and bounded MCP retrieval.
26
+ - Add `kb security-audit` for zero-telemetry, provider, redaction, gitignore, storage, and
27
+ MCP posture checks.
28
+ - Add `kb destroy-index --yes` to remove generated vector indexes.
29
+ - Add release verification artifacts: npm tarball, SHA256 checksums, SBOM, and manifest.
30
+ - Document air-gapped operation, threat model, MCP hardening, and secure deletion limits.
31
+
32
+ ## 0.2.1 - 2026-06-28
33
+
34
+ - Add GitHub Sponsors funding metadata and document suggested sponsor tiers.
35
+ - Add maintainer positioning for Jean-Baptiste Thery and JCode Labs in the README.
36
+ - Make `kb init` and `kb install-skill` automatically keep `.kb/` and `.mimir/`
37
+ ignored by Git.
38
+
39
+ ## 0.2.0 - 2026-06-28
40
+
41
+ - Rename public product branding to Mimir while keeping the JCode Labs npm scope.
42
+ - Add the bundled portable `mimir` agent skill.
43
+ - Add the MCP stdio server with `mimir_status`, `mimir_search`, `mimir_ask`, and
44
+ `mimir_audit`.
45
+ - Add production smoke coverage for the built CLI and MCP server.
46
+ - Add Biome, commitlint, publint, CodeQL, Dependabot grouping, protected npm publishing,
47
+ and open-source contribution/security documentation.
@@ -0,0 +1,28 @@
1
+ # Contributing
2
+
3
+ Mimir is an open-source project under the MIT License. Issues and pull requests are welcome.
4
+
5
+ ## Development
6
+
7
+ Use Node.js 20+ and pnpm:
8
+
9
+ ```bash
10
+ pnpm install
11
+ pnpm validate
12
+ ```
13
+
14
+ `pnpm validate` runs Biome, TypeScript, Vitest, the production CLI/MCP smoke test, and npm
15
+ package metadata checks.
16
+
17
+ ## Pull Requests
18
+
19
+ - Open pull requests against `main`.
20
+ - Keep changes focused and include tests or smoke coverage for behavior changes.
21
+ - Do not commit private documents, generated vector stores, environment files, tokens, or
22
+ credentials.
23
+ - Use conventional commit messages such as `feat: add source parser` or
24
+ `fix: handle empty index`.
25
+
26
+ ## Security
27
+
28
+ Do not report vulnerabilities through public issues. Follow [`SECURITY.md`](./SECURITY.md).
package/README.md CHANGED
@@ -5,11 +5,18 @@
5
5
  [![npm](https://img.shields.io/npm/v/@jcode.labs/mimir)](https://www.npmjs.com/package/@jcode.labs/mimir)
6
6
  [![license: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](./LICENSE)
7
7
 
8
- Open-source, local-first memory and retrieval for private project knowledge.
8
+ Open-source, sovereign local RAG for confidential datasets and AI agents.
9
9
 
10
- Mimir provides a TypeScript CLI and library that can be installed in any Node.js
11
- repository. It indexes files from the target repository, stores vectors locally with LanceDB,
12
- and uses Ollama for local embeddings and answers.
10
+ Mimir provides a TypeScript CLI, library, MCP server, and portable agent skills that can be
11
+ installed in any Node.js repository. It indexes local files from the target repository, stores
12
+ vectors locally with LanceDB, and can use either built-in local-hash retrieval or optional
13
+ Transformers.js semantic embeddings. Mimir core returns cited retrieval context; answer synthesis
14
+ belongs to the AI agent, LLM, or local model runtime you choose around it.
15
+
16
+ The intended use case is simple: put confidential company, institutional, legal, operational, or
17
+ research documents in a private local folder, index them locally, then let any compatible AI agent or
18
+ LLM workflow retrieve grounded context for summaries, briefs, audits, and decision support without
19
+ shipping the dataset to a hosted RAG service.
13
20
 
14
21
  Created by Jean-Baptiste Thery and published under the JCode Labs npm scope.
15
22
 
@@ -41,22 +48,50 @@ Suggested GitHub Sponsors tiers:
41
48
 
42
49
  Early public package. APIs may evolve before `1.0.0`.
43
50
 
51
+ ## What Mimir Is For
52
+
53
+ - Build a local RAG knowledge base inside any repository.
54
+ - Analyze confidential datasets while keeping raw files and generated indexes local.
55
+ - Give Claude, Codex, Cursor, internal assistants, or other MCP-compatible tools the same private
56
+ retrieval layer.
57
+ - Retrieve grounded local evidence through CLI, library calls, MCP tools, or the bundled agent
58
+ skills so your chosen AI agent can produce cited summaries.
59
+ - Optionally create listenable WAV summaries with `kb audio`, `@jcode.labs/mimir-tts`, and the
60
+ bundled `mimir-audio-summary` skill.
61
+
62
+ Mimir is not a hosted SaaS, not a remote vector database, and not a certified high-assurance system.
63
+ For regulated or state-grade environments, pair it with encrypted disks, controlled machines, release
64
+ verification, and an external security review.
65
+
66
+ ## Use Cases
67
+
68
+ Mimir is useful whenever the source material should stay local but an AI agent still needs grounded
69
+ context.
70
+
71
+ | Use case | Example questions |
72
+ | --- | --- |
73
+ | Understand a code repository | "Where is authentication implemented?", "What depends on this module?", "Summarize the payment flow." |
74
+ | Understand architecture | "What services exist?", "What are the data boundaries?", "Which components are risky to change?" |
75
+ | Analyze specifications | "What does the technical spec require?", "Which requirements are still unclear?", "Generate an implementation checklist." |
76
+ | Work through a request for proposal or tender | "What are the mandatory constraints?", "Which documents prove compliance?", "What risks should be clarified?" |
77
+ | Study courses and training material | "Summarize chapter three.", "Create revision questions.", "Compare these two concepts." |
78
+ | Analyze a book or long report | "Extract the main thesis.", "Find recurring arguments.", "Create a chapter-by-chapter brief." |
79
+ | Build an internal knowledge base | "What is the policy for incident review?", "Who owns this process?", "Which source says that?" |
80
+ | Prepare meetings or decisions | "Give me a one-page briefing.", "What is missing before deciding?", "List action items and evidence." |
81
+ | Ask questions over offline documents | "Which files mention local-only operation?", "What evidence supports this claim?" |
82
+ | Generate audio briefings | "Create a listenable summary of the current dossier using offline TTS." |
83
+
44
84
  ## Requirements
45
85
 
46
86
  - Node.js 20+
47
87
  - pnpm, npm, yarn or bun
48
- - Ollama running locally
49
- - Embedding model installed once:
50
-
51
- ```bash
52
- ollama pull nomic-embed-text
53
- ```
54
-
55
- Optional answer model:
56
-
57
- ```bash
58
- ollama pull gemma4
59
- ```
88
+ - No model runtime is required for the default `embeddingProvider: "local-hash"` mode.
89
+ - Optional semantic embeddings use Transformers.js with local model files under `.mimir/models` by
90
+ default.
91
+ - Generated answers are intentionally outside Mimir core. Use Claude, Codex, OpenAI, a local model
92
+ MCP server, or another trusted model runtime to synthesize from Mimir's cited context.
93
+ - Optional audio summaries use the separate `@jcode.labs/mimir-tts` workspace package. It renders
94
+ WAV files with Transformers.js and does not require Python, ffmpeg, Piper, XTTS, or a local server.
60
95
 
61
96
  ## Install From npm
62
97
 
@@ -76,23 +111,26 @@ npm install --save-dev @jcode.labs/mimir
76
111
 
77
112
  Maintainer tokens are only needed to publish new versions.
78
113
 
79
- ## Install From Git
114
+ ## Install From Source Checkout
80
115
 
81
116
  ```bash
82
- pnpm add -D git+ssh://git@github.com/jcode-works/jcode-mimir.git
117
+ git clone git@github.com:jcode-works/jcode-mimir.git
118
+ cd jcode-mimir
119
+ pnpm install
120
+ pnpm build
83
121
  ```
84
122
 
85
123
  For local development:
86
124
 
87
125
  ```bash
88
- pnpm add -D file:../jcode-mimir
126
+ pnpm add -D file:../jcode-mimir/packages/mimir
89
127
  ```
90
128
 
91
129
  Before creating an npm tarball later, run:
92
130
 
93
131
  ```bash
94
132
  pnpm build
95
- pnpm pack
133
+ pnpm --dir packages/mimir pack
96
134
  ```
97
135
 
98
136
  ## Use In Any Repository
@@ -110,6 +148,7 @@ pnpm exec kb ingest
110
148
  pnpm exec kb search "vendor invoice status"
111
149
  pnpm exec kb ask "What do the documents prove?"
112
150
  pnpm exec kb audit
151
+ pnpm exec kb security-audit
113
152
  pnpm exec kb status
114
153
  ```
115
154
 
@@ -121,12 +160,162 @@ npx kb ingest
121
160
  npx kb search "vendor invoice status"
122
161
  npx kb ask "What do the documents prove?"
123
162
  npx kb audit
163
+ npx kb security-audit
124
164
  npx kb status
125
165
  ```
126
166
 
127
- ## Agent Skill And MCP
167
+ ## Choose A Retrieval Mode
168
+
169
+ Mimir has two embedding modes.
170
+
171
+ ### Default Local Hash Retrieval
172
+
173
+ Use this when you want a fully local, no-model smoke test or a dependency-light setup. Retrieval is
174
+ lexical/hash-based, not semantic.
175
+
176
+ `.kb/config.json`:
177
+
178
+ ```json
179
+ {
180
+ "embeddingProvider": "local-hash"
181
+ }
182
+ ```
183
+
184
+ Commands:
185
+
186
+ ```bash
187
+ pnpm exec kb ingest
188
+ pnpm exec kb search "offline retrieval approval"
189
+ pnpm exec kb ask "What evidence supports offline operation?"
190
+ ```
191
+
192
+ `kb ask` always returns cited retrieved passages instead of a generated synthesis. You can pass those
193
+ passages to any LLM or agent you trust.
194
+
195
+ ### Optional Semantic Embeddings With Transformers.js
196
+
197
+ Use this when you want better semantic retrieval while keeping Mimir core free of an LLM server.
198
+
199
+ `.kb/config.json`:
200
+
201
+ ```json
202
+ {
203
+ "embeddingProvider": "transformers",
204
+ "embeddingModel": "mixedbread-ai/mxbai-embed-xsmall-v1",
205
+ "embeddingModelPath": ".mimir/models",
206
+ "transformersAllowRemoteModels": false
207
+ }
208
+ ```
209
+
210
+ Commands:
211
+
212
+ ```bash
213
+ pnpm exec kb ingest
214
+ pnpm exec kb ask "Which passages support offline operation?"
215
+ ```
216
+
217
+ Keep `transformersAllowRemoteModels` false for confidential or air-gapped work and preload model
218
+ files into `embeddingModelPath`. Set it to true only when you explicitly allow Transformers.js to
219
+ download model files from Hugging Face.
220
+
221
+ ## Dependency Footprint
222
+
223
+ Mimir can run retrieval without a model runtime. Some runtime dependencies remain because they own
224
+ core features:
225
+
226
+ | Dependency | Why it remains |
227
+ | --- | --- |
228
+ | @huggingface/transformers | optional local semantic embeddings |
229
+ | LanceDB | local vector storage and nearest-neighbor retrieval |
230
+ | MCP SDK | MCP server for compatible agents |
231
+ | fast-glob | safe source-file discovery |
232
+ | unpdf, html-to-text, yaml, fflate | document parsing for PDF, HTML, YAML, Office/OpenDocument ZIP files |
233
+ | commander, zod, picocolors | CLI, config validation, readable terminal output |
234
+
235
+ Removing more dependencies is possible only by dropping features or replacing them with smaller
236
+ internal implementations. The current low-friction path is dependency-light at runtime for users who
237
+ choose `local-hash`, while preserving richer parsing, MCP support, and optional semantic embeddings.
238
+
239
+ ## Example Test Workspace
240
+
241
+ This repository includes a synthetic example under
242
+ [`examples/sovereign-rag-demo`](./examples/sovereign-rag-demo). It can be used to test ingestion,
243
+ retrieval, `security-audit`, and custom text extensions without using private documents.
244
+
245
+ From a local checkout:
246
+
247
+ ```bash
248
+ pnpm build
249
+ cd examples/sovereign-rag-demo
250
+ node ../../dist/cli.js security-audit
251
+ node ../../dist/cli.js ingest
252
+ node ../../dist/cli.js search "offline retrieval approval"
253
+ node ../../dist/cli.js audit
254
+ ```
255
+
256
+ The example uses the default local-hash retrieval mode, so it can run without downloading an
257
+ embedding or chat model.
258
+
259
+ ## Typical Workflows
260
+
261
+ ### Understand A Codebase
262
+
263
+ ```bash
264
+ pnpm exec kb init
265
+ printf "src\nREADME.md\ndocs\n" >> .kb/sources.txt
266
+ pnpm exec kb ingest
267
+ pnpm exec kb search "authentication flow"
268
+ pnpm exec kb ask "Explain the architecture and cite the relevant files."
269
+ ```
270
+
271
+ ### Analyze Specifications Or A Course
272
+
273
+ ```bash
274
+ pnpm exec kb ingest
275
+ pnpm exec kb ask "Summarize the requirements and list open questions."
276
+ pnpm exec kb ask "Create revision questions from the indexed course material."
277
+ ```
278
+
279
+ ### Work Offline
280
+
281
+ ```bash
282
+ pnpm exec kb security-audit --strict
283
+ pnpm exec kb ingest
284
+ pnpm exec kb search "incident review policy"
285
+ pnpm exec kb ask "What does the local evidence prove?"
286
+ ```
287
+
288
+ Use `embeddingProvider: "local-hash"` for a no-model offline workflow. Use
289
+ `embeddingProvider: "transformers"` with preloaded model files for semantic offline retrieval.
290
+ Generated answers should come from a trusted external agent or model runtime.
291
+
292
+ ### Generate A Local Audio Briefing
293
+
294
+ Mimir includes a plug-and-play JS text-to-speech path for listenable summaries:
295
+
296
+ ```bash
297
+ pnpm exec kb audio --doctor
298
+ pnpm exec kb audio /tmp/MIMIR-SUMMARY-project.txt --out .mimir/audio/project-summary.wav
299
+ ```
300
+
301
+ The command writes WAV output locally and does not require Python or ffmpeg. The first render can
302
+ download a public Transformers.js-compatible model into `.mimir/models/tts`; the narration text is
303
+ processed locally. For confidential air-gapped work, preload model files and run:
304
+
305
+ ```bash
306
+ pnpm exec kb audio /tmp/MIMIR-SUMMARY-project.txt --out .mimir/audio/project-summary.wav --offline
307
+ ```
128
308
 
129
- Mimir ships with a portable agent skill and a standard MCP server.
309
+ The standalone package can also be installed directly:
310
+
311
+ ```bash
312
+ pnpm add -D @jcode.labs/mimir-tts
313
+ pnpm exec mimir-tts render /tmp/MIMIR-SUMMARY-project.txt --out .mimir/audio/project-summary.wav
314
+ ```
315
+
316
+ ## Agent Skills And MCP
317
+
318
+ Mimir ships with portable agent skills and a standard MCP server.
130
319
 
131
320
  Install the agent kit into a repository:
132
321
 
@@ -138,12 +327,14 @@ This creates:
138
327
 
139
328
  ```plain text
140
329
  .mimir/skills/mimir/SKILL.md
330
+ .mimir/skills/mimir-audio-summary/SKILL.md
141
331
  .mimir/mcp.json
142
332
  .mimir/README.md
143
333
  ```
144
334
 
145
- Agents that support skill folders can load `.mimir/skills/mimir/`. Other agents can read the
146
- generated `.mimir/README.md` and use the MCP config snippet.
335
+ Agents that support skill folders can load `.mimir/skills/mimir/` for deep local RAG usage.
336
+ Load `.mimir/skills/mimir-audio-summary/` only when an optional spoken summary is needed.
337
+ Other agents can read the generated `.mimir/README.md` and use the MCP config snippet.
147
338
 
148
339
  Start the MCP server from the repository root:
149
340
 
@@ -157,6 +348,11 @@ MCP tools exposed:
157
348
  - `mimir_search`
158
349
  - `mimir_ask`
159
350
  - `mimir_audit`
351
+ - `mimir_security_audit`
352
+
353
+ This MCP layer is the recommended way to let any compatible LLM or agent query the same local
354
+ knowledge base. The LLM does not need to know about LanceDB or the raw file layout; it asks Mimir for
355
+ ranked passages or cited context and uses the returned citations.
160
356
 
161
357
  Print the bundled skill path from the installed package:
162
358
 
@@ -175,14 +371,46 @@ your-project/
175
371
  .kb/config.json # local config
176
372
  .kb/sources.txt # optional extra source paths
177
373
  .kb/storage/ # generated LanceDB index
374
+ .kb/access.log # metadata-only access log
178
375
  ```
179
376
 
180
377
  The package never ships project documents. `kb init` adds gitignore entries for `.kb/`
181
378
  and `private/**`, and `kb install-skill` keeps `.mimir/` ignored as generated local agent
182
379
  state.
183
380
 
381
+ ## Confidentiality Defaults
382
+
383
+ Mimir is designed for private repositories and sensitive local evidence.
384
+
385
+ - Zero telemetry: no analytics or document content is sent to JCode Labs.
386
+ - No LLM generation in core: Mimir returns cited context for the agent/runtime you choose.
387
+ - Local-hash by default: no model runtime is required for the default retrieval path.
388
+ - Transformers.js remote model loading is disabled by default.
389
+ - Redaction before indexing: common secrets and identifiers are redacted before chunks are
390
+ embedded and stored.
391
+ - Metadata-only access logs: query hashes and action metadata are logged, not raw queries.
392
+ - MCP is read-focused and bounded by `mcpMaxTopK`.
393
+ - Generated local state is ignored by Git.
394
+
395
+ Run:
396
+
397
+ ```bash
398
+ pnpm exec kb security-audit --strict
399
+ ```
400
+
401
+ Remove the generated vector index:
402
+
403
+ ```bash
404
+ pnpm exec kb destroy-index --yes
405
+ ```
406
+
407
+ For air-gapped operation, release verification, secure deletion limits, and threat model details,
408
+ read [`SECURITY-HARDENING.md`](./SECURITY-HARDENING.md).
409
+
184
410
  ## Supported Files
185
411
 
412
+ Mimir supports common text, document, data, config, log, and source-code files out of the box:
413
+
186
414
  - Markdown: `.md`, `.mdx`
187
415
  - Text: `.txt`, `.text`
188
416
  - JSON: `.json`
@@ -190,6 +418,32 @@ state.
190
418
  - CSV/TSV: `.csv`, `.tsv`
191
419
  - HTML: `.html`, `.htm`
192
420
  - PDF: `.pdf`
421
+ - Office/OpenDocument: `.docx`, `.pptx`, `.xlsx`, `.odt`, `.ods`, `.odp`
422
+ - Rich text: `.rtf`
423
+ - Line data and logs: `.jsonl`, `.ndjson`, `.log`
424
+ - XML feeds and documents: `.xml`, `.rss`, `.atom`
425
+ - Config and data files: `.toml`, `.ini`, `.conf`, `.cfg`, `.properties`, `.sql`
426
+ - Source code: `.ts`, `.tsx`, `.js`, `.jsx`, `.py`, `.go`, `.rs`, `.java`, `.rb`, `.php`,
427
+ `.cs`, `.c`, `.cpp`, `.h`, `.css`
428
+
429
+ Custom UTF-8 text extensions can be enabled without changing code:
430
+
431
+ ```json
432
+ {
433
+ "includeExtensions": [".transcript", ".evidence"]
434
+ }
435
+ ```
436
+
437
+ Or through:
438
+
439
+ ```bash
440
+ KB_INCLUDE_EXTENSIONS=".transcript,.evidence" pnpm exec kb ingest
441
+ ```
442
+
443
+ Images, scans, audio/video files, old proprietary Office binaries such as `.doc`, and other formats
444
+ that are not listed should be OCRed, transcribed, converted, or exported to text/PDF/HTML first.
445
+ Mimir intentionally avoids pretending that every binary format can be indexed safely without
446
+ extraction logic.
193
447
 
194
448
  ## Config
195
449
 
@@ -200,13 +454,23 @@ state.
200
454
  "rawDir": "private",
201
455
  "storageDir": ".kb/storage",
202
456
  "sourcesFile": ".kb/sources.txt",
457
+ "accessLogPath": ".kb/access.log",
458
+ "embeddingModelPath": ".mimir/models",
203
459
  "tableName": "chunks",
204
- "ollamaHost": "http://localhost:11434",
205
- "embedModel": "nomic-embed-text",
206
- "llmModel": "gemma4:latest",
460
+ "embeddingProvider": "local-hash",
461
+ "embeddingModel": "mixedbread-ai/mxbai-embed-xsmall-v1",
462
+ "transformersAllowRemoteModels": false,
463
+ "redaction": {
464
+ "enabled": true,
465
+ "builtIn": true,
466
+ "patterns": []
467
+ },
468
+ "accessLog": true,
469
+ "mcpMaxTopK": 10,
207
470
  "topK": 5,
208
471
  "chunkSize": 1200,
209
- "chunkOverlap": 150
472
+ "chunkOverlap": 150,
473
+ "includeExtensions": []
210
474
  }
211
475
  ```
212
476
 
@@ -215,12 +479,19 @@ Environment overrides:
215
479
  - `KB_RAW_DIR`
216
480
  - `KB_STORAGE_DIR`
217
481
  - `KB_SOURCES_FILE`
218
- - `KB_OLLAMA_HOST`
219
- - `KB_EMBED_MODEL`
220
- - `KB_LLM_MODEL`
482
+ - `KB_ACCESS_LOG_PATH`
483
+ - `KB_EMBEDDING_PROVIDER`
484
+ - `KB_EMBEDDING_MODEL`
485
+ - `KB_EMBEDDING_MODEL_PATH`
486
+ - `KB_TRANSFORMERS_ALLOW_REMOTE_MODELS`
487
+ - `KB_REDACTION_ENABLED`
488
+ - `KB_REDACTION_BUILT_IN`
489
+ - `KB_ACCESS_LOG`
490
+ - `KB_MCP_MAX_TOP_K`
221
491
  - `KB_TOP_K`
222
492
  - `KB_CHUNK_SIZE`
223
493
  - `KB_CHUNK_OVERLAP`
494
+ - `KB_INCLUDE_EXTENSIONS`
224
495
 
225
496
  ## Library API
226
497
 
@@ -234,7 +505,11 @@ const answer = await ask("What documents support the project timeline?")
234
505
 
235
506
  ## Privacy
236
507
 
237
- - Embeddings and answers use local Ollama by default.
508
+ - Mimir core does not generate answers or call a chat model.
509
+ - `local-hash` can run ingestion, search, and cited retrieval without a model runtime.
510
+ - Transformers.js remote model loading is disabled by default.
511
+ - Built-in redaction runs before indexing by default.
512
+ - Access logs store query hashes, not raw queries.
238
513
  - The vector index is stored locally.
239
514
  - Raw private documents should stay in the target repository's ignored `private/` folder.
240
515
  - Do not put secrets or scans inside this package repository.