@jcode.labs/mimir 0.3.0 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/README.md +284 -40
  2. package/dist/chunking.d.ts.map +1 -1
  3. package/dist/chunking.js +6 -3
  4. package/dist/chunking.js.map +1 -1
  5. package/dist/cli.js +121 -9
  6. package/dist/cli.js.map +1 -1
  7. package/dist/config.d.ts.map +1 -1
  8. package/dist/config.js +50 -36
  9. package/dist/config.js.map +1 -1
  10. package/dist/defaults.d.ts +11 -0
  11. package/dist/defaults.d.ts.map +1 -0
  12. package/dist/defaults.js +31 -0
  13. package/dist/defaults.js.map +1 -0
  14. package/dist/embeddings.d.ts.map +1 -1
  15. package/dist/embeddings.js +85 -11
  16. package/dist/embeddings.js.map +1 -1
  17. package/dist/files.d.ts +2 -1
  18. package/dist/files.d.ts.map +1 -1
  19. package/dist/files.js +39 -2
  20. package/dist/files.js.map +1 -1
  21. package/dist/gitignore.d.ts +1 -1
  22. package/dist/gitignore.d.ts.map +1 -1
  23. package/dist/gitignore.js +8 -7
  24. package/dist/gitignore.js.map +1 -1
  25. package/dist/ingest.d.ts.map +1 -1
  26. package/dist/ingest.js +2 -1
  27. package/dist/ingest.js.map +1 -1
  28. package/dist/init.d.ts.map +1 -1
  29. package/dist/init.js +4 -24
  30. package/dist/init.js.map +1 -1
  31. package/dist/mcp.d.ts.map +1 -1
  32. package/dist/mcp.js +14 -13
  33. package/dist/mcp.js.map +1 -1
  34. package/dist/parsing.d.ts.map +1 -1
  35. package/dist/parsing.js +138 -0
  36. package/dist/parsing.js.map +1 -1
  37. package/dist/query.d.ts.map +1 -1
  38. package/dist/query.js +14 -22
  39. package/dist/query.js.map +1 -1
  40. package/dist/security.js +16 -18
  41. package/dist/security.js.map +1 -1
  42. package/dist/skill.d.ts +2 -1
  43. package/dist/skill.d.ts.map +1 -1
  44. package/dist/skill.js +24 -9
  45. package/dist/skill.js.map +1 -1
  46. package/dist/store.d.ts.map +1 -1
  47. package/dist/store.js +2 -1
  48. package/dist/store.js.map +1 -1
  49. package/dist/types.d.ts +12 -14
  50. package/dist/types.d.ts.map +1 -1
  51. package/dist/version.d.ts +1 -1
  52. package/dist/version.js +1 -1
  53. package/examples/sovereign-rag-demo/.kb/config.json +22 -0
  54. package/examples/sovereign-rag-demo/.kb/sources.txt +2 -0
  55. package/examples/sovereign-rag-demo/README.md +80 -0
  56. package/examples/sovereign-rag-demo/raw/dataset-inventory.csv +5 -0
  57. package/examples/sovereign-rag-demo/raw/incident-timeline.jsonl +4 -0
  58. package/examples/sovereign-rag-demo/raw/operations-brief.md +16 -0
  59. package/examples/sovereign-rag-demo/raw/review-notes.evidence +11 -0
  60. package/examples/sovereign-rag-demo/raw/security-policy.yaml +14 -0
  61. package/package.json +23 -29
  62. package/skills/mimir/SKILL.md +66 -5
  63. package/skills/mimir-audio-summary/SKILL.md +140 -0
  64. package/skills/mimir-audio-summary/forge-voice.sh +150 -0
  65. package/skills/mimir-audio-summary/split-lines.py +13 -0
  66. package/skills/mimir-audio-summary/xtts-voice.py +46 -0
  67. package/CHANGELOG.md +0 -28
  68. package/SECURITY-HARDENING.md +0 -156
  69. package/SECURITY.md +0 -21
  70. package/dist/network.d.ts +0 -4
  71. package/dist/network.d.ts.map +0 -1
  72. package/dist/network.js +0 -59
  73. package/dist/network.js.map +0 -1
package/README.md CHANGED
@@ -3,13 +3,20 @@
3
3
  [![CI](https://github.com/jcode-works/jcode-mimir/actions/workflows/ci.yml/badge.svg)](https://github.com/jcode-works/jcode-mimir/actions/workflows/ci.yml)
4
4
  [![CodeQL](https://github.com/jcode-works/jcode-mimir/actions/workflows/codeql.yml/badge.svg)](https://github.com/jcode-works/jcode-mimir/actions/workflows/codeql.yml)
5
5
  [![npm](https://img.shields.io/npm/v/@jcode.labs/mimir)](https://www.npmjs.com/package/@jcode.labs/mimir)
6
- [![license: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](./LICENSE)
6
+ [![license: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/jcode-works/jcode-mimir/blob/main/LICENSE)
7
7
 
8
- Open-source, local-first memory and retrieval for private project knowledge.
8
+ Open-source, sovereign local RAG for confidential datasets and AI agents.
9
9
 
10
- Mimir provides a TypeScript CLI and library that can be installed in any Node.js
11
- repository. It indexes files from the target repository, stores vectors locally with LanceDB,
12
- and uses Ollama for local embeddings and answers.
10
+ Mimir provides a TypeScript CLI, library, MCP server, and portable agent skills that can be
11
+ installed in any Node.js repository. It indexes local files from the target repository, stores
12
+ vectors locally with LanceDB, and can use either built-in local-hash retrieval or optional
13
+ Transformers.js semantic embeddings. Mimir core returns cited retrieval context; answer synthesis
14
+ belongs to the AI agent, LLM, or local model runtime you choose around it.
15
+
16
+ The intended use case is simple: put confidential company, institutional, legal, operational, or
17
+ research documents in a private local folder, index them locally, then let any compatible AI agent or
18
+ LLM workflow retrieve grounded context for summaries, briefs, audits, and decision support without
19
+ shipping the dataset to a hosted RAG service.
13
20
 
14
21
  Created by Jean-Baptiste Thery and published under the JCode Labs npm scope.
15
22
 
@@ -20,8 +27,10 @@ Built by Jean-Baptiste Thery, freelance full-stack/AI tooling engineer at JCode
20
27
  Mimir is a public open-source project under the MIT License. It is designed to be
21
28
  inspectable, forkable, and usable without a JCode Labs account.
22
29
 
23
- Contributions are welcome through pull requests. Start with [`CONTRIBUTING.md`](./CONTRIBUTING.md).
24
- Security reports should stay private and follow the policy in [`SECURITY.md`](./SECURITY.md).
30
+ Contributions are welcome through pull requests. Start with
31
+ [`CONTRIBUTING.md`](https://github.com/jcode-works/jcode-mimir/blob/main/CONTRIBUTING.md).
32
+ Security reports should stay private and follow the policy in
33
+ [`SECURITY.md`](https://github.com/jcode-works/jcode-mimir/blob/main/SECURITY.md).
25
34
 
26
35
  ## Sponsors
27
36
 
@@ -41,22 +50,53 @@ Suggested GitHub Sponsors tiers:
41
50
 
42
51
  Early public package. APIs may evolve before `1.0.0`.
43
52
 
53
+ ## What Mimir Is For
54
+
55
+ - Build a local RAG knowledge base inside any repository.
56
+ - Analyze confidential datasets while keeping raw files and generated indexes local.
57
+ - Give Claude, Codex, Cursor, internal assistants, or other MCP-compatible tools the same private
58
+ retrieval layer.
59
+ - Retrieve grounded local evidence through CLI, library calls, MCP tools, or the bundled agent
60
+ skills so your chosen AI agent can produce cited summaries.
61
+ - Optionally create listenable MP3 or WAV summaries with `kb audio`, `@jcode.labs/mimir-tts`, and
62
+ the bundled `mimir-audio-summary` skill.
63
+
64
+ Mimir is not a hosted SaaS, not a remote vector database, and not a certified high-assurance system.
65
+ For regulated or state-grade environments, pair it with encrypted disks, controlled machines, release
66
+ verification, and an external security review.
67
+
68
+ ## Use Cases
69
+
70
+ Mimir is useful whenever the source material should stay local but an AI agent still needs grounded
71
+ context.
72
+
73
+ | Use case | Example questions |
74
+ | --- | --- |
75
+ | Understand a code repository | "Where is authentication implemented?", "What depends on this module?", "Summarize the payment flow." |
76
+ | Understand architecture | "What services exist?", "What are the data boundaries?", "Which components are risky to change?" |
77
+ | Analyze specifications | "What does the technical spec require?", "Which requirements are still unclear?", "Generate an implementation checklist." |
78
+ | Work through a request for proposal or tender | "What are the mandatory constraints?", "Which documents prove compliance?", "What risks should be clarified?" |
79
+ | Study courses and training material | "Summarize chapter three.", "Create revision questions.", "Compare these two concepts." |
80
+ | Analyze a book or long report | "Extract the main thesis.", "Find recurring arguments.", "Create a chapter-by-chapter brief." |
81
+ | Build an internal knowledge base | "What is the policy for incident review?", "Who owns this process?", "Which source says that?" |
82
+ | Prepare meetings or decisions | "Give me a one-page briefing.", "What is missing before deciding?", "List action items and evidence." |
83
+ | Ask questions over offline documents | "Which files mention local-only operation?", "What evidence supports this claim?" |
84
+ | Generate audio briefings | "Create a listenable high-quality or offline summary of the current dossier." |
85
+
44
86
  ## Requirements
45
87
 
46
88
  - Node.js 20+
47
89
  - pnpm, npm, yarn or bun
48
- - Ollama running locally
49
- - Embedding model installed once:
50
-
51
- ```bash
52
- ollama pull nomic-embed-text
53
- ```
54
-
55
- Optional answer model:
56
-
57
- ```bash
58
- ollama pull gemma4
59
- ```
90
+ - No model runtime is required for the default `embeddingProvider: "local-hash"` mode.
91
+ - Optional semantic embeddings use Transformers.js with local model files under `.mimir/models` by
92
+ default.
93
+ - Generated answers are intentionally outside Mimir core. Use Claude, Codex, OpenAI, a local model
94
+ MCP server, or another trusted model runtime to synthesize from Mimir's cited context.
95
+ - Optional audio summaries use the separate `@jcode.labs/mimir-tts` workspace package. For the
96
+ highest quality, install the external `edge-tts` CLI and render Edge MP3 output with
97
+ `fr-FR-DeniseNeural`. For confidential or air-gapped content, use the Transformers.js WAV path
98
+ with `--engine transformers --offline`; it does not require Python, ffmpeg, Piper, XTTS, or a
99
+ local server.
60
100
 
61
101
  ## Install From npm
62
102
 
@@ -76,23 +116,26 @@ npm install --save-dev @jcode.labs/mimir
76
116
 
77
117
  Maintainer tokens are only needed to publish new versions.
78
118
 
79
- ## Install From Git
119
+ ## Install From Source Checkout
80
120
 
81
121
  ```bash
82
- pnpm add -D git+ssh://git@github.com/jcode-works/jcode-mimir.git
122
+ git clone git@github.com:jcode-works/jcode-mimir.git
123
+ cd jcode-mimir
124
+ pnpm install
125
+ pnpm build
83
126
  ```
84
127
 
85
128
  For local development:
86
129
 
87
130
  ```bash
88
- pnpm add -D file:../jcode-mimir
131
+ pnpm add -D file:../jcode-mimir/packages/mimir
89
132
  ```
90
133
 
91
134
  Before creating an npm tarball later, run:
92
135
 
93
136
  ```bash
94
137
  pnpm build
95
- pnpm pack
138
+ pnpm --dir packages/mimir pack
96
139
  ```
97
140
 
98
141
  ## Use In Any Repository
@@ -126,9 +169,170 @@ npx kb security-audit
126
169
  npx kb status
127
170
  ```
128
171
 
129
- ## Agent Skill And MCP
172
+ ## Choose A Retrieval Mode
173
+
174
+ Mimir has two embedding modes.
175
+
176
+ ### Default Local Hash Retrieval
177
+
178
+ Use this when you want a fully local, no-model smoke test or a dependency-light setup. Retrieval is
179
+ lexical/hash-based, not semantic.
180
+
181
+ `.kb/config.json`:
182
+
183
+ ```json
184
+ {
185
+ "embeddingProvider": "local-hash"
186
+ }
187
+ ```
188
+
189
+ Commands:
190
+
191
+ ```bash
192
+ pnpm exec kb ingest
193
+ pnpm exec kb search "offline retrieval approval"
194
+ pnpm exec kb ask "What evidence supports offline operation?"
195
+ ```
196
+
197
+ `kb ask` always returns cited retrieved passages instead of a generated synthesis. You can pass those
198
+ passages to any LLM or agent you trust.
199
+
200
+ ### Optional Semantic Embeddings With Transformers.js
201
+
202
+ Use this when you want better semantic retrieval while keeping Mimir core free of an LLM server.
203
+
204
+ `.kb/config.json`:
205
+
206
+ ```json
207
+ {
208
+ "embeddingProvider": "transformers",
209
+ "embeddingModel": "mixedbread-ai/mxbai-embed-xsmall-v1",
210
+ "embeddingModelPath": ".mimir/models",
211
+ "transformersAllowRemoteModels": false
212
+ }
213
+ ```
214
+
215
+ Commands:
216
+
217
+ ```bash
218
+ pnpm exec kb ingest
219
+ pnpm exec kb ask "Which passages support offline operation?"
220
+ ```
221
+
222
+ Keep `transformersAllowRemoteModels` false for confidential or air-gapped work and preload model
223
+ files into `embeddingModelPath`. Set it to true only when you explicitly allow Transformers.js to
224
+ download model files from Hugging Face.
225
+
226
+ ## Dependency Footprint
227
+
228
+ Mimir can run retrieval without a model runtime. Some runtime dependencies remain because they own
229
+ core features:
230
+
231
+ | Dependency | Why it remains |
232
+ | --- | --- |
233
+ | @huggingface/transformers | optional local semantic embeddings |
234
+ | LanceDB | local vector storage and nearest-neighbor retrieval |
235
+ | MCP SDK | MCP server for compatible agents |
236
+ | fast-glob | safe source-file discovery |
237
+ | unpdf, html-to-text, yaml, fflate | document parsing for PDF, HTML, YAML, Office/OpenDocument ZIP files |
238
+ | commander, zod, picocolors | CLI, config validation, readable terminal output |
239
+
240
+ Removing more dependencies is possible only by dropping features or replacing them with smaller
241
+ internal implementations. The current low-friction path is dependency-light at runtime for users who
242
+ choose `local-hash`, while preserving richer parsing, MCP support, and optional semantic embeddings.
243
+
244
+ ## Example Test Workspace
245
+
246
+ This repository includes a synthetic example under
247
+ [`examples/sovereign-rag-demo`](./examples/sovereign-rag-demo). It can be used to test ingestion,
248
+ retrieval, `security-audit`, and custom text extensions without using private documents.
249
+
250
+ From a local checkout:
251
+
252
+ ```bash
253
+ pnpm build
254
+ cd examples/sovereign-rag-demo
255
+ node ../../dist/cli.js security-audit
256
+ node ../../dist/cli.js ingest
257
+ node ../../dist/cli.js search "offline retrieval approval"
258
+ node ../../dist/cli.js audit
259
+ ```
260
+
261
+ The example uses the default local-hash retrieval mode, so it can run without downloading an
262
+ embedding or chat model.
263
+
264
+ ## Typical Workflows
265
+
266
+ ### Understand A Codebase
267
+
268
+ ```bash
269
+ pnpm exec kb init
270
+ printf "src\nREADME.md\ndocs\n" >> .kb/sources.txt
271
+ pnpm exec kb ingest
272
+ pnpm exec kb search "authentication flow"
273
+ pnpm exec kb ask "Explain the architecture and cite the relevant files."
274
+ ```
275
+
276
+ ### Analyze Specifications Or A Course
277
+
278
+ ```bash
279
+ pnpm exec kb ingest
280
+ pnpm exec kb ask "Summarize the requirements and list open questions."
281
+ pnpm exec kb ask "Create revision questions from the indexed course material."
282
+ ```
283
+
284
+ ### Work Offline
285
+
286
+ ```bash
287
+ pnpm exec kb security-audit --strict
288
+ pnpm exec kb ingest
289
+ pnpm exec kb search "incident review policy"
290
+ pnpm exec kb ask "What does the local evidence prove?"
291
+ ```
292
+
293
+ Use `embeddingProvider: "local-hash"` for a no-model offline workflow. Use
294
+ `embeddingProvider: "transformers"` with preloaded model files for semantic offline retrieval.
295
+ Generated answers should come from a trusted external agent or model runtime.
296
+
297
+ ### Generate An Audio Briefing
298
+
299
+ Mimir includes a plug-and-play text-to-speech path for listenable summaries. For the same quality
300
+ path as the global Voice Forge skill, install `edge-tts` and render MP3:
301
+
302
+ ```bash
303
+ pnpm exec kb audio --doctor
304
+ pipx install edge-tts
305
+ pnpm exec kb audio /tmp/MIMIR-SUMMARY-project.txt \
306
+ --engine edge \
307
+ --out .mimir/audio/project-summary.mp3
308
+ ```
309
+
310
+ The Edge path uses the online Microsoft Edge TTS service through the `edge-tts` CLI. Use it only
311
+ when sending the narration text to that service is acceptable.
312
+
313
+ For confidential or air-gapped work, preload Transformers.js-compatible model files and render WAV
314
+ offline:
315
+
316
+ ```bash
317
+ pnpm exec kb audio /tmp/MIMIR-SUMMARY-project.txt \
318
+ --engine transformers \
319
+ --offline \
320
+ --model-path .mimir/models/tts \
321
+ --out .mimir/audio/project-summary.wav
322
+ ```
130
323
 
131
- Mimir ships with a portable agent skill and a standard MCP server.
324
+ The standalone package can also be installed directly:
325
+
326
+ ```bash
327
+ pnpm add -D @jcode.labs/mimir-tts
328
+ pnpm exec mimir-tts render /tmp/MIMIR-SUMMARY-project.txt \
329
+ --engine edge \
330
+ --out .mimir/audio/project-summary.mp3
331
+ ```
332
+
333
+ ## Agent Skills And MCP
334
+
335
+ Mimir ships with portable agent skills and a standard MCP server.
132
336
 
133
337
  Install the agent kit into a repository:
134
338
 
@@ -140,12 +344,14 @@ This creates:
140
344
 
141
345
  ```plain text
142
346
  .mimir/skills/mimir/SKILL.md
347
+ .mimir/skills/mimir-audio-summary/SKILL.md
143
348
  .mimir/mcp.json
144
349
  .mimir/README.md
145
350
  ```
146
351
 
147
- Agents that support skill folders can load `.mimir/skills/mimir/`. Other agents can read the
148
- generated `.mimir/README.md` and use the MCP config snippet.
352
+ Agents that support skill folders can load `.mimir/skills/mimir/` for deep local RAG usage.
353
+ Load `.mimir/skills/mimir-audio-summary/` only when an optional spoken summary is needed.
354
+ Other agents can read the generated `.mimir/README.md` and use the MCP config snippet.
149
355
 
150
356
  Start the MCP server from the repository root:
151
357
 
@@ -161,6 +367,10 @@ MCP tools exposed:
161
367
  - `mimir_audit`
162
368
  - `mimir_security_audit`
163
369
 
370
+ This MCP layer is the recommended way to let any compatible LLM or agent query the same local
371
+ knowledge base. The LLM does not need to know about LanceDB or the raw file layout; it asks Mimir for
372
+ ranked passages or cited context and uses the returned citations.
373
+
164
374
  Print the bundled skill path from the installed package:
165
375
 
166
376
  ```bash
@@ -190,7 +400,9 @@ state.
190
400
  Mimir is designed for private repositories and sensitive local evidence.
191
401
 
192
402
  - Zero telemetry: no analytics or document content is sent to JCode Labs.
193
- - Local-only network policy: Ollama must be on loopback by default.
403
+ - No LLM generation in core: Mimir returns cited context for the agent/runtime you choose.
404
+ - Local-hash by default: no model runtime is required for the default retrieval path.
405
+ - Transformers.js remote model loading is disabled by default.
194
406
  - Redaction before indexing: common secrets and identifiers are redacted before chunks are
195
407
  embedded and stored.
196
408
  - Metadata-only access logs: query hashes and action metadata are logged, not raw queries.
@@ -210,10 +422,13 @@ pnpm exec kb destroy-index --yes
210
422
  ```
211
423
 
212
424
  For air-gapped operation, release verification, secure deletion limits, and threat model details,
213
- read [`SECURITY-HARDENING.md`](./SECURITY-HARDENING.md).
425
+ read
426
+ [`SECURITY-HARDENING.md`](https://github.com/jcode-works/jcode-mimir/blob/main/SECURITY-HARDENING.md).
214
427
 
215
428
  ## Supported Files
216
429
 
430
+ Mimir supports common text, document, data, config, log, and source-code files out of the box:
431
+
217
432
  - Markdown: `.md`, `.mdx`
218
433
  - Text: `.txt`, `.text`
219
434
  - JSON: `.json`
@@ -221,6 +436,32 @@ read [`SECURITY-HARDENING.md`](./SECURITY-HARDENING.md).
221
436
  - CSV/TSV: `.csv`, `.tsv`
222
437
  - HTML: `.html`, `.htm`
223
438
  - PDF: `.pdf`
439
+ - Office/OpenDocument: `.docx`, `.pptx`, `.xlsx`, `.odt`, `.ods`, `.odp`
440
+ - Rich text: `.rtf`
441
+ - Line data and logs: `.jsonl`, `.ndjson`, `.log`
442
+ - XML feeds and documents: `.xml`, `.rss`, `.atom`
443
+ - Config and data files: `.toml`, `.ini`, `.conf`, `.cfg`, `.properties`, `.sql`
444
+ - Source code: `.ts`, `.tsx`, `.js`, `.jsx`, `.py`, `.go`, `.rs`, `.java`, `.rb`, `.php`,
445
+ `.cs`, `.c`, `.cpp`, `.h`, `.css`
446
+
447
+ Custom UTF-8 text extensions can be enabled without changing code:
448
+
449
+ ```json
450
+ {
451
+ "includeExtensions": [".transcript", ".evidence"]
452
+ }
453
+ ```
454
+
455
+ Or through:
456
+
457
+ ```bash
458
+ KB_INCLUDE_EXTENSIONS=".transcript,.evidence" pnpm exec kb ingest
459
+ ```
460
+
461
+ Images, scans, audio/video files, old proprietary Office binaries such as `.doc`, and other formats
462
+ that are not listed should be OCRed, transcribed, converted, or exported to text/PDF/HTML first.
463
+ Mimir intentionally avoids pretending that every binary format can be indexed safely without
464
+ extraction logic.
224
465
 
225
466
  ## Config
226
467
 
@@ -232,11 +473,11 @@ read [`SECURITY-HARDENING.md`](./SECURITY-HARDENING.md).
232
473
  "storageDir": ".kb/storage",
233
474
  "sourcesFile": ".kb/sources.txt",
234
475
  "accessLogPath": ".kb/access.log",
476
+ "embeddingModelPath": ".mimir/models",
235
477
  "tableName": "chunks",
236
- "ollamaHost": "http://localhost:11434",
237
- "networkPolicy": "local-only",
238
- "embedModel": "nomic-embed-text",
239
- "llmModel": "gemma4:latest",
478
+ "embeddingProvider": "local-hash",
479
+ "embeddingModel": "mixedbread-ai/mxbai-embed-xsmall-v1",
480
+ "transformersAllowRemoteModels": false,
240
481
  "redaction": {
241
482
  "enabled": true,
242
483
  "builtIn": true,
@@ -246,7 +487,8 @@ read [`SECURITY-HARDENING.md`](./SECURITY-HARDENING.md).
246
487
  "mcpMaxTopK": 10,
247
488
  "topK": 5,
248
489
  "chunkSize": 1200,
249
- "chunkOverlap": 150
490
+ "chunkOverlap": 150,
491
+ "includeExtensions": []
250
492
  }
251
493
  ```
252
494
 
@@ -256,10 +498,10 @@ Environment overrides:
256
498
  - `KB_STORAGE_DIR`
257
499
  - `KB_SOURCES_FILE`
258
500
  - `KB_ACCESS_LOG_PATH`
259
- - `KB_OLLAMA_HOST`
260
- - `KB_NETWORK_POLICY`
261
- - `KB_EMBED_MODEL`
262
- - `KB_LLM_MODEL`
501
+ - `KB_EMBEDDING_PROVIDER`
502
+ - `KB_EMBEDDING_MODEL`
503
+ - `KB_EMBEDDING_MODEL_PATH`
504
+ - `KB_TRANSFORMERS_ALLOW_REMOTE_MODELS`
263
505
  - `KB_REDACTION_ENABLED`
264
506
  - `KB_REDACTION_BUILT_IN`
265
507
  - `KB_ACCESS_LOG`
@@ -267,6 +509,7 @@ Environment overrides:
267
509
  - `KB_TOP_K`
268
510
  - `KB_CHUNK_SIZE`
269
511
  - `KB_CHUNK_OVERLAP`
512
+ - `KB_INCLUDE_EXTENSIONS`
270
513
 
271
514
  ## Library API
272
515
 
@@ -280,8 +523,9 @@ const answer = await ask("What documents support the project timeline?")
280
523
 
281
524
  ## Privacy
282
525
 
283
- - Embeddings and answers use local Ollama by default.
284
- - Remote Ollama hosts are blocked unless `networkPolicy` explicitly allows them.
526
+ - Mimir core does not generate answers or call a chat model.
527
+ - `local-hash` can run ingestion, search, and cited retrieval without a model runtime.
528
+ - Transformers.js remote model loading is disabled by default.
285
529
  - Built-in redaction runs before indexing by default.
286
530
  - Access logs store query hashes, not raw queries.
287
531
  - The vector index is stored locally.
@@ -1 +1 @@
1
- {"version":3,"file":"chunking.d.ts","sourceRoot":"","sources":["../src/chunking.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,YAAY,CAAA;AAE3D,wBAAgB,aAAa,CAC3B,QAAQ,EAAE,cAAc,EACxB,SAAS,EAAE,MAAM,EACjB,YAAY,EAAE,MAAM,GACnB,SAAS,EAAE,CAqCb"}
1
+ {"version":3,"file":"chunking.d.ts","sourceRoot":"","sources":["../src/chunking.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,YAAY,CAAA;AAM3D,wBAAgB,aAAa,CAC3B,QAAQ,EAAE,cAAc,EACxB,SAAS,EAAE,MAAM,EACjB,YAAY,EAAE,MAAM,GACnB,SAAS,EAAE,CAqCb"}
package/dist/chunking.js CHANGED
@@ -1,4 +1,7 @@
1
1
  import { createHash } from "node:crypto";
2
+ const PARAGRAPH_BREAK_MIN_RATIO = 0.45;
3
+ const SENTENCE_BREAK_MIN_RATIO = 0.55;
4
+ const WHITESPACE_BREAK_MIN_RATIO = 0.75;
2
5
  export function chunkDocument(document, chunkSize, chunkOverlap) {
3
6
  if (!document.text) {
4
7
  return [];
@@ -39,15 +42,15 @@ function chooseChunkEnd(text, cursor, chunkSize) {
39
42
  }
40
43
  const window = text.slice(cursor, hardEnd);
41
44
  const paragraphBreak = window.lastIndexOf("\n\n");
42
- if (paragraphBreak > chunkSize * 0.45) {
45
+ if (paragraphBreak > chunkSize * PARAGRAPH_BREAK_MIN_RATIO) {
43
46
  return cursor + paragraphBreak;
44
47
  }
45
48
  const sentenceBreak = Math.max(window.lastIndexOf(". "), window.lastIndexOf("? "), window.lastIndexOf("! "));
46
- if (sentenceBreak > chunkSize * 0.55) {
49
+ if (sentenceBreak > chunkSize * SENTENCE_BREAK_MIN_RATIO) {
47
50
  return cursor + sentenceBreak + 1;
48
51
  }
49
52
  const whitespace = window.lastIndexOf(" ");
50
- if (whitespace > chunkSize * 0.75) {
53
+ if (whitespace > chunkSize * WHITESPACE_BREAK_MIN_RATIO) {
51
54
  return cursor + whitespace;
52
55
  }
53
56
  return hardEnd;
@@ -1 +1 @@
1
- {"version":3,"file":"chunking.js","sourceRoot":"","sources":["../src/chunking.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAA;AAGxC,MAAM,UAAU,aAAa,CAC3B,QAAwB,EACxB,SAAiB,EACjB,YAAoB;IAEpB,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;QACnB,OAAO,EAAE,CAAA;IACX,CAAC;IAED,MAAM,MAAM,GAAgB,EAAE,CAAA;IAC9B,IAAI,MAAM,GAAG,CAAC,CAAA;IACd,IAAI,UAAU,GAAG,CAAC,CAAA;IAElB,OAAO,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,cAAc,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,CAAC,CAAA;QAC5D,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;QAEpD,IAAI,IAAI,EAAE,CAAC;YACT,MAAM,EAAE,GAAG,UAAU,CAAC,QAAQ,CAAC;iBAC5B,MAAM,CAAC,GAAG,QAAQ,CAAC,IAAI,CAAC,YAAY,IAAI,UAAU,IAAI,IAAI,EAAE,CAAC;iBAC7D,MAAM,CAAC,KAAK,CAAC,CAAA;YAChB,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE;gBACF,MAAM,EAAE,QAAQ,CAAC,IAAI,CAAC,MAAM;gBAC5B,YAAY,EAAE,QAAQ,CAAC,IAAI,CAAC,YAAY;gBACxC,UAAU;gBACV,IAAI;gBACJ,QAAQ,EAAE,QAAQ,CAAC,IAAI,CAAC,QAAQ;gBAChC,KAAK,EAAE,QAAQ,CAAC,IAAI,CAAC,KAAK;gBAC1B,OAAO,EAAE,QAAQ,CAAC,IAAI,CAAC,OAAO;aAC/B,CAAC,CAAA;YACF,UAAU,IAAI,CAAC,CAAA;QACjB,CAAC;QAED,IAAI,GAAG,IAAI,QAAQ,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;YAChC,MAAK;QACP,CAAC;QACD,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,GAAG,YAAY,EAAE,MAAM,GAAG,CAAC,CAAC,CAAA;IACnD,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED,SAAS,cAAc,CAAC,IAAY,EAAE,MAAc,EAAE,SAAiB;IACrE,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,GAAG,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,CAAA;IACzD,IAAI,OAAO,KAAK,IAAI,CAAC,MAAM,EAAE,CAAC;QAC5B,OAAO,OAAO,CAAA;IAChB,CAAC;IAED,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;IAC1C,MAAM,cAAc,GAAG,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,CAAA;IACjD,IAAI,cAAc,GAAG,SAAS,GAAG,IAAI,EAAE,CAAC;QACtC,OAAO,MAAM,GAAG,cAAc,CAAA;IAChC,CAAC;IAED,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAC5B,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,EACxB,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,EACxB,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,CACzB,CAAA;IACD,IAAI,aAAa,GAAG,SAAS,GAAG,IAAI,EAAE,CAAC;QACrC,OAAO,MAAM,GAAG,aAAa,GAAG,CAAC,CAAA;IACnC,CAAC;IAED,MAAM,UAAU,GAAG,MAAM,CAAC,WAAW,CAAC,GAAG,CAAC,CAAA;IAC1C,IAAI,UAAU,GAAG,SAAS,GAAG,IAAI,EAAE,CAAC;QAClC,OAAO,MAAM,GAAG,UAAU,CAAA;IAC5B,CAAC;IAED,OAAO,OAAO,CAAA;AAChB,CAAC"}
1
+ {"version":3,"file":"chunking.js","sourceRoot":"","sources":["../src/chunking.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAA;AAGxC,MAAM,yBAAyB,GAAG,IAAI,CAAA;AACtC,MAAM,wBAAwB,GAAG,IAAI,CAAA;AACrC,MAAM,0BAA0B,GAAG,IAAI,CAAA;AAEvC,MAAM,UAAU,aAAa,CAC3B,QAAwB,EACxB,SAAiB,EACjB,YAAoB;IAEpB,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;QACnB,OAAO,EAAE,CAAA;IACX,CAAC;IAED,MAAM,MAAM,GAAgB,EAAE,CAAA;IAC9B,IAAI,MAAM,GAAG,CAAC,CAAA;IACd,IAAI,UAAU,GAAG,CAAC,CAAA;IAElB,OAAO,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,cAAc,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,CAAC,CAAA;QAC5D,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;QAEpD,IAAI,IAAI,EAAE,CAAC;YACT,MAAM,EAAE,GAAG,UAAU,CAAC,QAAQ,CAAC;iBAC5B,MAAM,CAAC,GAAG,QAAQ,CAAC,IAAI,CAAC,YAAY,IAAI,UAAU,IAAI,IAAI,EAAE,CAAC;iBAC7D,MAAM,CAAC,KAAK,CAAC,CAAA;YAChB,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE;gBACF,MAAM,EAAE,QAAQ,CAAC,IAAI,CAAC,MAAM;gBAC5B,YAAY,EAAE,QAAQ,CAAC,IAAI,CAAC,YAAY;gBACxC,UAAU;gBACV,IAAI;gBACJ,QAAQ,EAAE,QAAQ,CAAC,IAAI,CAAC,QAAQ;gBAChC,KAAK,EAAE,QAAQ,CAAC,IAAI,CAAC,KAAK;gBAC1B,OAAO,EAAE,QAAQ,CAAC,IAAI,CAAC,OAAO;aAC/B,CAAC,CAAA;YACF,UAAU,IAAI,CAAC,CAAA;QACjB,CAAC;QAED,IAAI,GAAG,IAAI,QAAQ,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;YAChC,MAAK;QACP,CAAC;QACD,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,GAAG,YAAY,EAAE,MAAM,GAAG,CAAC,CAAC,CAAA;IACnD,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED,SAAS,cAAc,CAAC,IAAY,EAAE,MAAc,EAAE,SAAiB;IACrE,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,GAAG,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,CAAA;IACzD,IAAI,OAAO,KAAK,IAAI,CAAC,MAAM,EAAE,CAAC;QAC5B,OAAO,OAAO,CAAA;IAChB,CAAC;IAED,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;IAC1C,MAAM,cAAc,GAAG,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,CAAA;IACjD,IAAI,cAAc,GAAG,SAAS,GAAG,yBAAyB,EAAE,CAAC;QAC3D,OAAO,MAAM,GAAG,cAAc,CAAA;IAChC,CAAC;IAED,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAC5B,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,EACxB,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,EACxB,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,CACzB,CAAA;IACD,IAAI,aAAa,GAAG,SAAS,GAAG,wBAAwB,EAAE,CAAC;QACzD,OAAO,MAAM,GAAG,aAAa,GAAG,CAAC,CAAA;IACnC,CAAC;IAED,MAAM,UAAU,GAAG,MAAM,CAAC,WAAW,CAAC,GAAG,CAAC,CAAA;IAC1C,IAAI,UAAU,GAAG,SAAS,GAAG,0BAA0B,EAAE,CAAC;QACxD,OAAO,MAAM,GAAG,UAAU,CAAA;IAC5B,CAAC;IAED,OAAO,OAAO,CAAA;AAChB,CAAC"}