@jcode.labs/mimir 0.3.0 → 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +284 -40
- package/dist/chunking.d.ts.map +1 -1
- package/dist/chunking.js +6 -3
- package/dist/chunking.js.map +1 -1
- package/dist/cli.js +121 -9
- package/dist/cli.js.map +1 -1
- package/dist/config.d.ts.map +1 -1
- package/dist/config.js +50 -36
- package/dist/config.js.map +1 -1
- package/dist/defaults.d.ts +11 -0
- package/dist/defaults.d.ts.map +1 -0
- package/dist/defaults.js +31 -0
- package/dist/defaults.js.map +1 -0
- package/dist/embeddings.d.ts.map +1 -1
- package/dist/embeddings.js +85 -11
- package/dist/embeddings.js.map +1 -1
- package/dist/files.d.ts +2 -1
- package/dist/files.d.ts.map +1 -1
- package/dist/files.js +39 -2
- package/dist/files.js.map +1 -1
- package/dist/gitignore.d.ts +1 -1
- package/dist/gitignore.d.ts.map +1 -1
- package/dist/gitignore.js +8 -7
- package/dist/gitignore.js.map +1 -1
- package/dist/ingest.d.ts.map +1 -1
- package/dist/ingest.js +2 -1
- package/dist/ingest.js.map +1 -1
- package/dist/init.d.ts.map +1 -1
- package/dist/init.js +4 -24
- package/dist/init.js.map +1 -1
- package/dist/mcp.d.ts.map +1 -1
- package/dist/mcp.js +14 -13
- package/dist/mcp.js.map +1 -1
- package/dist/parsing.d.ts.map +1 -1
- package/dist/parsing.js +138 -0
- package/dist/parsing.js.map +1 -1
- package/dist/query.d.ts.map +1 -1
- package/dist/query.js +14 -22
- package/dist/query.js.map +1 -1
- package/dist/security.js +16 -18
- package/dist/security.js.map +1 -1
- package/dist/skill.d.ts +2 -1
- package/dist/skill.d.ts.map +1 -1
- package/dist/skill.js +24 -9
- package/dist/skill.js.map +1 -1
- package/dist/store.d.ts.map +1 -1
- package/dist/store.js +2 -1
- package/dist/store.js.map +1 -1
- package/dist/types.d.ts +12 -14
- package/dist/types.d.ts.map +1 -1
- package/dist/version.d.ts +1 -1
- package/dist/version.js +1 -1
- package/examples/sovereign-rag-demo/.kb/config.json +22 -0
- package/examples/sovereign-rag-demo/.kb/sources.txt +2 -0
- package/examples/sovereign-rag-demo/README.md +80 -0
- package/examples/sovereign-rag-demo/raw/dataset-inventory.csv +5 -0
- package/examples/sovereign-rag-demo/raw/incident-timeline.jsonl +4 -0
- package/examples/sovereign-rag-demo/raw/operations-brief.md +16 -0
- package/examples/sovereign-rag-demo/raw/review-notes.evidence +11 -0
- package/examples/sovereign-rag-demo/raw/security-policy.yaml +14 -0
- package/package.json +23 -29
- package/skills/mimir/SKILL.md +66 -5
- package/skills/mimir-audio-summary/SKILL.md +140 -0
- package/skills/mimir-audio-summary/forge-voice.sh +150 -0
- package/skills/mimir-audio-summary/split-lines.py +13 -0
- package/skills/mimir-audio-summary/xtts-voice.py +46 -0
- package/CHANGELOG.md +0 -28
- package/SECURITY-HARDENING.md +0 -156
- package/SECURITY.md +0 -21
- package/dist/network.d.ts +0 -4
- package/dist/network.d.ts.map +0 -1
- package/dist/network.js +0 -59
- package/dist/network.js.map +0 -1
package/README.md
CHANGED
|
@@ -3,13 +3,20 @@
|
|
|
3
3
|
[](https://github.com/jcode-works/jcode-mimir/actions/workflows/ci.yml)
|
|
4
4
|
[](https://github.com/jcode-works/jcode-mimir/actions/workflows/codeql.yml)
|
|
5
5
|
[](https://www.npmjs.com/package/@jcode.labs/mimir)
|
|
6
|
-
[](
|
|
6
|
+
[](https://github.com/jcode-works/jcode-mimir/blob/main/LICENSE)
|
|
7
7
|
|
|
8
|
-
Open-source, local
|
|
8
|
+
Open-source, sovereign local RAG for confidential datasets and AI agents.
|
|
9
9
|
|
|
10
|
-
Mimir provides a TypeScript CLI and
|
|
11
|
-
repository. It indexes files from the target repository, stores
|
|
12
|
-
and
|
|
10
|
+
Mimir provides a TypeScript CLI, library, MCP server, and portable agent skills that can be
|
|
11
|
+
installed in any Node.js repository. It indexes local files from the target repository, stores
|
|
12
|
+
vectors locally with LanceDB, and can use either built-in local-hash retrieval or optional
|
|
13
|
+
Transformers.js semantic embeddings. Mimir core returns cited retrieval context; answer synthesis
|
|
14
|
+
belongs to the AI agent, LLM, or local model runtime you choose around it.
|
|
15
|
+
|
|
16
|
+
The intended use case is simple: put confidential company, institutional, legal, operational, or
|
|
17
|
+
research documents in a private local folder, index them locally, then let any compatible AI agent or
|
|
18
|
+
LLM workflow retrieve grounded context for summaries, briefs, audits, and decision support without
|
|
19
|
+
shipping the dataset to a hosted RAG service.
|
|
13
20
|
|
|
14
21
|
Created by Jean-Baptiste Thery and published under the JCode Labs npm scope.
|
|
15
22
|
|
|
@@ -20,8 +27,10 @@ Built by Jean-Baptiste Thery, freelance full-stack/AI tooling engineer at JCode
|
|
|
20
27
|
Mimir is a public open-source project under the MIT License. It is designed to be
|
|
21
28
|
inspectable, forkable, and usable without a JCode Labs account.
|
|
22
29
|
|
|
23
|
-
Contributions are welcome through pull requests. Start with
|
|
24
|
-
|
|
30
|
+
Contributions are welcome through pull requests. Start with
|
|
31
|
+
[`CONTRIBUTING.md`](https://github.com/jcode-works/jcode-mimir/blob/main/CONTRIBUTING.md).
|
|
32
|
+
Security reports should stay private and follow the policy in
|
|
33
|
+
[`SECURITY.md`](https://github.com/jcode-works/jcode-mimir/blob/main/SECURITY.md).
|
|
25
34
|
|
|
26
35
|
## Sponsors
|
|
27
36
|
|
|
@@ -41,22 +50,53 @@ Suggested GitHub Sponsors tiers:
|
|
|
41
50
|
|
|
42
51
|
Early public package. APIs may evolve before `1.0.0`.
|
|
43
52
|
|
|
53
|
+
## What Mimir Is For
|
|
54
|
+
|
|
55
|
+
- Build a local RAG knowledge base inside any repository.
|
|
56
|
+
- Analyze confidential datasets while keeping raw files and generated indexes local.
|
|
57
|
+
- Give Claude, Codex, Cursor, internal assistants, or other MCP-compatible tools the same private
|
|
58
|
+
retrieval layer.
|
|
59
|
+
- Retrieve grounded local evidence through CLI, library calls, MCP tools, or the bundled agent
|
|
60
|
+
skills so your chosen AI agent can produce cited summaries.
|
|
61
|
+
- Optionally create listenable MP3 or WAV summaries with `kb audio`, `@jcode.labs/mimir-tts`, and
|
|
62
|
+
the bundled `mimir-audio-summary` skill.
|
|
63
|
+
|
|
64
|
+
Mimir is not a hosted SaaS, not a remote vector database, and not a certified high-assurance system.
|
|
65
|
+
For regulated or state-grade environments, pair it with encrypted disks, controlled machines, release
|
|
66
|
+
verification, and an external security review.
|
|
67
|
+
|
|
68
|
+
## Use Cases
|
|
69
|
+
|
|
70
|
+
Mimir is useful whenever the source material should stay local but an AI agent still needs grounded
|
|
71
|
+
context.
|
|
72
|
+
|
|
73
|
+
| Use case | Example questions |
|
|
74
|
+
| --- | --- |
|
|
75
|
+
| Understand a code repository | "Where is authentication implemented?", "What depends on this module?", "Summarize the payment flow." |
|
|
76
|
+
| Understand architecture | "What services exist?", "What are the data boundaries?", "Which components are risky to change?" |
|
|
77
|
+
| Analyze specifications | "What does the technical spec require?", "Which requirements are still unclear?", "Generate an implementation checklist." |
|
|
78
|
+
| Work through a request for proposal or tender | "What are the mandatory constraints?", "Which documents prove compliance?", "What risks should be clarified?" |
|
|
79
|
+
| Study courses and training material | "Summarize chapter three.", "Create revision questions.", "Compare these two concepts." |
|
|
80
|
+
| Analyze a book or long report | "Extract the main thesis.", "Find recurring arguments.", "Create a chapter-by-chapter brief." |
|
|
81
|
+
| Build an internal knowledge base | "What is the policy for incident review?", "Who owns this process?", "Which source says that?" |
|
|
82
|
+
| Prepare meetings or decisions | "Give me a one-page briefing.", "What is missing before deciding?", "List action items and evidence." |
|
|
83
|
+
| Ask questions over offline documents | "Which files mention local-only operation?", "What evidence supports this claim?" |
|
|
84
|
+
| Generate audio briefings | "Create a listenable high-quality or offline summary of the current dossier." |
|
|
85
|
+
|
|
44
86
|
## Requirements
|
|
45
87
|
|
|
46
88
|
- Node.js 20+
|
|
47
89
|
- pnpm, npm, yarn or bun
|
|
48
|
-
-
|
|
49
|
-
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
ollama pull gemma4
|
|
59
|
-
```
|
|
90
|
+
- No model runtime is required for the default `embeddingProvider: "local-hash"` mode.
|
|
91
|
+
- Optional semantic embeddings use Transformers.js with local model files under `.mimir/models` by
|
|
92
|
+
default.
|
|
93
|
+
- Generated answers are intentionally outside Mimir core. Use Claude, Codex, OpenAI, a local model
|
|
94
|
+
MCP server, or another trusted model runtime to synthesize from Mimir's cited context.
|
|
95
|
+
- Optional audio summaries use the separate `@jcode.labs/mimir-tts` workspace package. For the
|
|
96
|
+
highest quality, install the external `edge-tts` CLI and render Edge MP3 output with
|
|
97
|
+
`fr-FR-DeniseNeural`. For confidential or air-gapped content, use the Transformers.js WAV path
|
|
98
|
+
with `--engine transformers --offline`; it does not require Python, ffmpeg, Piper, XTTS, or a
|
|
99
|
+
local server.
|
|
60
100
|
|
|
61
101
|
## Install From npm
|
|
62
102
|
|
|
@@ -76,23 +116,26 @@ npm install --save-dev @jcode.labs/mimir
|
|
|
76
116
|
|
|
77
117
|
Maintainer tokens are only needed to publish new versions.
|
|
78
118
|
|
|
79
|
-
## Install From
|
|
119
|
+
## Install From Source Checkout
|
|
80
120
|
|
|
81
121
|
```bash
|
|
82
|
-
|
|
122
|
+
git clone git@github.com:jcode-works/jcode-mimir.git
|
|
123
|
+
cd jcode-mimir
|
|
124
|
+
pnpm install
|
|
125
|
+
pnpm build
|
|
83
126
|
```
|
|
84
127
|
|
|
85
128
|
For local development:
|
|
86
129
|
|
|
87
130
|
```bash
|
|
88
|
-
pnpm add -D file:../jcode-mimir
|
|
131
|
+
pnpm add -D file:../jcode-mimir/packages/mimir
|
|
89
132
|
```
|
|
90
133
|
|
|
91
134
|
Before creating an npm tarball later, run:
|
|
92
135
|
|
|
93
136
|
```bash
|
|
94
137
|
pnpm build
|
|
95
|
-
pnpm pack
|
|
138
|
+
pnpm --dir packages/mimir pack
|
|
96
139
|
```
|
|
97
140
|
|
|
98
141
|
## Use In Any Repository
|
|
@@ -126,9 +169,170 @@ npx kb security-audit
|
|
|
126
169
|
npx kb status
|
|
127
170
|
```
|
|
128
171
|
|
|
129
|
-
##
|
|
172
|
+
## Choose A Retrieval Mode
|
|
173
|
+
|
|
174
|
+
Mimir has two embedding modes.
|
|
175
|
+
|
|
176
|
+
### Default Local Hash Retrieval
|
|
177
|
+
|
|
178
|
+
Use this when you want a fully local, no-model smoke test or a dependency-light setup. Retrieval is
|
|
179
|
+
lexical/hash-based, not semantic.
|
|
180
|
+
|
|
181
|
+
`.kb/config.json`:
|
|
182
|
+
|
|
183
|
+
```json
|
|
184
|
+
{
|
|
185
|
+
"embeddingProvider": "local-hash"
|
|
186
|
+
}
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
Commands:
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
pnpm exec kb ingest
|
|
193
|
+
pnpm exec kb search "offline retrieval approval"
|
|
194
|
+
pnpm exec kb ask "What evidence supports offline operation?"
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
`kb ask` always returns cited retrieved passages instead of a generated synthesis. You can pass those
|
|
198
|
+
passages to any LLM or agent you trust.
|
|
199
|
+
|
|
200
|
+
### Optional Semantic Embeddings With Transformers.js
|
|
201
|
+
|
|
202
|
+
Use this when you want better semantic retrieval while keeping Mimir core free of an LLM server.
|
|
203
|
+
|
|
204
|
+
`.kb/config.json`:
|
|
205
|
+
|
|
206
|
+
```json
|
|
207
|
+
{
|
|
208
|
+
"embeddingProvider": "transformers",
|
|
209
|
+
"embeddingModel": "mixedbread-ai/mxbai-embed-xsmall-v1",
|
|
210
|
+
"embeddingModelPath": ".mimir/models",
|
|
211
|
+
"transformersAllowRemoteModels": false
|
|
212
|
+
}
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
Commands:
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
pnpm exec kb ingest
|
|
219
|
+
pnpm exec kb ask "Which passages support offline operation?"
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
Keep `transformersAllowRemoteModels` false for confidential or air-gapped work and preload model
|
|
223
|
+
files into `embeddingModelPath`. Set it to true only when you explicitly allow Transformers.js to
|
|
224
|
+
download model files from Hugging Face.
|
|
225
|
+
|
|
226
|
+
## Dependency Footprint
|
|
227
|
+
|
|
228
|
+
Mimir can run retrieval without a model runtime. Some runtime dependencies remain because they own
|
|
229
|
+
core features:
|
|
230
|
+
|
|
231
|
+
| Dependency | Why it remains |
|
|
232
|
+
| --- | --- |
|
|
233
|
+
| @huggingface/transformers | optional local semantic embeddings |
|
|
234
|
+
| LanceDB | local vector storage and nearest-neighbor retrieval |
|
|
235
|
+
| MCP SDK | MCP server for compatible agents |
|
|
236
|
+
| fast-glob | safe source-file discovery |
|
|
237
|
+
| unpdf, html-to-text, yaml, fflate | document parsing for PDF, HTML, YAML, Office/OpenDocument ZIP files |
|
|
238
|
+
| commander, zod, picocolors | CLI, config validation, readable terminal output |
|
|
239
|
+
|
|
240
|
+
Removing more dependencies is possible only by dropping features or replacing them with smaller
|
|
241
|
+
internal implementations. The current low-friction path is dependency-light at runtime for users who
|
|
242
|
+
choose `local-hash`, while preserving richer parsing, MCP support, and optional semantic embeddings.
|
|
243
|
+
|
|
244
|
+
## Example Test Workspace
|
|
245
|
+
|
|
246
|
+
This repository includes a synthetic example under
|
|
247
|
+
[`examples/sovereign-rag-demo`](./examples/sovereign-rag-demo). It can be used to test ingestion,
|
|
248
|
+
retrieval, `security-audit`, and custom text extensions without using private documents.
|
|
249
|
+
|
|
250
|
+
From a local checkout:
|
|
251
|
+
|
|
252
|
+
```bash
|
|
253
|
+
pnpm build
|
|
254
|
+
cd examples/sovereign-rag-demo
|
|
255
|
+
node ../../dist/cli.js security-audit
|
|
256
|
+
node ../../dist/cli.js ingest
|
|
257
|
+
node ../../dist/cli.js search "offline retrieval approval"
|
|
258
|
+
node ../../dist/cli.js audit
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
The example uses the default local-hash retrieval mode, so it can run without downloading an
|
|
262
|
+
embedding or chat model.
|
|
263
|
+
|
|
264
|
+
## Typical Workflows
|
|
265
|
+
|
|
266
|
+
### Understand A Codebase
|
|
267
|
+
|
|
268
|
+
```bash
|
|
269
|
+
pnpm exec kb init
|
|
270
|
+
printf "src\nREADME.md\ndocs\n" >> .kb/sources.txt
|
|
271
|
+
pnpm exec kb ingest
|
|
272
|
+
pnpm exec kb search "authentication flow"
|
|
273
|
+
pnpm exec kb ask "Explain the architecture and cite the relevant files."
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
### Analyze Specifications Or A Course
|
|
277
|
+
|
|
278
|
+
```bash
|
|
279
|
+
pnpm exec kb ingest
|
|
280
|
+
pnpm exec kb ask "Summarize the requirements and list open questions."
|
|
281
|
+
pnpm exec kb ask "Create revision questions from the indexed course material."
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
### Work Offline
|
|
285
|
+
|
|
286
|
+
```bash
|
|
287
|
+
pnpm exec kb security-audit --strict
|
|
288
|
+
pnpm exec kb ingest
|
|
289
|
+
pnpm exec kb search "incident review policy"
|
|
290
|
+
pnpm exec kb ask "What does the local evidence prove?"
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
Use `embeddingProvider: "local-hash"` for a no-model offline workflow. Use
|
|
294
|
+
`embeddingProvider: "transformers"` with preloaded model files for semantic offline retrieval.
|
|
295
|
+
Generated answers should come from a trusted external agent or model runtime.
|
|
296
|
+
|
|
297
|
+
### Generate An Audio Briefing
|
|
298
|
+
|
|
299
|
+
Mimir includes a plug-and-play text-to-speech path for listenable summaries. For the same quality
|
|
300
|
+
path as the global Voice Forge skill, install `edge-tts` and render MP3:
|
|
301
|
+
|
|
302
|
+
```bash
|
|
303
|
+
pnpm exec kb audio --doctor
|
|
304
|
+
pipx install edge-tts
|
|
305
|
+
pnpm exec kb audio /tmp/MIMIR-SUMMARY-project.txt \
|
|
306
|
+
--engine edge \
|
|
307
|
+
--out .mimir/audio/project-summary.mp3
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
The Edge path uses the online Microsoft Edge TTS service through the `edge-tts` CLI. Use it only
|
|
311
|
+
when sending the narration text to that service is acceptable.
|
|
312
|
+
|
|
313
|
+
For confidential or air-gapped work, preload Transformers.js-compatible model files and render WAV
|
|
314
|
+
offline:
|
|
315
|
+
|
|
316
|
+
```bash
|
|
317
|
+
pnpm exec kb audio /tmp/MIMIR-SUMMARY-project.txt \
|
|
318
|
+
--engine transformers \
|
|
319
|
+
--offline \
|
|
320
|
+
--model-path .mimir/models/tts \
|
|
321
|
+
--out .mimir/audio/project-summary.wav
|
|
322
|
+
```
|
|
130
323
|
|
|
131
|
-
|
|
324
|
+
The standalone package can also be installed directly:
|
|
325
|
+
|
|
326
|
+
```bash
|
|
327
|
+
pnpm add -D @jcode.labs/mimir-tts
|
|
328
|
+
pnpm exec mimir-tts render /tmp/MIMIR-SUMMARY-project.txt \
|
|
329
|
+
--engine edge \
|
|
330
|
+
--out .mimir/audio/project-summary.mp3
|
|
331
|
+
```
|
|
332
|
+
|
|
333
|
+
## Agent Skills And MCP
|
|
334
|
+
|
|
335
|
+
Mimir ships with portable agent skills and a standard MCP server.
|
|
132
336
|
|
|
133
337
|
Install the agent kit into a repository:
|
|
134
338
|
|
|
@@ -140,12 +344,14 @@ This creates:
|
|
|
140
344
|
|
|
141
345
|
```plain text
|
|
142
346
|
.mimir/skills/mimir/SKILL.md
|
|
347
|
+
.mimir/skills/mimir-audio-summary/SKILL.md
|
|
143
348
|
.mimir/mcp.json
|
|
144
349
|
.mimir/README.md
|
|
145
350
|
```
|
|
146
351
|
|
|
147
|
-
Agents that support skill folders can load `.mimir/skills/mimir
|
|
148
|
-
|
|
352
|
+
Agents that support skill folders can load `.mimir/skills/mimir/` for deep local RAG usage.
|
|
353
|
+
Load `.mimir/skills/mimir-audio-summary/` only when an optional spoken summary is needed.
|
|
354
|
+
Other agents can read the generated `.mimir/README.md` and use the MCP config snippet.
|
|
149
355
|
|
|
150
356
|
Start the MCP server from the repository root:
|
|
151
357
|
|
|
@@ -161,6 +367,10 @@ MCP tools exposed:
|
|
|
161
367
|
- `mimir_audit`
|
|
162
368
|
- `mimir_security_audit`
|
|
163
369
|
|
|
370
|
+
This MCP layer is the recommended way to let any compatible LLM or agent query the same local
|
|
371
|
+
knowledge base. The LLM does not need to know about LanceDB or the raw file layout; it asks Mimir for
|
|
372
|
+
ranked passages or cited context and uses the returned citations.
|
|
373
|
+
|
|
164
374
|
Print the bundled skill path from the installed package:
|
|
165
375
|
|
|
166
376
|
```bash
|
|
@@ -190,7 +400,9 @@ state.
|
|
|
190
400
|
Mimir is designed for private repositories and sensitive local evidence.
|
|
191
401
|
|
|
192
402
|
- Zero telemetry: no analytics or document content is sent to JCode Labs.
|
|
193
|
-
-
|
|
403
|
+
- No LLM generation in core: Mimir returns cited context for the agent/runtime you choose.
|
|
404
|
+
- Local-hash by default: no model runtime is required for the default retrieval path.
|
|
405
|
+
- Transformers.js remote model loading is disabled by default.
|
|
194
406
|
- Redaction before indexing: common secrets and identifiers are redacted before chunks are
|
|
195
407
|
embedded and stored.
|
|
196
408
|
- Metadata-only access logs: query hashes and action metadata are logged, not raw queries.
|
|
@@ -210,10 +422,13 @@ pnpm exec kb destroy-index --yes
|
|
|
210
422
|
```
|
|
211
423
|
|
|
212
424
|
For air-gapped operation, release verification, secure deletion limits, and threat model details,
|
|
213
|
-
read
|
|
425
|
+
read
|
|
426
|
+
[`SECURITY-HARDENING.md`](https://github.com/jcode-works/jcode-mimir/blob/main/SECURITY-HARDENING.md).
|
|
214
427
|
|
|
215
428
|
## Supported Files
|
|
216
429
|
|
|
430
|
+
Mimir supports common text, document, data, config, log, and source-code files out of the box:
|
|
431
|
+
|
|
217
432
|
- Markdown: `.md`, `.mdx`
|
|
218
433
|
- Text: `.txt`, `.text`
|
|
219
434
|
- JSON: `.json`
|
|
@@ -221,6 +436,32 @@ read [`SECURITY-HARDENING.md`](./SECURITY-HARDENING.md).
|
|
|
221
436
|
- CSV/TSV: `.csv`, `.tsv`
|
|
222
437
|
- HTML: `.html`, `.htm`
|
|
223
438
|
- PDF: `.pdf`
|
|
439
|
+
- Office/OpenDocument: `.docx`, `.pptx`, `.xlsx`, `.odt`, `.ods`, `.odp`
|
|
440
|
+
- Rich text: `.rtf`
|
|
441
|
+
- Line data and logs: `.jsonl`, `.ndjson`, `.log`
|
|
442
|
+
- XML feeds and documents: `.xml`, `.rss`, `.atom`
|
|
443
|
+
- Config and data files: `.toml`, `.ini`, `.conf`, `.cfg`, `.properties`, `.sql`
|
|
444
|
+
- Source code: `.ts`, `.tsx`, `.js`, `.jsx`, `.py`, `.go`, `.rs`, `.java`, `.rb`, `.php`,
|
|
445
|
+
`.cs`, `.c`, `.cpp`, `.h`, `.css`
|
|
446
|
+
|
|
447
|
+
Custom UTF-8 text extensions can be enabled without changing code:
|
|
448
|
+
|
|
449
|
+
```json
|
|
450
|
+
{
|
|
451
|
+
"includeExtensions": [".transcript", ".evidence"]
|
|
452
|
+
}
|
|
453
|
+
```
|
|
454
|
+
|
|
455
|
+
Or through:
|
|
456
|
+
|
|
457
|
+
```bash
|
|
458
|
+
KB_INCLUDE_EXTENSIONS=".transcript,.evidence" pnpm exec kb ingest
|
|
459
|
+
```
|
|
460
|
+
|
|
461
|
+
Images, scans, audio/video files, old proprietary Office binaries such as `.doc`, and other formats
|
|
462
|
+
that are not listed should be OCRed, transcribed, converted, or exported to text/PDF/HTML first.
|
|
463
|
+
Mimir intentionally avoids pretending that every binary format can be indexed safely without
|
|
464
|
+
extraction logic.
|
|
224
465
|
|
|
225
466
|
## Config
|
|
226
467
|
|
|
@@ -232,11 +473,11 @@ read [`SECURITY-HARDENING.md`](./SECURITY-HARDENING.md).
|
|
|
232
473
|
"storageDir": ".kb/storage",
|
|
233
474
|
"sourcesFile": ".kb/sources.txt",
|
|
234
475
|
"accessLogPath": ".kb/access.log",
|
|
476
|
+
"embeddingModelPath": ".mimir/models",
|
|
235
477
|
"tableName": "chunks",
|
|
236
|
-
"
|
|
237
|
-
"
|
|
238
|
-
"
|
|
239
|
-
"llmModel": "gemma4:latest",
|
|
478
|
+
"embeddingProvider": "local-hash",
|
|
479
|
+
"embeddingModel": "mixedbread-ai/mxbai-embed-xsmall-v1",
|
|
480
|
+
"transformersAllowRemoteModels": false,
|
|
240
481
|
"redaction": {
|
|
241
482
|
"enabled": true,
|
|
242
483
|
"builtIn": true,
|
|
@@ -246,7 +487,8 @@ read [`SECURITY-HARDENING.md`](./SECURITY-HARDENING.md).
|
|
|
246
487
|
"mcpMaxTopK": 10,
|
|
247
488
|
"topK": 5,
|
|
248
489
|
"chunkSize": 1200,
|
|
249
|
-
"chunkOverlap": 150
|
|
490
|
+
"chunkOverlap": 150,
|
|
491
|
+
"includeExtensions": []
|
|
250
492
|
}
|
|
251
493
|
```
|
|
252
494
|
|
|
@@ -256,10 +498,10 @@ Environment overrides:
|
|
|
256
498
|
- `KB_STORAGE_DIR`
|
|
257
499
|
- `KB_SOURCES_FILE`
|
|
258
500
|
- `KB_ACCESS_LOG_PATH`
|
|
259
|
-
- `
|
|
260
|
-
- `
|
|
261
|
-
- `
|
|
262
|
-
- `
|
|
501
|
+
- `KB_EMBEDDING_PROVIDER`
|
|
502
|
+
- `KB_EMBEDDING_MODEL`
|
|
503
|
+
- `KB_EMBEDDING_MODEL_PATH`
|
|
504
|
+
- `KB_TRANSFORMERS_ALLOW_REMOTE_MODELS`
|
|
263
505
|
- `KB_REDACTION_ENABLED`
|
|
264
506
|
- `KB_REDACTION_BUILT_IN`
|
|
265
507
|
- `KB_ACCESS_LOG`
|
|
@@ -267,6 +509,7 @@ Environment overrides:
|
|
|
267
509
|
- `KB_TOP_K`
|
|
268
510
|
- `KB_CHUNK_SIZE`
|
|
269
511
|
- `KB_CHUNK_OVERLAP`
|
|
512
|
+
- `KB_INCLUDE_EXTENSIONS`
|
|
270
513
|
|
|
271
514
|
## Library API
|
|
272
515
|
|
|
@@ -280,8 +523,9 @@ const answer = await ask("What documents support the project timeline?")
|
|
|
280
523
|
|
|
281
524
|
## Privacy
|
|
282
525
|
|
|
283
|
-
-
|
|
284
|
-
-
|
|
526
|
+
- Mimir core does not generate answers or call a chat model.
|
|
527
|
+
- `local-hash` can run ingestion, search, and cited retrieval without a model runtime.
|
|
528
|
+
- Transformers.js remote model loading is disabled by default.
|
|
285
529
|
- Built-in redaction runs before indexing by default.
|
|
286
530
|
- Access logs store query hashes, not raw queries.
|
|
287
531
|
- The vector index is stored locally.
|
package/dist/chunking.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"chunking.d.ts","sourceRoot":"","sources":["../src/chunking.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,YAAY,CAAA;
|
|
1
|
+
{"version":3,"file":"chunking.d.ts","sourceRoot":"","sources":["../src/chunking.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,YAAY,CAAA;AAM3D,wBAAgB,aAAa,CAC3B,QAAQ,EAAE,cAAc,EACxB,SAAS,EAAE,MAAM,EACjB,YAAY,EAAE,MAAM,GACnB,SAAS,EAAE,CAqCb"}
|
package/dist/chunking.js
CHANGED
|
@@ -1,4 +1,7 @@
|
|
|
1
1
|
import { createHash } from "node:crypto";
|
|
2
|
+
const PARAGRAPH_BREAK_MIN_RATIO = 0.45;
|
|
3
|
+
const SENTENCE_BREAK_MIN_RATIO = 0.55;
|
|
4
|
+
const WHITESPACE_BREAK_MIN_RATIO = 0.75;
|
|
2
5
|
export function chunkDocument(document, chunkSize, chunkOverlap) {
|
|
3
6
|
if (!document.text) {
|
|
4
7
|
return [];
|
|
@@ -39,15 +42,15 @@ function chooseChunkEnd(text, cursor, chunkSize) {
|
|
|
39
42
|
}
|
|
40
43
|
const window = text.slice(cursor, hardEnd);
|
|
41
44
|
const paragraphBreak = window.lastIndexOf("\n\n");
|
|
42
|
-
if (paragraphBreak > chunkSize *
|
|
45
|
+
if (paragraphBreak > chunkSize * PARAGRAPH_BREAK_MIN_RATIO) {
|
|
43
46
|
return cursor + paragraphBreak;
|
|
44
47
|
}
|
|
45
48
|
const sentenceBreak = Math.max(window.lastIndexOf(". "), window.lastIndexOf("? "), window.lastIndexOf("! "));
|
|
46
|
-
if (sentenceBreak > chunkSize *
|
|
49
|
+
if (sentenceBreak > chunkSize * SENTENCE_BREAK_MIN_RATIO) {
|
|
47
50
|
return cursor + sentenceBreak + 1;
|
|
48
51
|
}
|
|
49
52
|
const whitespace = window.lastIndexOf(" ");
|
|
50
|
-
if (whitespace > chunkSize *
|
|
53
|
+
if (whitespace > chunkSize * WHITESPACE_BREAK_MIN_RATIO) {
|
|
51
54
|
return cursor + whitespace;
|
|
52
55
|
}
|
|
53
56
|
return hardEnd;
|
package/dist/chunking.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"chunking.js","sourceRoot":"","sources":["../src/chunking.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAA;AAGxC,MAAM,UAAU,aAAa,CAC3B,QAAwB,EACxB,SAAiB,EACjB,YAAoB;IAEpB,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;QACnB,OAAO,EAAE,CAAA;IACX,CAAC;IAED,MAAM,MAAM,GAAgB,EAAE,CAAA;IAC9B,IAAI,MAAM,GAAG,CAAC,CAAA;IACd,IAAI,UAAU,GAAG,CAAC,CAAA;IAElB,OAAO,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,cAAc,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,CAAC,CAAA;QAC5D,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;QAEpD,IAAI,IAAI,EAAE,CAAC;YACT,MAAM,EAAE,GAAG,UAAU,CAAC,QAAQ,CAAC;iBAC5B,MAAM,CAAC,GAAG,QAAQ,CAAC,IAAI,CAAC,YAAY,IAAI,UAAU,IAAI,IAAI,EAAE,CAAC;iBAC7D,MAAM,CAAC,KAAK,CAAC,CAAA;YAChB,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE;gBACF,MAAM,EAAE,QAAQ,CAAC,IAAI,CAAC,MAAM;gBAC5B,YAAY,EAAE,QAAQ,CAAC,IAAI,CAAC,YAAY;gBACxC,UAAU;gBACV,IAAI;gBACJ,QAAQ,EAAE,QAAQ,CAAC,IAAI,CAAC,QAAQ;gBAChC,KAAK,EAAE,QAAQ,CAAC,IAAI,CAAC,KAAK;gBAC1B,OAAO,EAAE,QAAQ,CAAC,IAAI,CAAC,OAAO;aAC/B,CAAC,CAAA;YACF,UAAU,IAAI,CAAC,CAAA;QACjB,CAAC;QAED,IAAI,GAAG,IAAI,QAAQ,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;YAChC,MAAK;QACP,CAAC;QACD,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,GAAG,YAAY,EAAE,MAAM,GAAG,CAAC,CAAC,CAAA;IACnD,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED,SAAS,cAAc,CAAC,IAAY,EAAE,MAAc,EAAE,SAAiB;IACrE,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,GAAG,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,CAAA;IACzD,IAAI,OAAO,KAAK,IAAI,CAAC,MAAM,EAAE,CAAC;QAC5B,OAAO,OAAO,CAAA;IAChB,CAAC;IAED,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;IAC1C,MAAM,cAAc,GAAG,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,CAAA;IACjD,IAAI,cAAc,GAAG,SAAS,GAAG,
|
|
1
|
+
{"version":3,"file":"chunking.js","sourceRoot":"","sources":["../src/chunking.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAA;AAGxC,MAAM,yBAAyB,GAAG,IAAI,CAAA;AACtC,MAAM,wBAAwB,GAAG,IAAI,CAAA;AACrC,MAAM,0BAA0B,GAAG,IAAI,CAAA;AAEvC,MAAM,UAAU,aAAa,CAC3B,QAAwB,EACxB,SAAiB,EACjB,YAAoB;IAEpB,IAAI,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;QACnB,OAAO,EAAE,CAAA;IACX,CAAC;IAED,MAAM,MAAM,GAAgB,EAAE,CAAA;IAC9B,IAAI,MAAM,GAAG,CAAC,CAAA;IACd,IAAI,UAAU,GAAG,CAAC,CAAA;IAElB,OAAO,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,cAAc,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,CAAC,CAAA;QAC5D,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE,CAAA;QAEpD,IAAI,IAAI,EAAE,CAAC;YACT,MAAM,EAAE,GAAG,UAAU,CAAC,QAAQ,CAAC;iBAC5B,MAAM,CAAC,GAAG,QAAQ,CAAC,IAAI,CAAC,YAAY,IAAI,UAAU,IAAI,IAAI,EAAE,CAAC;iBAC7D,MAAM,CAAC,KAAK,CAAC,CAAA;YAChB,MAAM,CAAC,IAAI,CAAC;gBACV,EAAE;gBACF,MAAM,EAAE,QAAQ,CAAC,IAAI,CAAC,MAAM;gBAC5B,YAAY,EAAE,QAAQ,CAAC,IAAI,CAAC,YAAY;gBACxC,UAAU;gBACV,IAAI;gBACJ,QAAQ,EAAE,QAAQ,CAAC,IAAI,CAAC,QAAQ;gBAChC,KAAK,EAAE,QAAQ,CAAC,IAAI,CAAC,KAAK;gBAC1B,OAAO,EAAE,QAAQ,CAAC,IAAI,CAAC,OAAO;aAC/B,CAAC,CAAA;YACF,UAAU,IAAI,CAAC,CAAA;QACjB,CAAC;QAED,IAAI,GAAG,IAAI,QAAQ,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC;YAChC,MAAK;QACP,CAAC;QACD,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,GAAG,YAAY,EAAE,MAAM,GAAG,CAAC,CAAC,CAAA;IACnD,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED,SAAS,cAAc,CAAC,IAAY,EAAE,MAAc,EAAE,SAAiB;IACrE,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,GAAG,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,CAAA;IACzD,IAAI,OAAO,KAAK,IAAI,CAAC,MAAM,EAAE,CAAC;QAC5B,OAAO,OAAO,CAAA;IAChB,CAAC;IAED,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;IAC1C,MAAM,cAAc,GAAG,MAAM,CAAC,WAAW,CAAC,MAAM,CAAC,CAAA;IACjD,IAAI,cAAc,GAAG,SAAS,GAAG,yBAAyB,EAAE,CAAC;QAC3D,OAAO,MAAM,GAAG,cAAc,CAAA;IAChC,CAAC;IAED,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAC5B,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,EACxB,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,EACxB,MAAM,CAAC,WAAW,CAAC,IAAI,CAAC,CACzB,CAAA;IACD,IAAI,aAAa,GAAG,SAAS,GAAG,wBAAwB,EAAE,CAAC;QACzD,OAAO,MAAM,GAAG,aAAa,GAAG,CAAC,CAAA;IACnC,CAAC;IAED,MAAM,UAAU,GAAG,MAAM,CAAC,WAAW,CAAC,GAAG,CAAC,CAAA;IAC1C,IAAI,UAAU,GAAG,SAAS,GAAG,0BAA0B,EAAE,CAAC;QACxD,OAAO,MAAM,GAAG,UAAU,CAAA;IAC5B,CAAC;IAED,OAAO,OAAO,CAAA;AAChB,CAAC"}
|