@spark-agents/engram 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +337 -0
- package/dist/chunker.d.ts +3 -0
- package/dist/chunker.js +337 -0
- package/dist/config.d.ts +19 -0
- package/dist/config.js +38 -0
- package/dist/embedding.d.ts +8 -0
- package/dist/embedding.js +186 -0
- package/dist/index.d.ts +17 -0
- package/dist/index.js +524 -0
- package/dist/manager.d.ts +76 -0
- package/dist/manager.js +103 -0
- package/dist/reranker.d.ts +15 -0
- package/dist/reranker.js +104 -0
- package/dist/search.d.ts +33 -0
- package/dist/search.js +203 -0
- package/dist/store.d.ts +6 -0
- package/dist/store.js +272 -0
- package/dist/sync.d.ts +31 -0
- package/dist/sync.js +516 -0
- package/dist/types.d.ts +111 -0
- package/dist/types.js +28 -0
- package/openclaw.plugin.json +58 -0
- package/package.json +39 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Spark Agents Workforce
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,337 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<h1 align="center">💎 Engram</h1>
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<p align="center">
|
|
6
|
+
<strong>Memory search for OpenClaw that actually finds what you meant.</strong><br>
|
|
7
|
+
<sub>Powered by Gemini Embedding-2 — text, images, and audio in one search space.<br>One config line. Zero migration. Switch back in 10 seconds.</sub>
|
|
8
|
+
</p>
|
|
9
|
+
|
|
10
|
+
<p align="center">
|
|
11
|
+
<a href="https://github.com/Spark-Agents-Workforce/engram/blob/main/LICENSE"><img src="https://img.shields.io/badge/License-MIT-yellow.svg" alt="License: MIT"></a>
|
|
12
|
+
<img src="https://img.shields.io/badge/OpenClaw-Plugin-FF6B35" alt="OpenClaw Plugin">
|
|
13
|
+
<img src="https://img.shields.io/badge/Gemini-Embedding--2-4285F4?logo=google&logoColor=white" alt="Gemini Embedding-2">
|
|
14
|
+
<img src="https://img.shields.io/badge/tests-98%20passing-brightgreen" alt="Tests passing">
|
|
15
|
+
</p>
|
|
16
|
+
|
|
17
|
+
<p align="center">
|
|
18
|
+
<a href="#why">Why</a> •
|
|
19
|
+
<a href="#install">Install</a> •
|
|
20
|
+
<a href="#how-it-works">How It Works</a> •
|
|
21
|
+
<a href="#comparison">Comparison</a> •
|
|
22
|
+
<a href="#faq">FAQ</a>
|
|
23
|
+
</p>
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
<a id="why"></a>
|
|
28
|
+
|
|
29
|
+
## Why Engram Exists
|
|
30
|
+
|
|
31
|
+
OpenClaw's builtin memory search uses a simple weighted average of keyword and vector scores. It works for easy queries. But it falls apart when:
|
|
32
|
+
|
|
33
|
+
- **Your wording changes.** You wrote "database error" — you search "that SQLite bug" — no match.
|
|
34
|
+
- **Old and new get mixed together.** You ask "what did we decide?" and get a brainstorm from two months ago instead of last week's decision.
|
|
35
|
+
- **You saved a screenshot, not text.** The builtin can't search images at all.
|
|
36
|
+
|
|
37
|
+
Engram replaces the retrieval pipeline with one built on **Google's Gemini Embedding-2** — the first embedding model that natively understands text, images, and audio in a single vector space. Not stitched together with separate models. One model, one space, one search.
|
|
38
|
+
|
|
39
|
+
```text
|
|
40
|
+
You: "Find that error we fixed last Tuesday"
|
|
41
|
+
Builtin: 🤷 random notes about errors from 3 months ago
|
|
42
|
+
Engram: ✅ memory/2026-03-12.md — "Fixed ENOENT crash in the chunker module"
|
|
43
|
+
|
|
44
|
+
You: "that architecture diagram from the planning session"
|
|
45
|
+
Builtin: ❌ can't search images
|
|
46
|
+
Engram: ✅ memory/architecture-sketch.png — matched by visual content
|
|
47
|
+
|
|
48
|
+
You: "what did the client say about pricing"
|
|
49
|
+
Builtin: ❌ buries last week's notes under older matches
|
|
50
|
+
Engram: ✅ memory/2026-03-10.md — recent results ranked first
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### What makes it actually better
|
|
54
|
+
|
|
55
|
+
🔍 **Smarter retrieval**
|
|
56
|
+
|
|
57
|
+
Keyword search and semantic search run in parallel, then get fused with Reciprocal Rank Fusion — a ranking method that's more robust than a simple weighted average. The top candidates then pass through a cross-encoder reranker (running locally, no API call) that reads each query-document pair together and re-scores for precision.
|
|
58
|
+
|
|
59
|
+
🕐 **Time-aware**
|
|
60
|
+
|
|
61
|
+
Recently-indexed notes score higher than old ones. Exponential decay with a configurable half-life (30 days default). Files you actively edit stay fresh. Old notes you never touch gradually fade. Your agent gets what's current, not what's ancient.
|
|
62
|
+
|
|
63
|
+
🖼️ **Multimodal**
|
|
64
|
+
|
|
65
|
+
Gemini Embedding-2 puts text, images, and audio in the same vector space. A text query can find a screenshot. A description can surface a voice memo. No OCR, no transcription — the model understands the content natively. Supported: `.jpg`, `.png`, `.webp`, `.gif`, `.mp3`, `.wav`, `.ogg`, `.opus`, `.m4a`, `.aac`, `.flac`. Multimodal is on by default — images and audio in your workspace get indexed automatically alongside your markdown.
|
|
66
|
+
|
|
67
|
+
📦 **Self-contained**
|
|
68
|
+
|
|
69
|
+
One SQLite file per agent. No Python. No sidecar process. No external database. No external API for reranking. Install it, configure one line, restart.
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
<a id="install"></a>
|
|
74
|
+
|
|
75
|
+
## Install
|
|
76
|
+
|
|
77
|
+
```bash
|
|
78
|
+
cd ~/.openclaw/extensions && npm install @sparkagents/engram
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
```yaml
|
|
82
|
+
plugins:
|
|
83
|
+
slots:
|
|
84
|
+
memory: "engram"
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
openclaw gateway restart
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
That's it. If you already use Gemini in OpenClaw, the API key is auto-detected. If not, get a free key at [aistudio.google.com/apikey](https://aistudio.google.com/apikey) (60 seconds) and set `GEMINI_API_KEY` in your environment.
|
|
92
|
+
|
|
93
|
+
> **Switching from QMD?** Also set `memory.backend: "builtin"` in your config (QMD overrides the plugin slot). Your QMD index stays on disk untouched — switch back anytime.
|
|
94
|
+
|
|
95
|
+
<details>
|
|
96
|
+
<summary>Other API key options</summary>
|
|
97
|
+
|
|
98
|
+
- Set `GOOGLE_API_KEY` in your environment
|
|
99
|
+
- Add `geminiApiKey: "${GEMINI_API_KEY}"` to the plugin config under `plugins.entries.engram.config`
|
|
100
|
+
- Run `openclaw onboard` and add Google as a provider
|
|
101
|
+
- If you use a restrictive `plugins.allow` list, add `"engram"` to it
|
|
102
|
+
|
|
103
|
+
</details>
|
|
104
|
+
|
|
105
|
+
### Switching back
|
|
106
|
+
|
|
107
|
+
Your files don't move. Nothing gets deleted. QMD's index stays untouched. Both indexes coexist.
|
|
108
|
+
|
|
109
|
+
```yaml
|
|
110
|
+
# Using Engram
|
|
111
|
+
plugins:
|
|
112
|
+
slots:
|
|
113
|
+
memory: "engram"
|
|
114
|
+
|
|
115
|
+
# Switch back — remove the lines above and restart
|
|
116
|
+
# Builtin or QMD picks up exactly where it left off
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
---
|
|
120
|
+
|
|
121
|
+
<a id="how-it-works"></a>
|
|
122
|
+
|
|
123
|
+
## How It Works
|
|
124
|
+
|
|
125
|
+
### Search pipeline
|
|
126
|
+
|
|
127
|
+
```text
|
|
128
|
+
┌──────────────────────────────────────────────────────────────┐
|
|
129
|
+
│ │
|
|
130
|
+
│ Query │
|
|
131
|
+
│ │ │
|
|
132
|
+
│ ▼ │
|
|
133
|
+
│ Embed query (Gemini Embedding-2, RETRIEVAL_QUERY) │
|
|
134
|
+
│ │ │
|
|
135
|
+
│ ├───────────────────┬──────────────────┐ │
|
|
136
|
+
│ ▼ ▼ │ │
|
|
137
|
+
│ BM25 keyword Vector KNN │ │
|
|
138
|
+
│ search (FTS5) search (sqlite-vec) │ │
|
|
139
|
+
│ │ │ │ │
|
|
140
|
+
│ └─────────┬─────────┘ │ │
|
|
141
|
+
│ ▼ │ │
|
|
142
|
+
│ Reciprocal Rank Fusion (k=60) │ │
|
|
143
|
+
│ weight: 0.7 vector / 0.3 keyword │ │
|
|
144
|
+
│ │ │ │
|
|
145
|
+
│ ▼ │ │
|
|
146
|
+
│ Cross-encoder reranking (top 20) │ │
|
|
147
|
+
│ ms-marco-MiniLM-L-12-v2 — local ONNX │ │
|
|
148
|
+
│ │ │ │
|
|
149
|
+
│ ▼ │ │
|
|
150
|
+
│ Time decay + source balancing │ │
|
|
151
|
+
│ │ │ │
|
|
152
|
+
│ ▼ │ │
|
|
153
|
+
│ Results │ │
|
|
154
|
+
│ │
|
|
155
|
+
└──────────────────────────────────────────────────────────────┘
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### Indexing
|
|
159
|
+
|
|
160
|
+
1. Discovers `MEMORY.md`, `memory/*.md`, session transcripts (JSONL), and optionally image/audio files
|
|
161
|
+
2. Chunks markdown at 1024 tokens with 15% overlap — respects headings, code blocks, frontmatter
|
|
162
|
+
3. Embeds with Gemini Embedding-2 using `RETRIEVAL_DOCUMENT` task type (768 dimensions default, L2-normalized)
|
|
163
|
+
4. Stores in one SQLite file: FTS5 for keywords, sqlite-vec for vectors, metadata for freshness tracking
|
|
164
|
+
5. Incremental: content-hash tracking means only changed files get re-embedded
|
|
165
|
+
6. Live: chokidar watches workspace files and session transcript directory for real-time updates
|
|
166
|
+
|
|
167
|
+
### Performance
|
|
168
|
+
|
|
169
|
+
Measured on Apple Silicon, real Gemini API calls:
|
|
170
|
+
|
|
171
|
+
| Operation | Time |
|
|
172
|
+
|---|---|
|
|
173
|
+
| Search (full pipeline) | ~350ms |
|
|
174
|
+
| Index one file | ~400ms |
|
|
175
|
+
| First sync (small workspace) | ~1-2s |
|
|
176
|
+
| Local pipeline (FTS5 + vector + RRF + rerank) | <10ms |
|
|
177
|
+
|
|
178
|
+
Search latency is dominated by the Gemini API round-trip. Everything after the embedding call runs locally in under 10ms.
|
|
179
|
+
|
|
180
|
+
### Storage
|
|
181
|
+
|
|
182
|
+
One SQLite file per agent at `~/.openclaw/agents/{id}/engram/index.sqlite`.
|
|
183
|
+
|
|
184
|
+
| Workspace size | Chunks | Index |
|
|
185
|
+
|---|---|---|
|
|
186
|
+
| 100 files | ~2K | ~10 MB |
|
|
187
|
+
| 500 files | ~10K | ~50 MB |
|
|
188
|
+
| 2000 files | ~50K | ~250 MB |
|
|
189
|
+
|
|
190
|
+
---
|
|
191
|
+
|
|
192
|
+
<a id="comparison"></a>
|
|
193
|
+
|
|
194
|
+
## Comparison
|
|
195
|
+
|
|
196
|
+
| | Builtin | QMD | LanceDB Pro | **Engram** |
|
|
197
|
+
|---|---|---|---|---|
|
|
198
|
+
| **Multimodal** (image + audio search) | ❌ | ❌ | ❌ | **✅** |
|
|
199
|
+
| **Hybrid search** (keywords + vectors) | Weak | ✅ | ✅ | ✅ |
|
|
200
|
+
| **Reranking** | ❌ | ✅ local GGUF | ✅ external API | ✅ local ONNX |
|
|
201
|
+
| **Task-aware embeddings** | ❌ | ❌ | ❌ | **✅** |
|
|
202
|
+
| **Reciprocal Rank Fusion** | ❌ | ✅ | ❌ | ✅ |
|
|
203
|
+
| **Time decay** | ❌ | ❌ | ✅ | ✅ |
|
|
204
|
+
| **Session transcript search** | ✅ | ✅ | ❌ | ✅ |
|
|
205
|
+
| **Single file storage** | ✅ | ✅ | ❌ | ✅ |
|
|
206
|
+
| **No sidecar / external binary** | ✅ | ❌ | ✅ | ✅ |
|
|
207
|
+
| **No external API for reranking** | N/A | ✅ | ❌ | ✅ |
|
|
208
|
+
| **Zero-config startup** | ✅ | ⚠️ | ❌ | ✅ |
|
|
209
|
+
|
|
210
|
+
Engram is the only OpenClaw memory plugin that combines multimodal search, hybrid retrieval with RRF, local cross-encoder reranking, and single-file storage.
|
|
211
|
+
|
|
212
|
+
---
|
|
213
|
+
|
|
214
|
+
### Configuration
|
|
215
|
+
|
|
216
|
+
Everything below is optional. Defaults are tuned and work out of the box.
|
|
217
|
+
|
|
218
|
+
```yaml
|
|
219
|
+
plugins:
|
|
220
|
+
slots:
|
|
221
|
+
memory: "engram"
|
|
222
|
+
entries:
|
|
223
|
+
engram:
|
|
224
|
+
config:
|
|
225
|
+
# geminiApiKey: "${GEMINI_API_KEY}" # only if Gemini isn't already configured
|
|
226
|
+
# dimensions: 768 # 768 (default) | 1536 | 3072
|
|
227
|
+
# chunkTokens: 1024 # 512-8192 (default: 1024)
|
|
228
|
+
# chunkOverlap: 0.15 # 0-0.5 (default: 0.15)
|
|
229
|
+
# reranking: true # cross-encoder reranking (default: on)
|
|
230
|
+
# timeDecay:
|
|
231
|
+
# enabled: true # recency boost (default: on)
|
|
232
|
+
# halfLifeDays: 30 # score halves every 30 days
|
|
233
|
+
# maxSessionShare: 0.4 # cap session results at 40%
|
|
234
|
+
# multimodal:
|
|
235
|
+
# enabled: true # index images and audio (default: on)
|
|
236
|
+
# modalities: ["image", "audio"]
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
### CLI
|
|
240
|
+
|
|
241
|
+
```bash
|
|
242
|
+
openclaw engram status # index stats for all agents
|
|
243
|
+
openclaw engram status --agent main # single agent
|
|
244
|
+
openclaw engram status --agent main --deep # probe embedding + vector availability
|
|
245
|
+
openclaw engram index --force # reindex all agents
|
|
246
|
+
openclaw engram index --agent main # reindex single agent
|
|
247
|
+
openclaw engram search "query" # search (default agent)
|
|
248
|
+
openclaw engram search "query" --agent main # search specific agent
|
|
249
|
+
openclaw engram search "query" --json # machine-readable output
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
---
|
|
253
|
+
|
|
254
|
+
<a id="faq"></a>
|
|
255
|
+
|
|
256
|
+
## FAQ
|
|
257
|
+
|
|
258
|
+
<details>
|
|
259
|
+
<summary><strong>What happens to my existing memory files?</strong></summary>
|
|
260
|
+
|
|
261
|
+
Nothing. Your markdown files (`MEMORY.md`, `memory/*.md`) stay exactly where they are. Engram reads them — it doesn't move, copy, or modify them. Same files, better search.
|
|
262
|
+
|
|
263
|
+
</details>
|
|
264
|
+
|
|
265
|
+
<details>
|
|
266
|
+
<summary><strong>What about my QMD index?</strong></summary>
|
|
267
|
+
|
|
268
|
+
QMD's index is untouched. Engram builds a separate index in its own SQLite file. Both coexist on disk. Switching between them is one config line plus a restart.
|
|
269
|
+
|
|
270
|
+
</details>
|
|
271
|
+
|
|
272
|
+
<details>
|
|
273
|
+
<summary><strong>How long does the first sync take?</strong></summary>
|
|
274
|
+
|
|
275
|
+
For a typical workspace (20-50 markdown files), about 10-30 seconds. Engram chunks every file and sends the text to the Gemini API for embedding. After that, only changed files get re-embedded.
|
|
276
|
+
|
|
277
|
+
</details>
|
|
278
|
+
|
|
279
|
+
<details>
|
|
280
|
+
<summary><strong>Does this cost money?</strong></summary>
|
|
281
|
+
|
|
282
|
+
Gemini's embedding API has a free tier (~1,500 requests/day). Most personal setups will never hit the limit. If you do, text embeddings cost $0.20 per million tokens — a typical workspace costs less than a penny to index.
|
|
283
|
+
|
|
284
|
+
</details>
|
|
285
|
+
|
|
286
|
+
<details>
|
|
287
|
+
<summary><strong>What if I don't have a Gemini API key?</strong></summary>
|
|
288
|
+
|
|
289
|
+
Get a free one at [aistudio.google.com/apikey](https://aistudio.google.com/apikey). Takes about 60 seconds.
|
|
290
|
+
|
|
291
|
+
</details>
|
|
292
|
+
|
|
293
|
+
<details>
|
|
294
|
+
<summary><strong>Is it slower than QMD?</strong></summary>
|
|
295
|
+
|
|
296
|
+
Yes. QMD runs 100% locally (~120ms). Engram makes one API call per search (~350ms). The tradeoff: Gemini Embedding-2 is a significantly stronger model than QMD's 300M-parameter local embeddings, and Engram supports multimodal. If latency matters more than accuracy, stick with QMD.
|
|
297
|
+
|
|
298
|
+
</details>
|
|
299
|
+
|
|
300
|
+
<details>
|
|
301
|
+
<summary><strong>Will this break my agents?</strong></summary>
|
|
302
|
+
|
|
303
|
+
No. Engram registers the same `memory_search` and `memory_get` tools your agents already use. From an agent's perspective, nothing changes except better results. If anything goes wrong, `memory_search` returns empty results gracefully instead of crashing.
|
|
304
|
+
|
|
305
|
+
</details>
|
|
306
|
+
|
|
307
|
+
<details>
|
|
308
|
+
<summary><strong>Can I use it for just one agent?</strong></summary>
|
|
309
|
+
|
|
310
|
+
Not currently. The memory plugin slot is global — applies to all agents on the gateway. That's an OpenClaw limitation, not Engram's. Per-agent overrides may come in a future OpenClaw version.
|
|
311
|
+
|
|
312
|
+
</details>
|
|
313
|
+
|
|
314
|
+
<details>
|
|
315
|
+
<summary><strong>What data gets sent to Google?</strong></summary>
|
|
316
|
+
|
|
317
|
+
Text content of your memory files and session transcripts is sent to Google's Gemini embedding API. If multimodal is enabled, image and audio bytes are sent too. Google's paid tier does not use your data for model training. Embeddings are one-way — original content can't be reconstructed from vectors.
|
|
318
|
+
|
|
319
|
+
</details>
|
|
320
|
+
|
|
321
|
+
<details>
|
|
322
|
+
<summary><strong>What about PDF search?</strong></summary>
|
|
323
|
+
|
|
324
|
+
Gemini Embedding-2 supports PDF embedding (up to 6 pages per request). Engram doesn't index PDFs yet but the model path is there — it's on the roadmap.
|
|
325
|
+
|
|
326
|
+
</details>
|
|
327
|
+
|
|
328
|
+
---
|
|
329
|
+
|
|
330
|
+
<p align="center">
|
|
331
|
+
<strong>Ready?</strong> <code>npm install</code> → one config line → <code>openclaw gateway restart</code><br>
|
|
332
|
+
<a href="#install">↑ Install now</a>
|
|
333
|
+
</p>
|
|
334
|
+
|
|
335
|
+
<p align="center">
|
|
336
|
+
<sub>Built by <a href="https://sparkagents.com">Spark Agents</a> for the <a href="https://github.com/openclaw/openclaw">OpenClaw</a> community.</sub>
|
|
337
|
+
</p>
|
package/dist/chunker.js
ADDED
|
@@ -0,0 +1,337 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
export function estimateTokens(text) {
|
|
3
|
+
return Math.ceil(text.length / 4);
|
|
4
|
+
}
|
|
5
|
+
function splitLines(text, startLine) {
|
|
6
|
+
if (text.length === 0) {
|
|
7
|
+
return [];
|
|
8
|
+
}
|
|
9
|
+
const lines = [];
|
|
10
|
+
let cursor = 0;
|
|
11
|
+
let line = startLine;
|
|
12
|
+
while (cursor < text.length) {
|
|
13
|
+
const newlineIndex = text.indexOf("\n", cursor);
|
|
14
|
+
if (newlineIndex === -1) {
|
|
15
|
+
lines.push({
|
|
16
|
+
text: text.slice(cursor),
|
|
17
|
+
start: cursor,
|
|
18
|
+
end: text.length,
|
|
19
|
+
originalLine: line,
|
|
20
|
+
});
|
|
21
|
+
break;
|
|
22
|
+
}
|
|
23
|
+
const end = newlineIndex + 1;
|
|
24
|
+
lines.push({
|
|
25
|
+
text: text.slice(cursor, end),
|
|
26
|
+
start: cursor,
|
|
27
|
+
end,
|
|
28
|
+
originalLine: line,
|
|
29
|
+
});
|
|
30
|
+
cursor = end;
|
|
31
|
+
line += 1;
|
|
32
|
+
}
|
|
33
|
+
return lines;
|
|
34
|
+
}
|
|
35
|
+
function stripLineTerminator(line) {
|
|
36
|
+
return line.replace(/\r?\n$/, "");
|
|
37
|
+
}
|
|
38
|
+
function stripFrontmatter(content) {
|
|
39
|
+
if (!(content.startsWith("---\n") || content.startsWith("---\r\n"))) {
|
|
40
|
+
return { body: content, startLine: 1 };
|
|
41
|
+
}
|
|
42
|
+
const lines = splitLines(content, 1);
|
|
43
|
+
if (lines.length === 0 || stripLineTerminator(lines[0].text) !== "---") {
|
|
44
|
+
return { body: content, startLine: 1 };
|
|
45
|
+
}
|
|
46
|
+
for (let i = 1; i < lines.length; i += 1) {
|
|
47
|
+
if (stripLineTerminator(lines[i].text) === "---") {
|
|
48
|
+
return {
|
|
49
|
+
body: content.slice(lines[i].end),
|
|
50
|
+
startLine: lines[i].originalLine + 1,
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
return { body: content, startLine: 1 };
|
|
55
|
+
}
|
|
56
|
+
function hashText(text) {
|
|
57
|
+
return createHash("sha256").update(text).digest("hex").slice(0, 16);
|
|
58
|
+
}
|
|
59
|
+
function boundaryAtOrBefore(boundaries, minExclusive, maxInclusive) {
|
|
60
|
+
let low = 0;
|
|
61
|
+
let high = boundaries.length - 1;
|
|
62
|
+
let candidate = -1;
|
|
63
|
+
while (low <= high) {
|
|
64
|
+
const mid = Math.floor((low + high) / 2);
|
|
65
|
+
if (boundaries[mid] <= maxInclusive) {
|
|
66
|
+
candidate = mid;
|
|
67
|
+
low = mid + 1;
|
|
68
|
+
}
|
|
69
|
+
else {
|
|
70
|
+
high = mid - 1;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
while (candidate >= 0) {
|
|
74
|
+
const value = boundaries[candidate];
|
|
75
|
+
if (value > minExclusive) {
|
|
76
|
+
return value;
|
|
77
|
+
}
|
|
78
|
+
candidate -= 1;
|
|
79
|
+
}
|
|
80
|
+
return undefined;
|
|
81
|
+
}
|
|
82
|
+
function boundaryAtOrAfter(boundaries, minInclusive, maxInclusive) {
|
|
83
|
+
let low = 0;
|
|
84
|
+
let high = boundaries.length - 1;
|
|
85
|
+
let candidate = -1;
|
|
86
|
+
while (low <= high) {
|
|
87
|
+
const mid = Math.floor((low + high) / 2);
|
|
88
|
+
if (boundaries[mid] >= minInclusive) {
|
|
89
|
+
candidate = mid;
|
|
90
|
+
high = mid - 1;
|
|
91
|
+
}
|
|
92
|
+
else {
|
|
93
|
+
low = mid + 1;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
if (candidate === -1) {
|
|
97
|
+
return undefined;
|
|
98
|
+
}
|
|
99
|
+
const value = boundaries[candidate];
|
|
100
|
+
return value <= maxInclusive ? value : undefined;
|
|
101
|
+
}
|
|
102
|
+
function rangeContaining(ranges, position) {
|
|
103
|
+
for (const range of ranges) {
|
|
104
|
+
if (position < range.start) {
|
|
105
|
+
return undefined;
|
|
106
|
+
}
|
|
107
|
+
if (position >= range.start && position < range.end) {
|
|
108
|
+
return range;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
return undefined;
|
|
112
|
+
}
|
|
113
|
+
function normalizeLimitForCode(ranges, start, proposedLimit) {
|
|
114
|
+
const currentRange = rangeContaining(ranges, start);
|
|
115
|
+
if (currentRange) {
|
|
116
|
+
return currentRange.end;
|
|
117
|
+
}
|
|
118
|
+
for (const range of ranges) {
|
|
119
|
+
if (range.start >= proposedLimit) {
|
|
120
|
+
break;
|
|
121
|
+
}
|
|
122
|
+
if (proposedLimit > range.start && proposedLimit < range.end) {
|
|
123
|
+
if (range.start > start) {
|
|
124
|
+
return range.start;
|
|
125
|
+
}
|
|
126
|
+
return range.end;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
return proposedLimit;
|
|
130
|
+
}
|
|
131
|
+
function lineForOffset(lines, offset) {
|
|
132
|
+
if (lines.length === 0) {
|
|
133
|
+
return 1;
|
|
134
|
+
}
|
|
135
|
+
let low = 0;
|
|
136
|
+
let high = lines.length - 1;
|
|
137
|
+
let index = 0;
|
|
138
|
+
while (low <= high) {
|
|
139
|
+
const mid = Math.floor((low + high) / 2);
|
|
140
|
+
if (lines[mid].start <= offset) {
|
|
141
|
+
index = mid;
|
|
142
|
+
low = mid + 1;
|
|
143
|
+
}
|
|
144
|
+
else {
|
|
145
|
+
high = mid - 1;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
if (offset >= lines[index].end && index < lines.length - 1) {
|
|
149
|
+
return lines[index + 1].originalLine;
|
|
150
|
+
}
|
|
151
|
+
return lines[index].originalLine;
|
|
152
|
+
}
|
|
153
|
+
function headingContextAt(headings, offset) {
|
|
154
|
+
const stack = [];
|
|
155
|
+
for (const heading of headings) {
|
|
156
|
+
if (heading.index > offset) {
|
|
157
|
+
break;
|
|
158
|
+
}
|
|
159
|
+
stack.length = heading.level - 1;
|
|
160
|
+
stack[heading.level - 1] = heading.label;
|
|
161
|
+
}
|
|
162
|
+
return stack.length > 0 ? stack.join(" > ") : undefined;
|
|
163
|
+
}
|
|
164
|
+
function nearestBoundary(boundaries, minInclusive, maxInclusive, target) {
|
|
165
|
+
if (minInclusive > maxInclusive) {
|
|
166
|
+
return undefined;
|
|
167
|
+
}
|
|
168
|
+
const before = boundaryAtOrBefore(boundaries, minInclusive - 1, target);
|
|
169
|
+
const after = boundaryAtOrAfter(boundaries, target, maxInclusive);
|
|
170
|
+
if (before === undefined) {
|
|
171
|
+
return after;
|
|
172
|
+
}
|
|
173
|
+
if (after === undefined) {
|
|
174
|
+
return before;
|
|
175
|
+
}
|
|
176
|
+
return Math.abs(after - target) <= Math.abs(target - before) ? after : before;
|
|
177
|
+
}
|
|
178
|
+
export function chunkMarkdown(content, options) {
|
|
179
|
+
if (content.length === 0) {
|
|
180
|
+
return [];
|
|
181
|
+
}
|
|
182
|
+
const maxTokens = Math.max(1, options.maxTokens);
|
|
183
|
+
const maxChars = maxTokens * 4;
|
|
184
|
+
const overlapTokens = Math.max(0, Math.floor(maxTokens * options.overlapRatio));
|
|
185
|
+
const overlapChars = overlapTokens * 4;
|
|
186
|
+
const stripped = stripFrontmatter(content);
|
|
187
|
+
const body = stripped.body;
|
|
188
|
+
if (body.length === 0 || body.trim().length === 0) {
|
|
189
|
+
return [];
|
|
190
|
+
}
|
|
191
|
+
const lines = splitLines(body, stripped.startLine);
|
|
192
|
+
if (lines.length === 0) {
|
|
193
|
+
return [];
|
|
194
|
+
}
|
|
195
|
+
const headingBoundaries = [];
|
|
196
|
+
const blankLineBoundaries = [];
|
|
197
|
+
const lineBoundaries = [0];
|
|
198
|
+
const headingEvents = [];
|
|
199
|
+
const codeRanges = [];
|
|
200
|
+
let inCodeBlock = false;
|
|
201
|
+
let codeStart = 0;
|
|
202
|
+
for (const line of lines) {
|
|
203
|
+
lineBoundaries.push(line.start);
|
|
204
|
+
const withoutTerminator = stripLineTerminator(line.text);
|
|
205
|
+
const isFence = /^\s*```/.test(withoutTerminator);
|
|
206
|
+
if (isFence) {
|
|
207
|
+
if (!inCodeBlock) {
|
|
208
|
+
inCodeBlock = true;
|
|
209
|
+
codeStart = line.start;
|
|
210
|
+
}
|
|
211
|
+
else {
|
|
212
|
+
inCodeBlock = false;
|
|
213
|
+
codeRanges.push({ start: codeStart, end: line.end });
|
|
214
|
+
}
|
|
215
|
+
continue;
|
|
216
|
+
}
|
|
217
|
+
if (inCodeBlock) {
|
|
218
|
+
continue;
|
|
219
|
+
}
|
|
220
|
+
const headingMatch = withoutTerminator.match(/^(#{1,6})\s+(.+?)\s*$/);
|
|
221
|
+
if (headingMatch) {
|
|
222
|
+
const level = headingMatch[1].length;
|
|
223
|
+
headingBoundaries.push(line.start);
|
|
224
|
+
headingEvents.push({
|
|
225
|
+
index: line.start,
|
|
226
|
+
level,
|
|
227
|
+
label: `${"#".repeat(level)} ${headingMatch[2].trim()}`,
|
|
228
|
+
});
|
|
229
|
+
}
|
|
230
|
+
if (withoutTerminator.trim().length === 0) {
|
|
231
|
+
blankLineBoundaries.push(line.end);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
if (inCodeBlock) {
|
|
235
|
+
codeRanges.push({ start: codeStart, end: body.length });
|
|
236
|
+
}
|
|
237
|
+
const sentenceBoundaries = [];
|
|
238
|
+
const wordBoundaries = [];
|
|
239
|
+
let codeIndex = 0;
|
|
240
|
+
for (let i = 0; i < body.length; i += 1) {
|
|
241
|
+
while (codeIndex < codeRanges.length && i >= codeRanges[codeIndex].end) {
|
|
242
|
+
codeIndex += 1;
|
|
243
|
+
}
|
|
244
|
+
const inCode = codeIndex < codeRanges.length && i >= codeRanges[codeIndex].start && i < codeRanges[codeIndex].end;
|
|
245
|
+
if (inCode) {
|
|
246
|
+
continue;
|
|
247
|
+
}
|
|
248
|
+
const char = body[i];
|
|
249
|
+
if (/\s/.test(char)) {
|
|
250
|
+
wordBoundaries.push(i + 1);
|
|
251
|
+
}
|
|
252
|
+
if (char === "." || char === "?" || char === "!") {
|
|
253
|
+
const next = body[i + 1];
|
|
254
|
+
if (next === undefined || /\s/.test(next)) {
|
|
255
|
+
sentenceBoundaries.push(i + 1);
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
const chunks = [];
|
|
260
|
+
let start = 0;
|
|
261
|
+
while (start < body.length) {
|
|
262
|
+
const remaining = body.slice(start);
|
|
263
|
+
if (estimateTokens(remaining) <= maxTokens) {
|
|
264
|
+
const chunkText = remaining;
|
|
265
|
+
const startLine = lineForOffset(lines, start);
|
|
266
|
+
const endLine = lineForOffset(lines, body.length - 1);
|
|
267
|
+
chunks.push({
|
|
268
|
+
text: chunkText,
|
|
269
|
+
startLine,
|
|
270
|
+
endLine,
|
|
271
|
+
hash: hashText(chunkText),
|
|
272
|
+
headingContext: headingContextAt(headingEvents, start),
|
|
273
|
+
});
|
|
274
|
+
break;
|
|
275
|
+
}
|
|
276
|
+
const rawLimit = Math.min(body.length, start + maxChars);
|
|
277
|
+
const normalizedLimit = normalizeLimitForCode(codeRanges, start, rawLimit);
|
|
278
|
+
let split = boundaryAtOrBefore(headingBoundaries, start, normalizedLimit) ??
|
|
279
|
+
boundaryAtOrBefore(blankLineBoundaries, start, normalizedLimit) ??
|
|
280
|
+
boundaryAtOrBefore(sentenceBoundaries, start, normalizedLimit) ??
|
|
281
|
+
boundaryAtOrBefore(wordBoundaries, start, normalizedLimit) ??
|
|
282
|
+
normalizedLimit;
|
|
283
|
+
if (split <= start) {
|
|
284
|
+
const containingCode = rangeContaining(codeRanges, start);
|
|
285
|
+
split = containingCode ? containingCode.end : Math.min(body.length, start + maxChars);
|
|
286
|
+
if (split <= start) {
|
|
287
|
+
split = Math.min(body.length, start + 1);
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
const chunkText = body.slice(start, split);
|
|
291
|
+
const startLine = lineForOffset(lines, start);
|
|
292
|
+
const endLine = lineForOffset(lines, split - 1);
|
|
293
|
+
chunks.push({
|
|
294
|
+
text: chunkText,
|
|
295
|
+
startLine,
|
|
296
|
+
endLine,
|
|
297
|
+
hash: hashText(chunkText),
|
|
298
|
+
headingContext: headingContextAt(headingEvents, start),
|
|
299
|
+
});
|
|
300
|
+
if (split >= body.length) {
|
|
301
|
+
break;
|
|
302
|
+
}
|
|
303
|
+
if (overlapChars <= 0) {
|
|
304
|
+
start = split;
|
|
305
|
+
continue;
|
|
306
|
+
}
|
|
307
|
+
const minStart = start + 1;
|
|
308
|
+
const maxStart = split - 1;
|
|
309
|
+
if (minStart > maxStart) {
|
|
310
|
+
start = split;
|
|
311
|
+
continue;
|
|
312
|
+
}
|
|
313
|
+
const target = Math.max(minStart, Math.min(maxStart, split - overlapChars));
|
|
314
|
+
const overlapCode = rangeContaining(codeRanges, target);
|
|
315
|
+
let nextStart;
|
|
316
|
+
if (overlapCode) {
|
|
317
|
+
nextStart = overlapCode.start;
|
|
318
|
+
}
|
|
319
|
+
else {
|
|
320
|
+
nextStart =
|
|
321
|
+
nearestBoundary(sentenceBoundaries, minStart, maxStart, target) ??
|
|
322
|
+
nearestBoundary(lineBoundaries, minStart, maxStart, target) ??
|
|
323
|
+
target;
|
|
324
|
+
}
|
|
325
|
+
const nextStartCode = rangeContaining(codeRanges, nextStart);
|
|
326
|
+
if (nextStartCode) {
|
|
327
|
+
nextStart = nextStartCode.start > start ? nextStartCode.start : split;
|
|
328
|
+
}
|
|
329
|
+
if (nextStart <= start || nextStart >= split) {
|
|
330
|
+
start = split;
|
|
331
|
+
}
|
|
332
|
+
else {
|
|
333
|
+
start = nextStart;
|
|
334
|
+
}
|
|
335
|
+
}
|
|
336
|
+
return chunks;
|
|
337
|
+
}
|