memory-lancedb-pro 1.0.20 → 1.0.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,6 +1,33 @@
1
1
  # Changelog
2
2
 
3
3
 
4
+ ## 1.0.22
5
+
6
+ **Storage Path Validation & Better Error Messages**
7
+
8
+ - **Fix**: Validate `dbPath` at startup — resolve symlinks, auto-create missing directories, check write permissions (#26, #27)
9
+ - **Fix**: Write/connection failures now include `errno`, resolved path, and actionable fix suggestions instead of generic errors (#28)
10
+ - **New**: Exported `validateStoragePath()` utility for external tooling and diagnostics
11
+
12
+ Breaking changes: None. Backward compatible.
13
+
14
+ ---
15
+
16
+ ## 1.0.21
17
+
18
+ **Long Context Chunking**
19
+
20
+ - **Feats**: Added automatic chunking for documents exceeding embedding context limits
21
+ - **Feats**: Smart semantic-aware chunking at sentence boundaries with configurable overlap
22
+ - **Feats**: Chunking adapts to different embedding model context limits (Jina, OpenAI, Gemini, etc.)
23
+ - **Feats**: Parallel chunk embedding with averaged result for better semantic preservation
24
+ - **Fixes**: Handles "Input length exceeds context length" errors gracefully
25
+ - **Docs**: Added comprehensive documentation in docs/long-context-chunking.md
26
+
27
+ Breaking changes: None. Backward compatible with existing configurations.
28
+
29
+ ---
30
+
4
31
  ## 1.0.20
5
32
 
6
33
  - Fix: reduce auto-capture noise by skipping memory-management prompts (delete/forget/cleanup memory entries).
package/README.md CHANGED
@@ -720,6 +720,40 @@ upgrade to **memory-lancedb-pro >= 1.0.14**. This plugin now coerces these value
720
720
 
721
721
  ---
722
722
 
723
+ ## ⭐ Star History
724
+
725
+ <a href="https://star-history.com/#win4r/memory-lancedb-pro&Date">
726
+ <picture>
727
+ <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=win4r/memory-lancedb-pro&type=Date&theme=dark&transparent=true" />
728
+ <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=win4r/memory-lancedb-pro&type=Date&transparent=true" />
729
+ <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=win4r/memory-lancedb-pro&type=Date&transparent=true" />
730
+ </picture>
731
+ </a>
732
+
733
+ ## Contributors
734
+
735
+ Top contributors (from GitHub’s contributors list, sorted by commit contributions; bots excluded):
736
+
737
+ <p>
738
+ <a href="https://github.com/win4r"><img src="https://avatars.githubusercontent.com/u/42172631?v=4" width="48" height="48" alt="@win4r" /></a>
739
+ <a href="https://github.com/kctony"><img src="https://avatars.githubusercontent.com/u/1731141?v=4" width="48" height="48" alt="@kctony" /></a>
740
+ <a href="https://github.com/Akatsuki-Ryu"><img src="https://avatars.githubusercontent.com/u/8062209?v=4" width="48" height="48" alt="@Akatsuki-Ryu" /></a>
741
+ <a href="https://github.com/JasonSuz"><img src="https://avatars.githubusercontent.com/u/612256?v=4" width="48" height="48" alt="@JasonSuz" /></a>
742
+ <a href="https://github.com/Minidoracat"><img src="https://avatars.githubusercontent.com/u/11269639?v=4" width="48" height="48" alt="@Minidoracat" /></a>
743
+ <a href="https://github.com/furedericca-lab"><img src="https://avatars.githubusercontent.com/u/263020793?v=4" width="48" height="48" alt="@furedericca-lab" /></a>
744
+ <a href="https://github.com/joe2643"><img src="https://avatars.githubusercontent.com/u/19421931?v=4" width="48" height="48" alt="@joe2643" /></a>
745
+ </p>
746
+
747
+ - [@win4r](https://github.com/win4r) (3 commits)
748
+ - [@kctony](https://github.com/kctony) (2 commits)
749
+ - [@Akatsuki-Ryu](https://github.com/Akatsuki-Ryu) (1 commit)
750
+ - [@JasonSuz](https://github.com/JasonSuz) (1 commit)
751
+ - [@Minidoracat](https://github.com/Minidoracat) (1 commit)
752
+ - [@furedericca-lab](https://github.com/furedericca-lab) (1 commit)
753
+ - [@joe2643](https://github.com/joe2643) (1 commit)
754
+
755
+ Full list: https://github.com/win4r/memory-lancedb-pro/graphs/contributors
756
+
723
757
  ## License
724
758
 
725
759
  MIT
package/README_CN.md CHANGED
@@ -594,6 +594,40 @@ LanceDB 表 `memories`:
594
594
 
595
595
  ---
596
596
 
597
+ ## ⭐ Star 趋势
598
+
599
+ <a href="https://star-history.com/#win4r/memory-lancedb-pro&Date">
600
+ <picture>
601
+ <source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=win4r/memory-lancedb-pro&type=Date&theme=dark&transparent=true" />
602
+ <source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=win4r/memory-lancedb-pro&type=Date&transparent=true" />
603
+ <img alt="Star History Chart" src="https://api.star-history.com/svg?repos=win4r/memory-lancedb-pro&type=Date&transparent=true" />
604
+ </picture>
605
+ </a>
606
+
607
+ ## 主要贡献者
608
+
609
+ 按 GitHub Contributors 列表自动生成(按 commit 贡献数排序,已排除 bot):
610
+
611
+ <p>
612
+ <a href="https://github.com/win4r"><img src="https://avatars.githubusercontent.com/u/42172631?v=4" width="48" height="48" alt="@win4r" /></a>
613
+ <a href="https://github.com/kctony"><img src="https://avatars.githubusercontent.com/u/1731141?v=4" width="48" height="48" alt="@kctony" /></a>
614
+ <a href="https://github.com/Akatsuki-Ryu"><img src="https://avatars.githubusercontent.com/u/8062209?v=4" width="48" height="48" alt="@Akatsuki-Ryu" /></a>
615
+ <a href="https://github.com/JasonSuz"><img src="https://avatars.githubusercontent.com/u/612256?v=4" width="48" height="48" alt="@JasonSuz" /></a>
616
+ <a href="https://github.com/Minidoracat"><img src="https://avatars.githubusercontent.com/u/11269639?v=4" width="48" height="48" alt="@Minidoracat" /></a>
617
+ <a href="https://github.com/furedericca-lab"><img src="https://avatars.githubusercontent.com/u/263020793?v=4" width="48" height="48" alt="@furedericca-lab" /></a>
618
+ <a href="https://github.com/joe2643"><img src="https://avatars.githubusercontent.com/u/19421931?v=4" width="48" height="48" alt="@joe2643" /></a>
619
+ </p>
620
+
621
+ - [@win4r](https://github.com/win4r)(3 次提交)
622
+ - [@kctony](https://github.com/kctony)(2 次提交)
623
+ - [@Akatsuki-Ryu](https://github.com/Akatsuki-Ryu)(1 次提交)
624
+ - [@JasonSuz](https://github.com/JasonSuz)(1 次提交)
625
+ - [@Minidoracat](https://github.com/Minidoracat)(1 次提交)
626
+ - [@furedericca-lab](https://github.com/furedericca-lab)(1 次提交)
627
+ - [@joe2643](https://github.com/joe2643)(1 次提交)
628
+
629
+ 完整列表:https://github.com/win4r/memory-lancedb-pro/graphs/contributors
630
+
597
631
  ## License
598
632
 
599
633
  MIT
@@ -0,0 +1,258 @@
1
+ # Long Context Chunking
2
+
3
+ ## Overview
4
+
5
+ The long context chunking system automatically handles documents that exceed embedding model context limits by splitting them into manageable chunks and computing averaged embeddings.
6
+
7
+ ## Problem Solved
8
+
9
+ When embedding very long documents or messages, you might encounter errors like:
10
+
11
+ ```
12
+ Input length exceeds context length: 12453 tokens. Maximum length: 8192 tokens.
13
+ ```
14
+
15
+ This plugin now handles such cases gracefully by:
16
+ 1. Detecting context length errors before they cause failures
17
+ 2. Automatically splitting the document into overlapping chunks
18
+ 3. Embedding each chunk separately
19
+ 4. Computing an averaged embedding that preserves semantic meaning
20
+
21
+ ## How It Works
22
+
23
+ ### Chunking Strategy
24
+
25
+ The chunker uses a **semantic-aware** approach:
26
+
27
+ - **Splits at sentence boundaries** when possible (better for preserving meaning)
28
+ - **Configurable overlap** (default: 200 characters) to maintain context across chunks
29
+ - **Adapts to model context limits** based on the embedding model
30
+ - **Forced splits** at hard limits if sentence boundaries are not found
31
+
32
+ ### Chunking Flow
33
+
34
+ ```
35
+ Long Document
36
+
37
+ ├── 8192+ characters ──┐
38
+
39
+
40
+ ┌─────────────────┐
41
+ │ Detect Overflow │
42
+ └────────┬────────┘
43
+
44
+
45
+ ┌─────────────────┐
46
+ │ Split into │
47
+ │ Overlapping │
48
+ │ Chunks │
49
+ └────────┬────────┘
50
+
51
+ ┌────────────────────┼────────────────────┐
52
+ │ │ │
53
+ ▼ ▼ ▼
54
+ ┌────────┐ ┌────────┐ ┌────────┐
55
+ │ Chunk 1│ │ Chunk 2│ │ Chunk 3│
56
+ │ [1-2k]│ │[1.8k-3.8k]│ │[3.6k-5.6k]│
57
+ └───┬────┘ └───┬────┘ └───┬────┘
58
+ │ │ │
59
+ ▼ ▼ ▼
60
+ Embedding Embedding Embedding
61
+ │ │ │
62
+ └──────────────────┼──────────────────┘
63
+
64
+
65
+ Compute Average
66
+
67
+
68
+ Final Embedding
69
+ ```
70
+
71
+ ## Configuration
72
+
73
+ ### Default Settings
74
+
75
+ The chunker automatically adapts to your embedding model:
76
+
77
+ - **maxChunkSize**: 70% of model context limit (e.g., 5734 for 8192-token model)
78
+ - **overlapSize**: 5% of model context limit
79
+ - **minChunkSize**: 10% of model context limit
80
+ - **semanticSplit**: true (prefer sentence boundaries)
81
+ - **maxLinesPerChunk**: 50 lines
82
+
83
+ ### Disabling Auto-Chunking
84
+
85
+ If you prefer to handle chunking manually or want the model to fail on long documents:
86
+
87
+ ```json
88
+ {
89
+ "plugins": {
90
+ "entries": {
91
+ "memory-lancedb-pro": {
92
+ "enabled": true,
93
+ "config": {
94
+ "embedding": {
95
+ "apiKey": "${JINA_API_KEY}",
96
+ "model": "jina-embeddings-v5-text-small",
97
+ "chunking": false // Disable auto-chunking
98
+ }
99
+ }
100
+ }
101
+ }
102
+ }
103
+ }
104
+ ```
105
+
106
+ ### Custom Chunking Parameters
107
+
108
+ For advanced users who want to tune chunking behavior:
109
+
110
+ ```json
111
+ {
112
+ "plugins": {
113
+ "entries": {
114
+ "memory-lancedb-pro": {
115
+ "enabled": true,
116
+ "config": {
117
+ "embedding": {
118
+ "autoChunk": {
119
+ "maxChunkSize": 2000, // Characters per chunk
120
+ "overlapSize": 500, // Overlap between chunks
121
+ "minChunkSize": 500, // Minimum acceptable chunk size
122
+ "semanticSplit": true, // Prefer sentence boundaries
123
+ "maxLinesPerChunk": 100 // Max lines before forced split
124
+ }
125
+ }
126
+ }
127
+ }
128
+ }
129
+ }
130
+ }
131
+ ```
132
+
133
+ ## Supported Models
134
+
135
+ The chunker automatically adapts to these embedding models:
136
+
137
+ | Model | Context Limit | Chunk Size | Overlap |
138
+ |-------|---------------|------------|----------|
139
+ | Jina jina-embeddings-v5-text-small | 8192 | 5734 | 409 |
140
+ | OpenAI text-embedding-3-small | 8192 | 5734 | 409 |
141
+ | OpenAI text-embedding-3-large | 8192 | 5734 | 409 |
142
+ | Gemini gemini-embedding-001 | 2048 | 1433 | 102 |
143
+
144
+ ## Performance Considerations
145
+
146
+ ### Token Savings
147
+
148
+ - **Without chunking**: 1 failed embedding (retries required)
149
+ - **With chunking**: 3-4 chunk embeddings (1 avg result)
150
+ - **Net cost increase**: ~3x for long documents (>8k tokens)
151
+ - **Trade-off**: Gracefully handling vs. processing smaller documents
152
+
153
+ ### Caching
154
+
155
+ Chunked embeddings are cached by their original document hash, so:
156
+ - Subsequent requests for the same document get the cached averaged embedding
157
+ - Cache hit rate improves as long documents are processed repeatedly
158
+
159
+ ### Processing Time
160
+
161
+ - **Small documents (<4k chars)**: No chunking, same as before
162
+ - **Medium documents (4k-8k chars)**: No chunking, same as before
163
+ - **Long documents (>8k chars)**: ~100-200ms additional chunking overhead
164
+
165
+ ## Logging & Debugging
166
+
167
+ ### Enable Debug Logging
168
+
169
+ To see chunking in action, you can check the logs:
170
+
171
+ ```
172
+ Document exceeded context limit (...), attempting chunking...
173
+ Split document into 3 chunks for embedding
174
+ Successfully embedded long document as 3 averaged chunks
175
+ ```
176
+
177
+ ### Common Scenarios
178
+
179
+ **Scenario 1: Long memory text**
180
+ - When a user's message or system prompt is very long
181
+ - Automatically chunked before embedding
182
+ - No error thrown, memory is still stored and retrievable
183
+
184
+ **Scenario 2: Batch embedding long documents**
185
+ - If some documents in a batch exceed limits
186
+ - Only the long ones are chunked
187
+ - Successful documents processed normally
188
+
189
+ ## Troubleshooting
190
+
191
+ ### Chunking Still Fails
192
+
193
+ If you still see context length errors:
194
+
195
+ 1. **Verify model**: Check which embedding model you're using
196
+ 2. **Increase minChunkSize**: May need smaller chunks for some models
197
+ 3. **Disable autoChunk**: Handle chunking manually with explicit split
198
+
199
+ ### Too Many Small Chunks
200
+
201
+ If chunking creates many tiny fragments:
202
+
203
+ 1. **Increase minChunkSize**: Larger minimum chunk size
204
+ 2. **Reduce overlap**: Less overlap between chunks means more efficient chunks
205
+
206
+ ### Embedding Quality Degradation
207
+
208
+ If chunked embeddings seem less accurate:
209
+
210
+ 1. **Increase overlap**: More context between chunks preserves relationships
211
+ 2. **Use smaller maxChunkSize**: Split into more, smaller overlapping pieces
212
+ 3. **Consider hierarchical approach**: Use a two-pass retrieval (chunk → document → full text)
213
+
214
+ ## Future Enhancements
215
+
216
+ Planned improvements:
217
+
218
+ - [ ] **Hierarchical chunking**: Chunk → document-level embedding
219
+ - [ ] **Sliding window**: Different overlap strategies per document complexity
220
+ - [ ] **Smart summarization**: Summarize chunks before averaging for better quality
221
+ - [ ] **Context-aware overlap**: Dynamic overlap based on document complexity
222
+ - [ ] **Async chunking**: Process chunks in parallel for batch operations
223
+
224
+ ## Technical Details
225
+
226
+ ### Algorithm
227
+
228
+ 1. **Detect overflow**: Check if document exceeds maxChunkSize
229
+ 2. **Split semantically**: Find sentence boundaries within target range
230
+ 3. **Create overlap**: Include overlap with previous chunk's end
231
+ 4. **Embed in parallel**: Process all chunks simultaneously
232
+ 5. **Average the result**: Compute mean embedding across all chunks
233
+
234
+ ### Complexity
235
+
236
+ - **Time**: O(n × k) where n = number of chunks, k = average chunk processing time
237
+ - **Space**: O(n × d) where d = embedding dimension
238
+
239
+ ### Edge Cases
240
+
241
+ | Case | Handling |
242
+ |------|----------|
243
+ | Empty document | Returns empty embedding immediately |
244
+ | Very small documents | No chunking, normal processing |
245
+ | Perfect boundaries | Split at sentence ends, no truncation |
246
+ | No boundaries found | Hard split at max position |
247
+ | Single oversized chunk | Process as-is, let provider error |
248
+ | All chunks too small | Last chunk takes remaining text |
249
+
250
+ ## References
251
+
252
+ - [LanceDB Documentation](https://lancedb.com)
253
+ - [OpenAI Embedding Context Limits](https://platform.openai.com/docs/guides/embeddings)
254
+ - [Semantic Chunking Research](https://arxiv.org/abs/2310.05970)
255
+
256
+ ---
257
+
258
+ *This feature was added to handle long-context documents gracefully without losing memory quality.*
package/index.ts CHANGED
@@ -10,7 +10,7 @@ import { readFile, readdir, writeFile, mkdir } from "node:fs/promises";
10
10
  import { readFileSync } from "node:fs";
11
11
 
12
12
  // Import core components
13
- import { MemoryStore } from "./src/store.js";
13
+ import { MemoryStore, validateStoragePath } from "./src/store.js";
14
14
  import { createEmbedder, getVectorDimensions } from "./src/embedder.js";
15
15
  import { createRetriever, DEFAULT_RETRIEVAL_CONFIG } from "./src/retriever.js";
16
16
  import { createScopeManager } from "./src/scopes.js";
@@ -315,6 +315,18 @@ const memoryLanceDBProPlugin = {
315
315
  const config = parsePluginConfig(api.pluginConfig);
316
316
 
317
317
  const resolvedDbPath = api.resolvePath(config.dbPath || getDefaultDbPath());
318
+
319
+ // Pre-flight: validate storage path (symlink resolution, mkdir, write check).
320
+ // Runs synchronously and logs warnings; does NOT block gateway startup.
321
+ try {
322
+ validateStoragePath(resolvedDbPath);
323
+ } catch (err) {
324
+ api.logger.warn(
325
+ `memory-lancedb-pro: storage path issue — ${String(err)}\n` +
326
+ ` The plugin will still attempt to start, but writes may fail.`
327
+ );
328
+ }
329
+
318
330
  const vectorDim = getVectorDimensions(
319
331
  config.embedding.model || "text-embedding-3-small",
320
332
  config.embedding.dimensions
@@ -1,8 +1,8 @@
1
1
  {
2
2
  "id": "memory-lancedb-pro",
3
3
  "name": "Memory (LanceDB Pro)",
4
- "description": "Enhanced LanceDB-backed long-term memory with hybrid retrieval, multi-scope isolation, and management CLI",
5
- "version": "1.0.20",
4
+ "description": "Enhanced LanceDB-backed long-term memory with hybrid retrieval, multi-scope isolation, long-context chunking, and management CLI",
5
+ "version": "1.0.21",
6
6
  "kind": "memory",
7
7
  "configSchema": {
8
8
  "type": "object",
@@ -40,6 +40,11 @@
40
40
  "normalized": {
41
41
  "type": "boolean",
42
42
  "description": "Request normalized embeddings when supported by the provider (e.g. Jina v5)"
43
+ },
44
+ "chunking": {
45
+ "type": "boolean",
46
+ "default": true,
47
+ "description": "Enable automatic chunking for documents exceeding embedding context limits"
43
48
  }
44
49
  },
45
50
  "required": [
@@ -260,6 +265,28 @@
260
265
  "help": "Override vector dimensions for custom models not in the built-in lookup table",
261
266
  "advanced": true
262
267
  },
268
+ "embedding.taskQuery": {
269
+ "label": "Query Task",
270
+ "placeholder": "retrieval.query",
271
+ "help": "Optional task selector for query embeddings (Jina: retrieval.query). If unset, no task field is sent.",
272
+ "advanced": true
273
+ },
274
+ "embedding.taskPassage": {
275
+ "label": "Passage Task",
276
+ "placeholder": "retrieval.passage",
277
+ "help": "Optional task selector for passage/document embeddings (Jina: retrieval.passage). If unset, no task field is sent.",
278
+ "advanced": true
279
+ },
280
+ "embedding.normalized": {
281
+ "label": "Normalized Embeddings",
282
+ "help": "Request normalized embeddings when the provider supports it (Jina v5). If unset, the field is not sent.",
283
+ "advanced": true
284
+ },
285
+ "embedding.chunking": {
286
+ "label": "Auto-Chunk Documents",
287
+ "help": "Automatically split long documents into chunks when they exceed embedding context limits. Set to false to disable and let the model fail on long documents.",
288
+ "advanced": true
289
+ },
263
290
  "dbPath": {
264
291
  "label": "Database Path",
265
292
  "placeholder": "~/.openclaw/memory/lancedb-pro",
@@ -338,6 +365,21 @@
338
365
  "help": "Number of candidates to fetch before fusion and reranking",
339
366
  "advanced": true
340
367
  },
368
+ "retrieval.lengthNormAnchor": {
369
+ "label": "Length Normalization Anchor",
370
+ "help": "Entries longer than this (chars) get score penalized to prevent long entries dominating. 0 = disabled.",
371
+ "advanced": true
372
+ },
373
+ "retrieval.hardMinScore": {
374
+ "label": "Hard Minimum Score",
375
+ "help": "Discard results below this score after all scoring stages. Higher = fewer but more relevant results.",
376
+ "advanced": true
377
+ },
378
+ "retrieval.timeDecayHalfLifeDays": {
379
+ "label": "Time Decay Half-Life",
380
+ "help": "Old entries lose score over this many days. Floor at 0.5x. 0 = disabled.",
381
+ "advanced": true
382
+ },
341
383
  "sessionMemory.enabled": {
342
384
  "label": "Session Memory",
343
385
  "help": "Store session summaries to LanceDB when /new is triggered (replaces built-in session-memory hook)"
@@ -366,38 +408,6 @@
366
408
  "label": "Management Tools",
367
409
  "help": "Enable memory_list and memory_stats tools for debugging and auditing",
368
410
  "advanced": true
369
- },
370
- "retrieval.lengthNormAnchor": {
371
- "label": "Length Normalization Anchor",
372
- "help": "Entries longer than this (chars) get score penalized to prevent long entries dominating. 0 = disabled.",
373
- "advanced": true
374
- },
375
- "retrieval.hardMinScore": {
376
- "label": "Hard Minimum Score",
377
- "help": "Discard results below this score after all scoring stages. Higher = fewer but more relevant results.",
378
- "advanced": true
379
- },
380
- "retrieval.timeDecayHalfLifeDays": {
381
- "label": "Time Decay Half-Life",
382
- "help": "Old entries lose score over this many days. Floor at 0.5x. 0 = disabled.",
383
- "advanced": true
384
- },
385
- "embedding.taskQuery": {
386
- "label": "Query Task",
387
- "placeholder": "retrieval.query",
388
- "help": "Optional task selector for query embeddings (Jina: retrieval.query). If unset, no task field is sent.",
389
- "advanced": true
390
- },
391
- "embedding.taskPassage": {
392
- "label": "Passage Task",
393
- "placeholder": "retrieval.passage",
394
- "help": "Optional task selector for passage/document embeddings (Jina: retrieval.passage). If unset, no task field is sent.",
395
- "advanced": true
396
- },
397
- "embedding.normalized": {
398
- "label": "Normalized Embeddings",
399
- "help": "Request normalized embeddings when the provider supports it (Jina v5). If unset, the field is not sent.",
400
- "advanced": true
401
411
  }
402
412
  }
403
413
  }
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "memory-lancedb-pro",
3
- "version": "1.0.20",
4
- "description": "OpenClaw enhanced LanceDB memory plugin with hybrid retrieval (Vector + BM25), cross-encoder rerank, multi-scope isolation, and management CLI",
3
+ "version": "1.0.22",
4
+ "description": "OpenClaw enhanced LanceDB memory plugin with hybrid retrieval (Vector + BM25), cross-encoder rerank, multi-scope isolation, long-context chunking, and management CLI",
5
5
  "type": "module",
6
6
  "main": "index.ts",
7
7
  "keywords": [
@@ -14,7 +14,9 @@
14
14
  "hybrid-retrieval",
15
15
  "rerank",
16
16
  "ai-memory",
17
- "long-term-memory"
17
+ "long-term-memory",
18
+ "chunking",
19
+ "long-context"
18
20
  ],
19
21
  "repository": {
20
22
  "type": "git",
package/src/chunker.ts ADDED
@@ -0,0 +1,253 @@
1
+ /**
2
+ * Long Context Chunking System
3
+ *
4
+ * Goal: split documents that exceed embedding model context limits into smaller,
5
+ * semantically coherent chunks with overlap.
6
+ *
7
+ * Notes:
8
+ * - We use *character counts* as a conservative proxy for tokens.
9
+ * - The embedder triggers this only after a provider throws a context-length error.
10
+ */
11
+
12
+ // ============================================================================
13
+ // Types & Constants
14
+ // ============================================================================
15
+
16
+ export interface ChunkMetadata {
17
+ startIndex: number;
18
+ endIndex: number;
19
+ length: number;
20
+ }
21
+
22
+ export interface ChunkResult {
23
+ chunks: string[];
24
+ metadatas: ChunkMetadata[];
25
+ totalOriginalLength: number;
26
+ chunkCount: number;
27
+ }
28
+
29
+ export interface ChunkerConfig {
30
+ /** Maximum characters per chunk. */
31
+ maxChunkSize: number;
32
+ /** Overlap between chunks in characters. */
33
+ overlapSize: number;
34
+ /** Minimum chunk size (except the final chunk). */
35
+ minChunkSize: number;
36
+ /** Attempt to split on sentence boundaries for better semantic coherence. */
37
+ semanticSplit: boolean;
38
+ /** Max lines per chunk before we try to split earlier on a line boundary. */
39
+ maxLinesPerChunk: number;
40
+ }
41
+
42
+ // Common embedding context limits (provider/model specific). These are typically
43
+ // token limits, but we treat them as inputs to a conservative char-based heuristic.
44
+ export const EMBEDDING_CONTEXT_LIMITS: Record<string, number> = {
45
+ // Jina v5
46
+ "jina-embeddings-v5-text-small": 8192,
47
+ "jina-embeddings-v5-text-nano": 8192,
48
+
49
+ // OpenAI
50
+ "text-embedding-3-small": 8192,
51
+ "text-embedding-3-large": 8192,
52
+
53
+ // Google
54
+ "text-embedding-004": 8192,
55
+ "gemini-embedding-001": 2048,
56
+
57
+ // Local/common
58
+ "nomic-embed-text": 8192,
59
+ "all-MiniLM-L6-v2": 512,
60
+ "all-mpnet-base-v2": 512,
61
+ };
62
+
63
+ export const DEFAULT_CHUNKER_CONFIG: ChunkerConfig = {
64
+ maxChunkSize: 4000,
65
+ overlapSize: 200,
66
+ minChunkSize: 200,
67
+ semanticSplit: true,
68
+ maxLinesPerChunk: 50,
69
+ };
70
+
71
+ // Sentence ending patterns (English + CJK-ish punctuation)
72
+ const SENTENCE_ENDING = /[.!?。!?]/;
73
+
74
+ // ============================================================================
75
+ // Helpers
76
+ // ============================================================================
77
+
78
+ function clamp(n: number, lo: number, hi: number): number {
79
+ return Math.max(lo, Math.min(hi, n));
80
+ }
81
+
82
+ function countLines(s: string): number {
83
+ // Count \n (treat CRLF as one line break)
84
+ return s.split(/\r\n|\n|\r/).length;
85
+ }
86
+
87
+ function findLastIndexWithin(text: string, re: RegExp, start: number, end: number): number {
88
+ // Find last match start index for regex within [start, end).
89
+ // NOTE: `re` must NOT be global; we will scan manually.
90
+ let last = -1;
91
+ for (let i = end - 1; i >= start; i--) {
92
+ if (re.test(text[i])) return i;
93
+ }
94
+ return last;
95
+ }
96
+
97
+ function findSplitEnd(text: string, start: number, maxEnd: number, minEnd: number, config: ChunkerConfig): number {
98
+ const safeMinEnd = clamp(minEnd, start + 1, maxEnd);
99
+ const safeMaxEnd = clamp(maxEnd, safeMinEnd, text.length);
100
+
101
+ // Respect line limit: if we exceed maxLinesPerChunk, force earlier split at a line break.
102
+ if (config.maxLinesPerChunk > 0) {
103
+ const candidate = text.slice(start, safeMaxEnd);
104
+ if (countLines(candidate) > config.maxLinesPerChunk) {
105
+ // Find the position of the Nth line break.
106
+ let breaks = 0;
107
+ for (let i = start; i < safeMaxEnd; i++) {
108
+ const ch = text[i];
109
+ if (ch === "\n") {
110
+ breaks++;
111
+ if (breaks >= config.maxLinesPerChunk) {
112
+ // Split right after this newline.
113
+ return Math.max(i + 1, safeMinEnd);
114
+ }
115
+ }
116
+ }
117
+ }
118
+ }
119
+
120
+ if (config.semanticSplit) {
121
+ // Prefer a sentence boundary near the end.
122
+ // Scan backward from safeMaxEnd to safeMinEnd.
123
+ for (let i = safeMaxEnd - 1; i >= safeMinEnd; i--) {
124
+ if (SENTENCE_ENDING.test(text[i])) {
125
+ // Include trailing whitespace after punctuation.
126
+ let j = i + 1;
127
+ while (j < safeMaxEnd && /\s/.test(text[j])) j++;
128
+ return j;
129
+ }
130
+ }
131
+
132
+ // Next best: newline boundary.
133
+ for (let i = safeMaxEnd - 1; i >= safeMinEnd; i--) {
134
+ if (text[i] === "\n") return i + 1;
135
+ }
136
+ }
137
+
138
+ // Fallback: last whitespace boundary.
139
+ for (let i = safeMaxEnd - 1; i >= safeMinEnd; i--) {
140
+ if (/\s/.test(text[i])) return i;
141
+ }
142
+
143
+ return safeMaxEnd;
144
+ }
145
+
146
+ function sliceTrimWithIndices(text: string, start: number, end: number): { chunk: string; meta: ChunkMetadata } {
147
+ const raw = text.slice(start, end);
148
+ const leading = raw.match(/^\s*/)?.[0]?.length ?? 0;
149
+ const trailing = raw.match(/\s*$/)?.[0]?.length ?? 0;
150
+ const chunk = raw.trim();
151
+
152
+ const trimmedStart = start + leading;
153
+ const trimmedEnd = end - trailing;
154
+
155
+ return {
156
+ chunk,
157
+ meta: {
158
+ startIndex: trimmedStart,
159
+ endIndex: Math.max(trimmedStart, trimmedEnd),
160
+ length: chunk.length,
161
+ },
162
+ };
163
+ }
164
+
165
+ // ============================================================================
166
+ // Chunking Core
167
+ // ============================================================================
168
+
169
+ export function chunkDocument(text: string, config: ChunkerConfig = DEFAULT_CHUNKER_CONFIG): ChunkResult {
170
+ if (!text || text.trim().length === 0) {
171
+ return { chunks: [], metadatas: [], totalOriginalLength: 0, chunkCount: 0 };
172
+ }
173
+
174
+ const totalOriginalLength = text.length;
175
+ const chunks: string[] = [];
176
+ const metadatas: ChunkMetadata[] = [];
177
+
178
+ let pos = 0;
179
+ const maxGuard = Math.max(4, Math.ceil(text.length / Math.max(1, config.maxChunkSize - config.overlapSize)) + 5);
180
+ let guard = 0;
181
+
182
+ while (pos < text.length && guard < maxGuard) {
183
+ guard++;
184
+
185
+ const remaining = text.length - pos;
186
+ if (remaining <= config.maxChunkSize) {
187
+ const { chunk, meta } = sliceTrimWithIndices(text, pos, text.length);
188
+ if (chunk.length > 0) {
189
+ chunks.push(chunk);
190
+ metadatas.push(meta);
191
+ }
192
+ break;
193
+ }
194
+
195
+ const maxEnd = Math.min(pos + config.maxChunkSize, text.length);
196
+ const minEnd = Math.min(pos + config.minChunkSize, maxEnd);
197
+
198
+ const end = findSplitEnd(text, pos, maxEnd, minEnd, config);
199
+ const { chunk, meta } = sliceTrimWithIndices(text, pos, end);
200
+
201
+ // If trimming made it too small, fall back to a hard split.
202
+ if (chunk.length < config.minChunkSize) {
203
+ const hardEnd = Math.min(pos + config.maxChunkSize, text.length);
204
+ const hard = sliceTrimWithIndices(text, pos, hardEnd);
205
+ if (hard.chunk.length > 0) {
206
+ chunks.push(hard.chunk);
207
+ metadatas.push(hard.meta);
208
+ }
209
+ if (hardEnd >= text.length) break;
210
+ pos = Math.max(hardEnd - config.overlapSize, pos + 1);
211
+ continue;
212
+ }
213
+
214
+ chunks.push(chunk);
215
+ metadatas.push(meta);
216
+
217
+ if (end >= text.length) break;
218
+
219
+ // Move forward with overlap.
220
+ const nextPos = Math.max(end - config.overlapSize, pos + 1);
221
+ pos = nextPos;
222
+ }
223
+
224
+ return {
225
+ chunks,
226
+ metadatas,
227
+ totalOriginalLength,
228
+ chunkCount: chunks.length,
229
+ };
230
+ }
231
+
232
+ /**
233
+ * Smart chunker that adapts to model context limits.
234
+ *
235
+ * We intentionally pick conservative char limits (70% of the reported limit)
236
+ * since token/char ratios vary.
237
+ */
238
+ export function smartChunk(text: string, embedderModel?: string): ChunkResult {
239
+ const limit = embedderModel ? EMBEDDING_CONTEXT_LIMITS[embedderModel] : undefined;
240
+ const base = limit ?? 8192;
241
+
242
+ const config: ChunkerConfig = {
243
+ maxChunkSize: Math.max(1000, Math.floor(base * 0.7)),
244
+ overlapSize: Math.max(0, Math.floor(base * 0.05)),
245
+ minChunkSize: Math.max(100, Math.floor(base * 0.1)),
246
+ semanticSplit: true,
247
+ maxLinesPerChunk: 50,
248
+ };
249
+
250
+ return chunkDocument(text, config);
251
+ }
252
+
253
+ export default chunkDocument;
package/src/embedder.ts CHANGED
@@ -1,6 +1,7 @@
1
1
  /**
2
2
  * Embedding Abstraction Layer
3
3
  * OpenAI-compatible API for various embedding providers.
4
+ * Supports automatic chunking for documents exceeding embedding context limits.
4
5
  *
5
6
  * Note: Some providers (e.g. Jina) support extra parameters like `task` and
6
7
  * `normalized` on the embeddings endpoint. The OpenAI SDK types do not include
@@ -9,6 +10,7 @@
9
10
 
10
11
  import OpenAI from "openai";
11
12
  import { createHash } from "node:crypto";
13
+ import { smartChunk } from "./chunker.js";
12
14
 
13
15
  // ============================================================================
14
16
  // Embedding Cache (LRU with TTL)
@@ -94,6 +96,8 @@ export interface EmbeddingConfig {
94
96
  taskPassage?: string;
95
97
  /** Optional flag to request normalized embeddings (provider-dependent, e.g. Jina v5) */
96
98
  normalized?: boolean;
99
+ /** Enable automatic chunking for documents exceeding context limits (default: true) */
100
+ chunking?: boolean;
97
101
  }
98
102
 
99
103
  // Known embedding model dimensions
@@ -106,7 +110,7 @@ const EMBEDDING_DIMENSIONS: Record<string, number> = {
106
110
  "mxbai-embed-large": 1024,
107
111
  "BAAI/bge-m3": 1024,
108
112
  "all-MiniLM-L6-v2": 384,
109
- "all-mpnet-base-v2": 768,
113
+ "all-mpnet-base-v2": 512,
110
114
 
111
115
  // Jina v5
112
116
  "jina-embeddings-v5-text-small": 1024,
@@ -158,8 +162,10 @@ export class Embedder {
158
162
 
159
163
  /** Optional requested dimensions to pass through to the embedding provider (OpenAI-compatible). */
160
164
  private readonly _requestDimensions?: number;
165
+ /** Enable automatic chunking for long documents (default: true) */
166
+ private readonly _autoChunk: boolean;
161
167
 
162
- constructor(config: EmbeddingConfig) {
168
+ constructor(config: EmbeddingConfig & { chunking?: boolean }) {
163
169
  // Resolve environment variables in API key
164
170
  const resolvedApiKey = resolveEnvVars(config.apiKey);
165
171
 
@@ -168,6 +174,8 @@ export class Embedder {
168
174
  this._taskPassage = config.taskPassage;
169
175
  this._normalized = config.normalized;
170
176
  this._requestDimensions = config.dimensions;
177
+ // Enable auto-chunking by default for better handling of long documents
178
+ this._autoChunk = config.chunking !== false;
171
179
 
172
180
  this.client = new OpenAI({
173
181
  apiKey: resolvedApiKey,
@@ -273,6 +281,58 @@ export class Embedder {
273
281
  this._cache.set(text, task, embedding);
274
282
  return embedding;
275
283
  } catch (error) {
284
+ // Check if this is a context length exceeded error and try chunking
285
+ const errorMsg = error instanceof Error ? error.message : String(error);
286
+ const isContextError = /context|too long|exceed|length/i.test(errorMsg);
287
+
288
+ if (isContextError && this._autoChunk) {
289
+ try {
290
+ console.log(`Document exceeded context limit (${errorMsg}), attempting chunking...`);
291
+ const chunkResult = smartChunk(text, this._model);
292
+
293
+ if (chunkResult.chunks.length === 0) {
294
+ throw new Error(`Failed to chunk document: ${errorMsg}`);
295
+ }
296
+
297
+ // Embed all chunks in parallel
298
+ console.log(`Split document into ${chunkResult.chunkCount} chunks for embedding`);
299
+ const chunkEmbeddings = await Promise.all(
300
+ chunkResult.chunks.map(async (chunk, idx) => {
301
+ try {
302
+ const embedding = await this.embedSingle(chunk, task);
303
+ return { embedding };
304
+ } catch (chunkError) {
305
+ console.warn(`Failed to embed chunk ${idx}:`, chunkError);
306
+ throw chunkError;
307
+ }
308
+ })
309
+ );
310
+
311
+ // Compute average embedding across chunks
312
+ const avgEmbedding = chunkEmbeddings.reduce(
313
+ (sum, { embedding }) => {
314
+ for (let i = 0; i < embedding.length; i++) {
315
+ sum[i] += embedding[i];
316
+ }
317
+ return sum;
318
+ },
319
+ new Array(this.dimensions).fill(0)
320
+ );
321
+
322
+ const finalEmbedding = avgEmbedding.map(v => v / chunkEmbeddings.length);
323
+
324
+ // Cache the result for the original text (using its hash)
325
+ this._cache.set(text, task, finalEmbedding);
326
+ console.log(`Successfully embedded long document as ${chunkEmbeddings.length} averaged chunks`);
327
+
328
+ return finalEmbedding;
329
+ } catch (chunkError) {
330
+ // If chunking fails, throw the original error
331
+ console.warn(`Chunking failed, using original error:`, chunkError);
332
+ throw new Error(`Failed to generate embedding: ${errorMsg}`, { cause: error });
333
+ }
334
+ }
335
+
276
336
  if (error instanceof Error) {
277
337
  throw new Error(`Failed to generate embedding: ${error.message}`, { cause: error });
278
338
  }
@@ -326,6 +386,71 @@ export class Embedder {
326
386
 
327
387
  return results;
328
388
  } catch (error) {
389
+ // Check if this is a context length exceeded error and try chunking each text
390
+ const errorMsg = error instanceof Error ? error.message : String(error);
391
+ const isContextError = /context|too long|exceed|length/i.test(errorMsg);
392
+
393
+ if (isContextError && this._autoChunk) {
394
+ try {
395
+ console.log(`Batch embedding failed with context error, attempting chunking...`);
396
+
397
+ const chunkResults = await Promise.all(
398
+ validTexts.map(async (text, idx) => {
399
+ const chunkResult = smartChunk(text, this._model);
400
+ if (chunkResult.chunks.length === 0) {
401
+ throw new Error("Chunker produced no chunks");
402
+ }
403
+
404
+ // Embed all chunks in parallel, then average.
405
+ const embeddings = await Promise.all(
406
+ chunkResult.chunks.map((chunk) => this.embedSingle(chunk, task))
407
+ );
408
+
409
+ const avgEmbedding = embeddings.reduce(
410
+ (sum, emb) => {
411
+ for (let i = 0; i < emb.length; i++) {
412
+ sum[i] += emb[i];
413
+ }
414
+ return sum;
415
+ },
416
+ new Array(this.dimensions).fill(0)
417
+ );
418
+
419
+ const finalEmbedding = avgEmbedding.map((v) => v / embeddings.length);
420
+
421
+ // Cache the averaged embedding for the original (long) text.
422
+ this._cache.set(text, task, finalEmbedding);
423
+
424
+ return { embedding: finalEmbedding, index: validIndices[idx] };
425
+ })
426
+ );
427
+
428
+ console.log(`Successfully chunked and embedded ${chunkResults.length} long documents`);
429
+
430
+ // Build results array
431
+ const results: number[][] = new Array(texts.length);
432
+ chunkResults.forEach(({ embedding, index }) => {
433
+ if (embedding.length > 0) {
434
+ this.validateEmbedding(embedding);
435
+ results[index] = embedding;
436
+ } else {
437
+ results[index] = [];
438
+ }
439
+ });
440
+
441
+ // Fill empty arrays for invalid texts
442
+ for (let i = 0; i < texts.length; i++) {
443
+ if (!results[i]) {
444
+ results[i] = [];
445
+ }
446
+ }
447
+
448
+ return results;
449
+ } catch (chunkError) {
450
+ throw new Error(`Failed to embed documents after chunking attempt: ${errorMsg}`);
451
+ }
452
+ }
453
+
329
454
  if (error instanceof Error) {
330
455
  throw new Error(`Failed to generate batch embeddings: ${error.message}`, { cause: error });
331
456
  }
package/src/store.ts CHANGED
@@ -4,6 +4,8 @@
4
4
 
5
5
  import type * as LanceDB from "@lancedb/lancedb";
6
6
  import { randomUUID } from "node:crypto";
7
+ import { existsSync, accessSync, constants, mkdirSync, realpathSync, lstatSync } from "node:fs";
8
+ import { dirname } from "node:path";
7
9
 
8
10
  // ============================================================================
9
11
  // Types
@@ -60,6 +62,72 @@ function escapeSqlLiteral(value: string): string {
60
62
  return value.replace(/'/g, "''");
61
63
  }
62
64
 
65
+ // ============================================================================
66
+ // Storage Path Validation
67
+ // ============================================================================
68
+
69
+ /**
70
+ * Validate and prepare the storage directory before LanceDB connection.
71
+ * Resolves symlinks, creates missing directories, and checks write permissions.
72
+ * Returns the resolved absolute path on success, or throws a descriptive error.
73
+ */
74
+ export function validateStoragePath(dbPath: string): string {
75
+ let resolvedPath = dbPath;
76
+
77
+ // Resolve symlinks (including dangling symlinks)
78
+ try {
79
+ const stats = lstatSync(dbPath);
80
+ if (stats.isSymbolicLink()) {
81
+ try {
82
+ resolvedPath = realpathSync(dbPath);
83
+ } catch (err: any) {
84
+ throw new Error(
85
+ `dbPath "${dbPath}" is a symlink whose target does not exist.\n` +
86
+ ` Fix: Create the target directory, or update the symlink to point to a valid path.\n` +
87
+ ` Details: ${err.code || ""} ${err.message}`
88
+ );
89
+ }
90
+ }
91
+ } catch (err: any) {
92
+ // Missing path is OK (it will be created below)
93
+ if (err?.code === "ENOENT") {
94
+ // no-op
95
+ } else if (typeof err?.message === "string" && err.message.includes("symlink whose target does not exist")) {
96
+ throw err;
97
+ } else {
98
+ // Other lstat failures — continue with original path
99
+ }
100
+ }
101
+
102
+ // Create directory if it doesn't exist
103
+ if (!existsSync(resolvedPath)) {
104
+ try {
105
+ mkdirSync(resolvedPath, { recursive: true });
106
+ } catch (err: any) {
107
+ throw new Error(
108
+ `Failed to create dbPath directory "${resolvedPath}".\n` +
109
+ ` Fix: Ensure the parent directory "${dirname(resolvedPath)}" exists and is writable,\n` +
110
+ ` or create it manually: mkdir -p "${resolvedPath}"\n` +
111
+ ` Details: ${err.code || ""} ${err.message}`
112
+ );
113
+ }
114
+ }
115
+
116
+ // Check write permissions
117
+ try {
118
+ accessSync(resolvedPath, constants.W_OK);
119
+ } catch (err: any) {
120
+ throw new Error(
121
+ `dbPath directory "${resolvedPath}" is not writable.\n` +
122
+ ` Fix: Check permissions with: ls -la "${dirname(resolvedPath)}"\n` +
123
+ ` Or grant write access: chmod u+w "${resolvedPath}"\n` +
124
+ ` Details: ${err.code || ""} ${err.message}`
125
+ );
126
+ }
127
+
128
+ return resolvedPath;
129
+ }
130
+
63
131
  // ============================================================================
64
132
  // Memory Store
65
133
  // ============================================================================
@@ -95,7 +163,19 @@ export class MemoryStore {
95
163
 
96
164
  private async doInitialize(): Promise<void> {
97
165
  const lancedb = await loadLanceDB();
98
- const db = await lancedb.connect(this.config.dbPath);
166
+
167
+ let db: LanceDB.Connection;
168
+ try {
169
+ db = await lancedb.connect(this.config.dbPath);
170
+ } catch (err: any) {
171
+ const code = err.code || "";
172
+ const message = err.message || String(err);
173
+ throw new Error(
174
+ `Failed to open LanceDB at "${this.config.dbPath}": ${code} ${message}\n` +
175
+ ` Fix: Verify the path exists and is writable. Check parent directory permissions.`
176
+ );
177
+ }
178
+
99
179
  let table: LanceDB.Table;
100
180
 
101
181
  // Idempotent table init: try openTable first, create only if missing,
@@ -196,7 +276,15 @@ export class MemoryStore {
196
276
  metadata: entry.metadata || "{}",
197
277
  };
198
278
 
199
- await this.table!.add([fullEntry]);
279
+ try {
280
+ await this.table!.add([fullEntry]);
281
+ } catch (err: any) {
282
+ const code = err.code || "";
283
+ const message = err.message || String(err);
284
+ throw new Error(
285
+ `Failed to store memory in "${this.config.dbPath}": ${code} ${message}`
286
+ );
287
+ }
200
288
  return fullEntry;
201
289
  }
202
290