clawmem 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +660 -0
- package/CLAUDE.md +660 -0
- package/LICENSE +21 -0
- package/README.md +993 -0
- package/SKILL.md +717 -0
- package/bin/clawmem +75 -0
- package/package.json +72 -0
- package/src/amem.ts +797 -0
- package/src/beads.ts +263 -0
- package/src/clawmem.ts +1849 -0
- package/src/collections.ts +405 -0
- package/src/config.ts +178 -0
- package/src/consolidation.ts +123 -0
- package/src/directory-context.ts +248 -0
- package/src/errors.ts +41 -0
- package/src/formatter.ts +427 -0
- package/src/graph-traversal.ts +247 -0
- package/src/hooks/context-surfacing.ts +317 -0
- package/src/hooks/curator-nudge.ts +89 -0
- package/src/hooks/decision-extractor.ts +639 -0
- package/src/hooks/feedback-loop.ts +214 -0
- package/src/hooks/handoff-generator.ts +345 -0
- package/src/hooks/postcompact-inject.ts +226 -0
- package/src/hooks/precompact-extract.ts +314 -0
- package/src/hooks/pretool-inject.ts +79 -0
- package/src/hooks/session-bootstrap.ts +324 -0
- package/src/hooks/staleness-check.ts +130 -0
- package/src/hooks.ts +367 -0
- package/src/indexer.ts +327 -0
- package/src/intent.ts +294 -0
- package/src/limits.ts +26 -0
- package/src/llm.ts +1175 -0
- package/src/mcp.ts +2138 -0
- package/src/memory.ts +336 -0
- package/src/mmr.ts +93 -0
- package/src/observer.ts +269 -0
- package/src/openclaw/engine.ts +283 -0
- package/src/openclaw/index.ts +221 -0
- package/src/openclaw/plugin.json +83 -0
- package/src/openclaw/shell.ts +207 -0
- package/src/openclaw/tools.ts +304 -0
- package/src/profile.ts +346 -0
- package/src/promptguard.ts +218 -0
- package/src/retrieval-gate.ts +106 -0
- package/src/search-utils.ts +127 -0
- package/src/server.ts +783 -0
- package/src/splitter.ts +325 -0
- package/src/store.ts +4062 -0
- package/src/validation.ts +67 -0
- package/src/watcher.ts +58 -0
package/src/splitter.ts
ADDED
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* ClawMem Document Splitter — Granular Fragment Indexing
|
|
3
|
+
*
|
|
4
|
+
* Splits markdown documents into semantic fragments (sections, bullet lists,
|
|
5
|
+
* code blocks, frontmatter facts) for per-fragment embedding. Each fragment
|
|
6
|
+
* gets its own vector, dramatically improving recall for specific facts
|
|
7
|
+
* buried in larger documents.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
// =============================================================================
|
|
11
|
+
// Types
|
|
12
|
+
// =============================================================================
|
|
13
|
+
|
|
14
|
+
export interface Fragment {
|
|
15
|
+
type: 'full' | 'section' | 'list' | 'code' | 'frontmatter' | 'fact' | 'narrative';
|
|
16
|
+
label: string | null;
|
|
17
|
+
content: string;
|
|
18
|
+
startLine: number;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// =============================================================================
|
|
22
|
+
// Config
|
|
23
|
+
// =============================================================================
|
|
24
|
+
|
|
25
|
+
import { MAX_FRAGMENTS_PER_DOC, MAX_SPLITTER_INPUT_CHARS } from "./limits.ts";
|
|
26
|
+
|
|
27
|
+
const MIN_FRAGMENT_CHARS = 50;
|
|
28
|
+
const MAX_FRAGMENT_CHARS = 2000;
|
|
29
|
+
const MIN_DOC_CHARS_FOR_SPLIT = 200;
|
|
30
|
+
|
|
31
|
+
// =============================================================================
|
|
32
|
+
// Main Splitter
|
|
33
|
+
// =============================================================================
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Split a markdown document into semantic fragments for embedding.
|
|
37
|
+
* Always includes a 'full' fragment (entire body). Additional fragments
|
|
38
|
+
* are only generated if the document is large enough to benefit from splitting.
|
|
39
|
+
*/
|
|
40
|
+
export function splitDocument(
|
|
41
|
+
body: string,
|
|
42
|
+
frontmatter?: Record<string, any>
|
|
43
|
+
): Fragment[] {
|
|
44
|
+
// Bound input size to prevent memory blowup
|
|
45
|
+
const boundedBody = body.length > MAX_SPLITTER_INPUT_CHARS
|
|
46
|
+
? body.slice(0, MAX_SPLITTER_INPUT_CHARS)
|
|
47
|
+
: body;
|
|
48
|
+
|
|
49
|
+
const fragments: Fragment[] = [];
|
|
50
|
+
|
|
51
|
+
// Always include full document as first fragment
|
|
52
|
+
fragments.push({ type: 'full', label: null, content: boundedBody, startLine: 1 });
|
|
53
|
+
|
|
54
|
+
// Skip splitting for very short documents
|
|
55
|
+
if (boundedBody.length < MIN_DOC_CHARS_FOR_SPLIT) return fragments;
|
|
56
|
+
|
|
57
|
+
const lines = boundedBody.split('\n');
|
|
58
|
+
const remaining = () => MAX_FRAGMENTS_PER_DOC - fragments.length;
|
|
59
|
+
|
|
60
|
+
// Extract sections (## headings)
|
|
61
|
+
const sections = extractSections(lines);
|
|
62
|
+
fragments.push(...sections.slice(0, remaining()));
|
|
63
|
+
|
|
64
|
+
// Extract bullet lists
|
|
65
|
+
if (remaining() > 0) {
|
|
66
|
+
const lists = extractLists(lines);
|
|
67
|
+
fragments.push(...lists.slice(0, remaining()));
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Extract code blocks
|
|
71
|
+
if (remaining() > 0) {
|
|
72
|
+
const blocks = extractCodeBlocks(lines);
|
|
73
|
+
fragments.push(...blocks.slice(0, remaining()));
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// Extract frontmatter facts
|
|
77
|
+
if (frontmatter && remaining() > 0) {
|
|
78
|
+
const fmFrags = extractFrontmatter(frontmatter);
|
|
79
|
+
fragments.push(...fmFrags.slice(0, remaining()));
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
return fragments;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Split observer-generated observations into fact and narrative fragments.
|
|
87
|
+
* Used for documents that have structured `facts` and `narrative` fields.
|
|
88
|
+
*/
|
|
89
|
+
export function splitObservation(
|
|
90
|
+
body: string,
|
|
91
|
+
meta: { facts?: string; narrative?: string }
|
|
92
|
+
): Fragment[] {
|
|
93
|
+
// Bound input size
|
|
94
|
+
const boundedBody = body.length > MAX_SPLITTER_INPUT_CHARS
|
|
95
|
+
? body.slice(0, MAX_SPLITTER_INPUT_CHARS)
|
|
96
|
+
: body;
|
|
97
|
+
|
|
98
|
+
const fragments: Fragment[] = [];
|
|
99
|
+
|
|
100
|
+
// Full document
|
|
101
|
+
fragments.push({ type: 'full', label: null, content: boundedBody, startLine: 1 });
|
|
102
|
+
|
|
103
|
+
// Individual facts
|
|
104
|
+
if (meta.facts && fragments.length < MAX_FRAGMENTS_PER_DOC) {
|
|
105
|
+
try {
|
|
106
|
+
const facts = JSON.parse(meta.facts) as string[];
|
|
107
|
+
for (const fact of facts) {
|
|
108
|
+
if (fragments.length >= MAX_FRAGMENTS_PER_DOC) break;
|
|
109
|
+
if (fact.length >= MIN_FRAGMENT_CHARS) {
|
|
110
|
+
fragments.push({ type: 'fact', label: null, content: fact, startLine: 0 });
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
} catch { /* invalid JSON, skip */ }
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// Narrative
|
|
117
|
+
if (meta.narrative && meta.narrative.length >= MIN_FRAGMENT_CHARS && fragments.length < MAX_FRAGMENTS_PER_DOC) {
|
|
118
|
+
fragments.push({ type: 'narrative', label: null, content: meta.narrative, startLine: 0 });
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
return fragments;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
// =============================================================================
|
|
125
|
+
// Section Extraction
|
|
126
|
+
// =============================================================================
|
|
127
|
+
|
|
128
|
+
function extractSections(lines: string[]): Fragment[] {
|
|
129
|
+
const sections: Fragment[] = [];
|
|
130
|
+
let currentHeading: string | null = null;
|
|
131
|
+
let currentLines: string[] = [];
|
|
132
|
+
let currentStartLine = 1;
|
|
133
|
+
|
|
134
|
+
for (let i = 0; i < lines.length; i++) {
|
|
135
|
+
const line = lines[i]!;
|
|
136
|
+
const headingMatch = line.match(/^(#{1,3})\s+(.+)/);
|
|
137
|
+
|
|
138
|
+
if (headingMatch) {
|
|
139
|
+
// Flush previous section
|
|
140
|
+
if (currentHeading !== null && currentLines.length > 0) {
|
|
141
|
+
const content = currentLines.join('\n').trim();
|
|
142
|
+
if (content.length >= MIN_FRAGMENT_CHARS) {
|
|
143
|
+
sections.push({
|
|
144
|
+
type: 'section',
|
|
145
|
+
label: currentHeading,
|
|
146
|
+
content: maybeSplitLarge(content),
|
|
147
|
+
startLine: currentStartLine,
|
|
148
|
+
});
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
currentHeading = headingMatch[2]!.trim();
|
|
153
|
+
currentLines = [line];
|
|
154
|
+
currentStartLine = i + 1;
|
|
155
|
+
} else {
|
|
156
|
+
currentLines.push(line);
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
// Flush last section
|
|
161
|
+
if (currentHeading !== null && currentLines.length > 0) {
|
|
162
|
+
const content = currentLines.join('\n').trim();
|
|
163
|
+
if (content.length >= MIN_FRAGMENT_CHARS) {
|
|
164
|
+
sections.push({
|
|
165
|
+
type: 'section',
|
|
166
|
+
label: currentHeading,
|
|
167
|
+
content: maybeSplitLarge(content),
|
|
168
|
+
startLine: currentStartLine,
|
|
169
|
+
});
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
return sections;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
// =============================================================================
|
|
177
|
+
// List Extraction
|
|
178
|
+
// =============================================================================
|
|
179
|
+
|
|
180
|
+
function extractLists(lines: string[]): Fragment[] {
|
|
181
|
+
const lists: Fragment[] = [];
|
|
182
|
+
let currentList: string[] = [];
|
|
183
|
+
let listStartLine = 0;
|
|
184
|
+
|
|
185
|
+
for (let i = 0; i < lines.length; i++) {
|
|
186
|
+
const line = lines[i]!;
|
|
187
|
+
const isBullet = /^\s*[-*+]\s/.test(line) || /^\s*\d+\.\s/.test(line);
|
|
188
|
+
// Indented continuation of a list item
|
|
189
|
+
const isContinuation = currentList.length > 0 && /^\s{2,}/.test(line) && line.trim().length > 0;
|
|
190
|
+
|
|
191
|
+
if (isBullet || isContinuation) {
|
|
192
|
+
if (currentList.length === 0) listStartLine = i + 1;
|
|
193
|
+
currentList.push(line);
|
|
194
|
+
} else {
|
|
195
|
+
if (currentList.length >= 2) {
|
|
196
|
+
const content = currentList.join('\n').trim();
|
|
197
|
+
if (content.length >= MIN_FRAGMENT_CHARS) {
|
|
198
|
+
lists.push({
|
|
199
|
+
type: 'list',
|
|
200
|
+
label: null,
|
|
201
|
+
content: maybeSplitLarge(content),
|
|
202
|
+
startLine: listStartLine,
|
|
203
|
+
});
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
currentList = [];
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// Flush trailing list
|
|
211
|
+
if (currentList.length >= 2) {
|
|
212
|
+
const content = currentList.join('\n').trim();
|
|
213
|
+
if (content.length >= MIN_FRAGMENT_CHARS) {
|
|
214
|
+
lists.push({
|
|
215
|
+
type: 'list',
|
|
216
|
+
label: null,
|
|
217
|
+
content: maybeSplitLarge(content),
|
|
218
|
+
startLine: listStartLine,
|
|
219
|
+
});
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
return lists;
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
// =============================================================================
|
|
227
|
+
// Code Block Extraction
|
|
228
|
+
// =============================================================================
|
|
229
|
+
|
|
230
|
+
function extractCodeBlocks(lines: string[]): Fragment[] {
|
|
231
|
+
const blocks: Fragment[] = [];
|
|
232
|
+
let inBlock = false;
|
|
233
|
+
let blockLines: string[] = [];
|
|
234
|
+
let blockLang: string | null = null;
|
|
235
|
+
let blockStartLine = 0;
|
|
236
|
+
|
|
237
|
+
for (let i = 0; i < lines.length; i++) {
|
|
238
|
+
const line = lines[i]!;
|
|
239
|
+
|
|
240
|
+
if (!inBlock && line.match(/^```(\w*)/)) {
|
|
241
|
+
inBlock = true;
|
|
242
|
+
blockLang = line.match(/^```(\w+)/)?.[1] || null;
|
|
243
|
+
blockLines = [line];
|
|
244
|
+
blockStartLine = i + 1;
|
|
245
|
+
} else if (inBlock && line.startsWith('```')) {
|
|
246
|
+
blockLines.push(line);
|
|
247
|
+
const content = blockLines.join('\n').trim();
|
|
248
|
+
if (content.length >= MIN_FRAGMENT_CHARS) {
|
|
249
|
+
blocks.push({
|
|
250
|
+
type: 'code',
|
|
251
|
+
label: blockLang,
|
|
252
|
+
content: maybeSplitLarge(content),
|
|
253
|
+
startLine: blockStartLine,
|
|
254
|
+
});
|
|
255
|
+
}
|
|
256
|
+
inBlock = false;
|
|
257
|
+
blockLines = [];
|
|
258
|
+
blockLang = null;
|
|
259
|
+
} else if (inBlock) {
|
|
260
|
+
blockLines.push(line);
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
return blocks;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// =============================================================================
|
|
268
|
+
// Frontmatter Extraction
|
|
269
|
+
// =============================================================================
|
|
270
|
+
|
|
271
|
+
function extractFrontmatter(fm: Record<string, any>): Fragment[] {
|
|
272
|
+
const fragments: Fragment[] = [];
|
|
273
|
+
|
|
274
|
+
for (const [key, value] of Object.entries(fm)) {
|
|
275
|
+
if (key === 'content_type' || key === 'tags') continue; // skip metadata-only fields
|
|
276
|
+
|
|
277
|
+
let text: string;
|
|
278
|
+
if (typeof value === 'string') {
|
|
279
|
+
text = `${key}: ${value}`;
|
|
280
|
+
} else if (typeof value === 'number' || typeof value === 'boolean') {
|
|
281
|
+
text = `${key}: ${String(value)}`;
|
|
282
|
+
} else if (Array.isArray(value)) {
|
|
283
|
+
text = `${key}: ${value.join(', ')}`;
|
|
284
|
+
} else {
|
|
285
|
+
continue;
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
if (text.length >= 10) {
|
|
289
|
+
fragments.push({
|
|
290
|
+
type: 'frontmatter',
|
|
291
|
+
label: key,
|
|
292
|
+
content: text,
|
|
293
|
+
startLine: 0,
|
|
294
|
+
});
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
return fragments;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
// =============================================================================
|
|
302
|
+
// Helpers
|
|
303
|
+
// =============================================================================
|
|
304
|
+
|
|
305
|
+
/**
|
|
306
|
+
* If content exceeds MAX_FRAGMENT_CHARS, truncate at a paragraph boundary.
|
|
307
|
+
*/
|
|
308
|
+
function maybeSplitLarge(content: string): string {
|
|
309
|
+
if (content.length <= MAX_FRAGMENT_CHARS) return content;
|
|
310
|
+
|
|
311
|
+
// Try to split at paragraph boundary
|
|
312
|
+
const paragraphBreak = content.lastIndexOf('\n\n', MAX_FRAGMENT_CHARS);
|
|
313
|
+
if (paragraphBreak > MAX_FRAGMENT_CHARS * 0.5) {
|
|
314
|
+
return content.slice(0, paragraphBreak);
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
// Fall back to line boundary
|
|
318
|
+
const lineBreak = content.lastIndexOf('\n', MAX_FRAGMENT_CHARS);
|
|
319
|
+
if (lineBreak > MAX_FRAGMENT_CHARS * 0.5) {
|
|
320
|
+
return content.slice(0, lineBreak);
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Hard truncate
|
|
324
|
+
return content.slice(0, MAX_FRAGMENT_CHARS);
|
|
325
|
+
}
|