@mnemoai/core 1.1.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.d.ts +2 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +7 -0
- package/dist/cli.js.map +7 -0
- package/dist/index.d.ts +128 -0
- package/dist/index.d.ts.map +1 -0
- package/{index.ts → dist/index.js} +526 -1333
- package/dist/index.js.map +7 -0
- package/dist/src/access-tracker.d.ts +97 -0
- package/dist/src/access-tracker.d.ts.map +1 -0
- package/dist/src/access-tracker.js +184 -0
- package/dist/src/access-tracker.js.map +7 -0
- package/dist/src/adapters/chroma.d.ts +31 -0
- package/dist/src/adapters/chroma.d.ts.map +1 -0
- package/{src/adapters/chroma.ts → dist/src/adapters/chroma.js} +45 -107
- package/dist/src/adapters/chroma.js.map +7 -0
- package/dist/src/adapters/lancedb.d.ts +29 -0
- package/dist/src/adapters/lancedb.d.ts.map +1 -0
- package/{src/adapters/lancedb.ts → dist/src/adapters/lancedb.js} +41 -109
- package/dist/src/adapters/lancedb.js.map +7 -0
- package/dist/src/adapters/pgvector.d.ts +33 -0
- package/dist/src/adapters/pgvector.d.ts.map +1 -0
- package/{src/adapters/pgvector.ts → dist/src/adapters/pgvector.js} +42 -104
- package/dist/src/adapters/pgvector.js.map +7 -0
- package/dist/src/adapters/qdrant.d.ts +34 -0
- package/dist/src/adapters/qdrant.d.ts.map +1 -0
- package/dist/src/adapters/qdrant.js +132 -0
- package/dist/src/adapters/qdrant.js.map +7 -0
- package/dist/src/adaptive-retrieval.d.ts +14 -0
- package/dist/src/adaptive-retrieval.d.ts.map +1 -0
- package/dist/src/adaptive-retrieval.js +52 -0
- package/dist/src/adaptive-retrieval.js.map +7 -0
- package/dist/src/audit-log.d.ts +56 -0
- package/dist/src/audit-log.d.ts.map +1 -0
- package/dist/src/audit-log.js +139 -0
- package/dist/src/audit-log.js.map +7 -0
- package/dist/src/chunker.d.ts +45 -0
- package/dist/src/chunker.d.ts.map +1 -0
- package/dist/src/chunker.js +157 -0
- package/dist/src/chunker.js.map +7 -0
- package/dist/src/config.d.ts +70 -0
- package/dist/src/config.d.ts.map +1 -0
- package/dist/src/config.js +142 -0
- package/dist/src/config.js.map +7 -0
- package/dist/src/decay-engine.d.ts +73 -0
- package/dist/src/decay-engine.d.ts.map +1 -0
- package/dist/src/decay-engine.js +119 -0
- package/dist/src/decay-engine.js.map +7 -0
- package/dist/src/embedder.d.ts +94 -0
- package/dist/src/embedder.d.ts.map +1 -0
- package/{src/embedder.ts → dist/src/embedder.js} +119 -317
- package/dist/src/embedder.js.map +7 -0
- package/dist/src/extraction-prompts.d.ts +12 -0
- package/dist/src/extraction-prompts.d.ts.map +1 -0
- package/dist/src/extraction-prompts.js +311 -0
- package/dist/src/extraction-prompts.js.map +7 -0
- package/dist/src/license.d.ts +29 -0
- package/dist/src/license.d.ts.map +1 -0
- package/{src/license.ts → dist/src/license.js} +42 -113
- package/dist/src/license.js.map +7 -0
- package/dist/src/llm-client.d.ts +23 -0
- package/dist/src/llm-client.d.ts.map +1 -0
- package/{src/llm-client.ts → dist/src/llm-client.js} +22 -55
- package/dist/src/llm-client.js.map +7 -0
- package/dist/src/logger.d.ts +33 -0
- package/dist/src/logger.d.ts.map +1 -0
- package/dist/src/logger.js +35 -0
- package/dist/src/logger.js.map +7 -0
- package/dist/src/mcp-server.d.ts +16 -0
- package/dist/src/mcp-server.d.ts.map +1 -0
- package/{src/mcp-server.ts → dist/src/mcp-server.js} +81 -181
- package/dist/src/mcp-server.js.map +7 -0
- package/dist/src/memory-categories.d.ts +40 -0
- package/dist/src/memory-categories.d.ts.map +1 -0
- package/dist/src/memory-categories.js +33 -0
- package/dist/src/memory-categories.js.map +7 -0
- package/dist/src/memory-upgrader.d.ts +71 -0
- package/dist/src/memory-upgrader.d.ts.map +1 -0
- package/dist/src/memory-upgrader.js +238 -0
- package/dist/src/memory-upgrader.js.map +7 -0
- package/dist/src/migrate.d.ts +47 -0
- package/dist/src/migrate.d.ts.map +1 -0
- package/{src/migrate.ts → dist/src/migrate.js} +57 -165
- package/dist/src/migrate.js.map +7 -0
- package/dist/src/mnemo.d.ts +67 -0
- package/dist/src/mnemo.d.ts.map +1 -0
- package/dist/src/mnemo.js +66 -0
- package/dist/src/mnemo.js.map +7 -0
- package/dist/src/noise-filter.d.ts +23 -0
- package/dist/src/noise-filter.d.ts.map +1 -0
- package/dist/src/noise-filter.js +62 -0
- package/dist/src/noise-filter.js.map +7 -0
- package/dist/src/noise-prototypes.d.ts +40 -0
- package/dist/src/noise-prototypes.d.ts.map +1 -0
- package/dist/src/noise-prototypes.js +116 -0
- package/dist/src/noise-prototypes.js.map +7 -0
- package/dist/src/observability.d.ts +16 -0
- package/dist/src/observability.d.ts.map +1 -0
- package/dist/src/observability.js +53 -0
- package/dist/src/observability.js.map +7 -0
- package/dist/src/query-tracker.d.ts +27 -0
- package/dist/src/query-tracker.d.ts.map +1 -0
- package/dist/src/query-tracker.js +32 -0
- package/dist/src/query-tracker.js.map +7 -0
- package/dist/src/reflection-event-store.d.ts +44 -0
- package/dist/src/reflection-event-store.d.ts.map +1 -0
- package/dist/src/reflection-event-store.js +50 -0
- package/dist/src/reflection-event-store.js.map +7 -0
- package/dist/src/reflection-item-store.d.ts +58 -0
- package/dist/src/reflection-item-store.d.ts.map +1 -0
- package/dist/src/reflection-item-store.js +69 -0
- package/dist/src/reflection-item-store.js.map +7 -0
- package/dist/src/reflection-mapped-metadata.d.ts +47 -0
- package/dist/src/reflection-mapped-metadata.d.ts.map +1 -0
- package/dist/src/reflection-mapped-metadata.js +40 -0
- package/dist/src/reflection-mapped-metadata.js.map +7 -0
- package/dist/src/reflection-metadata.d.ts +11 -0
- package/dist/src/reflection-metadata.d.ts.map +1 -0
- package/dist/src/reflection-metadata.js +24 -0
- package/dist/src/reflection-metadata.js.map +7 -0
- package/dist/src/reflection-ranking.d.ts +13 -0
- package/dist/src/reflection-ranking.d.ts.map +1 -0
- package/{src/reflection-ranking.ts → dist/src/reflection-ranking.js} +12 -21
- package/dist/src/reflection-ranking.js.map +7 -0
- package/dist/src/reflection-retry.d.ts +30 -0
- package/dist/src/reflection-retry.d.ts.map +1 -0
- package/{src/reflection-retry.ts → dist/src/reflection-retry.js} +24 -64
- package/dist/src/reflection-retry.js.map +7 -0
- package/dist/src/reflection-slices.d.ts +42 -0
- package/dist/src/reflection-slices.d.ts.map +1 -0
- package/{src/reflection-slices.ts → dist/src/reflection-slices.js} +60 -136
- package/dist/src/reflection-slices.js.map +7 -0
- package/dist/src/reflection-store.d.ts +85 -0
- package/dist/src/reflection-store.d.ts.map +1 -0
- package/dist/src/reflection-store.js +407 -0
- package/dist/src/reflection-store.js.map +7 -0
- package/dist/src/resonance-state.d.ts +19 -0
- package/dist/src/resonance-state.d.ts.map +1 -0
- package/{src/resonance-state.ts → dist/src/resonance-state.js} +13 -42
- package/dist/src/resonance-state.js.map +7 -0
- package/dist/src/retriever.d.ts +228 -0
- package/dist/src/retriever.d.ts.map +1 -0
- package/dist/src/retriever.js +1006 -0
- package/dist/src/retriever.js.map +7 -0
- package/dist/src/scopes.d.ts +58 -0
- package/dist/src/scopes.d.ts.map +1 -0
- package/dist/src/scopes.js +252 -0
- package/dist/src/scopes.js.map +7 -0
- package/dist/src/self-improvement-files.d.ts +20 -0
- package/dist/src/self-improvement-files.d.ts.map +1 -0
- package/{src/self-improvement-files.ts → dist/src/self-improvement-files.js} +24 -49
- package/dist/src/self-improvement-files.js.map +7 -0
- package/dist/src/semantic-gate.d.ts +24 -0
- package/dist/src/semantic-gate.d.ts.map +1 -0
- package/dist/src/semantic-gate.js +86 -0
- package/dist/src/semantic-gate.js.map +7 -0
- package/dist/src/session-recovery.d.ts +9 -0
- package/dist/src/session-recovery.d.ts.map +1 -0
- package/{src/session-recovery.ts → dist/src/session-recovery.js} +40 -57
- package/dist/src/session-recovery.js.map +7 -0
- package/dist/src/smart-extractor.d.ts +107 -0
- package/dist/src/smart-extractor.d.ts.map +1 -0
- package/{src/smart-extractor.ts → dist/src/smart-extractor.js} +130 -383
- package/dist/src/smart-extractor.js.map +7 -0
- package/dist/src/smart-metadata.d.ts +103 -0
- package/dist/src/smart-metadata.d.ts.map +1 -0
- package/dist/src/smart-metadata.js +361 -0
- package/dist/src/smart-metadata.js.map +7 -0
- package/dist/src/storage-adapter.d.ts +102 -0
- package/dist/src/storage-adapter.d.ts.map +1 -0
- package/dist/src/storage-adapter.js +22 -0
- package/dist/src/storage-adapter.js.map +7 -0
- package/dist/src/store.d.ts +108 -0
- package/dist/src/store.d.ts.map +1 -0
- package/dist/src/store.js +939 -0
- package/dist/src/store.js.map +7 -0
- package/dist/src/tier-manager.d.ts +57 -0
- package/dist/src/tier-manager.d.ts.map +1 -0
- package/dist/src/tier-manager.js +80 -0
- package/dist/src/tier-manager.js.map +7 -0
- package/dist/src/tools.d.ts +43 -0
- package/dist/src/tools.d.ts.map +1 -0
- package/dist/src/tools.js +1075 -0
- package/dist/src/tools.js.map +7 -0
- package/dist/src/wal-recovery.d.ts +30 -0
- package/dist/src/wal-recovery.d.ts.map +1 -0
- package/{src/wal-recovery.ts → dist/src/wal-recovery.js} +26 -79
- package/dist/src/wal-recovery.js.map +7 -0
- package/package.json +21 -2
- package/openclaw.plugin.json +0 -815
- package/src/access-tracker.ts +0 -341
- package/src/adapters/README.md +0 -78
- package/src/adapters/qdrant.ts +0 -191
- package/src/adaptive-retrieval.ts +0 -90
- package/src/audit-log.ts +0 -238
- package/src/chunker.ts +0 -254
- package/src/config.ts +0 -271
- package/src/decay-engine.ts +0 -238
- package/src/extraction-prompts.ts +0 -339
- package/src/memory-categories.ts +0 -71
- package/src/memory-upgrader.ts +0 -388
- package/src/mnemo.ts +0 -142
- package/src/noise-filter.ts +0 -97
- package/src/noise-prototypes.ts +0 -164
- package/src/observability.ts +0 -81
- package/src/query-tracker.ts +0 -57
- package/src/reflection-event-store.ts +0 -98
- package/src/reflection-item-store.ts +0 -112
- package/src/reflection-mapped-metadata.ts +0 -84
- package/src/reflection-metadata.ts +0 -23
- package/src/reflection-store.ts +0 -602
- package/src/retriever.ts +0 -1510
- package/src/scopes.ts +0 -375
- package/src/semantic-gate.ts +0 -121
- package/src/smart-metadata.ts +0 -561
- package/src/storage-adapter.ts +0 -153
- package/src/store.ts +0 -1330
- package/src/tier-manager.ts +0 -189
- package/src/tools.ts +0 -1292
- package/test/core.test.mjs +0 -301
package/src/audit-log.ts
DELETED
|
@@ -1,238 +0,0 @@
|
|
|
1
|
-
// SPDX-License-Identifier: LicenseRef-Mnemo-Pro
|
|
2
|
-
/**
|
|
3
|
-
* Mnemo Audit Log — GDPR/EU AI Act compliance
|
|
4
|
-
*
|
|
5
|
-
* Records all memory CRUD operations with:
|
|
6
|
-
* - WHO: agent/user identity
|
|
7
|
-
* - WHAT: operation type + affected memory IDs
|
|
8
|
-
* - WHEN: ISO timestamp
|
|
9
|
-
* - WHY: source/trigger (auto-capture, manual, contradiction, etc.)
|
|
10
|
-
*
|
|
11
|
-
* Stored as append-only JSONL file. Supports retention policies.
|
|
12
|
-
*/
|
|
13
|
-
|
|
14
|
-
import { appendFile, mkdir, readFile, stat } from "node:fs/promises";
|
|
15
|
-
import { join } from "node:path";
|
|
16
|
-
import { homedir } from "node:os";
|
|
17
|
-
|
|
18
|
-
const AUDIT_DIR = join(homedir(), ".mnemo", "audit");
|
|
19
|
-
const MAX_FILE_SIZE = 10 * 1024 * 1024; // 10MB per file, then rotate
|
|
20
|
-
|
|
21
|
-
export type AuditAction =
|
|
22
|
-
| "create"
|
|
23
|
-
| "update"
|
|
24
|
-
| "delete"
|
|
25
|
-
| "bulk_delete"
|
|
26
|
-
| "expire"
|
|
27
|
-
| "merge"
|
|
28
|
-
| "recall"
|
|
29
|
-
| "export";
|
|
30
|
-
|
|
31
|
-
export interface AuditEntry {
|
|
32
|
-
timestamp: string;
|
|
33
|
-
action: AuditAction;
|
|
34
|
-
actor: string; // agent ID, user ID, or "system"
|
|
35
|
-
memoryIds: string[]; // affected memory IDs
|
|
36
|
-
scope?: string;
|
|
37
|
-
reason?: string; // "auto-capture", "contradiction", "user-request", "decay", etc.
|
|
38
|
-
details?: string; // additional context (text preview, old→new value, etc.)
|
|
39
|
-
ip?: string; // for API-based access
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
let _initialized = false;
|
|
43
|
-
let _currentFile = "";
|
|
44
|
-
let _enabled = true;
|
|
45
|
-
|
|
46
|
-
/**
|
|
47
|
-
* Initialize the audit log directory.
|
|
48
|
-
*/
|
|
49
|
-
async function ensureDir(): Promise<void> {
|
|
50
|
-
if (_initialized) return;
|
|
51
|
-
try {
|
|
52
|
-
await mkdir(AUDIT_DIR, { recursive: true });
|
|
53
|
-
_currentFile = getLogFileName();
|
|
54
|
-
_initialized = true;
|
|
55
|
-
} catch {
|
|
56
|
-
_enabled = false;
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
function getLogFileName(): string {
|
|
61
|
-
const date = new Date().toISOString().slice(0, 10); // YYYY-MM-DD
|
|
62
|
-
return join(AUDIT_DIR, `audit-${date}.jsonl`);
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
/**
|
|
66
|
-
* Append an audit entry. Fire-and-forget — never blocks the main flow.
|
|
67
|
-
*/
|
|
68
|
-
export async function audit(entry: AuditEntry): Promise<void> {
|
|
69
|
-
if (!_enabled) return;
|
|
70
|
-
|
|
71
|
-
try {
|
|
72
|
-
await ensureDir();
|
|
73
|
-
|
|
74
|
-
// Rotate file daily
|
|
75
|
-
const expectedFile = getLogFileName();
|
|
76
|
-
if (expectedFile !== _currentFile) {
|
|
77
|
-
_currentFile = expectedFile;
|
|
78
|
-
}
|
|
79
|
-
|
|
80
|
-
// Check file size for rotation
|
|
81
|
-
try {
|
|
82
|
-
const stats = await stat(_currentFile);
|
|
83
|
-
if (stats.size > MAX_FILE_SIZE) {
|
|
84
|
-
const rotatedName = _currentFile.replace(".jsonl", `-${Date.now()}.jsonl`);
|
|
85
|
-
_currentFile = rotatedName;
|
|
86
|
-
}
|
|
87
|
-
} catch {
|
|
88
|
-
// File doesn't exist yet, that's fine
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
const line = JSON.stringify({
|
|
92
|
-
...entry,
|
|
93
|
-
timestamp: entry.timestamp || new Date().toISOString(),
|
|
94
|
-
}) + "\n";
|
|
95
|
-
|
|
96
|
-
await appendFile(_currentFile, line);
|
|
97
|
-
} catch {
|
|
98
|
-
// Audit log failure should never break the main flow
|
|
99
|
-
}
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
/**
|
|
103
|
-
* Convenience: log a memory creation.
|
|
104
|
-
*/
|
|
105
|
-
export function auditCreate(
|
|
106
|
-
memoryId: string,
|
|
107
|
-
actor: string,
|
|
108
|
-
scope: string,
|
|
109
|
-
reason: string,
|
|
110
|
-
textPreview?: string,
|
|
111
|
-
): void {
|
|
112
|
-
audit({
|
|
113
|
-
timestamp: new Date().toISOString(),
|
|
114
|
-
action: "create",
|
|
115
|
-
actor,
|
|
116
|
-
memoryIds: [memoryId],
|
|
117
|
-
scope,
|
|
118
|
-
reason,
|
|
119
|
-
details: textPreview ? textPreview.slice(0, 200) : undefined,
|
|
120
|
-
}).catch(() => {});
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
/**
|
|
124
|
-
* Convenience: log a memory deletion.
|
|
125
|
-
*/
|
|
126
|
-
export function auditDelete(
|
|
127
|
-
memoryIds: string[],
|
|
128
|
-
actor: string,
|
|
129
|
-
reason: string,
|
|
130
|
-
): void {
|
|
131
|
-
audit({
|
|
132
|
-
timestamp: new Date().toISOString(),
|
|
133
|
-
action: memoryIds.length > 1 ? "bulk_delete" : "delete",
|
|
134
|
-
actor,
|
|
135
|
-
memoryIds,
|
|
136
|
-
reason,
|
|
137
|
-
}).catch(() => {});
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
/**
|
|
141
|
-
* Convenience: log a memory update (e.g., importance change, tier change).
|
|
142
|
-
*/
|
|
143
|
-
export function auditUpdate(
|
|
144
|
-
memoryId: string,
|
|
145
|
-
actor: string,
|
|
146
|
-
reason: string,
|
|
147
|
-
details?: string,
|
|
148
|
-
): void {
|
|
149
|
-
audit({
|
|
150
|
-
timestamp: new Date().toISOString(),
|
|
151
|
-
action: "update",
|
|
152
|
-
actor,
|
|
153
|
-
memoryIds: [memoryId],
|
|
154
|
-
reason,
|
|
155
|
-
details,
|
|
156
|
-
}).catch(() => {});
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
/**
|
|
160
|
-
* Convenience: log a memory expiration (contradiction resolution).
|
|
161
|
-
*/
|
|
162
|
-
export function auditExpire(
|
|
163
|
-
memoryId: string,
|
|
164
|
-
actor: string,
|
|
165
|
-
reason: string,
|
|
166
|
-
details?: string,
|
|
167
|
-
): void {
|
|
168
|
-
audit({
|
|
169
|
-
timestamp: new Date().toISOString(),
|
|
170
|
-
action: "expire",
|
|
171
|
-
actor,
|
|
172
|
-
memoryIds: [memoryId],
|
|
173
|
-
reason,
|
|
174
|
-
details,
|
|
175
|
-
}).catch(() => {});
|
|
176
|
-
}
|
|
177
|
-
|
|
178
|
-
/**
|
|
179
|
-
* Convenience: log a memory recall (for access audit trail).
|
|
180
|
-
*/
|
|
181
|
-
export function auditRecall(
|
|
182
|
-
memoryIds: string[],
|
|
183
|
-
actor: string,
|
|
184
|
-
query?: string,
|
|
185
|
-
): void {
|
|
186
|
-
audit({
|
|
187
|
-
timestamp: new Date().toISOString(),
|
|
188
|
-
action: "recall",
|
|
189
|
-
actor,
|
|
190
|
-
memoryIds,
|
|
191
|
-
reason: "retrieval",
|
|
192
|
-
details: query ? query.slice(0, 200) : undefined,
|
|
193
|
-
}).catch(() => {});
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
/**
|
|
197
|
-
* Read audit log entries for a date range.
|
|
198
|
-
* Useful for compliance exports.
|
|
199
|
-
*/
|
|
200
|
-
export async function readAuditLog(
|
|
201
|
-
startDate: string,
|
|
202
|
-
endDate: string,
|
|
203
|
-
): Promise<AuditEntry[]> {
|
|
204
|
-
await ensureDir();
|
|
205
|
-
const entries: AuditEntry[] = [];
|
|
206
|
-
|
|
207
|
-
const start = new Date(startDate);
|
|
208
|
-
const end = new Date(endDate);
|
|
209
|
-
const current = new Date(start);
|
|
210
|
-
|
|
211
|
-
while (current <= end) {
|
|
212
|
-
const dateStr = current.toISOString().slice(0, 10);
|
|
213
|
-
const filePath = join(AUDIT_DIR, `audit-${dateStr}.jsonl`);
|
|
214
|
-
|
|
215
|
-
try {
|
|
216
|
-
const content = await readFile(filePath, "utf8");
|
|
217
|
-
const lines = content.trim().split("\n").filter(Boolean);
|
|
218
|
-
for (const line of lines) {
|
|
219
|
-
try {
|
|
220
|
-
entries.push(JSON.parse(line));
|
|
221
|
-
} catch { /* skip malformed */ }
|
|
222
|
-
}
|
|
223
|
-
} catch {
|
|
224
|
-
// File doesn't exist for this date, skip
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
current.setDate(current.getDate() + 1);
|
|
228
|
-
}
|
|
229
|
-
|
|
230
|
-
return entries;
|
|
231
|
-
}
|
|
232
|
-
|
|
233
|
-
/**
|
|
234
|
-
* Enable or disable audit logging.
|
|
235
|
-
*/
|
|
236
|
-
export function setAuditEnabled(enabled: boolean): void {
|
|
237
|
-
_enabled = enabled;
|
|
238
|
-
}
|
package/src/chunker.ts
DELETED
|
@@ -1,254 +0,0 @@
|
|
|
1
|
-
// SPDX-License-Identifier: MIT
|
|
2
|
-
/**
|
|
3
|
-
* Long Context Chunking System
|
|
4
|
-
*
|
|
5
|
-
* Goal: split documents that exceed embedding model context limits into smaller,
|
|
6
|
-
* semantically coherent chunks with overlap.
|
|
7
|
-
*
|
|
8
|
-
* Notes:
|
|
9
|
-
* - We use *character counts* as a conservative proxy for tokens.
|
|
10
|
-
* - The embedder triggers this only after a provider throws a context-length error.
|
|
11
|
-
*/
|
|
12
|
-
|
|
13
|
-
// ============================================================================
|
|
14
|
-
// Types & Constants
|
|
15
|
-
// ============================================================================
|
|
16
|
-
|
|
17
|
-
export interface ChunkMetadata {
|
|
18
|
-
startIndex: number;
|
|
19
|
-
endIndex: number;
|
|
20
|
-
length: number;
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
export interface ChunkResult {
|
|
24
|
-
chunks: string[];
|
|
25
|
-
metadatas: ChunkMetadata[];
|
|
26
|
-
totalOriginalLength: number;
|
|
27
|
-
chunkCount: number;
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
export interface ChunkerConfig {
|
|
31
|
-
/** Maximum characters per chunk. */
|
|
32
|
-
maxChunkSize: number;
|
|
33
|
-
/** Overlap between chunks in characters. */
|
|
34
|
-
overlapSize: number;
|
|
35
|
-
/** Minimum chunk size (except the final chunk). */
|
|
36
|
-
minChunkSize: number;
|
|
37
|
-
/** Attempt to split on sentence boundaries for better semantic coherence. */
|
|
38
|
-
semanticSplit: boolean;
|
|
39
|
-
/** Max lines per chunk before we try to split earlier on a line boundary. */
|
|
40
|
-
maxLinesPerChunk: number;
|
|
41
|
-
}
|
|
42
|
-
|
|
43
|
-
// Common embedding context limits (provider/model specific). These are typically
|
|
44
|
-
// token limits, but we treat them as inputs to a conservative char-based heuristic.
|
|
45
|
-
export const EMBEDDING_CONTEXT_LIMITS: Record<string, number> = {
|
|
46
|
-
// Jina v5
|
|
47
|
-
"jina-embeddings-v5-text-small": 8192,
|
|
48
|
-
"jina-embeddings-v5-text-nano": 8192,
|
|
49
|
-
|
|
50
|
-
// OpenAI
|
|
51
|
-
"text-embedding-3-small": 8192,
|
|
52
|
-
"text-embedding-3-large": 8192,
|
|
53
|
-
|
|
54
|
-
// Google
|
|
55
|
-
"text-embedding-004": 8192,
|
|
56
|
-
"gemini-embedding-001": 2048,
|
|
57
|
-
|
|
58
|
-
// Local/common
|
|
59
|
-
"nomic-embed-text": 8192,
|
|
60
|
-
"all-MiniLM-L6-v2": 512,
|
|
61
|
-
"all-mpnet-base-v2": 512,
|
|
62
|
-
};
|
|
63
|
-
|
|
64
|
-
export const DEFAULT_CHUNKER_CONFIG: ChunkerConfig = {
|
|
65
|
-
maxChunkSize: 4000,
|
|
66
|
-
overlapSize: 200,
|
|
67
|
-
minChunkSize: 200,
|
|
68
|
-
semanticSplit: true,
|
|
69
|
-
maxLinesPerChunk: 50,
|
|
70
|
-
};
|
|
71
|
-
|
|
72
|
-
// Sentence ending patterns (English + CJK-ish punctuation)
|
|
73
|
-
const SENTENCE_ENDING = /[.!?。!?]/;
|
|
74
|
-
|
|
75
|
-
// ============================================================================
|
|
76
|
-
// Helpers
|
|
77
|
-
// ============================================================================
|
|
78
|
-
|
|
79
|
-
function clamp(n: number, lo: number, hi: number): number {
|
|
80
|
-
return Math.max(lo, Math.min(hi, n));
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
function countLines(s: string): number {
|
|
84
|
-
// Count \n (treat CRLF as one line break)
|
|
85
|
-
return s.split(/\r\n|\n|\r/).length;
|
|
86
|
-
}
|
|
87
|
-
|
|
88
|
-
function findLastIndexWithin(text: string, re: RegExp, start: number, end: number): number {
|
|
89
|
-
// Find last match start index for regex within [start, end).
|
|
90
|
-
// NOTE: `re` must NOT be global; we will scan manually.
|
|
91
|
-
let last = -1;
|
|
92
|
-
for (let i = end - 1; i >= start; i--) {
|
|
93
|
-
if (re.test(text[i])) return i;
|
|
94
|
-
}
|
|
95
|
-
return last;
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
function findSplitEnd(text: string, start: number, maxEnd: number, minEnd: number, config: ChunkerConfig): number {
|
|
99
|
-
const safeMinEnd = clamp(minEnd, start + 1, maxEnd);
|
|
100
|
-
const safeMaxEnd = clamp(maxEnd, safeMinEnd, text.length);
|
|
101
|
-
|
|
102
|
-
// Respect line limit: if we exceed maxLinesPerChunk, force earlier split at a line break.
|
|
103
|
-
if (config.maxLinesPerChunk > 0) {
|
|
104
|
-
const candidate = text.slice(start, safeMaxEnd);
|
|
105
|
-
if (countLines(candidate) > config.maxLinesPerChunk) {
|
|
106
|
-
// Find the position of the Nth line break.
|
|
107
|
-
let breaks = 0;
|
|
108
|
-
for (let i = start; i < safeMaxEnd; i++) {
|
|
109
|
-
const ch = text[i];
|
|
110
|
-
if (ch === "\n") {
|
|
111
|
-
breaks++;
|
|
112
|
-
if (breaks >= config.maxLinesPerChunk) {
|
|
113
|
-
// Split right after this newline.
|
|
114
|
-
return Math.max(i + 1, safeMinEnd);
|
|
115
|
-
}
|
|
116
|
-
}
|
|
117
|
-
}
|
|
118
|
-
}
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
if (config.semanticSplit) {
|
|
122
|
-
// Prefer a sentence boundary near the end.
|
|
123
|
-
// Scan backward from safeMaxEnd to safeMinEnd.
|
|
124
|
-
for (let i = safeMaxEnd - 1; i >= safeMinEnd; i--) {
|
|
125
|
-
if (SENTENCE_ENDING.test(text[i])) {
|
|
126
|
-
// Include trailing whitespace after punctuation.
|
|
127
|
-
let j = i + 1;
|
|
128
|
-
while (j < safeMaxEnd && /\s/.test(text[j])) j++;
|
|
129
|
-
return j;
|
|
130
|
-
}
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
// Next best: newline boundary.
|
|
134
|
-
for (let i = safeMaxEnd - 1; i >= safeMinEnd; i--) {
|
|
135
|
-
if (text[i] === "\n") return i + 1;
|
|
136
|
-
}
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
// Fallback: last whitespace boundary.
|
|
140
|
-
for (let i = safeMaxEnd - 1; i >= safeMinEnd; i--) {
|
|
141
|
-
if (/\s/.test(text[i])) return i;
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
return safeMaxEnd;
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
function sliceTrimWithIndices(text: string, start: number, end: number): { chunk: string; meta: ChunkMetadata } {
|
|
148
|
-
const raw = text.slice(start, end);
|
|
149
|
-
const leading = raw.match(/^\s*/)?.[0]?.length ?? 0;
|
|
150
|
-
const trailing = raw.match(/\s*$/)?.[0]?.length ?? 0;
|
|
151
|
-
const chunk = raw.trim();
|
|
152
|
-
|
|
153
|
-
const trimmedStart = start + leading;
|
|
154
|
-
const trimmedEnd = end - trailing;
|
|
155
|
-
|
|
156
|
-
return {
|
|
157
|
-
chunk,
|
|
158
|
-
meta: {
|
|
159
|
-
startIndex: trimmedStart,
|
|
160
|
-
endIndex: Math.max(trimmedStart, trimmedEnd),
|
|
161
|
-
length: chunk.length,
|
|
162
|
-
},
|
|
163
|
-
};
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
// ============================================================================
|
|
167
|
-
// Chunking Core
|
|
168
|
-
// ============================================================================
|
|
169
|
-
|
|
170
|
-
export function chunkDocument(text: string, config: ChunkerConfig = DEFAULT_CHUNKER_CONFIG): ChunkResult {
|
|
171
|
-
if (!text || text.trim().length === 0) {
|
|
172
|
-
return { chunks: [], metadatas: [], totalOriginalLength: 0, chunkCount: 0 };
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
const totalOriginalLength = text.length;
|
|
176
|
-
const chunks: string[] = [];
|
|
177
|
-
const metadatas: ChunkMetadata[] = [];
|
|
178
|
-
|
|
179
|
-
let pos = 0;
|
|
180
|
-
const maxGuard = Math.max(4, Math.ceil(text.length / Math.max(1, config.maxChunkSize - config.overlapSize)) + 5);
|
|
181
|
-
let guard = 0;
|
|
182
|
-
|
|
183
|
-
while (pos < text.length && guard < maxGuard) {
|
|
184
|
-
guard++;
|
|
185
|
-
|
|
186
|
-
const remaining = text.length - pos;
|
|
187
|
-
if (remaining <= config.maxChunkSize) {
|
|
188
|
-
const { chunk, meta } = sliceTrimWithIndices(text, pos, text.length);
|
|
189
|
-
if (chunk.length > 0) {
|
|
190
|
-
chunks.push(chunk);
|
|
191
|
-
metadatas.push(meta);
|
|
192
|
-
}
|
|
193
|
-
break;
|
|
194
|
-
}
|
|
195
|
-
|
|
196
|
-
const maxEnd = Math.min(pos + config.maxChunkSize, text.length);
|
|
197
|
-
const minEnd = Math.min(pos + config.minChunkSize, maxEnd);
|
|
198
|
-
|
|
199
|
-
const end = findSplitEnd(text, pos, maxEnd, minEnd, config);
|
|
200
|
-
const { chunk, meta } = sliceTrimWithIndices(text, pos, end);
|
|
201
|
-
|
|
202
|
-
// If trimming made it too small, fall back to a hard split.
|
|
203
|
-
if (chunk.length < config.minChunkSize) {
|
|
204
|
-
const hardEnd = Math.min(pos + config.maxChunkSize, text.length);
|
|
205
|
-
const hard = sliceTrimWithIndices(text, pos, hardEnd);
|
|
206
|
-
if (hard.chunk.length > 0) {
|
|
207
|
-
chunks.push(hard.chunk);
|
|
208
|
-
metadatas.push(hard.meta);
|
|
209
|
-
}
|
|
210
|
-
if (hardEnd >= text.length) break;
|
|
211
|
-
pos = Math.max(hardEnd - config.overlapSize, pos + 1);
|
|
212
|
-
continue;
|
|
213
|
-
}
|
|
214
|
-
|
|
215
|
-
chunks.push(chunk);
|
|
216
|
-
metadatas.push(meta);
|
|
217
|
-
|
|
218
|
-
if (end >= text.length) break;
|
|
219
|
-
|
|
220
|
-
// Move forward with overlap.
|
|
221
|
-
const nextPos = Math.max(end - config.overlapSize, pos + 1);
|
|
222
|
-
pos = nextPos;
|
|
223
|
-
}
|
|
224
|
-
|
|
225
|
-
return {
|
|
226
|
-
chunks,
|
|
227
|
-
metadatas,
|
|
228
|
-
totalOriginalLength,
|
|
229
|
-
chunkCount: chunks.length,
|
|
230
|
-
};
|
|
231
|
-
}
|
|
232
|
-
|
|
233
|
-
/**
|
|
234
|
-
* Smart chunker that adapts to model context limits.
|
|
235
|
-
*
|
|
236
|
-
* We intentionally pick conservative char limits (70% of the reported limit)
|
|
237
|
-
* since token/char ratios vary.
|
|
238
|
-
*/
|
|
239
|
-
export function smartChunk(text: string, embedderModel?: string): ChunkResult {
|
|
240
|
-
const limit = embedderModel ? EMBEDDING_CONTEXT_LIMITS[embedderModel] : undefined;
|
|
241
|
-
const base = limit ?? 8192;
|
|
242
|
-
|
|
243
|
-
const config: ChunkerConfig = {
|
|
244
|
-
maxChunkSize: Math.max(1000, Math.floor(base * 0.7)),
|
|
245
|
-
overlapSize: Math.max(0, Math.floor(base * 0.05)),
|
|
246
|
-
minChunkSize: Math.max(100, Math.floor(base * 0.1)),
|
|
247
|
-
semanticSplit: true,
|
|
248
|
-
maxLinesPerChunk: 50,
|
|
249
|
-
};
|
|
250
|
-
|
|
251
|
-
return chunkDocument(text, config);
|
|
252
|
-
}
|
|
253
|
-
|
|
254
|
-
export default chunkDocument;
|