@mnemoai/core 1.1.0 → 1.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. package/dist/cli.d.ts +2 -0
  2. package/dist/cli.d.ts.map +1 -0
  3. package/dist/cli.js +7 -0
  4. package/dist/cli.js.map +7 -0
  5. package/dist/index.d.ts +136 -0
  6. package/dist/index.d.ts.map +1 -0
  7. package/{index.ts → dist/index.js} +537 -1333
  8. package/dist/index.js.map +7 -0
  9. package/dist/src/access-tracker.d.ts +97 -0
  10. package/dist/src/access-tracker.d.ts.map +1 -0
  11. package/dist/src/access-tracker.js +184 -0
  12. package/dist/src/access-tracker.js.map +7 -0
  13. package/dist/src/adapters/chroma.d.ts +31 -0
  14. package/dist/src/adapters/chroma.d.ts.map +1 -0
  15. package/{src/adapters/chroma.ts → dist/src/adapters/chroma.js} +45 -107
  16. package/dist/src/adapters/chroma.js.map +7 -0
  17. package/dist/src/adapters/lancedb.d.ts +29 -0
  18. package/dist/src/adapters/lancedb.d.ts.map +1 -0
  19. package/{src/adapters/lancedb.ts → dist/src/adapters/lancedb.js} +41 -109
  20. package/dist/src/adapters/lancedb.js.map +7 -0
  21. package/dist/src/adapters/pgvector.d.ts +33 -0
  22. package/dist/src/adapters/pgvector.d.ts.map +1 -0
  23. package/{src/adapters/pgvector.ts → dist/src/adapters/pgvector.js} +42 -104
  24. package/dist/src/adapters/pgvector.js.map +7 -0
  25. package/dist/src/adapters/qdrant.d.ts +34 -0
  26. package/dist/src/adapters/qdrant.d.ts.map +1 -0
  27. package/dist/src/adapters/qdrant.js +132 -0
  28. package/dist/src/adapters/qdrant.js.map +7 -0
  29. package/dist/src/adaptive-retrieval.d.ts +14 -0
  30. package/dist/src/adaptive-retrieval.d.ts.map +1 -0
  31. package/dist/src/adaptive-retrieval.js +52 -0
  32. package/dist/src/adaptive-retrieval.js.map +7 -0
  33. package/dist/src/audit-log.d.ts +56 -0
  34. package/dist/src/audit-log.d.ts.map +1 -0
  35. package/dist/src/audit-log.js +139 -0
  36. package/dist/src/audit-log.js.map +7 -0
  37. package/dist/src/chunker.d.ts +45 -0
  38. package/dist/src/chunker.d.ts.map +1 -0
  39. package/dist/src/chunker.js +157 -0
  40. package/dist/src/chunker.js.map +7 -0
  41. package/dist/src/config.d.ts +70 -0
  42. package/dist/src/config.d.ts.map +1 -0
  43. package/dist/src/config.js +142 -0
  44. package/dist/src/config.js.map +7 -0
  45. package/dist/src/decay-engine.d.ts +73 -0
  46. package/dist/src/decay-engine.d.ts.map +1 -0
  47. package/dist/src/decay-engine.js +119 -0
  48. package/dist/src/decay-engine.js.map +7 -0
  49. package/dist/src/embedder.d.ts +94 -0
  50. package/dist/src/embedder.d.ts.map +1 -0
  51. package/{src/embedder.ts → dist/src/embedder.js} +119 -317
  52. package/dist/src/embedder.js.map +7 -0
  53. package/dist/src/extraction-prompts.d.ts +12 -0
  54. package/dist/src/extraction-prompts.d.ts.map +1 -0
  55. package/dist/src/extraction-prompts.js +311 -0
  56. package/dist/src/extraction-prompts.js.map +7 -0
  57. package/dist/src/license.d.ts +29 -0
  58. package/dist/src/license.d.ts.map +1 -0
  59. package/{src/license.ts → dist/src/license.js} +42 -113
  60. package/dist/src/license.js.map +7 -0
  61. package/dist/src/llm-client.d.ts +23 -0
  62. package/dist/src/llm-client.d.ts.map +1 -0
  63. package/{src/llm-client.ts → dist/src/llm-client.js} +22 -55
  64. package/dist/src/llm-client.js.map +7 -0
  65. package/dist/src/logger.d.ts +33 -0
  66. package/dist/src/logger.d.ts.map +1 -0
  67. package/dist/src/logger.js +35 -0
  68. package/dist/src/logger.js.map +7 -0
  69. package/dist/src/mcp-server.d.ts +16 -0
  70. package/dist/src/mcp-server.d.ts.map +1 -0
  71. package/{src/mcp-server.ts → dist/src/mcp-server.js} +81 -181
  72. package/dist/src/mcp-server.js.map +7 -0
  73. package/dist/src/memory-categories.d.ts +40 -0
  74. package/dist/src/memory-categories.d.ts.map +1 -0
  75. package/dist/src/memory-categories.js +33 -0
  76. package/dist/src/memory-categories.js.map +7 -0
  77. package/dist/src/memory-upgrader.d.ts +71 -0
  78. package/dist/src/memory-upgrader.d.ts.map +1 -0
  79. package/dist/src/memory-upgrader.js +238 -0
  80. package/dist/src/memory-upgrader.js.map +7 -0
  81. package/dist/src/migrate.d.ts +47 -0
  82. package/dist/src/migrate.d.ts.map +1 -0
  83. package/{src/migrate.ts → dist/src/migrate.js} +57 -165
  84. package/dist/src/migrate.js.map +7 -0
  85. package/dist/src/mnemo.d.ts +67 -0
  86. package/dist/src/mnemo.d.ts.map +1 -0
  87. package/dist/src/mnemo.js +66 -0
  88. package/dist/src/mnemo.js.map +7 -0
  89. package/dist/src/noise-filter.d.ts +23 -0
  90. package/dist/src/noise-filter.d.ts.map +1 -0
  91. package/dist/src/noise-filter.js +62 -0
  92. package/dist/src/noise-filter.js.map +7 -0
  93. package/dist/src/noise-prototypes.d.ts +40 -0
  94. package/dist/src/noise-prototypes.d.ts.map +1 -0
  95. package/dist/src/noise-prototypes.js +116 -0
  96. package/dist/src/noise-prototypes.js.map +7 -0
  97. package/dist/src/observability.d.ts +16 -0
  98. package/dist/src/observability.d.ts.map +1 -0
  99. package/dist/src/observability.js +53 -0
  100. package/dist/src/observability.js.map +7 -0
  101. package/dist/src/query-tracker.d.ts +27 -0
  102. package/dist/src/query-tracker.d.ts.map +1 -0
  103. package/dist/src/query-tracker.js +32 -0
  104. package/dist/src/query-tracker.js.map +7 -0
  105. package/dist/src/reflection-event-store.d.ts +44 -0
  106. package/dist/src/reflection-event-store.d.ts.map +1 -0
  107. package/dist/src/reflection-event-store.js +50 -0
  108. package/dist/src/reflection-event-store.js.map +7 -0
  109. package/dist/src/reflection-item-store.d.ts +58 -0
  110. package/dist/src/reflection-item-store.d.ts.map +1 -0
  111. package/dist/src/reflection-item-store.js +69 -0
  112. package/dist/src/reflection-item-store.js.map +7 -0
  113. package/dist/src/reflection-mapped-metadata.d.ts +47 -0
  114. package/dist/src/reflection-mapped-metadata.d.ts.map +1 -0
  115. package/dist/src/reflection-mapped-metadata.js +40 -0
  116. package/dist/src/reflection-mapped-metadata.js.map +7 -0
  117. package/dist/src/reflection-metadata.d.ts +11 -0
  118. package/dist/src/reflection-metadata.d.ts.map +1 -0
  119. package/dist/src/reflection-metadata.js +24 -0
  120. package/dist/src/reflection-metadata.js.map +7 -0
  121. package/dist/src/reflection-ranking.d.ts +13 -0
  122. package/dist/src/reflection-ranking.d.ts.map +1 -0
  123. package/{src/reflection-ranking.ts → dist/src/reflection-ranking.js} +12 -21
  124. package/dist/src/reflection-ranking.js.map +7 -0
  125. package/dist/src/reflection-retry.d.ts +30 -0
  126. package/dist/src/reflection-retry.d.ts.map +1 -0
  127. package/{src/reflection-retry.ts → dist/src/reflection-retry.js} +24 -64
  128. package/dist/src/reflection-retry.js.map +7 -0
  129. package/dist/src/reflection-slices.d.ts +42 -0
  130. package/dist/src/reflection-slices.d.ts.map +1 -0
  131. package/{src/reflection-slices.ts → dist/src/reflection-slices.js} +60 -136
  132. package/dist/src/reflection-slices.js.map +7 -0
  133. package/dist/src/reflection-store.d.ts +85 -0
  134. package/dist/src/reflection-store.d.ts.map +1 -0
  135. package/dist/src/reflection-store.js +407 -0
  136. package/dist/src/reflection-store.js.map +7 -0
  137. package/dist/src/resonance-state.d.ts +19 -0
  138. package/dist/src/resonance-state.d.ts.map +1 -0
  139. package/{src/resonance-state.ts → dist/src/resonance-state.js} +13 -42
  140. package/dist/src/resonance-state.js.map +7 -0
  141. package/dist/src/retriever.d.ts +228 -0
  142. package/dist/src/retriever.d.ts.map +1 -0
  143. package/dist/src/retriever.js +1006 -0
  144. package/dist/src/retriever.js.map +7 -0
  145. package/dist/src/scopes.d.ts +58 -0
  146. package/dist/src/scopes.d.ts.map +1 -0
  147. package/dist/src/scopes.js +252 -0
  148. package/dist/src/scopes.js.map +7 -0
  149. package/dist/src/self-improvement-files.d.ts +20 -0
  150. package/dist/src/self-improvement-files.d.ts.map +1 -0
  151. package/{src/self-improvement-files.ts → dist/src/self-improvement-files.js} +24 -49
  152. package/dist/src/self-improvement-files.js.map +7 -0
  153. package/dist/src/semantic-gate.d.ts +24 -0
  154. package/dist/src/semantic-gate.d.ts.map +1 -0
  155. package/dist/src/semantic-gate.js +86 -0
  156. package/dist/src/semantic-gate.js.map +7 -0
  157. package/dist/src/session-recovery.d.ts +9 -0
  158. package/dist/src/session-recovery.d.ts.map +1 -0
  159. package/{src/session-recovery.ts → dist/src/session-recovery.js} +40 -57
  160. package/dist/src/session-recovery.js.map +7 -0
  161. package/dist/src/smart-extractor.d.ts +107 -0
  162. package/dist/src/smart-extractor.d.ts.map +1 -0
  163. package/{src/smart-extractor.ts → dist/src/smart-extractor.js} +130 -383
  164. package/dist/src/smart-extractor.js.map +7 -0
  165. package/dist/src/smart-metadata.d.ts +103 -0
  166. package/dist/src/smart-metadata.d.ts.map +1 -0
  167. package/dist/src/smart-metadata.js +361 -0
  168. package/dist/src/smart-metadata.js.map +7 -0
  169. package/dist/src/storage-adapter.d.ts +102 -0
  170. package/dist/src/storage-adapter.d.ts.map +1 -0
  171. package/dist/src/storage-adapter.js +22 -0
  172. package/dist/src/storage-adapter.js.map +7 -0
  173. package/dist/src/store.d.ts +108 -0
  174. package/dist/src/store.d.ts.map +1 -0
  175. package/dist/src/store.js +939 -0
  176. package/dist/src/store.js.map +7 -0
  177. package/dist/src/tier-manager.d.ts +57 -0
  178. package/dist/src/tier-manager.d.ts.map +1 -0
  179. package/dist/src/tier-manager.js +80 -0
  180. package/dist/src/tier-manager.js.map +7 -0
  181. package/dist/src/tools.d.ts +43 -0
  182. package/dist/src/tools.d.ts.map +1 -0
  183. package/dist/src/tools.js +1075 -0
  184. package/dist/src/tools.js.map +7 -0
  185. package/dist/src/wal-recovery.d.ts +30 -0
  186. package/dist/src/wal-recovery.d.ts.map +1 -0
  187. package/{src/wal-recovery.ts → dist/src/wal-recovery.js} +26 -79
  188. package/dist/src/wal-recovery.js.map +7 -0
  189. package/package.json +21 -2
  190. package/openclaw.plugin.json +0 -815
  191. package/src/access-tracker.ts +0 -341
  192. package/src/adapters/README.md +0 -78
  193. package/src/adapters/qdrant.ts +0 -191
  194. package/src/adaptive-retrieval.ts +0 -90
  195. package/src/audit-log.ts +0 -238
  196. package/src/chunker.ts +0 -254
  197. package/src/config.ts +0 -271
  198. package/src/decay-engine.ts +0 -238
  199. package/src/extraction-prompts.ts +0 -339
  200. package/src/memory-categories.ts +0 -71
  201. package/src/memory-upgrader.ts +0 -388
  202. package/src/mnemo.ts +0 -142
  203. package/src/noise-filter.ts +0 -97
  204. package/src/noise-prototypes.ts +0 -164
  205. package/src/observability.ts +0 -81
  206. package/src/query-tracker.ts +0 -57
  207. package/src/reflection-event-store.ts +0 -98
  208. package/src/reflection-item-store.ts +0 -112
  209. package/src/reflection-mapped-metadata.ts +0 -84
  210. package/src/reflection-metadata.ts +0 -23
  211. package/src/reflection-store.ts +0 -602
  212. package/src/retriever.ts +0 -1510
  213. package/src/scopes.ts +0 -375
  214. package/src/semantic-gate.ts +0 -121
  215. package/src/smart-metadata.ts +0 -561
  216. package/src/storage-adapter.ts +0 -153
  217. package/src/store.ts +0 -1330
  218. package/src/tier-manager.ts +0 -189
  219. package/src/tools.ts +0 -1292
  220. package/test/core.test.mjs +0 -301
package/src/audit-log.ts DELETED
@@ -1,238 +0,0 @@
1
- // SPDX-License-Identifier: LicenseRef-Mnemo-Pro
2
- /**
3
- * Mnemo Audit Log — GDPR/EU AI Act compliance
4
- *
5
- * Records all memory CRUD operations with:
6
- * - WHO: agent/user identity
7
- * - WHAT: operation type + affected memory IDs
8
- * - WHEN: ISO timestamp
9
- * - WHY: source/trigger (auto-capture, manual, contradiction, etc.)
10
- *
11
- * Stored as append-only JSONL file. Supports retention policies.
12
- */
13
-
14
- import { appendFile, mkdir, readFile, stat } from "node:fs/promises";
15
- import { join } from "node:path";
16
- import { homedir } from "node:os";
17
-
18
- const AUDIT_DIR = join(homedir(), ".mnemo", "audit");
19
- const MAX_FILE_SIZE = 10 * 1024 * 1024; // 10MB per file, then rotate
20
-
21
- export type AuditAction =
22
- | "create"
23
- | "update"
24
- | "delete"
25
- | "bulk_delete"
26
- | "expire"
27
- | "merge"
28
- | "recall"
29
- | "export";
30
-
31
- export interface AuditEntry {
32
- timestamp: string;
33
- action: AuditAction;
34
- actor: string; // agent ID, user ID, or "system"
35
- memoryIds: string[]; // affected memory IDs
36
- scope?: string;
37
- reason?: string; // "auto-capture", "contradiction", "user-request", "decay", etc.
38
- details?: string; // additional context (text preview, old→new value, etc.)
39
- ip?: string; // for API-based access
40
- }
41
-
42
- let _initialized = false;
43
- let _currentFile = "";
44
- let _enabled = true;
45
-
46
- /**
47
- * Initialize the audit log directory.
48
- */
49
- async function ensureDir(): Promise<void> {
50
- if (_initialized) return;
51
- try {
52
- await mkdir(AUDIT_DIR, { recursive: true });
53
- _currentFile = getLogFileName();
54
- _initialized = true;
55
- } catch {
56
- _enabled = false;
57
- }
58
- }
59
-
60
- function getLogFileName(): string {
61
- const date = new Date().toISOString().slice(0, 10); // YYYY-MM-DD
62
- return join(AUDIT_DIR, `audit-${date}.jsonl`);
63
- }
64
-
65
- /**
66
- * Append an audit entry. Fire-and-forget — never blocks the main flow.
67
- */
68
- export async function audit(entry: AuditEntry): Promise<void> {
69
- if (!_enabled) return;
70
-
71
- try {
72
- await ensureDir();
73
-
74
- // Rotate file daily
75
- const expectedFile = getLogFileName();
76
- if (expectedFile !== _currentFile) {
77
- _currentFile = expectedFile;
78
- }
79
-
80
- // Check file size for rotation
81
- try {
82
- const stats = await stat(_currentFile);
83
- if (stats.size > MAX_FILE_SIZE) {
84
- const rotatedName = _currentFile.replace(".jsonl", `-${Date.now()}.jsonl`);
85
- _currentFile = rotatedName;
86
- }
87
- } catch {
88
- // File doesn't exist yet, that's fine
89
- }
90
-
91
- const line = JSON.stringify({
92
- ...entry,
93
- timestamp: entry.timestamp || new Date().toISOString(),
94
- }) + "\n";
95
-
96
- await appendFile(_currentFile, line);
97
- } catch {
98
- // Audit log failure should never break the main flow
99
- }
100
- }
101
-
102
- /**
103
- * Convenience: log a memory creation.
104
- */
105
- export function auditCreate(
106
- memoryId: string,
107
- actor: string,
108
- scope: string,
109
- reason: string,
110
- textPreview?: string,
111
- ): void {
112
- audit({
113
- timestamp: new Date().toISOString(),
114
- action: "create",
115
- actor,
116
- memoryIds: [memoryId],
117
- scope,
118
- reason,
119
- details: textPreview ? textPreview.slice(0, 200) : undefined,
120
- }).catch(() => {});
121
- }
122
-
123
- /**
124
- * Convenience: log a memory deletion.
125
- */
126
- export function auditDelete(
127
- memoryIds: string[],
128
- actor: string,
129
- reason: string,
130
- ): void {
131
- audit({
132
- timestamp: new Date().toISOString(),
133
- action: memoryIds.length > 1 ? "bulk_delete" : "delete",
134
- actor,
135
- memoryIds,
136
- reason,
137
- }).catch(() => {});
138
- }
139
-
140
- /**
141
- * Convenience: log a memory update (e.g., importance change, tier change).
142
- */
143
- export function auditUpdate(
144
- memoryId: string,
145
- actor: string,
146
- reason: string,
147
- details?: string,
148
- ): void {
149
- audit({
150
- timestamp: new Date().toISOString(),
151
- action: "update",
152
- actor,
153
- memoryIds: [memoryId],
154
- reason,
155
- details,
156
- }).catch(() => {});
157
- }
158
-
159
- /**
160
- * Convenience: log a memory expiration (contradiction resolution).
161
- */
162
- export function auditExpire(
163
- memoryId: string,
164
- actor: string,
165
- reason: string,
166
- details?: string,
167
- ): void {
168
- audit({
169
- timestamp: new Date().toISOString(),
170
- action: "expire",
171
- actor,
172
- memoryIds: [memoryId],
173
- reason,
174
- details,
175
- }).catch(() => {});
176
- }
177
-
178
- /**
179
- * Convenience: log a memory recall (for access audit trail).
180
- */
181
- export function auditRecall(
182
- memoryIds: string[],
183
- actor: string,
184
- query?: string,
185
- ): void {
186
- audit({
187
- timestamp: new Date().toISOString(),
188
- action: "recall",
189
- actor,
190
- memoryIds,
191
- reason: "retrieval",
192
- details: query ? query.slice(0, 200) : undefined,
193
- }).catch(() => {});
194
- }
195
-
196
- /**
197
- * Read audit log entries for a date range.
198
- * Useful for compliance exports.
199
- */
200
- export async function readAuditLog(
201
- startDate: string,
202
- endDate: string,
203
- ): Promise<AuditEntry[]> {
204
- await ensureDir();
205
- const entries: AuditEntry[] = [];
206
-
207
- const start = new Date(startDate);
208
- const end = new Date(endDate);
209
- const current = new Date(start);
210
-
211
- while (current <= end) {
212
- const dateStr = current.toISOString().slice(0, 10);
213
- const filePath = join(AUDIT_DIR, `audit-${dateStr}.jsonl`);
214
-
215
- try {
216
- const content = await readFile(filePath, "utf8");
217
- const lines = content.trim().split("\n").filter(Boolean);
218
- for (const line of lines) {
219
- try {
220
- entries.push(JSON.parse(line));
221
- } catch { /* skip malformed */ }
222
- }
223
- } catch {
224
- // File doesn't exist for this date, skip
225
- }
226
-
227
- current.setDate(current.getDate() + 1);
228
- }
229
-
230
- return entries;
231
- }
232
-
233
- /**
234
- * Enable or disable audit logging.
235
- */
236
- export function setAuditEnabled(enabled: boolean): void {
237
- _enabled = enabled;
238
- }
package/src/chunker.ts DELETED
@@ -1,254 +0,0 @@
1
- // SPDX-License-Identifier: MIT
2
- /**
3
- * Long Context Chunking System
4
- *
5
- * Goal: split documents that exceed embedding model context limits into smaller,
6
- * semantically coherent chunks with overlap.
7
- *
8
- * Notes:
9
- * - We use *character counts* as a conservative proxy for tokens.
10
- * - The embedder triggers this only after a provider throws a context-length error.
11
- */
12
-
13
- // ============================================================================
14
- // Types & Constants
15
- // ============================================================================
16
-
17
- export interface ChunkMetadata {
18
- startIndex: number;
19
- endIndex: number;
20
- length: number;
21
- }
22
-
23
- export interface ChunkResult {
24
- chunks: string[];
25
- metadatas: ChunkMetadata[];
26
- totalOriginalLength: number;
27
- chunkCount: number;
28
- }
29
-
30
- export interface ChunkerConfig {
31
- /** Maximum characters per chunk. */
32
- maxChunkSize: number;
33
- /** Overlap between chunks in characters. */
34
- overlapSize: number;
35
- /** Minimum chunk size (except the final chunk). */
36
- minChunkSize: number;
37
- /** Attempt to split on sentence boundaries for better semantic coherence. */
38
- semanticSplit: boolean;
39
- /** Max lines per chunk before we try to split earlier on a line boundary. */
40
- maxLinesPerChunk: number;
41
- }
42
-
43
- // Common embedding context limits (provider/model specific). These are typically
44
- // token limits, but we treat them as inputs to a conservative char-based heuristic.
45
- export const EMBEDDING_CONTEXT_LIMITS: Record<string, number> = {
46
- // Jina v5
47
- "jina-embeddings-v5-text-small": 8192,
48
- "jina-embeddings-v5-text-nano": 8192,
49
-
50
- // OpenAI
51
- "text-embedding-3-small": 8192,
52
- "text-embedding-3-large": 8192,
53
-
54
- // Google
55
- "text-embedding-004": 8192,
56
- "gemini-embedding-001": 2048,
57
-
58
- // Local/common
59
- "nomic-embed-text": 8192,
60
- "all-MiniLM-L6-v2": 512,
61
- "all-mpnet-base-v2": 512,
62
- };
63
-
64
- export const DEFAULT_CHUNKER_CONFIG: ChunkerConfig = {
65
- maxChunkSize: 4000,
66
- overlapSize: 200,
67
- minChunkSize: 200,
68
- semanticSplit: true,
69
- maxLinesPerChunk: 50,
70
- };
71
-
72
- // Sentence ending patterns (English + CJK-ish punctuation)
73
- const SENTENCE_ENDING = /[.!?。!?]/;
74
-
75
- // ============================================================================
76
- // Helpers
77
- // ============================================================================
78
-
79
- function clamp(n: number, lo: number, hi: number): number {
80
- return Math.max(lo, Math.min(hi, n));
81
- }
82
-
83
- function countLines(s: string): number {
84
- // Count \n (treat CRLF as one line break)
85
- return s.split(/\r\n|\n|\r/).length;
86
- }
87
-
88
- function findLastIndexWithin(text: string, re: RegExp, start: number, end: number): number {
89
- // Find last match start index for regex within [start, end).
90
- // NOTE: `re` must NOT be global; we will scan manually.
91
- let last = -1;
92
- for (let i = end - 1; i >= start; i--) {
93
- if (re.test(text[i])) return i;
94
- }
95
- return last;
96
- }
97
-
98
- function findSplitEnd(text: string, start: number, maxEnd: number, minEnd: number, config: ChunkerConfig): number {
99
- const safeMinEnd = clamp(minEnd, start + 1, maxEnd);
100
- const safeMaxEnd = clamp(maxEnd, safeMinEnd, text.length);
101
-
102
- // Respect line limit: if we exceed maxLinesPerChunk, force earlier split at a line break.
103
- if (config.maxLinesPerChunk > 0) {
104
- const candidate = text.slice(start, safeMaxEnd);
105
- if (countLines(candidate) > config.maxLinesPerChunk) {
106
- // Find the position of the Nth line break.
107
- let breaks = 0;
108
- for (let i = start; i < safeMaxEnd; i++) {
109
- const ch = text[i];
110
- if (ch === "\n") {
111
- breaks++;
112
- if (breaks >= config.maxLinesPerChunk) {
113
- // Split right after this newline.
114
- return Math.max(i + 1, safeMinEnd);
115
- }
116
- }
117
- }
118
- }
119
- }
120
-
121
- if (config.semanticSplit) {
122
- // Prefer a sentence boundary near the end.
123
- // Scan backward from safeMaxEnd to safeMinEnd.
124
- for (let i = safeMaxEnd - 1; i >= safeMinEnd; i--) {
125
- if (SENTENCE_ENDING.test(text[i])) {
126
- // Include trailing whitespace after punctuation.
127
- let j = i + 1;
128
- while (j < safeMaxEnd && /\s/.test(text[j])) j++;
129
- return j;
130
- }
131
- }
132
-
133
- // Next best: newline boundary.
134
- for (let i = safeMaxEnd - 1; i >= safeMinEnd; i--) {
135
- if (text[i] === "\n") return i + 1;
136
- }
137
- }
138
-
139
- // Fallback: last whitespace boundary.
140
- for (let i = safeMaxEnd - 1; i >= safeMinEnd; i--) {
141
- if (/\s/.test(text[i])) return i;
142
- }
143
-
144
- return safeMaxEnd;
145
- }
146
-
147
- function sliceTrimWithIndices(text: string, start: number, end: number): { chunk: string; meta: ChunkMetadata } {
148
- const raw = text.slice(start, end);
149
- const leading = raw.match(/^\s*/)?.[0]?.length ?? 0;
150
- const trailing = raw.match(/\s*$/)?.[0]?.length ?? 0;
151
- const chunk = raw.trim();
152
-
153
- const trimmedStart = start + leading;
154
- const trimmedEnd = end - trailing;
155
-
156
- return {
157
- chunk,
158
- meta: {
159
- startIndex: trimmedStart,
160
- endIndex: Math.max(trimmedStart, trimmedEnd),
161
- length: chunk.length,
162
- },
163
- };
164
- }
165
-
166
- // ============================================================================
167
- // Chunking Core
168
- // ============================================================================
169
-
170
- export function chunkDocument(text: string, config: ChunkerConfig = DEFAULT_CHUNKER_CONFIG): ChunkResult {
171
- if (!text || text.trim().length === 0) {
172
- return { chunks: [], metadatas: [], totalOriginalLength: 0, chunkCount: 0 };
173
- }
174
-
175
- const totalOriginalLength = text.length;
176
- const chunks: string[] = [];
177
- const metadatas: ChunkMetadata[] = [];
178
-
179
- let pos = 0;
180
- const maxGuard = Math.max(4, Math.ceil(text.length / Math.max(1, config.maxChunkSize - config.overlapSize)) + 5);
181
- let guard = 0;
182
-
183
- while (pos < text.length && guard < maxGuard) {
184
- guard++;
185
-
186
- const remaining = text.length - pos;
187
- if (remaining <= config.maxChunkSize) {
188
- const { chunk, meta } = sliceTrimWithIndices(text, pos, text.length);
189
- if (chunk.length > 0) {
190
- chunks.push(chunk);
191
- metadatas.push(meta);
192
- }
193
- break;
194
- }
195
-
196
- const maxEnd = Math.min(pos + config.maxChunkSize, text.length);
197
- const minEnd = Math.min(pos + config.minChunkSize, maxEnd);
198
-
199
- const end = findSplitEnd(text, pos, maxEnd, minEnd, config);
200
- const { chunk, meta } = sliceTrimWithIndices(text, pos, end);
201
-
202
- // If trimming made it too small, fall back to a hard split.
203
- if (chunk.length < config.minChunkSize) {
204
- const hardEnd = Math.min(pos + config.maxChunkSize, text.length);
205
- const hard = sliceTrimWithIndices(text, pos, hardEnd);
206
- if (hard.chunk.length > 0) {
207
- chunks.push(hard.chunk);
208
- metadatas.push(hard.meta);
209
- }
210
- if (hardEnd >= text.length) break;
211
- pos = Math.max(hardEnd - config.overlapSize, pos + 1);
212
- continue;
213
- }
214
-
215
- chunks.push(chunk);
216
- metadatas.push(meta);
217
-
218
- if (end >= text.length) break;
219
-
220
- // Move forward with overlap.
221
- const nextPos = Math.max(end - config.overlapSize, pos + 1);
222
- pos = nextPos;
223
- }
224
-
225
- return {
226
- chunks,
227
- metadatas,
228
- totalOriginalLength,
229
- chunkCount: chunks.length,
230
- };
231
- }
232
-
233
- /**
234
- * Smart chunker that adapts to model context limits.
235
- *
236
- * We intentionally pick conservative char limits (70% of the reported limit)
237
- * since token/char ratios vary.
238
- */
239
- export function smartChunk(text: string, embedderModel?: string): ChunkResult {
240
- const limit = embedderModel ? EMBEDDING_CONTEXT_LIMITS[embedderModel] : undefined;
241
- const base = limit ?? 8192;
242
-
243
- const config: ChunkerConfig = {
244
- maxChunkSize: Math.max(1000, Math.floor(base * 0.7)),
245
- overlapSize: Math.max(0, Math.floor(base * 0.05)),
246
- minChunkSize: Math.max(100, Math.floor(base * 0.1)),
247
- semanticSplit: true,
248
- maxLinesPerChunk: 50,
249
- };
250
-
251
- return chunkDocument(text, config);
252
- }
253
-
254
- export default chunkDocument;