clawmem 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/AGENTS.md +660 -0
  2. package/CLAUDE.md +660 -0
  3. package/LICENSE +21 -0
  4. package/README.md +993 -0
  5. package/SKILL.md +717 -0
  6. package/bin/clawmem +75 -0
  7. package/package.json +72 -0
  8. package/src/amem.ts +797 -0
  9. package/src/beads.ts +263 -0
  10. package/src/clawmem.ts +1849 -0
  11. package/src/collections.ts +405 -0
  12. package/src/config.ts +178 -0
  13. package/src/consolidation.ts +123 -0
  14. package/src/directory-context.ts +248 -0
  15. package/src/errors.ts +41 -0
  16. package/src/formatter.ts +427 -0
  17. package/src/graph-traversal.ts +247 -0
  18. package/src/hooks/context-surfacing.ts +317 -0
  19. package/src/hooks/curator-nudge.ts +89 -0
  20. package/src/hooks/decision-extractor.ts +639 -0
  21. package/src/hooks/feedback-loop.ts +214 -0
  22. package/src/hooks/handoff-generator.ts +345 -0
  23. package/src/hooks/postcompact-inject.ts +226 -0
  24. package/src/hooks/precompact-extract.ts +314 -0
  25. package/src/hooks/pretool-inject.ts +79 -0
  26. package/src/hooks/session-bootstrap.ts +324 -0
  27. package/src/hooks/staleness-check.ts +130 -0
  28. package/src/hooks.ts +367 -0
  29. package/src/indexer.ts +327 -0
  30. package/src/intent.ts +294 -0
  31. package/src/limits.ts +26 -0
  32. package/src/llm.ts +1175 -0
  33. package/src/mcp.ts +2138 -0
  34. package/src/memory.ts +336 -0
  35. package/src/mmr.ts +93 -0
  36. package/src/observer.ts +269 -0
  37. package/src/openclaw/engine.ts +283 -0
  38. package/src/openclaw/index.ts +221 -0
  39. package/src/openclaw/plugin.json +83 -0
  40. package/src/openclaw/shell.ts +207 -0
  41. package/src/openclaw/tools.ts +304 -0
  42. package/src/profile.ts +346 -0
  43. package/src/promptguard.ts +218 -0
  44. package/src/retrieval-gate.ts +106 -0
  45. package/src/search-utils.ts +127 -0
  46. package/src/server.ts +783 -0
  47. package/src/splitter.ts +325 -0
  48. package/src/store.ts +4062 -0
  49. package/src/validation.ts +67 -0
  50. package/src/watcher.ts +58 -0
package/src/llm.ts ADDED
@@ -0,0 +1,1175 @@
1
+ /**
2
+ * llm.ts - LLM abstraction layer for QMD using node-llama-cpp
3
+ *
4
+ * Provides embeddings, text generation, and reranking using local GGUF models.
5
+ * Embeddings can use a remote server (CLAWMEM_EMBED_URL), cloud API, or local node-llama-cpp fallback.
6
+ */
7
+
8
+ // node-llama-cpp is loaded lazily to avoid ~630ms import cost when all
9
+ // operations route to remote GPU servers. Only loaded if a local fallback
10
+ // is actually needed (GPU server down).
11
+ let _nodeLlamaCpp: typeof import("node-llama-cpp") | null = null;
12
+ async function getNodeLlamaCpp() {
13
+ if (!_nodeLlamaCpp) {
14
+ _nodeLlamaCpp = await import("node-llama-cpp");
15
+ }
16
+ return _nodeLlamaCpp;
17
+ }
18
+
19
+ // Re-export type aliases for internal use (structural, no runtime cost)
20
+ type Llama = any;
21
+ type LlamaModel = any;
22
+ type LlamaEmbeddingContext = any;
23
+ type LlamaToken = any;
24
+
25
+ import { homedir } from "os";
26
+ import { join } from "path";
27
+ import { existsSync, mkdirSync } from "fs";
28
+
29
+ // =============================================================================
30
+ // Embedding Formatting Functions
31
+ // =============================================================================
32
+
33
+ /**
34
+ * Format a query for embedding.
35
+ * Uses task prefix format for embedding models.
36
+ */
37
+ export function formatQueryForEmbedding(query: string): string {
38
+ return `task: search result | query: ${query}`;
39
+ }
40
+
41
+ /**
42
+ * Format a document for embedding.
43
+ * Uses title + text format for embedding models.
44
+ */
45
+ export function formatDocForEmbedding(text: string, title?: string): string {
46
+ return `title: ${title || "none"} | text: ${text}`;
47
+ }
48
+
49
+ // =============================================================================
50
+ // Types
51
+ // =============================================================================
52
+
53
+ /**
54
+ * Token with log probability
55
+ */
56
+ export type TokenLogProb = {
57
+ token: string;
58
+ logprob: number;
59
+ };
60
+
61
+ /**
62
+ * Embedding result
63
+ */
64
+ export type EmbeddingResult = {
65
+ embedding: number[];
66
+ model: string;
67
+ };
68
+
69
+ /**
70
+ * Generation result with optional logprobs
71
+ */
72
+ export type GenerateResult = {
73
+ text: string;
74
+ model: string;
75
+ logprobs?: TokenLogProb[];
76
+ done: boolean;
77
+ };
78
+
79
+ /**
80
+ * Rerank result for a single document
81
+ */
82
+ export type RerankDocumentResult = {
83
+ file: string;
84
+ score: number;
85
+ index: number;
86
+ };
87
+
88
+ /**
89
+ * Batch rerank result
90
+ */
91
+ export type RerankResult = {
92
+ results: RerankDocumentResult[];
93
+ model: string;
94
+ };
95
+
96
+ /**
97
+ * Model info
98
+ */
99
+ export type ModelInfo = {
100
+ name: string;
101
+ exists: boolean;
102
+ path?: string;
103
+ };
104
+
105
+ /**
106
+ * Options for embedding
107
+ */
108
+ export type EmbedOptions = {
109
+ model?: string;
110
+ isQuery?: boolean;
111
+ title?: string;
112
+ };
113
+
114
+ /**
115
+ * Options for text generation
116
+ */
117
+ export type GenerateOptions = {
118
+ model?: string;
119
+ maxTokens?: number;
120
+ temperature?: number;
121
+ signal?: AbortSignal;
122
+ };
123
+
124
+ /**
125
+ * Options for reranking
126
+ */
127
+ export type RerankOptions = {
128
+ model?: string;
129
+ };
130
+
131
+ /**
132
+ * Supported query types for different search backends
133
+ */
134
+ export type QueryType = 'lex' | 'vec' | 'hyde';
135
+
136
+ /**
137
+ * A single query and its target backend type
138
+ */
139
+ export type Queryable = {
140
+ type: QueryType;
141
+ text: string;
142
+ };
143
+
144
+ /**
145
+ * Document to rerank
146
+ */
147
+ export type RerankDocument = {
148
+ file: string;
149
+ text: string;
150
+ title?: string;
151
+ };
152
+
153
+ // =============================================================================
154
+ // Model Configuration
155
+ // =============================================================================
156
+
157
+ // HuggingFace model URIs for node-llama-cpp
158
+ // Format: hf:<user>/<repo>/<file>
159
+ const DEFAULT_EMBED_MODEL = "hf:ggml-org/embeddinggemma-300M-GGUF/embeddinggemma-300M-Q8_0.gguf";
160
+ const DEFAULT_RERANK_MODEL = "hf:ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF/qwen3-reranker-0.6b-q8_0.gguf";
161
+ const DEFAULT_GENERATE_MODEL = "hf:tobil/qmd-query-expansion-1.7B-gguf/qmd-query-expansion-1.7B-q4_k_m.gguf";
162
+
163
+ // Local model cache directory
164
+ const MODEL_CACHE_DIR = join(homedir(), ".cache", "qmd", "models");
165
+
166
+ // =============================================================================
167
+ // LLM Interface
168
+ // =============================================================================
169
+
170
+ /**
171
+ * Abstract LLM interface - implement this for different backends
172
+ */
173
+ export interface LLM {
174
+ /**
175
+ * Get embeddings for text
176
+ */
177
+ embed(text: string, options?: EmbedOptions): Promise<EmbeddingResult | null>;
178
+
179
+ /**
180
+ * Generate text completion
181
+ */
182
+ generate(prompt: string, options?: GenerateOptions): Promise<GenerateResult | null>;
183
+
184
+ /**
185
+ * Check if a model exists/is available
186
+ */
187
+ modelExists(model: string): Promise<ModelInfo>;
188
+
189
+ /**
190
+ * Expand a search query into multiple variations for different backends.
191
+ * Returns a list of Queryable objects.
192
+ */
193
+ expandQuery(query: string, options?: { context?: string, includeLexical?: boolean, intent?: string }): Promise<Queryable[]>;
194
+
195
+ /**
196
+ * Rerank documents by relevance to a query
197
+ * Returns list of documents with relevance scores (higher = more relevant)
198
+ */
199
+ rerank(query: string, documents: RerankDocument[], options?: RerankOptions): Promise<RerankResult>;
200
+
201
+ /**
202
+ * Dispose of resources
203
+ */
204
+ dispose(): Promise<void>;
205
+ }
206
+
207
+ // =============================================================================
208
+ // node-llama-cpp Implementation
209
+ // =============================================================================
210
+
211
+ export type LlamaCppConfig = {
212
+ embedModel?: string;
213
+ generateModel?: string;
214
+ rerankModel?: string;
215
+ modelCacheDir?: string;
216
+ /**
217
+ * Remote embedding server URL (e.g. "http://your-gpu-server:8088").
218
+ * When set, embed() uses HTTP POST to /v1/embeddings instead of local node-llama-cpp.
219
+ * Env: CLAWMEM_EMBED_URL
220
+ */
221
+ remoteEmbedUrl?: string;
222
+ /**
223
+ * API key for remote embedding service (e.g. OpenAI, Voyage AI, Jina AI, Cohere).
224
+ * When set, sent as Authorization: Bearer header with embedding requests.
225
+ * Env: CLAWMEM_EMBED_API_KEY
226
+ */
227
+ remoteEmbedApiKey?: string;
228
+ /**
229
+ * Model name to send with embedding requests (e.g. "text-embedding-3-small",
230
+ * "voyage-4-large", "jina-embeddings-v3", "embed-v4.0").
231
+ * Defaults to "embedding" (llama-server convention).
232
+ * Env: CLAWMEM_EMBED_MODEL
233
+ */
234
+ remoteEmbedModel?: string;
235
+ /**
236
+ * Remote LLM server URL for text generation (e.g. http://localhost:8089).
237
+ * When set, generate() calls /v1/chat/completions instead of local node-llama-cpp.
238
+ */
239
+ remoteLlmUrl?: string;
240
+ /**
241
+ * Inactivity timeout in ms before unloading contexts (default: 2 minutes, 0 to disable).
242
+ *
243
+ * Per node-llama-cpp lifecycle guidance, we prefer keeping models loaded and only disposing
244
+ * contexts when idle, since contexts (and their sequences) are the heavy per-session objects.
245
+ * @see https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
246
+ */
247
+ inactivityTimeoutMs?: number;
248
+ /**
249
+ * Whether to dispose models on inactivity (default: false).
250
+ *
251
+ * Keeping models loaded avoids repeated VRAM thrash; set to true only if you need aggressive
252
+ * memory reclaim.
253
+ */
254
+ disposeModelsOnInactivity?: boolean;
255
+ };
256
+
257
+ /**
258
+ * LLM implementation using node-llama-cpp
259
+ */
260
+ // Default inactivity timeout: 2 minutes
261
+ const DEFAULT_INACTIVITY_TIMEOUT_MS = 2 * 60 * 1000;
262
+
263
+ export class LlamaCpp implements LLM {
264
+ private llama: Llama | null = null;
265
+ private embedModel: LlamaModel | null = null;
266
+ private embedContext: LlamaEmbeddingContext | null = null;
267
+ private generateModel: LlamaModel | null = null;
268
+ private rerankModel: LlamaModel | null = null;
269
+ private rerankContext: Awaited<ReturnType<LlamaModel["createRankingContext"]>> | null = null;
270
+
271
+ private embedModelUri: string;
272
+ private generateModelUri: string;
273
+ private rerankModelUri: string;
274
+ private modelCacheDir: string;
275
+ private remoteEmbedUrl: string | null;
276
+ private remoteEmbedApiKey: string | null;
277
+ private remoteEmbedModel: string;
278
+ private remoteLlmUrl: string | null;
279
+
280
+ // Ensure we don't load the same model concurrently (which can allocate duplicate VRAM).
281
+ private embedModelLoadPromise: Promise<LlamaModel> | null = null;
282
+ private generateModelLoadPromise: Promise<LlamaModel> | null = null;
283
+ private rerankModelLoadPromise: Promise<LlamaModel> | null = null;
284
+
285
+ // Inactivity timer for auto-unloading models
286
+ private inactivityTimer: ReturnType<typeof setTimeout> | null = null;
287
+ private inactivityTimeoutMs: number;
288
+ private disposeModelsOnInactivity: boolean;
289
+
290
+ // Track disposal state to prevent double-dispose
291
+ private disposed = false;
292
+
293
+
294
+ constructor(config: LlamaCppConfig = {}) {
295
+ this.embedModelUri = config.embedModel || DEFAULT_EMBED_MODEL;
296
+ this.generateModelUri = config.generateModel || DEFAULT_GENERATE_MODEL;
297
+ this.rerankModelUri = config.rerankModel || DEFAULT_RERANK_MODEL;
298
+ this.modelCacheDir = config.modelCacheDir || MODEL_CACHE_DIR;
299
+ this.remoteEmbedUrl = config.remoteEmbedUrl || null;
300
+ this.remoteEmbedApiKey = config.remoteEmbedApiKey || null;
301
+ this.remoteEmbedModel = config.remoteEmbedModel || "embedding";
302
+ this.remoteLlmUrl = config.remoteLlmUrl || null;
303
+ this.inactivityTimeoutMs = config.inactivityTimeoutMs ?? DEFAULT_INACTIVITY_TIMEOUT_MS;
304
+ this.disposeModelsOnInactivity = config.disposeModelsOnInactivity ?? false;
305
+ }
306
+
307
+ /**
308
+ * Reset the inactivity timer. Called after each model operation.
309
+ * When timer fires, models are unloaded to free memory.
310
+ */
311
+ private touchActivity(): void {
312
+ // Clear existing timer
313
+ if (this.inactivityTimer) {
314
+ clearTimeout(this.inactivityTimer);
315
+ this.inactivityTimer = null;
316
+ }
317
+
318
+ // Only set timer if we have disposable contexts and timeout is enabled
319
+ if (this.inactivityTimeoutMs > 0 && this.hasLoadedContexts()) {
320
+ this.inactivityTimer = setTimeout(() => {
321
+ this.unloadIdleResources().catch(err => {
322
+ console.error("Error unloading idle resources:", err);
323
+ });
324
+ }, this.inactivityTimeoutMs);
325
+ // Don't keep process alive just for this timer
326
+ this.inactivityTimer.unref();
327
+ }
328
+ }
329
+
330
+ /**
331
+ * Check if any contexts are currently loaded (and therefore worth unloading on inactivity).
332
+ */
333
+ private hasLoadedContexts(): boolean {
334
+ return !!this.embedContext || !!this.rerankContext;
335
+ }
336
+
337
+ /**
338
+ * Unload idle resources but keep the instance alive for future use.
339
+ *
340
+ * By default, this disposes contexts (and their dependent sequences), while keeping models loaded.
341
+ * This matches the intended lifecycle: model → context → sequence, where contexts are per-session.
342
+ */
343
+ async unloadIdleResources(): Promise<void> {
344
+ // Don't unload if already disposed
345
+ if (this.disposed) {
346
+ return;
347
+ }
348
+
349
+ // Clear timer
350
+ if (this.inactivityTimer) {
351
+ clearTimeout(this.inactivityTimer);
352
+ this.inactivityTimer = null;
353
+ }
354
+
355
+ // Dispose contexts first
356
+ if (this.embedContext) {
357
+ await this.embedContext.dispose();
358
+ this.embedContext = null;
359
+ }
360
+ if (this.rerankContext) {
361
+ await this.rerankContext.dispose();
362
+ this.rerankContext = null;
363
+ }
364
+
365
+ // Optionally dispose models too (opt-in)
366
+ if (this.disposeModelsOnInactivity) {
367
+ if (this.embedModel) {
368
+ await this.embedModel.dispose();
369
+ this.embedModel = null;
370
+ }
371
+ if (this.generateModel) {
372
+ await this.generateModel.dispose();
373
+ this.generateModel = null;
374
+ }
375
+ if (this.rerankModel) {
376
+ await this.rerankModel.dispose();
377
+ this.rerankModel = null;
378
+ }
379
+ // Reset load promises so models can be reloaded later
380
+ this.embedModelLoadPromise = null;
381
+ this.generateModelLoadPromise = null;
382
+ this.rerankModelLoadPromise = null;
383
+ }
384
+
385
+ // Note: We keep llama instance alive - it's lightweight
386
+ }
387
+
388
+ /**
389
+ * Ensure model cache directory exists
390
+ */
391
+ private ensureModelCacheDir(): void {
392
+ if (!existsSync(this.modelCacheDir)) {
393
+ mkdirSync(this.modelCacheDir, { recursive: true });
394
+ }
395
+ }
396
+
397
+ /**
398
+ * Initialize the llama instance (lazy)
399
+ */
400
+ private async ensureLlama(): Promise<Llama> {
401
+ if (!this.llama) {
402
+ const { getLlama, LlamaLogLevel } = await getNodeLlamaCpp();
403
+ this.llama = await getLlama({ logLevel: LlamaLogLevel.error });
404
+ }
405
+ return this.llama;
406
+ }
407
+
408
+ /**
409
+ * Resolve a model URI to a local path, downloading if needed.
410
+ * Set CLAWMEM_NO_LOCAL_MODELS=true to prevent auto-downloads (GPU-only mode).
411
+ */
412
+ private async resolveModel(modelUri: string): Promise<string> {
413
+ if (process.env.CLAWMEM_NO_LOCAL_MODELS === "true") {
414
+ throw new Error(`Local model download blocked (CLAWMEM_NO_LOCAL_MODELS=true). Model: ${modelUri}. Set CLAWMEM_EMBED_URL / CLAWMEM_LLM_URL / CLAWMEM_RERANK_URL to use GPU endpoints.`);
415
+ }
416
+ this.ensureModelCacheDir();
417
+ const { resolveModelFile } = await getNodeLlamaCpp();
418
+ return await resolveModelFile(modelUri, this.modelCacheDir);
419
+ }
420
+
421
+ /**
422
+ * Load embedding model (lazy) — used for in-process CPU fallback when no remote embed server.
423
+ * Auto-downloads EmbeddingGemma-300M from HuggingFace on first use (~300MB).
424
+ */
425
+ private async ensureEmbedModel(): Promise<LlamaModel> {
426
+ if (this.embedModel) {
427
+ return this.embedModel;
428
+ }
429
+ if (this.embedModelLoadPromise) {
430
+ return await this.embedModelLoadPromise;
431
+ }
432
+
433
+ this.embedModelLoadPromise = (async () => {
434
+ const llama = await this.ensureLlama();
435
+ const modelPath = await this.resolveModel(this.embedModelUri);
436
+ const model = await llama.loadModel({ modelPath });
437
+ this.embedModel = model;
438
+ this.touchActivity();
439
+ return model;
440
+ })();
441
+
442
+ try {
443
+ return await this.embedModelLoadPromise;
444
+ } finally {
445
+ this.embedModelLoadPromise = null;
446
+ }
447
+ }
448
+
449
+ /**
450
+ * Get or create a single embedding context (lazy).
451
+ */
452
+ private async ensureEmbedContext(): Promise<LlamaEmbeddingContext> {
453
+ if (this.embedContext) {
454
+ this.touchActivity();
455
+ return this.embedContext;
456
+ }
457
+ const model = await this.ensureEmbedModel();
458
+ this.embedContext = await model.createEmbeddingContext();
459
+ this.touchActivity();
460
+ return this.embedContext;
461
+ }
462
+
463
+ /**
464
+ * Load generation model (lazy) - context is created fresh per call
465
+ */
466
+ private async ensureGenerateModel(): Promise<LlamaModel> {
467
+ if (!this.generateModel) {
468
+ if (this.generateModelLoadPromise) {
469
+ return await this.generateModelLoadPromise;
470
+ }
471
+
472
+ this.generateModelLoadPromise = (async () => {
473
+ const llama = await this.ensureLlama();
474
+ const modelPath = await this.resolveModel(this.generateModelUri);
475
+ const model = await llama.loadModel({ modelPath });
476
+ this.generateModel = model;
477
+ return model;
478
+ })();
479
+
480
+ try {
481
+ await this.generateModelLoadPromise;
482
+ } finally {
483
+ this.generateModelLoadPromise = null;
484
+ }
485
+ }
486
+ this.touchActivity();
487
+ if (!this.generateModel) {
488
+ throw new Error("Generate model not loaded");
489
+ }
490
+ return this.generateModel;
491
+ }
492
+
493
+ /**
494
+ * Load rerank model (lazy)
495
+ */
496
+ private async ensureRerankModel(): Promise<LlamaModel> {
497
+ if (this.rerankModel) {
498
+ return this.rerankModel;
499
+ }
500
+ if (this.rerankModelLoadPromise) {
501
+ return await this.rerankModelLoadPromise;
502
+ }
503
+
504
+ this.rerankModelLoadPromise = (async () => {
505
+ const llama = await this.ensureLlama();
506
+ const modelPath = await this.resolveModel(this.rerankModelUri);
507
+ const model = await llama.loadModel({ modelPath });
508
+ this.rerankModel = model;
509
+ return model;
510
+ })();
511
+
512
+ try {
513
+ return await this.rerankModelLoadPromise;
514
+ } finally {
515
+ this.rerankModelLoadPromise = null;
516
+ }
517
+ }
518
+
519
+ /**
520
+ * Load rerank context (lazy). Context can be disposed and recreated without reloading the model.
521
+ */
522
+ private async ensureRerankContext(): Promise<Awaited<ReturnType<LlamaModel["createRankingContext"]>>> {
523
+ if (!this.rerankContext) {
524
+ const model = await this.ensureRerankModel();
525
+ this.rerankContext = await model.createRankingContext();
526
+ }
527
+ this.touchActivity();
528
+ return this.rerankContext;
529
+ }
530
+
531
+ // ==========================================================================
532
+ // Tokenization
533
+ // ==========================================================================
534
+
535
+ /**
536
+ * Tokenize text using the generate model's tokenizer
537
+ * Returns tokenizer tokens (opaque type from node-llama-cpp)
538
+ */
539
+ async tokenize(text: string): Promise<readonly LlamaToken[]> {
540
+ const model = await this.ensureGenerateModel();
541
+ return model.tokenize(text);
542
+ }
543
+
544
+ /**
545
+ * Count tokens in text using the generate model's tokenizer
546
+ */
547
+ async countTokens(text: string): Promise<number> {
548
+ const tokens = await this.tokenize(text);
549
+ return tokens.length;
550
+ }
551
+
552
+ /**
553
+ * Detokenize token IDs back to text
554
+ */
555
+ async detokenize(tokens: readonly LlamaToken[]): Promise<string> {
556
+ const model = await this.ensureGenerateModel();
557
+ return model.detokenize(tokens);
558
+ }
559
+
560
+ // ==========================================================================
561
+ // Core API methods
562
+ // ==========================================================================
563
+
564
+ async embed(text: string, options: EmbedOptions = {}): Promise<EmbeddingResult | null> {
565
+ // Remote server or cloud API — preferred path
566
+ if (this.remoteEmbedUrl) {
567
+ const extraParams = this.getCloudEmbedParams(!!options.isQuery);
568
+ const result = await this.embedRemote(text, extraParams);
569
+ if (result) return result;
570
+ // Cloud providers don't fall back — if API key is set, the user chose cloud
571
+ if (this.isCloudEmbedding()) return null;
572
+ // Local server unreachable — fall through to in-process fallback
573
+ console.error("[embed] Remote server unreachable, falling back to in-process embedding");
574
+ }
575
+
576
+ // In-process fallback via node-llama-cpp (auto-downloads EmbeddingGemma on first use)
577
+ return this.embedLocal(text);
578
+ }
579
+
580
+ /**
581
+ * Batch embed multiple texts efficiently.
582
+ * Remote: single HTTP request with up to 50 texts.
583
+ * Local: sequential via node-llama-cpp embedding context.
584
+ */
585
+ async embedBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
586
+ if (texts.length === 0) return [];
587
+
588
+ // Remote server or cloud API
589
+ if (this.remoteEmbedUrl) {
590
+ const extraParams = this.getCloudEmbedParams(false);
591
+ const results = await this.embedRemoteBatch(texts, extraParams);
592
+ // If we got at least one result, remote is working
593
+ if (results.some(r => r !== null)) return results;
594
+ // Cloud providers don't fall back
595
+ if (this.isCloudEmbedding()) return results;
596
+ // Local server unreachable — fall through to in-process fallback
597
+ console.error("[embed] Remote server unreachable, falling back to in-process embedding");
598
+ }
599
+
600
+ // In-process fallback via node-llama-cpp
601
+ return this.embedLocalBatch(texts);
602
+ }
603
+
604
+ /** In-process embedding via node-llama-cpp with truncation guard */
605
+ private async embedLocal(text: string): Promise<EmbeddingResult | null> {
606
+ try {
607
+ const context = await this.ensureEmbedContext();
608
+ const safeText = this.truncateForLocalEmbed(text);
609
+ const embedding = await context.getEmbeddingFor(safeText);
610
+ return {
611
+ embedding: Array.from(embedding.vector),
612
+ model: this.embedModelUri,
613
+ };
614
+ } catch (error) {
615
+ console.error("[embed] Local embedding error:", error);
616
+ return null;
617
+ }
618
+ }
619
+
620
+ /** In-process batch embedding via node-llama-cpp with truncation guard */
621
+ private async embedLocalBatch(texts: string[]): Promise<(EmbeddingResult | null)[]> {
622
+ try {
623
+ const context = await this.ensureEmbedContext();
624
+ const results: (EmbeddingResult | null)[] = [];
625
+ for (const text of texts) {
626
+ try {
627
+ const safeText = this.truncateForLocalEmbed(text);
628
+ const embedding = await context.getEmbeddingFor(safeText);
629
+ results.push({ embedding: Array.from(embedding.vector), model: this.embedModelUri });
630
+ } catch (err) {
631
+ console.error("[embed] Local batch embedding error:", err);
632
+ results.push(null);
633
+ }
634
+ }
635
+ return results;
636
+ } catch (error) {
637
+ console.error("[embed] Failed to initialize local embedding:", error);
638
+ return texts.map(() => null);
639
+ }
640
+ }
641
+
642
+ /** Truncate text to maxRemoteEmbedChars for local in-process embedding (prevents context overflow crash) */
643
+ private truncateForLocalEmbed(text: string): string {
644
+ if (text.length <= this.maxRemoteEmbedChars) return text;
645
+ return text.slice(0, this.maxRemoteEmbedChars);
646
+ }
647
+
648
+ // ---------- Remote embedding (GPU server or cloud API via /v1/embeddings) ----------
649
+
650
+ // Default: 6000 chars for EmbeddingGemma-300M (2048-token context).
651
+ // At ~3 chars/token (mixed code+prose), 6000 chars ≈ 2000 tokens — safely under 2048.
652
+ // Pure code tokenizes at ~2 chars/token (3000 tokens) but chunks are pre-split
653
+ // at 900 tokens so this only applies to the formatting wrapper.
654
+ // Override via CLAWMEM_EMBED_MAX_CHARS (e.g. 1100 for granite-278m, 512-token context).
655
+ // Cloud providers (API key set) skip truncation entirely.
656
+ private readonly maxRemoteEmbedChars: number =
657
+ parseInt(process.env.CLAWMEM_EMBED_MAX_CHARS || "6000", 10);
658
+
659
+ private isCloudEmbedding(): boolean {
660
+ return !!this.remoteEmbedApiKey;
661
+ }
662
+
663
+ /** Detect cloud provider from embed URL and return provider-specific request params */
664
+ private getCloudEmbedParams(isQuery: boolean): Record<string, unknown> {
665
+ if (!this.isCloudEmbedding() || !this.remoteEmbedUrl) return {};
666
+ const url = this.remoteEmbedUrl.toLowerCase();
667
+ if (url.includes("jina.ai")) {
668
+ return { task: isQuery ? "retrieval.query" : "retrieval.passage", truncate: true };
669
+ }
670
+ if (url.includes("voyageai.com")) {
671
+ return { input_type: isQuery ? "query" : "document" };
672
+ }
673
+ if (url.includes("cohere.")) {
674
+ return { input_type: isQuery ? "search_query" : "search_document", truncate: "END" };
675
+ }
676
+ if (url.includes("openai.com")) {
677
+ const dims = parseInt(process.env.CLAWMEM_EMBED_DIMENSIONS || "", 10);
678
+ return dims > 0 ? { dimensions: dims } : {};
679
+ }
680
+ return {};
681
+ }
682
+
683
+ private getEmbedHeaders(): Record<string, string> {
684
+ const headers: Record<string, string> = { "Content-Type": "application/json" };
685
+ if (this.remoteEmbedApiKey) {
686
+ headers["Authorization"] = `Bearer ${this.remoteEmbedApiKey}`;
687
+ }
688
+ return headers;
689
+ }
690
+
691
+ private truncateForEmbed(text: string): string {
692
+ // Cloud providers handle their own context window limits
693
+ if (this.isCloudEmbedding()) return text;
694
+ return text.length > this.maxRemoteEmbedChars
695
+ ? text.slice(0, this.maxRemoteEmbedChars) : text;
696
+ }
697
+
698
+ /** Parse Retry-After header (seconds or HTTP-date) into milliseconds to wait */
699
+ private parseRetryAfter(resp: Response): number | null {
700
+ const header = resp.headers.get("retry-after");
701
+ if (!header) return null;
702
+ const secs = parseInt(header, 10);
703
+ if (!isNaN(secs)) return secs * 1000;
704
+ const date = Date.parse(header);
705
+ if (!isNaN(date)) return Math.max(0, date - Date.now());
706
+ return null;
707
+ }
708
+
709
+ /** Add ±25% jitter to a delay to prevent synchronized retries */
710
+ private jitter(delayMs: number): number {
711
+ return Math.floor(delayMs * (0.75 + Math.random() * 0.5));
712
+ }
713
+
714
+ private async embedRemote(text: string, extraParams: Record<string, unknown> = {}, retries = 5): Promise<EmbeddingResult | null> {
715
+ const input = this.truncateForEmbed(text);
716
+ for (let attempt = 0; attempt < retries; attempt++) {
717
+ try {
718
+ const body: Record<string, unknown> = { input, model: this.remoteEmbedModel, ...extraParams };
719
+ const resp = await fetch(`${this.remoteEmbedUrl}/v1/embeddings`, {
720
+ method: "POST",
721
+ headers: this.getEmbedHeaders(),
722
+ body: JSON.stringify(body),
723
+ });
724
+ if (resp.status === 429) {
725
+ const retryAfter = this.parseRetryAfter(resp);
726
+ const delay = retryAfter ?? Math.min(1000 * 2 ** attempt, 30000);
727
+ console.error(`Remote embed rate-limited, retry ${attempt + 1}/${retries} in ${this.jitter(delay)}ms`);
728
+ await new Promise(r => setTimeout(r, this.jitter(delay)));
729
+ continue;
730
+ }
731
+ if (!resp.ok) {
732
+ console.error(`Remote embed HTTP ${resp.status}: ${await resp.text()}`);
733
+ return null;
734
+ }
735
+ const data = await resp.json() as {
736
+ data: { embedding: number[] }[];
737
+ model?: string;
738
+ };
739
+ return {
740
+ embedding: data.data[0]!.embedding,
741
+ model: data.model || this.remoteEmbedUrl!,
742
+ };
743
+ } catch (error) {
744
+ console.error("Remote embed error:", error);
745
+ return null;
746
+ }
747
+ }
748
+ console.error("Remote embed: max retries exceeded (rate limit)");
749
+ return null;
750
+ }
751
+
752
+ /** Token usage from the last successful batch embed call (for adaptive pacing) */
753
+ lastBatchTokens = 0;
754
+
755
+ private async embedRemoteBatch(texts: string[], extraParams: Record<string, unknown> = {}, retries = 3): Promise<(EmbeddingResult | null)[]> {
756
+ const truncated = texts.map(t => this.truncateForEmbed(t));
757
+ for (let attempt = 0; attempt < retries; attempt++) {
758
+ try {
759
+ const body: Record<string, unknown> = { input: truncated, model: this.remoteEmbedModel, ...extraParams };
760
+ const resp = await fetch(`${this.remoteEmbedUrl}/v1/embeddings`, {
761
+ method: "POST",
762
+ headers: this.getEmbedHeaders(),
763
+ body: JSON.stringify(body),
764
+ });
765
+ if (resp.status === 429) {
766
+ const retryAfter = this.parseRetryAfter(resp);
767
+ const delay = retryAfter ?? Math.min(5000 * 2 ** attempt, 60000);
768
+ const jittered = this.jitter(delay);
769
+ console.error(`Remote batch embed rate-limited, retry ${attempt + 1}/${retries} in ${(jittered / 1000).toFixed(1)}s${retryAfter ? ` (Retry-After: ${Math.ceil(retryAfter / 1000)}s)` : ""}`);
770
+ await new Promise(r => setTimeout(r, jittered));
771
+ continue;
772
+ }
773
+ if (!resp.ok) {
774
+ console.error(`Remote batch embed HTTP ${resp.status}: ${await resp.text()}`);
775
+ return texts.map(() => null);
776
+ }
777
+ const data = await resp.json() as {
778
+ data: { embedding: number[]; index: number }[];
779
+ model?: string;
780
+ usage?: { total_tokens?: number; prompt_tokens?: number };
781
+ };
782
+ this.lastBatchTokens = data.usage?.total_tokens ?? data.usage?.prompt_tokens ?? 0;
783
+ const modelName = data.model || this.remoteEmbedUrl!;
784
+ const results: (EmbeddingResult | null)[] = new Array(texts.length).fill(null);
785
+ for (const item of data.data) {
786
+ results[item.index] = { embedding: item.embedding, model: modelName };
787
+ }
788
+ return results;
789
+ } catch (error) {
790
+ console.error("Remote batch embed error:", error);
791
+ return texts.map(() => null);
792
+ }
793
+ }
794
+ console.error("Remote batch embed: max retries exceeded (rate limit)");
795
+ return texts.map(() => null);
796
+ }
797
+
798
+ async generate(prompt: string, options: GenerateOptions = {}): Promise<GenerateResult | null> {
799
+ const maxTokens = options.maxTokens ?? 150;
800
+ const temperature = options.temperature ?? 0;
801
+
802
+ // Remote LLM server (GPU) — preferred path
803
+ if (this.remoteLlmUrl) {
804
+ return this.generateRemote(prompt, maxTokens, temperature, options.signal);
805
+ }
806
+
807
+ // Local fallback via node-llama-cpp (CPU)
808
+ await this.ensureGenerateModel();
809
+
810
+ const context = await this.generateModel!.createContext();
811
+ const sequence = context.getSequence();
812
+ const { LlamaChatSession } = await getNodeLlamaCpp();
813
+ const session = new LlamaChatSession({ contextSequence: sequence });
814
+
815
+ let result = "";
816
+ try {
817
+ await session.prompt(prompt, {
818
+ maxTokens,
819
+ temperature,
820
+ signal: options.signal,
821
+ stopOnAbortSignal: true,
822
+ onTextChunk: (text) => {
823
+ result += text;
824
+ },
825
+ });
826
+
827
+ return {
828
+ text: result,
829
+ model: this.generateModelUri,
830
+ done: true,
831
+ };
832
+ } finally {
833
+ await context.dispose();
834
+ }
835
+ }
836
+
837
+ private async generateRemote(
838
+ prompt: string,
839
+ maxTokens: number,
840
+ temperature: number,
841
+ signal?: AbortSignal
842
+ ): Promise<GenerateResult | null> {
843
+ try {
844
+ const resp = await fetch(`${this.remoteLlmUrl}/v1/chat/completions`, {
845
+ method: "POST",
846
+ headers: { "Content-Type": "application/json" },
847
+ body: JSON.stringify({
848
+ model: "qwen3",
849
+ messages: [{ role: "user", content: `${prompt} /no_think` }],
850
+ max_tokens: maxTokens,
851
+ temperature,
852
+ }),
853
+ signal,
854
+ });
855
+
856
+ if (!resp.ok) {
857
+ console.error(`[generate] Remote LLM error: ${resp.status} ${resp.statusText}`);
858
+ return null;
859
+ }
860
+
861
+ const data = await resp.json() as {
862
+ choices: { message: { content: string } }[];
863
+ model?: string;
864
+ };
865
+
866
+ return {
867
+ text: data.choices[0]?.message?.content || "",
868
+ model: data.model || this.remoteLlmUrl!,
869
+ done: true,
870
+ };
871
+ } catch (error) {
872
+ console.error("[generate] Remote LLM error:", error);
873
+ return null;
874
+ }
875
+ }
876
+
877
+ private async expandQueryRemote(query: string, includeLexical: boolean, context?: string, intent?: string): Promise<Queryable[]> {
878
+ const prompt = `Rewrite this search query for better retrieval. Output lines in format "type: text" where type is lex, vec, or hyde.
879
+ - lex: keyword search terms (1-3 lines)
880
+ - vec: semantic search queries (1-3 lines)
881
+ - hyde: hypothetical document passage that answers the query (1 line)
882
+
883
+ Query: ${query}${intent ? `\nQuery intent: ${intent}` : ""}${context ? `\nContext: ${context}` : ""}
884
+
885
+ Output:`;
886
+
887
+ const result = await this.generateRemote(prompt, 500, 0.7);
888
+ if (!result?.text) {
889
+ const fallback: Queryable[] = [{ type: 'vec', text: query }];
890
+ if (includeLexical) fallback.unshift({ type: 'lex', text: query });
891
+ return fallback;
892
+ }
893
+
894
+ const lines = result.text.trim().split("\n");
895
+ const queryables: Queryable[] = lines.map(line => {
896
+ const colonIdx = line.indexOf(":");
897
+ if (colonIdx === -1) return null;
898
+ const type = line.slice(0, colonIdx).trim();
899
+ if (type !== 'lex' && type !== 'vec' && type !== 'hyde') return null;
900
+ const text = line.slice(colonIdx + 1).trim();
901
+ if (!text) return null;
902
+ return { type: type as QueryType, text };
903
+ }).filter((q): q is Queryable => q !== null);
904
+
905
+ if (queryables.length === 0) {
906
+ const fallback: Queryable[] = [{ type: 'vec', text: query }];
907
+ if (includeLexical) fallback.unshift({ type: 'lex', text: query });
908
+ return fallback;
909
+ }
910
+
911
+ if (!includeLexical) {
912
+ return queryables.filter(q => q.type !== 'lex');
913
+ }
914
+ return queryables;
915
+ }
916
+
917
+ async modelExists(modelUri: string): Promise<ModelInfo> {
918
+ // For HuggingFace URIs, we assume they exist
919
+ // For local paths, check if file exists
920
+ if (modelUri.startsWith("hf:")) {
921
+ return { name: modelUri, exists: true };
922
+ }
923
+
924
+ const exists = existsSync(modelUri);
925
+ return {
926
+ name: modelUri,
927
+ exists,
928
+ path: exists ? modelUri : undefined,
929
+ };
930
+ }
931
+
932
+ // ==========================================================================
933
+ // High-level abstractions
934
+ // ==========================================================================
935
+
936
+ async expandQuery(query: string, options: { context?: string, includeLexical?: boolean, intent?: string } = {}): Promise<Queryable[]> {
937
+ const includeLexical = options.includeLexical ?? true;
938
+ const context = options.context;
939
+ const intent = options.intent;
940
+
941
+ // Remote LLM path — no grammar constraint, parse output instead
942
+ if (this.remoteLlmUrl) {
943
+ return this.expandQueryRemote(query, includeLexical, context, intent);
944
+ }
945
+
946
+ const llama = await this.ensureLlama();
947
+ await this.ensureGenerateModel();
948
+
949
+ const grammar = await llama.createGrammar({
950
+ grammar: `
951
+ root ::= line+
952
+ line ::= type ": " content "\\n"
953
+ type ::= "lex" | "vec" | "hyde"
954
+ content ::= [^\\n]+
955
+ `
956
+ });
957
+
958
+ const prompt = `You are a search query optimization expert. Your task is to improve retrieval by rewriting queries and generating hypothetical documents.
959
+
960
+ Original Query: ${query}
961
+ ${intent ? `\nQuery intent: ${intent}` : ""}
962
+ ${context ? `Additional Context, ONLY USE IF RELEVANT:\n\n<context>${context}</context>` : ""}
963
+
964
+ ## Step 1: Query Analysis
965
+ Identify entities, search intent, and missing context.
966
+
967
+ ## Step 2: Generate Hypothetical Document
968
+ Write a focused sentence passage that would answer the query. Include specific terminology and domain vocabulary.
969
+
970
+ ## Step 3: Query Rewrites
971
+ Generate 2-3 alternative search queries that resolve ambiguities. Use terminology from the hypothetical document.
972
+
973
+ ## Step 4: Final Retrieval Text
974
+ Output exactly 1-3 'lex' lines, 1-3 'vec' lines, and MAX ONE 'hyde' line.
975
+
976
+ <format>
977
+ lex: {single search term}
978
+ vec: {single vector query}
979
+ hyde: {complete hypothetical document passage from Step 2 on a SINGLE LINE}
980
+ </format>
981
+
982
+ <example>
983
+ Example (FOR FORMAT ONLY - DO NOT COPY THIS CONTENT):
984
+ lex: example keyword 1
985
+ lex: example keyword 2
986
+ vec: example semantic query
987
+ hyde: This is an example of a hypothetical document passage that would answer the example query. It contains multiple sentences and relevant vocabulary.
988
+ </example>
989
+
990
+ <rules>
991
+ - DO NOT repeat the same line.
992
+ - Each 'lex:' line MUST be a different keyword variation based on the ORIGINAL QUERY.
993
+ - Each 'vec:' line MUST be a different semantic variation based on the ORIGINAL QUERY.
994
+ - The 'hyde:' line MUST be the full sentence passage from Step 2, but all on one line.
995
+ - DO NOT use the example content above.
996
+ ${!includeLexical ? "- Do NOT output any 'lex:' lines" : ""}
997
+ </rules>
998
+
999
+ Final Output:`;
1000
+
1001
+ // Create fresh context for each call
1002
+ const genContext = await this.generateModel!.createContext();
1003
+ const sequence = genContext.getSequence();
1004
+ const { LlamaChatSession } = await getNodeLlamaCpp();
1005
+ const session = new LlamaChatSession({ contextSequence: sequence });
1006
+
1007
+ try {
1008
+ const result = await session.prompt(prompt, {
1009
+ grammar,
1010
+ maxTokens: 1000,
1011
+ temperature: 1,
1012
+ });
1013
+
1014
+ const lines = result.trim().split("\n");
1015
+ const queryables: Queryable[] = lines.map(line => {
1016
+ const colonIdx = line.indexOf(":");
1017
+ if (colonIdx === -1) return null;
1018
+ const type = line.slice(0, colonIdx).trim();
1019
+ if (type !== 'lex' && type !== 'vec' && type !== 'hyde') return null;
1020
+ const text = line.slice(colonIdx + 1).trim();
1021
+ return { type: type as QueryType, text };
1022
+ }).filter((q): q is Queryable => q !== null);
1023
+
1024
+ // Filter out lex entries if not requested
1025
+ if (!includeLexical) {
1026
+ return queryables.filter(q => q.type !== 'lex');
1027
+ }
1028
+ return queryables;
1029
+ } catch (error) {
1030
+ console.error("Structured query expansion failed:", error);
1031
+ // Fallback to original query
1032
+ const fallback: Queryable[] = [{ type: 'vec', text: query }];
1033
+ if (includeLexical) fallback.unshift({ type: 'lex', text: query });
1034
+ return fallback;
1035
+ } finally {
1036
+ await genContext.dispose();
1037
+ }
1038
+ }
1039
+
1040
+ async rerank(
1041
+ query: string,
1042
+ documents: RerankDocument[],
1043
+ options: RerankOptions = {}
1044
+ ): Promise<RerankResult> {
1045
+ const context = await this.ensureRerankContext();
1046
+
1047
+ // Build a map from document text to original indices (for lookup after sorting)
1048
+ const textToDoc = new Map<string, { file: string; index: number }>();
1049
+ documents.forEach((doc, index) => {
1050
+ textToDoc.set(doc.text, { file: doc.file, index });
1051
+ });
1052
+
1053
+ // Extract just the text for ranking
1054
+ const texts = documents.map((doc) => doc.text);
1055
+
1056
+ // Use the proper ranking API - returns [{document: string, score: number}] sorted by score
1057
+ const ranked = await context.rankAndSort(query, texts);
1058
+
1059
+ // Map back to our result format using the text-to-doc map
1060
+ const results: RerankDocumentResult[] = ranked.map((item: { document: string; score: number }) => {
1061
+ const docInfo = textToDoc.get(item.document)!;
1062
+ return {
1063
+ file: docInfo.file,
1064
+ score: item.score,
1065
+ index: docInfo.index,
1066
+ };
1067
+ });
1068
+
1069
+ return {
1070
+ results,
1071
+ model: this.rerankModelUri,
1072
+ };
1073
+ }
1074
+
1075
+ async dispose(): Promise<void> {
1076
+ // Prevent double-dispose
1077
+ if (this.disposed) {
1078
+ return;
1079
+ }
1080
+ this.disposed = true;
1081
+
1082
+ // Clear inactivity timer
1083
+ if (this.inactivityTimer) {
1084
+ clearTimeout(this.inactivityTimer);
1085
+ this.inactivityTimer = null;
1086
+ }
1087
+
1088
+ // Disposing llama cascades to models and contexts automatically
1089
+ // See: https://node-llama-cpp.withcat.ai/guide/objects-lifecycle
1090
+ // Note: llama.dispose() can hang indefinitely, so we use a timeout
1091
+ if (this.llama) {
1092
+ const disposePromise = this.llama.dispose();
1093
+ const timeoutPromise = new Promise<void>((resolve) => setTimeout(resolve, 1000));
1094
+ await Promise.race([disposePromise, timeoutPromise]);
1095
+ }
1096
+
1097
+ // Clear references
1098
+ this.embedContext = null;
1099
+ this.embedModel = null;
1100
+ this.rerankContext = null;
1101
+ this.generateModel = null;
1102
+ this.rerankModel = null;
1103
+ this.llama = null;
1104
+
1105
+ // Clear any in-flight load promises
1106
+ this.embedModelLoadPromise = null;
1107
+ this.generateModelLoadPromise = null;
1108
+ this.rerankModelLoadPromise = null;
1109
+ }
1110
+ }
1111
+
1112
+ // =============================================================================
1113
+ // Singleton for default LlamaCpp instance
1114
+ // =============================================================================
1115
+
1116
+ let defaultLlamaCpp: LlamaCpp | null = null;
1117
+
1118
+ /**
1119
+ * Get the default LlamaCpp instance (creates one if needed).
1120
+ * Reads CLAWMEM_EMBED_URL, CLAWMEM_EMBED_API_KEY, CLAWMEM_EMBED_MODEL env vars.
1121
+ *
1122
+ * Cloud embedding providers (set CLAWMEM_EMBED_API_KEY + CLAWMEM_EMBED_URL):
1123
+ * OpenAI: CLAWMEM_EMBED_URL=https://api.openai.com CLAWMEM_EMBED_MODEL=text-embedding-3-small
1124
+ * Voyage: CLAWMEM_EMBED_URL=https://api.voyageai.com CLAWMEM_EMBED_MODEL=voyage-4-large
1125
+ * Jina: CLAWMEM_EMBED_URL=https://api.jina.ai CLAWMEM_EMBED_MODEL=jina-embeddings-v3
1126
+ * Cohere: CLAWMEM_EMBED_URL=https://api.cohere.com CLAWMEM_EMBED_MODEL=embed-v4.0
1127
+ */
1128
+ let _apiKeyLocalhostWarned = false;
1129
+
1130
+ export function getDefaultLlamaCpp(): LlamaCpp {
1131
+ if (!defaultLlamaCpp) {
1132
+ const embedUrl = process.env.CLAWMEM_EMBED_URL || undefined;
1133
+ const embedApiKey = process.env.CLAWMEM_EMBED_API_KEY || undefined;
1134
+
1135
+ // Warn once if API key is set but URL points to localhost
1136
+ if (embedApiKey && embedUrl && !_apiKeyLocalhostWarned) {
1137
+ const lower = embedUrl.toLowerCase();
1138
+ if (lower.includes("localhost") || lower.includes("127.0.0.1")) {
1139
+ console.warn(
1140
+ "[clawmem] Warning: CLAWMEM_EMBED_API_KEY is set but CLAWMEM_EMBED_URL points to " +
1141
+ `${embedUrl}. API key will be sent as Bearer token to local server. ` +
1142
+ "If this is intentional (local gateway), ignore this warning."
1143
+ );
1144
+ _apiKeyLocalhostWarned = true;
1145
+ }
1146
+ }
1147
+
1148
+ defaultLlamaCpp = new LlamaCpp({
1149
+ remoteEmbedUrl: embedUrl,
1150
+ remoteEmbedApiKey: embedApiKey,
1151
+ remoteEmbedModel: process.env.CLAWMEM_EMBED_MODEL || undefined,
1152
+ remoteLlmUrl: process.env.CLAWMEM_LLM_URL || undefined,
1153
+ });
1154
+ }
1155
+ return defaultLlamaCpp;
1156
+ }
1157
+
1158
+ /**
1159
+ * Set a custom default LlamaCpp instance (useful for testing)
1160
+ */
1161
+ export function setDefaultLlamaCpp(llm: LlamaCpp | null): void {
1162
+ defaultLlamaCpp = llm;
1163
+ }
1164
+
1165
+ /**
1166
+ * Dispose the default LlamaCpp instance if it exists.
1167
+ * Call this before process exit to prevent NAPI crashes.
1168
+ */
1169
+ export async function disposeDefaultLlamaCpp(): Promise<void> {
1170
+ if (defaultLlamaCpp) {
1171
+ await defaultLlamaCpp.dispose();
1172
+ defaultLlamaCpp = null;
1173
+ }
1174
+ }
1175
+