@vellumai/assistant 0.3.2 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/README.md +82 -21
  2. package/package.json +1 -1
  3. package/src/__tests__/__snapshots__/ipc-snapshot.test.ts.snap +16 -0
  4. package/src/__tests__/app-git-history.test.ts +22 -27
  5. package/src/__tests__/app-git-service.test.ts +44 -78
  6. package/src/__tests__/call-orchestrator.test.ts +321 -0
  7. package/src/__tests__/channel-approval-routes.test.ts +1267 -93
  8. package/src/__tests__/channel-approval.test.ts +2 -0
  9. package/src/__tests__/channel-approvals.test.ts +51 -2
  10. package/src/__tests__/channel-delivery-store.test.ts +130 -1
  11. package/src/__tests__/channel-guardian.test.ts +371 -1
  12. package/src/__tests__/config-schema.test.ts +1 -1
  13. package/src/__tests__/credential-security-invariants.test.ts +1 -0
  14. package/src/__tests__/daemon-lifecycle.test.ts +635 -0
  15. package/src/__tests__/daemon-server-session-init.test.ts +5 -0
  16. package/src/__tests__/gateway-only-enforcement.test.ts +106 -21
  17. package/src/__tests__/handlers-telegram-config.test.ts +82 -0
  18. package/src/__tests__/handlers-twilio-config.test.ts +738 -5
  19. package/src/__tests__/ingress-url-consistency.test.ts +64 -0
  20. package/src/__tests__/ipc-snapshot.test.ts +10 -0
  21. package/src/__tests__/run-orchestrator.test.ts +1 -1
  22. package/src/__tests__/secret-scanner.test.ts +223 -0
  23. package/src/__tests__/session-process-bridge.test.ts +2 -0
  24. package/src/__tests__/shell-parser-property.test.ts +357 -2
  25. package/src/__tests__/system-prompt.test.ts +25 -1
  26. package/src/__tests__/tool-executor-lifecycle-events.test.ts +34 -1
  27. package/src/__tests__/tool-permission-simulate-handler.test.ts +2 -2
  28. package/src/__tests__/user-reference.test.ts +68 -0
  29. package/src/calls/call-orchestrator.ts +63 -11
  30. package/src/calls/twilio-config.ts +10 -1
  31. package/src/calls/twilio-rest.ts +70 -0
  32. package/src/cli/map.ts +6 -0
  33. package/src/commands/__tests__/cc-command-registry.test.ts +67 -0
  34. package/src/commands/cc-command-registry.ts +14 -1
  35. package/src/config/bundled-skills/claude-code/TOOLS.json +10 -3
  36. package/src/config/bundled-skills/email-setup/SKILL.md +56 -0
  37. package/src/config/bundled-skills/messaging/SKILL.md +4 -0
  38. package/src/config/bundled-skills/subagent/SKILL.md +4 -0
  39. package/src/config/bundled-skills/subagent/TOOLS.json +4 -0
  40. package/src/config/defaults.ts +1 -1
  41. package/src/config/schema.ts +6 -3
  42. package/src/config/skills.ts +5 -32
  43. package/src/config/system-prompt.ts +16 -0
  44. package/src/config/user-reference.ts +29 -0
  45. package/src/config/vellum-skills/catalog.json +52 -0
  46. package/src/config/vellum-skills/telegram-setup/SKILL.md +6 -1
  47. package/src/config/vellum-skills/twilio-setup/SKILL.md +49 -4
  48. package/src/daemon/auth-manager.ts +103 -0
  49. package/src/daemon/computer-use-session.ts +8 -1
  50. package/src/daemon/config-watcher.ts +253 -0
  51. package/src/daemon/handlers/config.ts +193 -17
  52. package/src/daemon/handlers/sessions.ts +5 -3
  53. package/src/daemon/handlers/skills.ts +60 -17
  54. package/src/daemon/ipc-contract-inventory.json +4 -0
  55. package/src/daemon/ipc-contract.ts +16 -0
  56. package/src/daemon/ipc-handler.ts +87 -0
  57. package/src/daemon/lifecycle.ts +16 -4
  58. package/src/daemon/ride-shotgun-handler.ts +11 -1
  59. package/src/daemon/server.ts +105 -502
  60. package/src/daemon/session-agent-loop.ts +9 -14
  61. package/src/daemon/session-process.ts +20 -3
  62. package/src/daemon/session-runtime-assembly.ts +60 -44
  63. package/src/daemon/session-slash.ts +50 -2
  64. package/src/daemon/session-surfaces.ts +17 -1
  65. package/src/daemon/session.ts +8 -1
  66. package/src/inbound/public-ingress-urls.ts +20 -3
  67. package/src/index.ts +1 -23
  68. package/src/memory/app-git-service.ts +24 -0
  69. package/src/memory/app-store.ts +0 -21
  70. package/src/memory/channel-delivery-store.ts +74 -3
  71. package/src/memory/channel-guardian-store.ts +54 -26
  72. package/src/memory/conversation-key-store.ts +20 -0
  73. package/src/memory/conversation-store.ts +14 -2
  74. package/src/memory/db-connection.ts +28 -0
  75. package/src/memory/db-init.ts +1019 -0
  76. package/src/memory/db.ts +2 -1995
  77. package/src/memory/embedding-backend.ts +79 -11
  78. package/src/memory/indexer.ts +2 -0
  79. package/src/memory/job-utils.ts +64 -4
  80. package/src/memory/jobs-worker.ts +7 -1
  81. package/src/memory/recall-cache.ts +107 -0
  82. package/src/memory/retriever.ts +30 -1
  83. package/src/memory/schema-migration.ts +984 -0
  84. package/src/memory/schema.ts +6 -0
  85. package/src/memory/search/types.ts +2 -0
  86. package/src/permissions/prompter.ts +14 -3
  87. package/src/permissions/trust-store.ts +7 -0
  88. package/src/runtime/channel-approvals.ts +17 -3
  89. package/src/runtime/gateway-client.ts +2 -1
  90. package/src/runtime/http-server.ts +28 -9
  91. package/src/runtime/routes/channel-routes.ts +279 -100
  92. package/src/runtime/routes/run-routes.ts +7 -1
  93. package/src/runtime/run-orchestrator.ts +8 -1
  94. package/src/security/secret-scanner.ts +218 -0
  95. package/src/skills/clawhub.ts +6 -2
  96. package/src/skills/frontmatter.ts +63 -0
  97. package/src/skills/slash-commands.ts +23 -0
  98. package/src/skills/vellum-catalog-remote.ts +107 -0
  99. package/src/subagent/manager.ts +4 -1
  100. package/src/subagent/types.ts +2 -0
  101. package/src/tools/browser/auto-navigate.ts +132 -24
  102. package/src/tools/browser/browser-manager.ts +67 -61
  103. package/src/tools/claude-code/claude-code.ts +55 -3
  104. package/src/tools/executor.ts +10 -2
  105. package/src/tools/skills/vellum-catalog.ts +75 -127
  106. package/src/tools/subagent/spawn.ts +2 -0
  107. package/src/tools/terminal/parser.ts +21 -5
  108. package/src/util/platform.ts +8 -1
  109. package/src/util/retry.ts +4 -4
@@ -1,3 +1,4 @@
1
+ import { createHash } from 'node:crypto';
1
2
  import type { AssistantConfig } from '../config/types.js';
2
3
  import { getLogger } from '../util/logger.js';
3
4
  import { GeminiEmbeddingBackend } from './embedding-gemini.js';
@@ -10,9 +11,41 @@ const log = getLogger('memory-embeddings');
10
11
  /** Global cache of embedding backend instances, keyed by "provider:model". */
11
12
  const backendCache = new Map<string, EmbeddingBackend>();
12
13
 
13
- /** Clear cached embedding backends so new instances pick up fresh credentials. */
14
+ // ── In-memory embedding vector cache ──────────────────────────────
15
+ // LRU cache keyed by sha256(provider + model + text) → embedding vector.
16
+ // Avoids redundant API calls / local compute for identical content.
17
+ const VECTOR_CACHE_MAX_ENTRIES = 4096;
18
+ const vectorCache = new Map<string, number[]>();
19
+
20
+ function vectorCacheKey(provider: string, model: string, text: string): string {
21
+ return createHash('sha256').update(`${provider}\0${model}\0${text}`).digest('hex');
22
+ }
23
+
24
+ function getFromVectorCache(provider: string, model: string, text: string): number[] | undefined {
25
+ const key = vectorCacheKey(provider, model, text);
26
+ const v = vectorCache.get(key);
27
+ if (v !== undefined) {
28
+ // LRU refresh: move to end of insertion order
29
+ vectorCache.delete(key);
30
+ vectorCache.set(key, v);
31
+ }
32
+ return v;
33
+ }
34
+
35
+ function putInVectorCache(provider: string, model: string, text: string, vector: number[]): void {
36
+ const key = vectorCacheKey(provider, model, text);
37
+ vectorCache.delete(key);
38
+ if (vectorCache.size >= VECTOR_CACHE_MAX_ENTRIES) {
39
+ const oldest = vectorCache.keys().next().value;
40
+ if (oldest !== undefined) vectorCache.delete(oldest);
41
+ }
42
+ vectorCache.set(key, vector);
43
+ }
44
+
45
+ /** Clear cached embedding backends and the in-memory vector cache. */
14
46
  export function clearEmbeddingBackendCache(): void {
15
47
  backendCache.clear();
48
+ vectorCache.clear();
16
49
  }
17
50
 
18
51
  function cacheKey(provider: string, model: string): string {
@@ -153,22 +186,44 @@ export async function embedWithBackend(
153
186
  throw new Error(selection.reason ?? 'No memory embedding backend configured');
154
187
  }
155
188
 
156
- // In auto mode, build a fallback list of backends to try
157
- const backends: EmbeddingBackend[] = [selection.backend];
158
- if (config.memory.embeddings.provider === 'auto' && selection.backend.provider === 'local') {
159
- for (const fallback of selectFallbackBackends(config, 'local')) {
160
- backends.push(fallback);
161
- }
189
+ const expectedDim = config.memory.qdrant.vectorSize;
190
+ const { provider: primaryProvider, model: primaryModel } = selection.backend;
191
+
192
+ // ── Build fallback backends list (needed for embed fallback) ──
193
+ const fallbacks: EmbeddingBackend[] =
194
+ config.memory.embeddings.provider === 'auto' && selection.backend.provider === 'local'
195
+ ? selectFallbackBackends(config, 'local')
196
+ : [];
197
+
198
+ // ── In-memory cache check (primary provider only) ──────────────
199
+ const cached: (number[] | null)[] = texts.map(t => {
200
+ const v = getFromVectorCache(primaryProvider, primaryModel, t);
201
+ if (v && v.length === expectedDim) return v;
202
+ return null;
203
+ });
204
+ const uncachedIndices: number[] = [];
205
+ for (let i = 0; i < cached.length; i++) {
206
+ if (!cached[i]) uncachedIndices.push(i);
207
+ }
208
+ if (uncachedIndices.length === 0) {
209
+ return { provider: primaryProvider, model: primaryModel, vectors: cached as number[][] };
162
210
  }
163
211
 
212
+ // ── Embed uncached texts ────────────────────────────────────────
213
+ const backends: EmbeddingBackend[] = [selection.backend, ...fallbacks];
214
+
164
215
  let lastErr: unknown;
165
216
  for (const backend of backends) {
217
+ const isPrimary = backend === selection.backend;
218
+ // For the primary backend, only embed uncached texts and merge with cached.
219
+ // For fallback backends, embed ALL texts since the cache was keyed to the primary.
220
+ const textsToEmbed = isPrimary ? uncachedIndices.map(i => texts[i]) : texts;
221
+
166
222
  try {
167
- const vectors = await backend.embed(texts, options);
168
- if (vectors.length !== texts.length) {
169
- throw new Error(`Embedding backend returned ${vectors.length} vectors for ${texts.length} texts`);
223
+ const vectors = await backend.embed(textsToEmbed, options);
224
+ if (vectors.length !== textsToEmbed.length) {
225
+ throw new Error(`Embedding backend returned ${vectors.length} vectors for ${textsToEmbed.length} texts`);
170
226
  }
171
- const expectedDim = config.memory.qdrant.vectorSize;
172
227
  for (const vec of vectors) {
173
228
  if (vec.length !== expectedDim) {
174
229
  throw new Error(
@@ -176,6 +231,19 @@ export async function embedWithBackend(
176
231
  );
177
232
  }
178
233
  }
234
+
235
+ // Populate cache with freshly embedded vectors
236
+ for (let i = 0; i < textsToEmbed.length; i++) {
237
+ putInVectorCache(backend.provider, backend.model, textsToEmbed[i], vectors[i]);
238
+ }
239
+
240
+ if (isPrimary) {
241
+ const merged = [...cached] as number[][];
242
+ for (let i = 0; i < uncachedIndices.length; i++) {
243
+ merged[uncachedIndices[i]] = vectors[i];
244
+ }
245
+ return { provider: backend.provider, model: backend.model, vectors: merged };
246
+ }
179
247
  return { provider: backend.provider, model: backend.model, vectors };
180
248
  } catch (err) {
181
249
  lastErr = err;
@@ -7,6 +7,7 @@ import { getDb } from './db.js';
7
7
  import { enqueueMemoryJob, enqueueResolvePendingConflictsForMessageJob } from './jobs-store.js';
8
8
  import { extractTextFromStoredMessageContent } from './message-content.js';
9
9
  import { segmentText } from './segmenter.js';
10
+ import { bumpMemoryVersion } from './recall-cache.js';
10
11
  import { memorySegments } from './schema.js';
11
12
 
12
13
  const log = getLogger('memory-indexer');
@@ -108,6 +109,7 @@ export function indexMessageNow(
108
109
  log.debug(`Skipped ${skippedEmbedJobs}/${segments.length} embed_segment jobs (content unchanged)`);
109
110
  }
110
111
 
112
+ bumpMemoryVersion();
111
113
  enqueueSummaryRollupJobsIfDue();
112
114
 
113
115
  const enqueuedJobs = (segments.length - skippedEmbedJobs) + (shouldExtract ? 2 : 1) + (shouldResolveConflicts ? 1 : 0);
@@ -1,6 +1,10 @@
1
+ import { createHash, randomUUID } from 'node:crypto';
2
+ import { eq, and } from 'drizzle-orm';
1
3
  import { getLogger } from '../util/logger.js';
2
4
  import { embedWithBackend, getMemoryBackendStatus } from './embedding-backend.js';
5
+ import { getDb } from './db.js';
3
6
  import { getQdrantClient } from './qdrant-client.js';
7
+ import { memoryEmbeddings } from './schema.js';
4
8
  import type { AssistantConfig } from '../config/types.js';
5
9
 
6
10
  const log = getLogger('memory-jobs-worker');
@@ -111,9 +115,66 @@ export async function embedAndUpsert(
111
115
  );
112
116
  }
113
117
 
114
- const embedded = await embedWithBackend(config, [text]);
115
- const vector = embedded.vectors[0];
116
- if (!vector) return;
118
+ const contentHash = createHash('sha256').update(text).digest('hex');
119
+ let provider = status.provider;
120
+ let model = status.model!;
121
+ let vector: number[];
122
+
123
+ // Check SQLite embedding cache for a matching content hash (primary provider only).
124
+ const db = getDb();
125
+ const expectedDim = config.memory.qdrant.vectorSize;
126
+ let cachedRow = db
127
+ .select({ vectorJson: memoryEmbeddings.vectorJson, dimensions: memoryEmbeddings.dimensions })
128
+ .from(memoryEmbeddings)
129
+ .where(
130
+ and(
131
+ eq(memoryEmbeddings.contentHash, contentHash),
132
+ eq(memoryEmbeddings.provider, provider),
133
+ eq(memoryEmbeddings.model, model),
134
+ ),
135
+ )
136
+ .get();
137
+ if (cachedRow && cachedRow.dimensions !== expectedDim) cachedRow = undefined;
138
+
139
+ if (cachedRow) {
140
+ vector = JSON.parse(cachedRow.vectorJson);
141
+ } else {
142
+ const embedded = await embedWithBackend(config, [text]);
143
+ vector = embedded.vectors[0];
144
+ if (!vector) return;
145
+ provider = embedded.provider;
146
+ model = embedded.model;
147
+ }
148
+
149
+ // Persist embedding in SQLite for cross-restart cache
150
+ const now = Date.now();
151
+ try {
152
+ db.insert(memoryEmbeddings)
153
+ .values({
154
+ id: randomUUID(),
155
+ targetType,
156
+ targetId,
157
+ provider,
158
+ model,
159
+ dimensions: vector.length,
160
+ vectorJson: JSON.stringify(vector),
161
+ contentHash,
162
+ createdAt: now,
163
+ updatedAt: now,
164
+ })
165
+ .onConflictDoUpdate({
166
+ target: [memoryEmbeddings.targetType, memoryEmbeddings.targetId, memoryEmbeddings.provider, memoryEmbeddings.model],
167
+ set: {
168
+ vectorJson: JSON.stringify(vector),
169
+ dimensions: vector.length,
170
+ contentHash,
171
+ updatedAt: now,
172
+ },
173
+ })
174
+ .run();
175
+ } catch (err) {
176
+ log.warn({ err, targetType, targetId }, 'Failed to write embedding cache');
177
+ }
117
178
 
118
179
  let qdrant;
119
180
  try {
@@ -123,7 +184,6 @@ export async function embedAndUpsert(
123
184
  }
124
185
 
125
186
  try {
126
- const now = Date.now();
127
187
  await qdrant.upsert(targetType, targetId, vector, {
128
188
  text,
129
189
  created_at: (extraPayload?.created_at as number) ?? now,
@@ -18,6 +18,7 @@ import {
18
18
  retryDelayForAttempt,
19
19
  RETRY_MAX_ATTEMPTS,
20
20
  } from './job-utils.js';
21
+ import { bumpMemoryVersion } from './recall-cache.js';
21
22
 
22
23
  // ── Per-job-type handlers ──────────────────────────────────────────
23
24
 
@@ -121,9 +122,14 @@ export async function runMemoryJobsOnce(
121
122
  try {
122
123
  await processJob(job, config);
123
124
  completeMemoryJob(job.id);
125
+ bumpMemoryVersion();
124
126
  groupProcessed += 1;
125
127
  } catch (err) {
126
- handleJobError(job, err);
128
+ try {
129
+ handleJobError(job, err);
130
+ } catch (handlerErr) {
131
+ log.error({ err: handlerErr, jobId: job.id, type: job.type }, 'handleJobError itself threw, job left in running status');
132
+ }
127
133
  }
128
134
  }
129
135
  return groupProcessed;
@@ -0,0 +1,107 @@
1
+ import { createHash } from 'crypto';
2
+ import type { MemoryRecallResult, MemoryRecallOptions } from './search/types.js';
3
+
4
+ /**
5
+ * In-memory cache for memory recall results.
6
+ *
7
+ * The full retrieval pipeline (FTS5 + Qdrant + entity graph + RRF merge) is
8
+ * expensive. When the same query is issued multiple turns in a row (common
9
+ * when the conversation context hasn't changed), we can serve the cached
10
+ * result instantly.
11
+ *
12
+ * Invalidation: a monotonic version counter is bumped whenever new memory
13
+ * is indexed (segments, items, embeddings). Cache entries are only valid
14
+ * when their version matches the current global version.
15
+ */
16
+
17
+ interface CacheEntry {
18
+ version: number;
19
+ createdAt: number;
20
+ result: MemoryRecallResult;
21
+ }
22
+
23
+ const MAX_ENTRIES = 32;
24
+ const TTL_MS = 60_000; // 60 seconds
25
+
26
+ let _version = 0;
27
+ const _cache = new Map<string, CacheEntry>();
28
+
29
+ /** Bump the global memory version, invalidating all cached recall results. */
30
+ export function bumpMemoryVersion(): void {
31
+ _version++;
32
+ }
33
+
34
+ /** Return the current memory version (for snapshot-based staleness checks). */
35
+ export function getMemoryVersion(): number {
36
+ return _version;
37
+ }
38
+
39
+ /** Build a deterministic cache key from the recall inputs. */
40
+ function buildCacheKey(
41
+ query: string,
42
+ conversationId: string,
43
+ options?: MemoryRecallOptions,
44
+ ): string {
45
+ const parts = [
46
+ query,
47
+ conversationId,
48
+ options?.scopeId ?? '',
49
+ options?.scopePolicyOverride
50
+ ? `${options.scopePolicyOverride.scopeId}:${options.scopePolicyOverride.fallbackToDefault}`
51
+ : '',
52
+ options?.excludeMessageIds ? [...options.excludeMessageIds].sort().join(',') : '',
53
+ options?.maxInjectTokensOverride != null ? String(options.maxInjectTokensOverride) : '',
54
+ ];
55
+ return createHash('sha256').update(parts.join('\0')).digest('hex');
56
+ }
57
+
58
+ /** Look up a cached recall result. Returns undefined on miss or stale entry. */
59
+ export function getCachedRecall(
60
+ query: string,
61
+ conversationId: string,
62
+ options?: MemoryRecallOptions,
63
+ ): MemoryRecallResult | undefined {
64
+ const key = buildCacheKey(query, conversationId, options);
65
+ const entry = _cache.get(key);
66
+ if (!entry) return undefined;
67
+ if (entry.version !== _version || Date.now() - entry.createdAt > TTL_MS) {
68
+ _cache.delete(key);
69
+ return undefined;
70
+ }
71
+ return entry.result;
72
+ }
73
+
74
+ /**
75
+ * Store a recall result in the cache. Evicts oldest entries when full.
76
+ *
77
+ * When `snapshotVersion` is provided, the entry is only stored if the
78
+ * snapshot still matches the current global version — this prevents a
79
+ * stale result from being cached under a version that was bumped while
80
+ * the retrieval pipeline was in flight.
81
+ */
82
+ export function setCachedRecall(
83
+ query: string,
84
+ conversationId: string,
85
+ options: MemoryRecallOptions | undefined,
86
+ result: MemoryRecallResult,
87
+ snapshotVersion?: number,
88
+ ): void {
89
+ // If a snapshot version was provided, only cache when it still matches
90
+ // the current version — otherwise the result may be stale.
91
+ if (snapshotVersion !== undefined && snapshotVersion !== _version) return;
92
+
93
+ const key = buildCacheKey(query, conversationId, options);
94
+
95
+ // Evict oldest entries if at capacity
96
+ if (_cache.size >= MAX_ENTRIES && !_cache.has(key)) {
97
+ const oldest = _cache.keys().next().value;
98
+ if (oldest !== undefined) _cache.delete(oldest);
99
+ }
100
+
101
+ _cache.set(key, { version: _version, createdAt: Date.now(), result });
102
+ }
103
+
104
+ /** Clear the entire cache (useful for testing). */
105
+ export function clearRecallCache(): void {
106
+ _cache.clear();
107
+ }
@@ -19,6 +19,7 @@ import { semanticSearch, isQdrantConnectionError } from './search/semantic.js';
19
19
  import { entitySearch } from './search/entity.js';
20
20
  import { mergeCandidates, applySourceCaps, rerankWithLLM, trimToTokenBudget, markItemUsage } from './search/ranking.js';
21
21
  import { buildInjectedText, MEMORY_CONTEXT_ACK } from './search/formatting.js';
22
+ import { getCachedRecall, setCachedRecall, getMemoryVersion } from './recall-cache.js';
22
23
 
23
24
  // Re-export public types and functions so existing importers continue to work
24
25
  export type {
@@ -161,10 +162,12 @@ async function collectAndMergeCandidates(
161
162
 
162
163
  // -- Phase 2: expensive searches (skipped on early termination) --
163
164
  let semantic: Candidate[] = [];
165
+ let semanticSearchFailed = false;
164
166
  if (queryVector && !canTerminateEarly) {
165
167
  try {
166
168
  semantic = await semanticSearch(queryVector, opts?.provider ?? 'unknown', opts?.model ?? 'unknown', config.memory.retrieval.semanticTopK, excludeMessageIds, scopeIds);
167
169
  } catch (err) {
170
+ semanticSearchFailed = true;
168
171
  if (isQdrantConnectionError(err)) {
169
172
  log.warn({ err }, 'Qdrant is unavailable — semantic search disabled, memory recall will be degraded');
170
173
  } else {
@@ -214,6 +217,7 @@ async function collectAndMergeCandidates(
214
217
  relationNeighborEntityCount,
215
218
  relationExpandedItemCount,
216
219
  earlyTerminated: canTerminateEarly,
220
+ semanticSearchFailed,
217
221
  merged,
218
222
  };
219
223
  }
@@ -225,6 +229,7 @@ export async function buildMemoryRecall(
225
229
  options?: MemoryRecallOptions,
226
230
  ): Promise<MemoryRecallResult> {
227
231
  const start = Date.now();
232
+ const versionSnapshot = getMemoryVersion();
228
233
  const excludeMessageIds = options?.excludeMessageIds?.filter((id) => id.length > 0) ?? [];
229
234
  const signal = options?.signal;
230
235
  if (!config.memory.enabled) {
@@ -234,6 +239,14 @@ export async function buildMemoryRecall(
234
239
  return emptyResult({ enabled: true, degraded: false, reason: 'memory.aborted', latencyMs: Date.now() - start });
235
240
  }
236
241
 
242
+ // Check recall cache — serves identical results instantly when the query
243
+ // and memory state haven't changed since the last recall.
244
+ const cached = getCachedRecall(query, conversationId, options);
245
+ if (cached) {
246
+ log.debug({ query: truncate(query, 120), latencyMs: Date.now() - start }, 'Memory recall served from cache');
247
+ return { ...cached, latencyMs: Date.now() - start };
248
+ }
249
+
237
250
  const backendStatus = getMemoryBackendStatus(config);
238
251
  let queryVector: number[] | null = null;
239
252
  let provider: string | undefined;
@@ -326,7 +339,15 @@ export async function buildMemoryRecall(
326
339
  relationNeighborEntityCount,
327
340
  relationExpandedItemCount,
328
341
  earlyTerminated,
342
+ semanticSearchFailed,
329
343
  } = collected;
344
+
345
+ // Mark as degraded when semantic search failed — the recall is based on
346
+ // lexical/recency only and should not be cached.
347
+ if (semanticSearchFailed) {
348
+ degraded = true;
349
+ reason = reason ?? 'memory.semantic_search_failure';
350
+ }
330
351
  let merged = applySourceCaps(collected.merged, config);
331
352
 
332
353
  // LLM re-ranking: send top candidates to Haiku for relevance scoring
@@ -395,7 +416,7 @@ export async function buildMemoryRecall(
395
416
  latencyMs,
396
417
  }, 'Memory recall completed');
397
418
 
398
- return {
419
+ const result: MemoryRecallResult = {
399
420
  enabled: true,
400
421
  degraded,
401
422
  reason,
@@ -418,6 +439,14 @@ export async function buildMemoryRecall(
418
439
  latencyMs,
419
440
  topCandidates,
420
441
  };
442
+
443
+ // Only cache non-degraded results — degraded results (e.g. lexical-only
444
+ // fallback when embeddings fail) would delay quality recovery once the
445
+ // embedding backend comes back.
446
+ if (!result.degraded) {
447
+ setCachedRecall(query, conversationId, options, result, versionSnapshot);
448
+ }
449
+ return result;
421
450
  }
422
451
 
423
452
  export function stripMemoryRecallMessages<T extends { role: 'user' | 'assistant'; content: Array<{ type: string; text?: string }> }>(