npm - @swarmclawai/swarmclaw - Versions diffs - 1.5.39 → 1.5.40 - Mend

@swarmclawai/swarmclaw 1.5.39 → 1.5.40

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md +9 -8
package/package.json +1 -1
package/src/lib/server/autonomy/supervisor-reflection.ts +22 -0
package/src/lib/server/chat-execution/continuation-evaluator.ts +7 -1
package/src/lib/server/chat-execution/message-classifier.ts +32 -4
package/src/lib/server/chat-execution/prompt-builder.ts +13 -3
package/src/lib/server/memory/memory-policy.ts +35 -3
package/src/lib/server/session-tools/memory.ts +43 -0

package/README.md CHANGED Viewed

@@ -389,6 +389,15 @@ Operational docs: https://swarmclaw.ai/docs/observability
 ## Releases
+### v1.5.40 Highlights
+- **Current-thread recall routing**: the message classifier now emits four explicit flags (`isCurrentThreadRecall`, `isGreeting`, `isAcknowledgement`, `isMemoryWriteIntent`) so the chat router stops treating in-thread pronouns ("your last reply", "both answers", "what I just said") as durable-memory queries. Previously small OSS models (`devstral-small-2:24b` and similar) would run `memory_search` for these, come back empty, and truthfully report "no memories found" even when the answer was three messages up.
+- **`memory_search` short-circuits thread-recall queries**: when the search query itself contains phrases like "just", "last reply", "my last", "both answers", the tool now returns a redirect pointing the model back to the visible chat history instead of executing a pointless vector search. Explicit cross-session phrasing ("yesterday", "last week", "in a previous conversation") still runs the normal search path.
+- **Explicit Routing Matrix in the system prompt**: spells out the boundary between "read the thread above" and "call a memory tool" in plain language, so routing doesn't depend on the model extrapolating a terse rule. Memory-tool lines are now tagged `(not this thread)` so the distinction is unmissable.
+- **Tool-summary retry threshold tightened**: the "trivial response" threshold used to decide whether to force a redundant `tool_summary` continuation dropped from 150 → 80 characters. A 119-char response like "I wrote X, stored Y, and confirmed both." is substantive; the old threshold forced the model to re-stream the same answer twice.
+- **Classifier timeout raised to 10 s**: 2 s was too tight for Ollama Cloud with a fully-configured agent (observed 4–6 s calls). Result caching means the latency tax only applies to first-seen messages.
+- **Reflection memories dedup across runs**: the supervisor reflection writer now compares candidate notes against recent (last 7 days) reflection memories for the same agent and skips ones that have already been stored, stopping the ~7-per-turn rediscovery churn on top of the within-run dedup shipped in v1.5.38.
 ### v1.5.39 Highlights
 - **Agents default to scoped tool access**: new agents (and existing agents whose `tools` list is non-empty) now only see the tools they've been given in the system prompt. This trims ~3 k input tokens per turn — an observed CEO/coordinator agent with 14 tools and 4 loaded skills went from 62 k to 38 k chars of system prompt. Opt back into the old firehose by toggling **Universal tool access** in the agent sheet's new "Context & Tool Access" section. Memory, context management, and `ask_human` are always included regardless of the scoped list.
@@ -423,14 +432,6 @@ Operational docs: https://swarmclaw.ai/docs/observability
 - **Desktop release CI**: new `desktop-release.yml` workflow builds and publishes installers for all three platforms to GitHub Releases on every version tag.
 - **UI cleanup**: removed sibling-product navigation links from the in-app sidebar rail and login gate so the open-source app focuses on SwarmClaw itself. Those links remain in the project README and on swarmclaw.ai.
-### v1.5.35 Highlights
-- **Update safety: prevent DB corruption on Linux**: `npm run update:easy`, `swarmclaw update`, and the in-app update endpoint now stop the running server (or checkpoint the SQLite WAL) before rebuilding native modules, preventing the WAL journal corruption that forced some Linux users back to the setup wizard.
-- **SQLite graceful shutdown**: the server now checkpoints and closes the database on SIGTERM/SIGINT, eliminating stale WAL state after any clean stop.
-- **Doctor: detect dangling gateway credentials**: the setup doctor now flags gateway profiles that reference deleted or missing credentials, explaining the "gateway token missing" connection errors.
-- **Gateway credential resolution logging**: when a gateway credential can't be resolved, the server now logs a clear warning identifying the missing credential ID.
-- **Credential decryption error logging**: when a stored credential can't be decrypted (e.g. after `CREDENTIAL_SECRET` changes), the server now logs the credential ID and provider so users know which key to re-add.
 Older releases: https://swarmclaw.ai/docs/release-notes
 - GitHub releases: https://github.com/swarmclawai/swarmclaw/releases

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@swarmclawai/swarmclaw",
-  "version": "1.5.39",
+  "version": "1.5.40",
   "description": "Build and run autonomous AI agents with OpenClaw, Hermes, multiple model providers, orchestration, delegation, memory, skills, schedules, and chat connectors.",
   "main": "electron-dist/main.js",
   "license": "MIT",

package/src/lib/server/autonomy/supervisor-reflection.ts CHANGED Viewed

@@ -787,6 +787,28 @@ function writeReflectionMemories(params: {
   const normalizeNote = (note: string): string =>
     note.toLowerCase().replace(/\s+/g, ' ').trim().slice(0, 240)
+  // Cross-run dedup: skip notes that already exist as a recent reflection
+  // memory for this agent. Different reflection runs over successive turns
+  // often rediscover the same invariant/lesson because the model re-derives
+  // them from the same pattern. Without this guard the reflection table
+  // grows ~7 entries per test turn; with it, repeat reflections are absorbed.
+  const CROSS_RUN_DEDUP_WINDOW_MS = 7 * 24 * 3600_000 // 7 days
+  const crossRunDedupCutoff = createdAt - CROSS_RUN_DEDUP_WINDOW_MS
+  try {
+    if (params.agentId) {
+      const recent = memoryDb.list(params.agentId, 500)
+      for (const entry of recent) {
+        if (!entry.category || !entry.category.startsWith('reflection/')) continue
+        if ((entry.updatedAt || 0) < crossRunDedupCutoff) continue
+        const norm = normalizeNote(entry.content || '')
+        if (norm) seenNormalized.add(norm)
+      }
+    }
+  } catch {
+    // Memory DB lookup is best-effort — if it fails, fall back to within-run
+    // dedup only rather than blocking the reflection write.
+  }
   for (const group of groups) {
     for (const note of group.notes) {
       const norm = normalizeNote(note)

package/src/lib/server/chat-execution/continuation-evaluator.ts CHANGED Viewed

@@ -370,8 +370,14 @@ function checkToolSummary(ctx: ContinuationContext): ContinuationDecision | null
     isConnectorSession: ctx.isConnectorSession,
   })
   if (skipToolSummaryForShortResponse) return null
+  // A 119-char response like "I wrote X, stored Y, and confirmed both." is
+  // substantive after two tool calls — it names each action. The prior
+  // 150-char threshold treated such responses as trivial preambles and
+  // forced a redundant retry that streamed the same answer twice. Tightened
+  // to 80 so only genuinely short preambles ("Done.", "Let me do that…")
+  // trigger the summary continuation.
   const textIsTrivial = !ctx.state.fullText.trim() || (
-    !ctx.isConnectorSession && ctx.state.fullText.trim().length < 150
+    !ctx.isConnectorSession && ctx.state.fullText.trim().length < 80
     && (
       ctx.state.streamedToolEvents.length >= 2
       || ctx.likelyResearchSynthesisTask

package/src/lib/server/chat-execution/message-classifier.ts CHANGED Viewed

@@ -32,6 +32,10 @@ export const MessageClassificationSchema = z.object({
   isDeliverableTask: z.boolean(),
   isBroadGoal: z.boolean(),
   isLightweightDirectChat: z.boolean().optional().default(false),
+  isCurrentThreadRecall: z.boolean().optional().default(false),
+  isGreeting: z.boolean().optional().default(false),
+  isAcknowledgement: z.boolean().optional().default(false),
+  isMemoryWriteIntent: z.boolean().optional().default(false),
   hasHumanSignals: z.boolean(),
   hasSignificantEvent: z.boolean(),
   isResearchSynthesis: z.boolean(),
@@ -48,6 +52,10 @@ export interface MessageClassification {
   isDeliverableTask: boolean
   isBroadGoal: boolean
   isLightweightDirectChat?: boolean
+  isCurrentThreadRecall?: boolean
+  isGreeting?: boolean
+  isAcknowledgement?: boolean
+  isMemoryWriteIntent?: boolean
   hasHumanSignals: boolean
   hasSignificantEvent: boolean
   isResearchSynthesis: boolean
@@ -103,6 +111,10 @@ function buildClassificationPrompt(message: string, recentHistory: string): stri
     '- isDeliverableTask (bool): The user wants a concrete artifact produced — a document, report, plan, proposal, landing page, dashboard, HTML file, markdown file, brief, copy, screenshots, or similar deliverable. NOT simple Q&A, code fixes, or single-command tasks.',
     '- isBroadGoal (bool): The message describes a broad, multi-step goal (50+ chars, no code blocks, no file paths, no numbered lists). Short questions ending with "?" are NOT broad goals.',
     '- isLightweightDirectChat (bool): This is a low-signal direct chat turn that should get a natural lightweight reply, such as a greeting, acknowledgment, check-in, or simple social/direct question that does NOT require research, file work, planning, delegation, or tool execution.',
+    '- isCurrentThreadRecall (bool): The user is asking about something from THIS CURRENT CHAT THREAD — e.g. "what were both answers you just gave?", "tell me that number again", "what did I just ask?", "your last reply mentioned X — expand on it". The answer is in the visible conversation history above. Return FALSE when the user is asking about prior conversations, sessions from other days, or things they remember from outside this thread (e.g. "remember when we talked about X last week", "what did we decide yesterday"). Regardless of language or exact phrasing, the signal is: does the answer live in the messages above, or does it require a memory/history lookup?',
+    '- isGreeting (bool): A standalone greeting with no other task — "hi", "hello", "hey there", "good morning", "yo". Returns FALSE if the greeting is followed by a real request.',
+    '- isAcknowledgement (bool): A short acknowledgement / social reply with no action required — "ok", "thanks", "got it", "cool", "makes sense", "sounds good", "nope". Returns FALSE if there is a follow-up question or directive.',
+    '- isMemoryWriteIntent (bool): The user is explicitly asking the assistant to remember, store, save, memorize, forget, or correct a durable fact about themselves, a preference, or a standing instruction — "remember my wife is called Anna", "save this as a preference", "forget what I told you about X", "update your memory: I now prefer Y". Returns FALSE for passive statements that happen to mention memory/remembering without asking for a write.',
     '- hasHumanSignals (bool): The message contains personal signals — preferences ("I prefer", "call me"), relationships ("my wife", "my partner", "my kid"), life events ("birthday", "wedding", "promotion", "moving", "graduation", "hospital"), or personal disclosures.',
     '- hasSignificantEvent (bool): The message mentions a notable life/work event or milestone (birthday, anniversary, wedding, graduation, promotion, new job, relocation, illness, funeral, travel, house, deadline, launch).',
     '- isResearchSynthesis (bool): The task requires gathering information from multiple sources and synthesizing it — research reports, competitive analysis, market overviews, literature reviews, multi-source comparisons. NOT simple factual lookups.',
@@ -121,7 +133,7 @@ function buildClassificationPrompt(message: string, recentHistory: string): stri
     '- Prefer the most execution-relevant taskIntent. Example: "research this and send me a voice note" is "research", not "outreach".',
     '',
     'Output shape:',
-    '{"taskIntent":"coding|research|browsing|outreach|scheduling|general","isDeliverableTask":bool,"isBroadGoal":bool,"isLightweightDirectChat":bool,"hasHumanSignals":bool,"hasSignificantEvent":bool,"isResearchSynthesis":bool,"workType":"coding|research|writing|review|operations|general","wantsScreenshots":bool,"wantsOutboundDelivery":bool,"wantsVoiceDelivery":bool,"explicitToolRequests":[],"confidence":0.0-1.0}',
+    '{"taskIntent":"coding|research|browsing|outreach|scheduling|general","isDeliverableTask":bool,"isBroadGoal":bool,"isLightweightDirectChat":bool,"isCurrentThreadRecall":bool,"isGreeting":bool,"isAcknowledgement":bool,"isMemoryWriteIntent":bool,"hasHumanSignals":bool,"hasSignificantEvent":bool,"isResearchSynthesis":bool,"workType":"coding|research|writing|review|operations|general","wantsScreenshots":bool,"wantsOutboundDelivery":bool,"wantsVoiceDelivery":bool,"explicitToolRequests":[],"confidence":0.0-1.0}',
     '',
     recentHistory ? `Recent context:\n${recentHistory}\n` : '',
     `User message: ${JSON.stringify(message)}`,
@@ -206,7 +218,13 @@ export interface ClassifyMessageInput {
   history?: Message[]
 }
-const CLASSIFIER_TIMEOUT_MS = 2_000
+// Timeout sized for Ollama Cloud with a fully-configured agent: observed
+// classifier calls in the 4-6 s range during live testing, plus the expanded
+// 4-flag semantic schema requires a slightly larger JSON output. 10 s
+// accommodates the tail without blocking chat turns for long on a total
+// failure. Result is cached per-message so the latency tax only applies to
+// first-seen messages.
+const CLASSIFIER_TIMEOUT_MS = 10_000
 /**
  * Classify a user message using a single LLM call.
@@ -240,6 +258,9 @@ export async function classifyMessage(
       options?.generateText
         ? options.generateText(prompt)
         : (async () => {
+            // Uses the agent's configured LLM (same model/credential), but
+            // with a lightweight prompt-only call — no agent system prompt,
+            // no tools, no memory injection, no history replay.
             const { llm } = await buildLLM({
               sessionId: input.sessionId,
               agentId: input.agentId || null,
@@ -253,9 +274,16 @@ export async function classifyMessage(
     ])
     const durationMs = Date.now() - startMs
-    log.info(TAG, `session=${input.sessionId} completed in ${durationMs}ms`)
     const classification = parseClassificationResponse(responseText)
+    log.info(TAG, `session=${input.sessionId} completed in ${durationMs}ms`, classification ? {
+      taskIntent: classification.taskIntent,
+      isCurrentThreadRecall: classification.isCurrentThreadRecall || false,
+      isGreeting: classification.isGreeting || false,
+      isAcknowledgement: classification.isAcknowledgement || false,
+      isMemoryWriteIntent: classification.isMemoryWriteIntent || false,
+      isLightweightDirectChat: classification.isLightweightDirectChat || false,
+      confidence: classification.confidence,
+    } : { parsed: false })
     if (classification) {
       setCache(message, classification)
     }

package/src/lib/server/chat-execution/prompt-builder.ts CHANGED Viewed

@@ -344,10 +344,17 @@ export function buildAgenticExecutionPolicy(opts: {
   if (hasTooling) {
     parts.push(
       '## Routing Matrix',
+      // Smaller open-source models (observed with devstral-small-2:24b) routinely
+      // ignore a terse "use the thread first" line and call `memory_search`
+      // whenever a user message contains referential words like "that", "those",
+      // "both", "my last", "your previous". Spell out the boundary explicitly
+      // so compliance is consistent regardless of model size.
       'Current-thread facts already visible in this chat: answer directly from the thread before using tools.',
+      'References in the user\'s message to things from THIS conversation — e.g. "that", "those", "both", "your last reply", "the number you gave", "what I just said" — are already in the thread history above. Read the prior messages to answer. Do NOT call `memory_search`, `sessions_tool`, or any recall tool for these.',
+      'Only use memory or session-history tools when the user explicitly asks about a PRIOR conversation ("what did we discuss yesterday", "remember when I told you X last week") or names something not present in the current thread.',
       hasMemoryTools
-        ? 'Facts from previous conversations: start with `memory_search`, then `memory_get` only for a targeted follow-up read.'
-        : 'Facts from previous conversations: rely on the visible thread only and state when memory tools are unavailable.',
+        ? 'Facts from previous conversations (not this thread): start with `memory_search`, then `memory_get` only for a targeted follow-up read.'
+        : 'Facts from previous conversations (not this thread): rely on the visible thread only and state when memory tools are unavailable.',
       hasManageSessions
         ? 'Harness/session context, lineage, project attachment, or enabled-tool questions: use `sessions_tool` action `identity`.'
         : 'Harness/session introspection is limited here; rely on the runtime orientation block and visible context.',
@@ -450,7 +457,10 @@ export function buildAgenticExecutionPolicy(opts: {
       const exactStructureBlock = buildExactStructureBlock(opts.userMessage)
       if (exactStructureBlock) parts.push(exactStructureBlock)
     }
-    if (opts.userMessage && isCurrentThreadRecallRequest(opts.userMessage)) {
+    // Delegate to isCurrentThreadRecallRequest which internally prefers the
+    // LLM classifier's judgment and falls back to regex only when classifier
+    // is unavailable.
+    if (opts.userMessage && isCurrentThreadRecallRequest(opts.userMessage, opts.classification ?? null)) {
       parts.push(buildCurrentThreadRecallBlock(opts.history || []))
     }
   }

package/src/lib/server/memory/memory-policy.ts CHANGED Viewed

@@ -1,10 +1,23 @@
 import type { MemoryEntry } from '@/types'
+// Shape subset — we only need the boolean signals the LLM classifier emits.
+// Typed loosely here to avoid a circular import with chat-execution.
+type ClassificationHint = {
+  isCurrentThreadRecall?: boolean
+  isGreeting?: boolean
+  isAcknowledgement?: boolean
+  isMemoryWriteIntent?: boolean
+} | null | undefined
+// The regexes below are kept as fallbacks: when the LLM classifier returns
+// null (timeout, no provider), these cover the common English phrasings so
+// the system degrades gracefully. Paraphrases, non-English, or novel wordings
+// are handled by the classifier path in callers.
 const ACK_RE = /^(?:ok(?:ay)?|cool|nice|got it|makes sense|thanks|thank you|thx|roger|copy|sounds good|sgtm|yep|yup|y|nope?|nah|kk|done)[.! ]*$/i
 const GREETING_RE = /^(?:hi|hello|hey|yo|morning|good morning|good afternoon|good evening)[.! ]*$/i
 const MEMORY_META_RE = /\b(?:remember|memory|memorize|store this|save this|forget)\b/i
 const LOW_SIGNAL_RESPONSE_RE = /^(?:HEARTBEAT_OK|NO_MESSAGE)\b/i
-const CURRENT_THREAD_RECALL_MARKER_RE = /\b(?:this conversation|this chat|this thread|current conversation|current chat|current thread|same thread|same chat|same conversation|earlier in (?:this )?(?:conversation|chat|thread)|from (?:this|our) (?:conversation|chat|thread)|you just stored|you just said|we just discussed|we just decided)\b/i
+const CURRENT_THREAD_RECALL_MARKER_RE = /\b(?:this conversation|this chat|this thread|current conversation|current chat|current thread|same thread|same chat|same conversation|earlier in (?:this )?(?:conversation|chat|thread)|from (?:this|our) (?:conversation|chat|thread)|you just stored|you just said|you just gave|you just told|you just answered|you just replied|i just (?:said|asked|gave|told|mentioned)|we just (?:discussed|decided|talked)|your last (?:reply|answer|response|message)|my last (?:question|message)|above in (?:this |the )?(?:chat|thread|conversation)|(?:both|two|all) (?:answers|numbers|results|replies|responses))\b/i
 const CURRENT_THREAD_RECALL_INTENT_RE = /\b(?:what|which|who|when|where|did|remind|recap|summarize|repeat|list|tell me|answer|confirm|recall|mention)\b/i
 const DIRECT_MEMORY_WRITE_MARKER_RE = /\b(?:remember|memorize|store (?:this|that|the fact|it)|save (?:this|that|the fact|it) (?:to|in) memory|write to memory|add to memory|update.*memory|correct.*memory)\b/i
 const DIRECT_MEMORY_WRITE_FOLLOWUP_RE = /\b(?:confirm|recap|repeat|summarize|what you just stored|what you saved|what you updated)\b/i
@@ -17,17 +30,36 @@ function lower(value: string | null | undefined): string {
   return normalizeWhitespace(value || '').toLowerCase()
 }
-export function shouldInjectMemoryContext(message: string): boolean {
+export function shouldInjectMemoryContext(
+  message: string,
+  classification?: ClassificationHint,
+): boolean {
   const trimmed = normalizeWhitespace(message)
   if (!trimmed) return false
+  // Prefer the LLM classifier's judgment when available — it generalizes across
+  // paraphrases and non-English phrasings that the static regexes miss.
+  if (classification) {
+    if (classification.isGreeting === true) return false
+    if (classification.isAcknowledgement === true) return false
+    if (classification.isMemoryWriteIntent === true && trimmed.length < 24) return false
+    return true
+  }
+  // Regex fallback for when classifier is unavailable.
   if (trimmed.length < 16 && (ACK_RE.test(trimmed) || GREETING_RE.test(trimmed))) return false
   if (trimmed.length < 24 && MEMORY_META_RE.test(trimmed)) return false
   return true
 }
-export function isCurrentThreadRecallRequest(message: string): boolean {
+export function isCurrentThreadRecallRequest(
+  message: string,
+  classification?: ClassificationHint,
+): boolean {
   const trimmed = normalizeWhitespace(message)
   if (!trimmed) return false
+  if (classification?.isCurrentThreadRecall === true) return true
+  // Regex fallback. Skip when classifier confidently said "not thread recall"
+  // (isCurrentThreadRecall === false explicitly — not just missing).
+  if (classification && classification.isCurrentThreadRecall === false) return false
   if (!CURRENT_THREAD_RECALL_MARKER_RE.test(trimmed)) return false
   if (DIRECT_MEMORY_WRITE_MARKER_RE.test(trimmed) && DIRECT_MEMORY_WRITE_FOLLOWUP_RE.test(trimmed)) return false
   if (/\b(?:remember|store|save)\b/i.test(trimmed) && !/\?\s*$/.test(trimmed) && !/\b(?:what|which|who|when|where|did|confirm|recap|summarize|repeat|list|tell me|answer|recall)\b/i.test(trimmed)) {

package/src/lib/server/session-tools/memory.ts CHANGED Viewed

@@ -65,6 +65,40 @@ type MemoryActionContext = Partial<Session> & {
 type MemorySearchSource = 'durable' | 'working' | 'archive' | 'all'
 type NarrowMemoryAction = 'search' | 'get' | 'store' | 'update'
+// Heuristic for detecting queries that actually refer to the current chat
+// thread, not durable memory. Phrases like "just", "last reply", "both"
+// (without any "yesterday/last week/before/earlier conversation" qualifier)
+// are almost always pronouns targeting the visible thread. Small open-source
+// models routinely run memory_search for these and then truthfully report
+// "no memories found" even though the answer is three messages up.
+const THREAD_RECALL_SIGNALS = [
+  /\bjust\b/i,
+  /\blast reply\b/i,
+  /\bmy last\b/i,
+  /\byour last\b/i,
+  /\bprevious (reply|answer|response|message)\b/i,
+  /\babove\b/i,
+  /\bwhat (i|you) (just|last) (said|asked|answered|gave|told)\b/i,
+  /\b(both|two|all) (answers|numbers|replies|responses)\b/i,
+  /\bthe (answer|number|result) you (just|last) (gave|said)\b/i,
+]
+const PRIOR_CONVERSATION_SIGNALS = [
+  /\byesterday\b/i,
+  /\blast (week|month|year|time)\b/i,
+  /\bearlier (today|conversation|session|chat)\b/i,
+  /\bbefore we\b/i,
+  /\bremember (that|when|the time)\b/i,
+  /\bin a (previous|prior) (chat|session|conversation)\b/i,
+]
+function isLikelyThreadRecallQuery(query: string): boolean {
+  if (typeof query !== 'string' || !query.trim()) return false
+  // If the user explicitly mentions a prior conversation/session, it's NOT
+  // a thread recall — let memory_search run normally.
+  if (PRIOR_CONVERSATION_SIGNALS.some((rx) => rx.test(query))) return false
+  return THREAD_RECALL_SIGNALS.some((rx) => rx.test(query))
+}
 type CanonicalMemoryCandidate = {
   entry: MemoryEntry
   score: number
@@ -567,6 +601,15 @@ export async function executeMemoryAction(input: unknown, ctx: MemoryActionConte
   }
   if (resolvedAction === 'search') {
+    // Short-circuit when the query obviously refers to something in the
+    // current chat thread (e.g. "both answers I just got", "your last reply",
+    // "what I just said"). Small open-source models repeatedly call
+    // memory_search for this pattern instead of reading the thread above,
+    // then truthfully report "no memories found" even though the answer is
+    // three messages up. Redirect them back to the thread.
+    if (queryText && isLikelyThreadRecallQuery(queryText)) {
+      return 'No stored memories match this query, and the phrasing looks like a reference to the current chat thread (e.g. "just", "last reply", "both"). The information is already in the conversation history above — read the prior messages in this thread to answer instead of searching memory.'
+    }
     const queries = queryText ? await expandQuery(queryText) : [keyText]
     const allResults: MemoryEntry[] = []
     const seenIds = new Set<string>()