@swarmclawai/swarmclaw 1.5.39 → 1.5.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -389,6 +389,19 @@ Operational docs: https://swarmclaw.ai/docs/observability
389
389
 
390
390
  ## Releases
391
391
 
392
+ ### v1.5.41 Highlights
393
+
394
+ - **Moonshot / Kimi compatibility — duplicate `files` tool name fixed**: any agent with the default `files` extension was sending two tools both literally named `files` to the LLM. Most providers tolerated the duplicate; Moonshot's strict tool-schema validation rejected it with `MoonshotException - function name files is duplicated` ([#39](https://github.com/swarmclawai/swarmclaw/issues/39), reported by [@SteamedFish](https://github.com/SteamedFish)). Three fixes: the v2 file builder is now correctly gated on `files_v2` (not `files`), it registers under the matching capability key, and the session-tools assembler now shares a single dedup Set across native, CRUD, and extension phases so any future name collision is rejected with a clear warning instead of a silent double-register.
395
+
396
+ ### v1.5.40 Highlights
397
+
398
+ - **Current-thread recall routing**: the message classifier now emits four explicit flags (`isCurrentThreadRecall`, `isGreeting`, `isAcknowledgement`, `isMemoryWriteIntent`) so the chat router stops treating in-thread pronouns ("your last reply", "both answers", "what I just said") as durable-memory queries. Previously small OSS models (`devstral-small-2:24b` and similar) would run `memory_search` for these, come back empty, and truthfully report "no memories found" even when the answer was three messages up.
399
+ - **`memory_search` short-circuits thread-recall queries**: when the search query itself contains phrases like "just", "last reply", "my last", "both answers", the tool now returns a redirect pointing the model back to the visible chat history instead of executing a pointless vector search. Explicit cross-session phrasing ("yesterday", "last week", "in a previous conversation") still runs the normal search path.
400
+ - **Explicit Routing Matrix in the system prompt**: spells out the boundary between "read the thread above" and "call a memory tool" in plain language, so routing doesn't depend on the model extrapolating a terse rule. Memory-tool lines are now tagged `(not this thread)` so the distinction is unmissable.
401
+ - **Tool-summary retry threshold tightened**: the "trivial response" threshold used to decide whether to force a redundant `tool_summary` continuation dropped from 150 → 80 characters. A 119-char response like "I wrote X, stored Y, and confirmed both." is substantive; the old threshold forced the model to re-stream the same answer twice.
402
+ - **Classifier timeout raised to 10 s**: 2 s was too tight for Ollama Cloud with a fully-configured agent (observed 4–6 s calls). Result caching means the latency tax only applies to first-seen messages.
403
+ - **Reflection memories dedup across runs**: the supervisor reflection writer now compares candidate notes against recent (last 7 days) reflection memories for the same agent and skips ones that have already been stored, stopping the ~7-per-turn rediscovery churn on top of the within-run dedup shipped in v1.5.38.
404
+
392
405
  ### v1.5.39 Highlights
393
406
 
394
407
  - **Agents default to scoped tool access**: new agents (and existing agents whose `tools` list is non-empty) now only see the tools they've been given in the system prompt. This trims ~3 k input tokens per turn — an observed CEO/coordinator agent with 14 tools and 4 loaded skills went from 62 k to 38 k chars of system prompt. Opt back into the old firehose by toggling **Universal tool access** in the agent sheet's new "Context & Tool Access" section. Memory, context management, and `ask_human` are always included regardless of the scoped list.
@@ -417,20 +430,6 @@ Operational docs: https://swarmclaw.ai/docs/observability
417
430
  - Pins `outputFileTracingRoot` in `next.config.ts` to the project root so the Next.js build no longer walks `C:\Users\<user>\Application Data` (a legacy NTFS junction that throws EPERM on Windows runners).
418
431
  - Pins Python 3.11 in the desktop-release workflow so `node-gyp` rebuilds of native modules (`node-liblzma`, etc.) succeed on Python 3.12+ runners where `distutils` was removed from the stdlib.
419
432
 
420
- ### v1.5.36 Highlights
421
-
422
- - **Desktop app (Electron)**: SwarmClaw now ships as a native desktop app for macOS (Apple Silicon + Intel), Windows, and Linux (AppImage + .deb). Download from [swarmclaw.ai/downloads](https://swarmclaw.ai/downloads). The app wraps the existing standalone server inside an Electron shell, stores data in the OS app-data directory, and auto-updates via GitHub Releases (notify-only on unsigned macOS builds).
423
- - **Desktop release CI**: new `desktop-release.yml` workflow builds and publishes installers for all three platforms to GitHub Releases on every version tag.
424
- - **UI cleanup**: removed sibling-product navigation links from the in-app sidebar rail and login gate so the open-source app focuses on SwarmClaw itself. Those links remain in the project README and on swarmclaw.ai.
425
-
426
- ### v1.5.35 Highlights
427
-
428
- - **Update safety: prevent DB corruption on Linux**: `npm run update:easy`, `swarmclaw update`, and the in-app update endpoint now stop the running server (or checkpoint the SQLite WAL) before rebuilding native modules, preventing the WAL journal corruption that forced some Linux users back to the setup wizard.
429
- - **SQLite graceful shutdown**: the server now checkpoints and closes the database on SIGTERM/SIGINT, eliminating stale WAL state after any clean stop.
430
- - **Doctor: detect dangling gateway credentials**: the setup doctor now flags gateway profiles that reference deleted or missing credentials, explaining the "gateway token missing" connection errors.
431
- - **Gateway credential resolution logging**: when a gateway credential can't be resolved, the server now logs a clear warning identifying the missing credential ID.
432
- - **Credential decryption error logging**: when a stored credential can't be decrypted (e.g. after `CREDENTIAL_SECRET` changes), the server now logs the credential ID and provider so users know which key to re-add.
433
-
434
433
  Older releases: https://swarmclaw.ai/docs/release-notes
435
434
 
436
435
  - GitHub releases: https://github.com/swarmclawai/swarmclaw/releases
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@swarmclawai/swarmclaw",
3
- "version": "1.5.39",
3
+ "version": "1.5.41",
4
4
  "description": "Build and run autonomous AI agents with OpenClaw, Hermes, multiple model providers, orchestration, delegation, memory, skills, schedules, and chat connectors.",
5
5
  "main": "electron-dist/main.js",
6
6
  "license": "MIT",
@@ -787,6 +787,28 @@ function writeReflectionMemories(params: {
787
787
  const normalizeNote = (note: string): string =>
788
788
  note.toLowerCase().replace(/\s+/g, ' ').trim().slice(0, 240)
789
789
 
790
+ // Cross-run dedup: skip notes that already exist as a recent reflection
791
+ // memory for this agent. Different reflection runs over successive turns
792
+ // often rediscover the same invariant/lesson because the model re-derives
793
+ // them from the same pattern. Without this guard the reflection table
794
+ // grows ~7 entries per test turn; with it, repeat reflections are absorbed.
795
+ const CROSS_RUN_DEDUP_WINDOW_MS = 7 * 24 * 3600_000 // 7 days
796
+ const crossRunDedupCutoff = createdAt - CROSS_RUN_DEDUP_WINDOW_MS
797
+ try {
798
+ if (params.agentId) {
799
+ const recent = memoryDb.list(params.agentId, 500)
800
+ for (const entry of recent) {
801
+ if (!entry.category || !entry.category.startsWith('reflection/')) continue
802
+ if ((entry.updatedAt || 0) < crossRunDedupCutoff) continue
803
+ const norm = normalizeNote(entry.content || '')
804
+ if (norm) seenNormalized.add(norm)
805
+ }
806
+ }
807
+ } catch {
808
+ // Memory DB lookup is best-effort — if it fails, fall back to within-run
809
+ // dedup only rather than blocking the reflection write.
810
+ }
811
+
790
812
  for (const group of groups) {
791
813
  for (const note of group.notes) {
792
814
  const norm = normalizeNote(note)
@@ -370,8 +370,14 @@ function checkToolSummary(ctx: ContinuationContext): ContinuationDecision | null
370
370
  isConnectorSession: ctx.isConnectorSession,
371
371
  })
372
372
  if (skipToolSummaryForShortResponse) return null
373
+ // A 119-char response like "I wrote X, stored Y, and confirmed both." is
374
+ // substantive after two tool calls — it names each action. The prior
375
+ // 150-char threshold treated such responses as trivial preambles and
376
+ // forced a redundant retry that streamed the same answer twice. Tightened
377
+ // to 80 so only genuinely short preambles ("Done.", "Let me do that…")
378
+ // trigger the summary continuation.
373
379
  const textIsTrivial = !ctx.state.fullText.trim() || (
374
- !ctx.isConnectorSession && ctx.state.fullText.trim().length < 150
380
+ !ctx.isConnectorSession && ctx.state.fullText.trim().length < 80
375
381
  && (
376
382
  ctx.state.streamedToolEvents.length >= 2
377
383
  || ctx.likelyResearchSynthesisTask
@@ -32,6 +32,10 @@ export const MessageClassificationSchema = z.object({
32
32
  isDeliverableTask: z.boolean(),
33
33
  isBroadGoal: z.boolean(),
34
34
  isLightweightDirectChat: z.boolean().optional().default(false),
35
+ isCurrentThreadRecall: z.boolean().optional().default(false),
36
+ isGreeting: z.boolean().optional().default(false),
37
+ isAcknowledgement: z.boolean().optional().default(false),
38
+ isMemoryWriteIntent: z.boolean().optional().default(false),
35
39
  hasHumanSignals: z.boolean(),
36
40
  hasSignificantEvent: z.boolean(),
37
41
  isResearchSynthesis: z.boolean(),
@@ -48,6 +52,10 @@ export interface MessageClassification {
48
52
  isDeliverableTask: boolean
49
53
  isBroadGoal: boolean
50
54
  isLightweightDirectChat?: boolean
55
+ isCurrentThreadRecall?: boolean
56
+ isGreeting?: boolean
57
+ isAcknowledgement?: boolean
58
+ isMemoryWriteIntent?: boolean
51
59
  hasHumanSignals: boolean
52
60
  hasSignificantEvent: boolean
53
61
  isResearchSynthesis: boolean
@@ -103,6 +111,10 @@ function buildClassificationPrompt(message: string, recentHistory: string): stri
103
111
  '- isDeliverableTask (bool): The user wants a concrete artifact produced — a document, report, plan, proposal, landing page, dashboard, HTML file, markdown file, brief, copy, screenshots, or similar deliverable. NOT simple Q&A, code fixes, or single-command tasks.',
104
112
  '- isBroadGoal (bool): The message describes a broad, multi-step goal (50+ chars, no code blocks, no file paths, no numbered lists). Short questions ending with "?" are NOT broad goals.',
105
113
  '- isLightweightDirectChat (bool): This is a low-signal direct chat turn that should get a natural lightweight reply, such as a greeting, acknowledgment, check-in, or simple social/direct question that does NOT require research, file work, planning, delegation, or tool execution.',
114
+ '- isCurrentThreadRecall (bool): The user is asking about something from THIS CURRENT CHAT THREAD — e.g. "what were both answers you just gave?", "tell me that number again", "what did I just ask?", "your last reply mentioned X — expand on it". The answer is in the visible conversation history above. Return FALSE when the user is asking about prior conversations, sessions from other days, or things they remember from outside this thread (e.g. "remember when we talked about X last week", "what did we decide yesterday"). Regardless of language or exact phrasing, the signal is: does the answer live in the messages above, or does it require a memory/history lookup?',
115
+ '- isGreeting (bool): A standalone greeting with no other task — "hi", "hello", "hey there", "good morning", "yo". Returns FALSE if the greeting is followed by a real request.',
116
+ '- isAcknowledgement (bool): A short acknowledgement / social reply with no action required — "ok", "thanks", "got it", "cool", "makes sense", "sounds good", "nope". Returns FALSE if there is a follow-up question or directive.',
117
+ '- isMemoryWriteIntent (bool): The user is explicitly asking the assistant to remember, store, save, memorize, forget, or correct a durable fact about themselves, a preference, or a standing instruction — "remember my wife is called Anna", "save this as a preference", "forget what I told you about X", "update your memory: I now prefer Y". Returns FALSE for passive statements that happen to mention memory/remembering without asking for a write.',
106
118
  '- hasHumanSignals (bool): The message contains personal signals — preferences ("I prefer", "call me"), relationships ("my wife", "my partner", "my kid"), life events ("birthday", "wedding", "promotion", "moving", "graduation", "hospital"), or personal disclosures.',
107
119
  '- hasSignificantEvent (bool): The message mentions a notable life/work event or milestone (birthday, anniversary, wedding, graduation, promotion, new job, relocation, illness, funeral, travel, house, deadline, launch).',
108
120
  '- isResearchSynthesis (bool): The task requires gathering information from multiple sources and synthesizing it — research reports, competitive analysis, market overviews, literature reviews, multi-source comparisons. NOT simple factual lookups.',
@@ -121,7 +133,7 @@ function buildClassificationPrompt(message: string, recentHistory: string): stri
121
133
  '- Prefer the most execution-relevant taskIntent. Example: "research this and send me a voice note" is "research", not "outreach".',
122
134
  '',
123
135
  'Output shape:',
124
- '{"taskIntent":"coding|research|browsing|outreach|scheduling|general","isDeliverableTask":bool,"isBroadGoal":bool,"isLightweightDirectChat":bool,"hasHumanSignals":bool,"hasSignificantEvent":bool,"isResearchSynthesis":bool,"workType":"coding|research|writing|review|operations|general","wantsScreenshots":bool,"wantsOutboundDelivery":bool,"wantsVoiceDelivery":bool,"explicitToolRequests":[],"confidence":0.0-1.0}',
136
+ '{"taskIntent":"coding|research|browsing|outreach|scheduling|general","isDeliverableTask":bool,"isBroadGoal":bool,"isLightweightDirectChat":bool,"isCurrentThreadRecall":bool,"isGreeting":bool,"isAcknowledgement":bool,"isMemoryWriteIntent":bool,"hasHumanSignals":bool,"hasSignificantEvent":bool,"isResearchSynthesis":bool,"workType":"coding|research|writing|review|operations|general","wantsScreenshots":bool,"wantsOutboundDelivery":bool,"wantsVoiceDelivery":bool,"explicitToolRequests":[],"confidence":0.0-1.0}',
125
137
  '',
126
138
  recentHistory ? `Recent context:\n${recentHistory}\n` : '',
127
139
  `User message: ${JSON.stringify(message)}`,
@@ -206,7 +218,13 @@ export interface ClassifyMessageInput {
206
218
  history?: Message[]
207
219
  }
208
220
 
209
- const CLASSIFIER_TIMEOUT_MS = 2_000
221
+ // Timeout sized for Ollama Cloud with a fully-configured agent: observed
222
+ // classifier calls in the 4-6 s range during live testing, plus the expanded
223
+ // 4-flag semantic schema requires a slightly larger JSON output. 10 s
224
+ // accommodates the tail without blocking chat turns for long on a total
225
+ // failure. Result is cached per-message so the latency tax only applies to
226
+ // first-seen messages.
227
+ const CLASSIFIER_TIMEOUT_MS = 10_000
210
228
 
211
229
  /**
212
230
  * Classify a user message using a single LLM call.
@@ -240,6 +258,9 @@ export async function classifyMessage(
240
258
  options?.generateText
241
259
  ? options.generateText(prompt)
242
260
  : (async () => {
261
+ // Uses the agent's configured LLM (same model/credential), but
262
+ // with a lightweight prompt-only call — no agent system prompt,
263
+ // no tools, no memory injection, no history replay.
243
264
  const { llm } = await buildLLM({
244
265
  sessionId: input.sessionId,
245
266
  agentId: input.agentId || null,
@@ -253,9 +274,16 @@ export async function classifyMessage(
253
274
  ])
254
275
 
255
276
  const durationMs = Date.now() - startMs
256
- log.info(TAG, `session=${input.sessionId} completed in ${durationMs}ms`)
257
-
258
277
  const classification = parseClassificationResponse(responseText)
278
+ log.info(TAG, `session=${input.sessionId} completed in ${durationMs}ms`, classification ? {
279
+ taskIntent: classification.taskIntent,
280
+ isCurrentThreadRecall: classification.isCurrentThreadRecall || false,
281
+ isGreeting: classification.isGreeting || false,
282
+ isAcknowledgement: classification.isAcknowledgement || false,
283
+ isMemoryWriteIntent: classification.isMemoryWriteIntent || false,
284
+ isLightweightDirectChat: classification.isLightweightDirectChat || false,
285
+ confidence: classification.confidence,
286
+ } : { parsed: false })
259
287
  if (classification) {
260
288
  setCache(message, classification)
261
289
  }
@@ -344,10 +344,17 @@ export function buildAgenticExecutionPolicy(opts: {
344
344
  if (hasTooling) {
345
345
  parts.push(
346
346
  '## Routing Matrix',
347
+ // Smaller open-source models (observed with devstral-small-2:24b) routinely
348
+ // ignore a terse "use the thread first" line and call `memory_search`
349
+ // whenever a user message contains referential words like "that", "those",
350
+ // "both", "my last", "your previous". Spell out the boundary explicitly
351
+ // so compliance is consistent regardless of model size.
347
352
  'Current-thread facts already visible in this chat: answer directly from the thread before using tools.',
353
+ 'References in the user\'s message to things from THIS conversation — e.g. "that", "those", "both", "your last reply", "the number you gave", "what I just said" — are already in the thread history above. Read the prior messages to answer. Do NOT call `memory_search`, `sessions_tool`, or any recall tool for these.',
354
+ 'Only use memory or session-history tools when the user explicitly asks about a PRIOR conversation ("what did we discuss yesterday", "remember when I told you X last week") or names something not present in the current thread.',
348
355
  hasMemoryTools
349
- ? 'Facts from previous conversations: start with `memory_search`, then `memory_get` only for a targeted follow-up read.'
350
- : 'Facts from previous conversations: rely on the visible thread only and state when memory tools are unavailable.',
356
+ ? 'Facts from previous conversations (not this thread): start with `memory_search`, then `memory_get` only for a targeted follow-up read.'
357
+ : 'Facts from previous conversations (not this thread): rely on the visible thread only and state when memory tools are unavailable.',
351
358
  hasManageSessions
352
359
  ? 'Harness/session context, lineage, project attachment, or enabled-tool questions: use `sessions_tool` action `identity`.'
353
360
  : 'Harness/session introspection is limited here; rely on the runtime orientation block and visible context.',
@@ -450,7 +457,10 @@ export function buildAgenticExecutionPolicy(opts: {
450
457
  const exactStructureBlock = buildExactStructureBlock(opts.userMessage)
451
458
  if (exactStructureBlock) parts.push(exactStructureBlock)
452
459
  }
453
- if (opts.userMessage && isCurrentThreadRecallRequest(opts.userMessage)) {
460
+ // Delegate to isCurrentThreadRecallRequest which internally prefers the
461
+ // LLM classifier's judgment and falls back to regex only when classifier
462
+ // is unavailable.
463
+ if (opts.userMessage && isCurrentThreadRecallRequest(opts.userMessage, opts.classification ?? null)) {
454
464
  parts.push(buildCurrentThreadRecallBlock(opts.history || []))
455
465
  }
456
466
  }
@@ -1,10 +1,23 @@
1
1
  import type { MemoryEntry } from '@/types'
2
2
 
3
+ // Shape subset — we only need the boolean signals the LLM classifier emits.
4
+ // Typed loosely here to avoid a circular import with chat-execution.
5
+ type ClassificationHint = {
6
+ isCurrentThreadRecall?: boolean
7
+ isGreeting?: boolean
8
+ isAcknowledgement?: boolean
9
+ isMemoryWriteIntent?: boolean
10
+ } | null | undefined
11
+
12
+ // The regexes below are kept as fallbacks: when the LLM classifier returns
13
+ // null (timeout, no provider), these cover the common English phrasings so
14
+ // the system degrades gracefully. Paraphrases, non-English, or novel wordings
15
+ // are handled by the classifier path in callers.
3
16
  const ACK_RE = /^(?:ok(?:ay)?|cool|nice|got it|makes sense|thanks|thank you|thx|roger|copy|sounds good|sgtm|yep|yup|y|nope?|nah|kk|done)[.! ]*$/i
4
17
  const GREETING_RE = /^(?:hi|hello|hey|yo|morning|good morning|good afternoon|good evening)[.! ]*$/i
5
18
  const MEMORY_META_RE = /\b(?:remember|memory|memorize|store this|save this|forget)\b/i
6
19
  const LOW_SIGNAL_RESPONSE_RE = /^(?:HEARTBEAT_OK|NO_MESSAGE)\b/i
7
- const CURRENT_THREAD_RECALL_MARKER_RE = /\b(?:this conversation|this chat|this thread|current conversation|current chat|current thread|same thread|same chat|same conversation|earlier in (?:this )?(?:conversation|chat|thread)|from (?:this|our) (?:conversation|chat|thread)|you just stored|you just said|we just discussed|we just decided)\b/i
20
+ const CURRENT_THREAD_RECALL_MARKER_RE = /\b(?:this conversation|this chat|this thread|current conversation|current chat|current thread|same thread|same chat|same conversation|earlier in (?:this )?(?:conversation|chat|thread)|from (?:this|our) (?:conversation|chat|thread)|you just stored|you just said|you just gave|you just told|you just answered|you just replied|i just (?:said|asked|gave|told|mentioned)|we just (?:discussed|decided|talked)|your last (?:reply|answer|response|message)|my last (?:question|message)|above in (?:this |the )?(?:chat|thread|conversation)|(?:both|two|all) (?:answers|numbers|results|replies|responses))\b/i
8
21
  const CURRENT_THREAD_RECALL_INTENT_RE = /\b(?:what|which|who|when|where|did|remind|recap|summarize|repeat|list|tell me|answer|confirm|recall|mention)\b/i
9
22
  const DIRECT_MEMORY_WRITE_MARKER_RE = /\b(?:remember|memorize|store (?:this|that|the fact|it)|save (?:this|that|the fact|it) (?:to|in) memory|write to memory|add to memory|update.*memory|correct.*memory)\b/i
10
23
  const DIRECT_MEMORY_WRITE_FOLLOWUP_RE = /\b(?:confirm|recap|repeat|summarize|what you just stored|what you saved|what you updated)\b/i
@@ -17,17 +30,36 @@ function lower(value: string | null | undefined): string {
17
30
  return normalizeWhitespace(value || '').toLowerCase()
18
31
  }
19
32
 
20
- export function shouldInjectMemoryContext(message: string): boolean {
33
+ export function shouldInjectMemoryContext(
34
+ message: string,
35
+ classification?: ClassificationHint,
36
+ ): boolean {
21
37
  const trimmed = normalizeWhitespace(message)
22
38
  if (!trimmed) return false
39
+ // Prefer the LLM classifier's judgment when available — it generalizes across
40
+ // paraphrases and non-English phrasings that the static regexes miss.
41
+ if (classification) {
42
+ if (classification.isGreeting === true) return false
43
+ if (classification.isAcknowledgement === true) return false
44
+ if (classification.isMemoryWriteIntent === true && trimmed.length < 24) return false
45
+ return true
46
+ }
47
+ // Regex fallback for when classifier is unavailable.
23
48
  if (trimmed.length < 16 && (ACK_RE.test(trimmed) || GREETING_RE.test(trimmed))) return false
24
49
  if (trimmed.length < 24 && MEMORY_META_RE.test(trimmed)) return false
25
50
  return true
26
51
  }
27
52
 
28
- export function isCurrentThreadRecallRequest(message: string): boolean {
53
+ export function isCurrentThreadRecallRequest(
54
+ message: string,
55
+ classification?: ClassificationHint,
56
+ ): boolean {
29
57
  const trimmed = normalizeWhitespace(message)
30
58
  if (!trimmed) return false
59
+ if (classification?.isCurrentThreadRecall === true) return true
60
+ // Regex fallback. Skip when classifier confidently said "not thread recall"
61
+ // (isCurrentThreadRecall === false explicitly — not just missing).
62
+ if (classification && classification.isCurrentThreadRecall === false) return false
31
63
  if (!CURRENT_THREAD_RECALL_MARKER_RE.test(trimmed)) return false
32
64
  if (DIRECT_MEMORY_WRITE_MARKER_RE.test(trimmed) && DIRECT_MEMORY_WRITE_FOLLOWUP_RE.test(trimmed)) return false
33
65
  if (/\b(?:remember|store|save)\b/i.test(trimmed) && !/\?\s*$/.test(trimmed) && !/\b(?:what|which|who|when|where|did|confirm|recap|summarize|repeat|list|tell me|answer|recall)\b/i.test(trimmed)) {
@@ -0,0 +1,127 @@
1
+ import { describe, it } from 'node:test'
2
+ import assert from 'node:assert/strict'
3
+
4
+ // Issue #39 (Moonshot/Kimi rejecting duplicate tool names) showed that the
5
+ // Phase 1 native-tool loop in `session-tools/index.ts` was pushing tools
6
+ // without checking for duplicate names. Phase 2 already had a dedup Set; the
7
+ // fix lifts that Set above Phase 1 so all phases share it.
8
+ //
9
+ // This test mirrors the dedup algorithm in pure form so it can be verified
10
+ // without booting the full session-tools graph (which OOMs in test workers
11
+ // when run alongside the dev server).
12
+
13
+ type FakeTool = { name: string }
14
+ type Builder = () => FakeTool[]
15
+
16
+ interface DedupWarn {
17
+ toolName: string
18
+ source: 'native' | 'crud' | 'extension'
19
+ extensionId?: string
20
+ }
21
+
22
+ function dedupAssemble(
23
+ nativeBuilders: ReadonlyArray<readonly [string, Builder]>,
24
+ crudBuilder: Builder,
25
+ extensionTools: ReadonlyArray<{ extensionId: string; tool: FakeTool }>,
26
+ ): { tools: FakeTool[]; warnings: DedupWarn[] } {
27
+ const tools: FakeTool[] = []
28
+ const warnings: DedupWarn[] = []
29
+ const existingNames = new Set<string>()
30
+
31
+ for (const [extensionId, builder] of nativeBuilders) {
32
+ for (const t of builder()) {
33
+ if (existingNames.has(t.name)) {
34
+ warnings.push({ toolName: t.name, source: 'native', extensionId })
35
+ continue
36
+ }
37
+ existingNames.add(t.name)
38
+ tools.push(t)
39
+ }
40
+ }
41
+
42
+ for (const t of crudBuilder()) {
43
+ if (existingNames.has(t.name)) {
44
+ warnings.push({ toolName: t.name, source: 'crud' })
45
+ continue
46
+ }
47
+ existingNames.add(t.name)
48
+ tools.push(t)
49
+ }
50
+
51
+ for (const entry of extensionTools) {
52
+ if (existingNames.has(entry.tool.name)) {
53
+ warnings.push({ toolName: entry.tool.name, source: 'extension', extensionId: entry.extensionId })
54
+ continue
55
+ }
56
+ existingNames.add(entry.tool.name)
57
+ tools.push(entry.tool)
58
+ }
59
+
60
+ return { tools, warnings }
61
+ }
62
+
63
+ describe('session-tools assembler dedup (issue #39 regression)', () => {
64
+ it('emits a single `files` tool when two native builders both produce one (the original issue #39 scenario)', () => {
65
+ const result = dedupAssemble(
66
+ [
67
+ ['files', () => [{ name: 'files' }]],
68
+ ['files_v2', () => [{ name: 'files' }]],
69
+ ],
70
+ () => [],
71
+ [],
72
+ )
73
+
74
+ const fileTools = result.tools.filter((t) => t.name === 'files')
75
+ assert.equal(fileTools.length, 1, 'must emit exactly one tool named "files"')
76
+ assert.equal(result.warnings.length, 1)
77
+ assert.equal(result.warnings[0].toolName, 'files')
78
+ assert.equal(result.warnings[0].source, 'native')
79
+ assert.equal(result.warnings[0].extensionId, 'files_v2')
80
+ })
81
+
82
+ it('first builder wins when names collide', () => {
83
+ const t1 = { name: 'shared' }
84
+ const t2 = { name: 'shared' }
85
+ const result = dedupAssemble(
86
+ [
87
+ ['ext-a', () => [t1]],
88
+ ['ext-b', () => [t2]],
89
+ ],
90
+ () => [],
91
+ [],
92
+ )
93
+ assert.equal(result.tools.length, 1)
94
+ assert.strictEqual(result.tools[0], t1)
95
+ })
96
+
97
+ it('CRUD tools cannot collide with native tools', () => {
98
+ const result = dedupAssemble(
99
+ [['ext-a', () => [{ name: 'crud_op' }]]],
100
+ () => [{ name: 'crud_op' }],
101
+ [],
102
+ )
103
+ assert.equal(result.tools.length, 1)
104
+ assert.equal(result.warnings[0].source, 'crud')
105
+ })
106
+
107
+ it('extension tools dedup against the same shared Set', () => {
108
+ const result = dedupAssemble(
109
+ [['ext-a', () => [{ name: 'foo' }]]],
110
+ () => [],
111
+ [{ extensionId: 'ext-b', tool: { name: 'foo' } }],
112
+ )
113
+ assert.equal(result.tools.length, 1)
114
+ assert.equal(result.warnings[0].source, 'extension')
115
+ assert.equal(result.warnings[0].extensionId, 'ext-b')
116
+ })
117
+
118
+ it('lets distinct names through unchanged', () => {
119
+ const result = dedupAssemble(
120
+ [['ext-a', () => [{ name: 'a' }, { name: 'b' }]]],
121
+ () => [{ name: 'c' }],
122
+ [{ extensionId: 'ext-b', tool: { name: 'd' } }],
123
+ )
124
+ assert.deepEqual(result.tools.map((t) => t.name), ['a', 'b', 'c', 'd'])
125
+ assert.equal(result.warnings.length, 0)
126
+ })
127
+ })
@@ -0,0 +1,56 @@
1
+ import { describe, it } from 'node:test'
2
+ import assert from 'node:assert/strict'
3
+ import { buildFilesTools } from '@/lib/server/session-tools/files-tool'
4
+ import type { ToolBuildContext } from '@/lib/server/session-tools/context'
5
+
6
+ function makeBctx(enabled: Set<string>): ToolBuildContext {
7
+ return {
8
+ cwd: '/tmp',
9
+ ctx: undefined,
10
+ hasExtension: (name) => enabled.has(name),
11
+ hasTool: (name) => enabled.has(name),
12
+ cleanupFns: [],
13
+ commandTimeoutMs: 0,
14
+ claudeTimeoutMs: 0,
15
+ cliProcessTimeoutMs: 0,
16
+ persistDelegateResumeId: () => {},
17
+ readStoredDelegateResumeId: () => null,
18
+ resolveCurrentSession: () => null,
19
+ activeExtensions: Array.from(enabled),
20
+ filesystemScope: 'workspace',
21
+ }
22
+ }
23
+
24
+ describe('buildFilesTools (issue #39)', () => {
25
+ it('returns no tools when only the legacy `files` extension is enabled', () => {
26
+ // Pre-fix this returned a tool named "files", on top of the v1 builder
27
+ // which already produced a tool with the same name. Moonshot/Kimi rejected
28
+ // the duplicate with `function name files is duplicated`.
29
+ const bctx = makeBctx(new Set(['files']))
30
+ const out = buildFilesTools(bctx)
31
+ assert.equal(out.length, 0)
32
+ })
33
+
34
+ it('returns no tools when no relevant extension is enabled', () => {
35
+ const bctx = makeBctx(new Set(['shell', 'web']))
36
+ const out = buildFilesTools(bctx)
37
+ assert.equal(out.length, 0)
38
+ })
39
+
40
+ it('returns one `files` tool when the v2 extension is explicitly enabled', () => {
41
+ const bctx = makeBctx(new Set(['files_v2']))
42
+ const out = buildFilesTools(bctx)
43
+ assert.equal(out.length, 1)
44
+ assert.equal(out[0].name, 'files')
45
+ })
46
+
47
+ it('returns one `files` tool when both `files` and `files_v2` are enabled', () => {
48
+ // Defensive: even with both enabled, this builder emits exactly one tool.
49
+ // (The duplicate-with-v1 protection lives in the session-tools assembler
50
+ // dedup loop, covered by build-session-tools-dedup.test.ts.)
51
+ const bctx = makeBctx(new Set(['files', 'files_v2']))
52
+ const out = buildFilesTools(bctx)
53
+ assert.equal(out.length, 1)
54
+ assert.equal(out[0].name, 'files')
55
+ })
56
+ })
@@ -608,14 +608,22 @@ const FilesExtension: Extension = {
608
608
  ],
609
609
  }
610
610
 
611
- registerNativeCapability('files', FilesExtension)
611
+ // Registered under 'files_v2' to avoid colliding with the v1 FileExtension
612
+ // in `file.ts`, which also registers under the literal key 'files'. The
613
+ // builder below is wired into `session-tools/index.ts` via the same key.
614
+ registerNativeCapability('files_v2', FilesExtension)
612
615
 
613
616
  // ---------------------------------------------------------------------------
614
617
  // Tool builder (called from session-tools/index.ts)
615
618
  // ---------------------------------------------------------------------------
616
619
 
617
620
  export function buildFilesTools(bctx: ToolBuildContext) {
618
- if (!bctx.hasExtension('files')) return []
621
+ // Gate on 'files_v2' (not 'files'). Previously this checked 'files', which
622
+ // meant that enabling the v1 `files` extension activated BOTH builders and
623
+ // registered two tools literally named "files". Most providers tolerate
624
+ // duplicate tool names; Moonshot/Kimi rejects them with `function name
625
+ // files is duplicated`. Reported as issue #39.
626
+ if (!bctx.hasExtension('files_v2')) return []
619
627
 
620
628
  return [
621
629
  tool(
@@ -221,24 +221,41 @@ export async function buildSessionTools(cwd: string, enabledExtensions: string[]
221
221
  ['swarmdock', buildSwarmDockTools],
222
222
  ]
223
223
 
224
+ // Track tool names across all phases so duplicates are rejected
225
+ // consistently. Issue #39: Moonshot rejects duplicate tool names that
226
+ // most providers silently tolerate, so guarding only Phase 2 (as the
227
+ // pre-fix code did) was not enough.
228
+ const existingNames = new Set<string>()
224
229
  for (const [extensionId, builder] of nativeBuilders) {
225
230
  const builtTools = builder(bctx)
226
231
  for (const t of builtTools) {
232
+ if (existingNames.has(t.name)) {
233
+ log.warn('session-tools', 'Skipping native tool due to duplicate name', {
234
+ toolName: t.name,
235
+ extensionId,
236
+ })
237
+ continue
238
+ }
239
+ existingNames.add(t.name)
227
240
  toolToExtensionMap[t.name] = extensionId
241
+ tools.push(t)
228
242
  }
229
- tools.push(...builtTools)
230
243
  }
231
244
 
232
245
  const crudTools = buildCrudTools(bctx)
233
246
  for (const toolEntry of crudTools) {
247
+ if (existingNames.has(toolEntry.name)) {
248
+ log.warn('session-tools', 'Skipping CRUD tool due to duplicate name', { toolName: toolEntry.name })
249
+ continue
250
+ }
251
+ existingNames.add(toolEntry.name)
234
252
  toolToExtensionMap[toolEntry.name] = toolEntry.name
253
+ tools.push(toolEntry)
235
254
  }
236
- tools.push(...crudTools)
237
255
 
238
256
  // 2. Build Extension Tools (Built-in + External)
239
257
  try {
240
258
  const extensionTools = extensionManager.getTools(activeExtensions)
241
- const existingNames = new Set(tools.map((t) => t.name))
242
259
 
243
260
  for (const entry of extensionTools) {
244
261
  const pt = entry.tool
@@ -65,6 +65,40 @@ type MemoryActionContext = Partial<Session> & {
65
65
 
66
66
  type MemorySearchSource = 'durable' | 'working' | 'archive' | 'all'
67
67
  type NarrowMemoryAction = 'search' | 'get' | 'store' | 'update'
68
+
69
+ // Heuristic for detecting queries that actually refer to the current chat
70
+ // thread, not durable memory. Phrases like "just", "last reply", "both"
71
+ // (without any "yesterday/last week/before/earlier conversation" qualifier)
72
+ // are almost always pronouns targeting the visible thread. Small open-source
73
+ // models routinely run memory_search for these and then truthfully report
74
+ // "no memories found" even though the answer is three messages up.
75
+ const THREAD_RECALL_SIGNALS = [
76
+ /\bjust\b/i,
77
+ /\blast reply\b/i,
78
+ /\bmy last\b/i,
79
+ /\byour last\b/i,
80
+ /\bprevious (reply|answer|response|message)\b/i,
81
+ /\babove\b/i,
82
+ /\bwhat (i|you) (just|last) (said|asked|answered|gave|told)\b/i,
83
+ /\b(both|two|all) (answers|numbers|replies|responses)\b/i,
84
+ /\bthe (answer|number|result) you (just|last) (gave|said)\b/i,
85
+ ]
86
+ const PRIOR_CONVERSATION_SIGNALS = [
87
+ /\byesterday\b/i,
88
+ /\blast (week|month|year|time)\b/i,
89
+ /\bearlier (today|conversation|session|chat)\b/i,
90
+ /\bbefore we\b/i,
91
+ /\bremember (that|when|the time)\b/i,
92
+ /\bin a (previous|prior) (chat|session|conversation)\b/i,
93
+ ]
94
+ function isLikelyThreadRecallQuery(query: string): boolean {
95
+ if (typeof query !== 'string' || !query.trim()) return false
96
+ // If the user explicitly mentions a prior conversation/session, it's NOT
97
+ // a thread recall — let memory_search run normally.
98
+ if (PRIOR_CONVERSATION_SIGNALS.some((rx) => rx.test(query))) return false
99
+ return THREAD_RECALL_SIGNALS.some((rx) => rx.test(query))
100
+ }
101
+
68
102
  type CanonicalMemoryCandidate = {
69
103
  entry: MemoryEntry
70
104
  score: number
@@ -567,6 +601,15 @@ export async function executeMemoryAction(input: unknown, ctx: MemoryActionConte
567
601
  }
568
602
 
569
603
  if (resolvedAction === 'search') {
604
+ // Short-circuit when the query obviously refers to something in the
605
+ // current chat thread (e.g. "both answers I just got", "your last reply",
606
+ // "what I just said"). Small open-source models repeatedly call
607
+ // memory_search for this pattern instead of reading the thread above,
608
+ // then truthfully report "no memories found" even though the answer is
609
+ // three messages up. Redirect them back to the thread.
610
+ if (queryText && isLikelyThreadRecallQuery(queryText)) {
611
+ return 'No stored memories match this query, and the phrasing looks like a reference to the current chat thread (e.g. "just", "last reply", "both"). The information is already in the conversation history above — read the prior messages in this thread to answer instead of searching memory.'
612
+ }
570
613
  const queries = queryText ? await expandQuery(queryText) : [keyText]
571
614
  const allResults: MemoryEntry[] = []
572
615
  const seenIds = new Set<string>()