@swarmclawai/swarmclaw 1.5.38 → 1.5.40
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +18 -14
- package/package.json +1 -1
- package/src/components/agents/agent-sheet.tsx +33 -0
- package/src/lib/providers/openclaw.ts +15 -0
- package/src/lib/server/autonomy/supervisor-reflection.ts +22 -0
- package/src/lib/server/chat-execution/chat-turn-preparation.ts +21 -5
- package/src/lib/server/chat-execution/continuation-evaluator.ts +7 -1
- package/src/lib/server/chat-execution/message-classifier.ts +32 -4
- package/src/lib/server/chat-execution/prompt-builder.ts +13 -3
- package/src/lib/server/memory/memory-policy.ts +35 -3
- package/src/lib/server/runtime/perf.ts +5 -1
- package/src/lib/server/runtime/queue/core.ts +13 -0
- package/src/lib/server/runtime/queue-recovery.test.ts +61 -0
- package/src/lib/server/session-tools/memory.ts +43 -0
- package/src/lib/server/skills/runtime-skill-resolver.ts +34 -1
- package/src/lib/server/universal-tool-access.test.ts +71 -0
- package/src/lib/server/universal-tool-access.ts +23 -0
- package/src/types/agent.ts +7 -0
package/README.md
CHANGED
|
@@ -389,6 +389,24 @@ Operational docs: https://swarmclaw.ai/docs/observability
|
|
|
389
389
|
|
|
390
390
|
## Releases
|
|
391
391
|
|
|
392
|
+
### v1.5.40 Highlights
|
|
393
|
+
|
|
394
|
+
- **Current-thread recall routing**: the message classifier now emits four explicit flags (`isCurrentThreadRecall`, `isGreeting`, `isAcknowledgement`, `isMemoryWriteIntent`) so the chat router stops treating in-thread pronouns ("your last reply", "both answers", "what I just said") as durable-memory queries. Previously small OSS models (`devstral-small-2:24b` and similar) would run `memory_search` for these, come back empty, and truthfully report "no memories found" even when the answer was three messages up.
|
|
395
|
+
- **`memory_search` short-circuits thread-recall queries**: when the search query itself contains phrases like "just", "last reply", "my last", "both answers", the tool now returns a redirect pointing the model back to the visible chat history instead of executing a pointless vector search. Explicit cross-session phrasing ("yesterday", "last week", "in a previous conversation") still runs the normal search path.
|
|
396
|
+
- **Explicit Routing Matrix in the system prompt**: spells out the boundary between "read the thread above" and "call a memory tool" in plain language, so routing doesn't depend on the model extrapolating a terse rule. Memory-tool lines are now tagged `(not this thread)` so the distinction is unmissable.
|
|
397
|
+
- **Tool-summary retry threshold tightened**: the "trivial response" threshold used to decide whether to force a redundant `tool_summary` continuation dropped from 150 → 80 characters. A 119-char response like "I wrote X, stored Y, and confirmed both." is substantive; the old threshold forced the model to re-stream the same answer twice.
|
|
398
|
+
- **Classifier timeout raised to 10 s**: 2 s was too tight for Ollama Cloud with a fully-configured agent (observed 4–6 s calls). Result caching means the latency tax only applies to first-seen messages.
|
|
399
|
+
- **Reflection memories dedup across runs**: the supervisor reflection writer now compares candidate notes against recent (last 7 days) reflection memories for the same agent and skips ones that have already been stored, stopping the ~7-per-turn rediscovery churn on top of the within-run dedup shipped in v1.5.38.
|
|
400
|
+
|
|
401
|
+
### v1.5.39 Highlights
|
|
402
|
+
|
|
403
|
+
- **Agents default to scoped tool access**: new agents (and existing agents whose `tools` list is non-empty) now only see the tools they've been given in the system prompt. This trims ~3 k input tokens per turn — an observed CEO/coordinator agent with 14 tools and 4 loaded skills went from 62 k to 38 k chars of system prompt. Opt back into the old firehose by toggling **Universal tool access** in the agent sheet's new "Context & Tool Access" section. Memory, context management, and `ask_human` are always included regardless of the scoped list.
|
|
404
|
+
- **Pinned skills budget hardening**: one long markdown skill was eating 24 k of a 62 k prompt. Inlined pinned-skill content is now capped at 3 k chars with a pointer to `use_skill` action="load" for the full guide, and auto-attached *learned* skills get a dedicated sub-budget (max 6 skills / 8 k chars) so they cannot dominate the main pinned-skills section.
|
|
405
|
+
- **OpenClaw chat fast-fails on dangling credentials**: v1.5.38 added gateway-side fast-fail; the chat streaming path now does the same, emitting a clear `err` event naming the missing credential instead of dialing the gateway unauthenticated and waiting 120 s for the timeout.
|
|
406
|
+
- **Queue: orphan-recovery auto-heals stale checkouts**: pre-1.5.38 storage could leave `queued` tasks with a stale `checkoutRunId` that `checkoutTask()` refused forever. Orphan recovery now clears the stale id in the same sweep that re-queues the task, and `reconcileFinishedRunningTasks` / agent-not-found / capability-mismatch paths also null out the checkout when they terminally fail a task.
|
|
407
|
+
- **Perf ring buffer raised to 2 000 entries**: queue/task repository events fire ~20 Hz during task processing and were evicting chat-execution/prompt perf entries out of the 200-entry buffer before they could be read. The larger buffer lets the perf viewer actually show a full turn.
|
|
408
|
+
- **Tests**: added regression tests for pre-1.5.38 stale-checkout orphan recovery and for the scoped-tool-access algorithm.
|
|
409
|
+
|
|
392
410
|
### v1.5.38 Highlights
|
|
393
411
|
|
|
394
412
|
- **Task queue: reclaim stale checkouts**: `checkoutTask()` now reclaims a lingering `checkoutRunId` on a `queued` task instead of refusing it forever. An ungraceful server exit mid-turn (crash, SIGKILL, HMR reload) previously left tasks uncheckoutable, producing a dispatch → orphan-recovery → failed-checkout spin that logged "Recovering orphaned queued task" tens of thousands of times per session. `scheduleRetryOrDeadLetter()` also clears the prior checkout when scheduling a retry or dead-lettering.
|
|
@@ -414,20 +432,6 @@ Operational docs: https://swarmclaw.ai/docs/observability
|
|
|
414
432
|
- **Desktop release CI**: new `desktop-release.yml` workflow builds and publishes installers for all three platforms to GitHub Releases on every version tag.
|
|
415
433
|
- **UI cleanup**: removed sibling-product navigation links from the in-app sidebar rail and login gate so the open-source app focuses on SwarmClaw itself. Those links remain in the project README and on swarmclaw.ai.
|
|
416
434
|
|
|
417
|
-
### v1.5.35 Highlights
|
|
418
|
-
|
|
419
|
-
- **Update safety: prevent DB corruption on Linux**: `npm run update:easy`, `swarmclaw update`, and the in-app update endpoint now stop the running server (or checkpoint the SQLite WAL) before rebuilding native modules, preventing the WAL journal corruption that forced some Linux users back to the setup wizard.
|
|
420
|
-
- **SQLite graceful shutdown**: the server now checkpoints and closes the database on SIGTERM/SIGINT, eliminating stale WAL state after any clean stop.
|
|
421
|
-
- **Doctor: detect dangling gateway credentials**: the setup doctor now flags gateway profiles that reference deleted or missing credentials, explaining the "gateway token missing" connection errors.
|
|
422
|
-
- **Gateway credential resolution logging**: when a gateway credential can't be resolved, the server now logs a clear warning identifying the missing credential ID.
|
|
423
|
-
- **Credential decryption error logging**: when a stored credential can't be decrypted (e.g. after `CREDENTIAL_SECRET` changes), the server now logs the credential ID and provider so users know which key to re-add.
|
|
424
|
-
|
|
425
|
-
### v1.5.34 Highlights
|
|
426
|
-
|
|
427
|
-
- **Ollama Cloud auth fix**: SwarmClaw now normalizes `api.ollama.com` and `www.ollama.com` to `ollama.com` before making authenticated requests, avoiding the redirect that was dropping authorization headers and causing false provider-health/runtime failures.
|
|
428
|
-
- **Chat execution context hardening**: tool invocation now resolves names case-insensitively, oversized tool results are truncated before they are fed back into the model, and proactive grounding/heartbeat prompts stay smaller under pressure to reduce avoidable context blowouts.
|
|
429
|
-
- **API compatibility fixes**: OpenAI-compatible streaming now captures reasoning deltas from providers that emit them outside `delta.content`, and A2A endpoints are exempt from the main proxy access-key gate so they can rely on their own auth scheme.
|
|
430
|
-
|
|
431
435
|
Older releases: https://swarmclaw.ai/docs/release-notes
|
|
432
436
|
|
|
433
437
|
- GitHub releases: https://github.com/swarmclawai/swarmclaw/releases
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@swarmclawai/swarmclaw",
|
|
3
|
-
"version": "1.5.
|
|
3
|
+
"version": "1.5.40",
|
|
4
4
|
"description": "Build and run autonomous AI agents with OpenClaw, Hermes, multiple model providers, orchestration, delegation, memory, skills, schedules, and chat connectors.",
|
|
5
5
|
"main": "electron-dist/main.js",
|
|
6
6
|
"license": "MIT",
|
|
@@ -210,6 +210,11 @@ export function AgentSheet() {
|
|
|
210
210
|
const [delegationTargetMode, setDelegationTargetMode] = useState<'all' | 'selected'>('all')
|
|
211
211
|
const [delegationTargetAgentIds, setDelegationTargetAgentIds] = useState<string[]>([])
|
|
212
212
|
const [tools, setTools] = useState<string[]>([])
|
|
213
|
+
// Scoped tool access is the default for new agents (cuts ~3 k input tokens
|
|
214
|
+
// per turn). Existing agents with no toolAccessMode field persisted stay
|
|
215
|
+
// universal server-side for backward compat; the new-agent setup path
|
|
216
|
+
// below also explicitly writes 'scoped' so it persists on save.
|
|
217
|
+
const [toolAccessMode, setToolAccessMode] = useState<'universal' | 'scoped'>('scoped')
|
|
213
218
|
const [extensions, setExtensions] = useState<string[]>([])
|
|
214
219
|
const [enabledExtensionIds, setEnabledExtensionIds] = useState<Set<string> | null>(null)
|
|
215
220
|
const [skills, setSkills] = useState<string[]>([])
|
|
@@ -415,6 +420,7 @@ export function AgentSheet() {
|
|
|
415
420
|
setDelegationTargetMode(editing.delegationTargetMode === 'selected' ? 'selected' : 'all')
|
|
416
421
|
setDelegationTargetAgentIds(editing.delegationTargetAgentIds || [])
|
|
417
422
|
setTools(getEnabledToolIds(editing))
|
|
423
|
+
setToolAccessMode(editing.toolAccessMode === 'scoped' ? 'scoped' : 'universal')
|
|
418
424
|
setExtensions(getEnabledExtensionIds(editing))
|
|
419
425
|
setSkills(editing.skills || [])
|
|
420
426
|
setSkillIds(editing.skillIds || [])
|
|
@@ -497,6 +503,7 @@ export function AgentSheet() {
|
|
|
497
503
|
setDelegationTargetMode(src.delegationTargetMode === 'selected' ? 'selected' : 'all')
|
|
498
504
|
setDelegationTargetAgentIds(src.delegationTargetAgentIds || [])
|
|
499
505
|
setTools(getEnabledToolIds(src))
|
|
506
|
+
setToolAccessMode(src.toolAccessMode === 'scoped' ? 'scoped' : 'universal')
|
|
500
507
|
setExtensions(getEnabledExtensionIds(src))
|
|
501
508
|
setSkills(src.skills || [])
|
|
502
509
|
setSkillIds(src.skillIds || [])
|
|
@@ -576,6 +583,7 @@ export function AgentSheet() {
|
|
|
576
583
|
setDelegationTargetMode('all')
|
|
577
584
|
setDelegationTargetAgentIds([])
|
|
578
585
|
setTools(getDefaultAgentToolIds())
|
|
586
|
+
setToolAccessMode('scoped')
|
|
579
587
|
setExtensions([])
|
|
580
588
|
setSkills([])
|
|
581
589
|
setSkillIds([])
|
|
@@ -783,6 +791,7 @@ export function AgentSheet() {
|
|
|
783
791
|
delegationTargetMode: delegationEnabled || role === 'coordinator' ? delegationTargetMode : 'all',
|
|
784
792
|
delegationTargetAgentIds: (delegationEnabled || role === 'coordinator') && delegationTargetMode === 'selected' ? delegationTargetAgentIds : [],
|
|
785
793
|
tools,
|
|
794
|
+
toolAccessMode,
|
|
786
795
|
extensions,
|
|
787
796
|
skills,
|
|
788
797
|
skillIds,
|
|
@@ -2005,6 +2014,30 @@ export function AgentSheet() {
|
|
|
2005
2014
|
summary={advancedSummary}
|
|
2006
2015
|
badges={agentAdvancedBadges}
|
|
2007
2016
|
>
|
|
2017
|
+
<SectionCard
|
|
2018
|
+
title="Context & Tool Access"
|
|
2019
|
+
description="Control how many tools are described in this agent's system prompt. Scoped (default) keeps the agent focused and saves ~3 k input tokens per turn; Universal gives it visibility into every built-in tool."
|
|
2020
|
+
className="mb-6 border-white/[0.05] bg-white/[0.01]"
|
|
2021
|
+
>
|
|
2022
|
+
<div className="space-y-3">
|
|
2023
|
+
<label className="flex items-center gap-3 cursor-pointer">
|
|
2024
|
+
<div
|
|
2025
|
+
onClick={() => setToolAccessMode((current) => current === 'universal' ? 'scoped' : 'universal')}
|
|
2026
|
+
className={`w-11 h-6 rounded-full transition-all duration-200 relative cursor-pointer shrink-0 ${toolAccessMode === 'universal' ? 'bg-accent-bright' : 'bg-white/[0.08]'}`}
|
|
2027
|
+
>
|
|
2028
|
+
<div className={`absolute top-0.5 w-5 h-5 rounded-full bg-white transition-all duration-200 ${toolAccessMode === 'universal' ? 'left-[22px]' : 'left-0.5'}`} />
|
|
2029
|
+
</div>
|
|
2030
|
+
<span className="text-[13px] text-text-2">Universal tool access</span>
|
|
2031
|
+
<HintTip text="Off (default, recommended): the agent only sees tools enabled in its Tools list. On: every built-in tool is described in the system prompt. Turn on only for coordinator agents that need visibility across every possible downstream tool, or temporarily for debugging." />
|
|
2032
|
+
</label>
|
|
2033
|
+
<p className="text-[12px] text-text-3/70 pl-[56px] -mt-1">
|
|
2034
|
+
{toolAccessMode === 'universal'
|
|
2035
|
+
? 'Full tool universe is injected into the prompt. Costs ~3 k more input tokens per turn.'
|
|
2036
|
+
: 'Only the tools enabled above are visible to the agent — this is the focused default.'}
|
|
2037
|
+
</p>
|
|
2038
|
+
</div>
|
|
2039
|
+
</SectionCard>
|
|
2040
|
+
|
|
2008
2041
|
<SectionCard
|
|
2009
2042
|
title="Voice & Autonomy"
|
|
2010
2043
|
description="Tune voice and the detailed heartbeat behavior for this agent."
|
|
@@ -422,6 +422,21 @@ export function streamOpenClawChat({ session, message, imagePath, apiKey, write,
|
|
|
422
422
|
|
|
423
423
|
const wsUrl = session.apiEndpoint ? deriveOpenClawWsUrl(session.apiEndpoint) : 'ws://127.0.0.1:18789'
|
|
424
424
|
const token = apiKey || session.apiKey || undefined
|
|
425
|
+
// If the session references a credential but nothing resolved, the credential
|
|
426
|
+
// was deleted or corrupted. Fail fast with a clear error instead of dialing
|
|
427
|
+
// the gateway unauthenticated and timing out 120 s later (the original symptom
|
|
428
|
+
// behind the "OpenClaw gateway timed out after 120 s" report).
|
|
429
|
+
const credentialIdSet = typeof session.credentialId === 'string' && session.credentialId.trim().length > 0
|
|
430
|
+
if (credentialIdSet && !token) {
|
|
431
|
+
return Promise.resolve().then(() => {
|
|
432
|
+
active.delete(session.id)
|
|
433
|
+
write(`data: ${JSON.stringify({
|
|
434
|
+
t: 'err',
|
|
435
|
+
text: `OpenClaw credential "${session.credentialId}" is missing from the credential store. Reattach an existing credential or create a new one in Settings → Credentials.`,
|
|
436
|
+
})}\n\n`)
|
|
437
|
+
return ''
|
|
438
|
+
})
|
|
439
|
+
}
|
|
425
440
|
return new Promise((resolve) => {
|
|
426
441
|
let fullResponse = ''
|
|
427
442
|
let settled = false
|
|
@@ -787,6 +787,28 @@ function writeReflectionMemories(params: {
|
|
|
787
787
|
const normalizeNote = (note: string): string =>
|
|
788
788
|
note.toLowerCase().replace(/\s+/g, ' ').trim().slice(0, 240)
|
|
789
789
|
|
|
790
|
+
// Cross-run dedup: skip notes that already exist as a recent reflection
|
|
791
|
+
// memory for this agent. Different reflection runs over successive turns
|
|
792
|
+
// often rediscover the same invariant/lesson because the model re-derives
|
|
793
|
+
// them from the same pattern. Without this guard the reflection table
|
|
794
|
+
// grows ~7 entries per test turn; with it, repeat reflections are absorbed.
|
|
795
|
+
const CROSS_RUN_DEDUP_WINDOW_MS = 7 * 24 * 3600_000 // 7 days
|
|
796
|
+
const crossRunDedupCutoff = createdAt - CROSS_RUN_DEDUP_WINDOW_MS
|
|
797
|
+
try {
|
|
798
|
+
if (params.agentId) {
|
|
799
|
+
const recent = memoryDb.list(params.agentId, 500)
|
|
800
|
+
for (const entry of recent) {
|
|
801
|
+
if (!entry.category || !entry.category.startsWith('reflection/')) continue
|
|
802
|
+
if ((entry.updatedAt || 0) < crossRunDedupCutoff) continue
|
|
803
|
+
const norm = normalizeNote(entry.content || '')
|
|
804
|
+
if (norm) seenNormalized.add(norm)
|
|
805
|
+
}
|
|
806
|
+
}
|
|
807
|
+
} catch {
|
|
808
|
+
// Memory DB lookup is best-effort — if it fails, fall back to within-run
|
|
809
|
+
// dedup only rather than blocking the reflection write.
|
|
810
|
+
}
|
|
811
|
+
|
|
790
812
|
for (const group of groups) {
|
|
791
813
|
for (const note of group.notes) {
|
|
792
814
|
const norm = normalizeNote(note)
|
|
@@ -15,7 +15,7 @@ import { loadSettings } from '@/lib/server/settings/settings-repository'
|
|
|
15
15
|
import { loadSkills } from '@/lib/server/skills/skill-repository'
|
|
16
16
|
import { resolveImagePath } from '@/lib/server/resolve-image'
|
|
17
17
|
import { resolveSessionToolPolicy } from '@/lib/server/tool-capability-policy'
|
|
18
|
-
import { listUniversalToolAccessExtensionIds } from '@/lib/server/universal-tool-access'
|
|
18
|
+
import { listUniversalToolAccessExtensionIds, listScopedToolAccessExtensionIds } from '@/lib/server/universal-tool-access'
|
|
19
19
|
import {
|
|
20
20
|
buildAgentDisabledMessage,
|
|
21
21
|
isAgentDisabled,
|
|
@@ -332,9 +332,17 @@ function buildAgentSystemPrompt(
|
|
|
332
332
|
const allowSilentReplies = isDirectConnectorSession(session)
|
|
333
333
|
const lightweightDirectChat = options?.lightweightDirectChat === true
|
|
334
334
|
const parts: string[] = []
|
|
335
|
-
const
|
|
336
|
-
|
|
337
|
-
|
|
335
|
+
const capabilityIds = getEnabledCapabilityIds(session).length > 0
|
|
336
|
+
? getEnabledCapabilityIds(session)
|
|
337
|
+
: getEnabledCapabilityIds(agent)
|
|
338
|
+
// Scoped tool access is the new default: if the agent declares a non-empty
|
|
339
|
+
// `tools` list, the system prompt only describes those tools. Explicit
|
|
340
|
+
// `toolAccessMode: 'universal'` opts into the full firehose (for coordinators
|
|
341
|
+
// or debugging). Agents with no declared tools fall back to universal so
|
|
342
|
+
// empty-config agents aren't crippled.
|
|
343
|
+
const enabledExtensions = agent.toolAccessMode !== 'universal' && Array.isArray(agent.tools) && agent.tools.length > 0
|
|
344
|
+
? listScopedToolAccessExtensionIds(agent.tools, capabilityIds)
|
|
345
|
+
: listUniversalToolAccessExtensionIds(capabilityIds)
|
|
338
346
|
|
|
339
347
|
const identityLines = ['## My Identity']
|
|
340
348
|
identityLines.push(`Name: ${agent.name}`)
|
|
@@ -547,8 +555,16 @@ export async function prepareChatTurn(input: ExecuteChatTurnInput): Promise<Prep
|
|
|
547
555
|
const runtimeCapabilityIds = filterRuntimeCapabilityIds(getEnabledCapabilityIds(session), {
|
|
548
556
|
delegationEnabled: agentForSession?.delegationEnabled === true,
|
|
549
557
|
})
|
|
558
|
+
// Match the resolver in buildAgentSystemPrompt: default to scoped whenever
|
|
559
|
+
// the agent declares a non-empty tools list, unless explicitly set to
|
|
560
|
+
// 'universal'. Agents with no declared tools stay universal.
|
|
561
|
+
const scopedAccess = agentForSession?.toolAccessMode !== 'universal'
|
|
562
|
+
&& Array.isArray(agentForSession?.tools)
|
|
563
|
+
&& (agentForSession!.tools!.length > 0)
|
|
550
564
|
const requestedCapabilityIds = runtimeCapabilityIds.length > 0
|
|
551
|
-
?
|
|
565
|
+
? (scopedAccess
|
|
566
|
+
? listScopedToolAccessExtensionIds(agentForSession!.tools!, runtimeCapabilityIds)
|
|
567
|
+
: listUniversalToolAccessExtensionIds(runtimeCapabilityIds))
|
|
552
568
|
: []
|
|
553
569
|
const toolPolicy = resolveSessionToolPolicy(requestedCapabilityIds, appSettings)
|
|
554
570
|
const isHeartbeatRun = input.internal === true && source === 'heartbeat'
|
|
@@ -370,8 +370,14 @@ function checkToolSummary(ctx: ContinuationContext): ContinuationDecision | null
|
|
|
370
370
|
isConnectorSession: ctx.isConnectorSession,
|
|
371
371
|
})
|
|
372
372
|
if (skipToolSummaryForShortResponse) return null
|
|
373
|
+
// A 119-char response like "I wrote X, stored Y, and confirmed both." is
|
|
374
|
+
// substantive after two tool calls — it names each action. The prior
|
|
375
|
+
// 150-char threshold treated such responses as trivial preambles and
|
|
376
|
+
// forced a redundant retry that streamed the same answer twice. Tightened
|
|
377
|
+
// to 80 so only genuinely short preambles ("Done.", "Let me do that…")
|
|
378
|
+
// trigger the summary continuation.
|
|
373
379
|
const textIsTrivial = !ctx.state.fullText.trim() || (
|
|
374
|
-
!ctx.isConnectorSession && ctx.state.fullText.trim().length <
|
|
380
|
+
!ctx.isConnectorSession && ctx.state.fullText.trim().length < 80
|
|
375
381
|
&& (
|
|
376
382
|
ctx.state.streamedToolEvents.length >= 2
|
|
377
383
|
|| ctx.likelyResearchSynthesisTask
|
|
@@ -32,6 +32,10 @@ export const MessageClassificationSchema = z.object({
|
|
|
32
32
|
isDeliverableTask: z.boolean(),
|
|
33
33
|
isBroadGoal: z.boolean(),
|
|
34
34
|
isLightweightDirectChat: z.boolean().optional().default(false),
|
|
35
|
+
isCurrentThreadRecall: z.boolean().optional().default(false),
|
|
36
|
+
isGreeting: z.boolean().optional().default(false),
|
|
37
|
+
isAcknowledgement: z.boolean().optional().default(false),
|
|
38
|
+
isMemoryWriteIntent: z.boolean().optional().default(false),
|
|
35
39
|
hasHumanSignals: z.boolean(),
|
|
36
40
|
hasSignificantEvent: z.boolean(),
|
|
37
41
|
isResearchSynthesis: z.boolean(),
|
|
@@ -48,6 +52,10 @@ export interface MessageClassification {
|
|
|
48
52
|
isDeliverableTask: boolean
|
|
49
53
|
isBroadGoal: boolean
|
|
50
54
|
isLightweightDirectChat?: boolean
|
|
55
|
+
isCurrentThreadRecall?: boolean
|
|
56
|
+
isGreeting?: boolean
|
|
57
|
+
isAcknowledgement?: boolean
|
|
58
|
+
isMemoryWriteIntent?: boolean
|
|
51
59
|
hasHumanSignals: boolean
|
|
52
60
|
hasSignificantEvent: boolean
|
|
53
61
|
isResearchSynthesis: boolean
|
|
@@ -103,6 +111,10 @@ function buildClassificationPrompt(message: string, recentHistory: string): stri
|
|
|
103
111
|
'- isDeliverableTask (bool): The user wants a concrete artifact produced — a document, report, plan, proposal, landing page, dashboard, HTML file, markdown file, brief, copy, screenshots, or similar deliverable. NOT simple Q&A, code fixes, or single-command tasks.',
|
|
104
112
|
'- isBroadGoal (bool): The message describes a broad, multi-step goal (50+ chars, no code blocks, no file paths, no numbered lists). Short questions ending with "?" are NOT broad goals.',
|
|
105
113
|
'- isLightweightDirectChat (bool): This is a low-signal direct chat turn that should get a natural lightweight reply, such as a greeting, acknowledgment, check-in, or simple social/direct question that does NOT require research, file work, planning, delegation, or tool execution.',
|
|
114
|
+
'- isCurrentThreadRecall (bool): The user is asking about something from THIS CURRENT CHAT THREAD — e.g. "what were both answers you just gave?", "tell me that number again", "what did I just ask?", "your last reply mentioned X — expand on it". The answer is in the visible conversation history above. Return FALSE when the user is asking about prior conversations, sessions from other days, or things they remember from outside this thread (e.g. "remember when we talked about X last week", "what did we decide yesterday"). Regardless of language or exact phrasing, the signal is: does the answer live in the messages above, or does it require a memory/history lookup?',
|
|
115
|
+
'- isGreeting (bool): A standalone greeting with no other task — "hi", "hello", "hey there", "good morning", "yo". Returns FALSE if the greeting is followed by a real request.',
|
|
116
|
+
'- isAcknowledgement (bool): A short acknowledgement / social reply with no action required — "ok", "thanks", "got it", "cool", "makes sense", "sounds good", "nope". Returns FALSE if there is a follow-up question or directive.',
|
|
117
|
+
'- isMemoryWriteIntent (bool): The user is explicitly asking the assistant to remember, store, save, memorize, forget, or correct a durable fact about themselves, a preference, or a standing instruction — "remember my wife is called Anna", "save this as a preference", "forget what I told you about X", "update your memory: I now prefer Y". Returns FALSE for passive statements that happen to mention memory/remembering without asking for a write.',
|
|
106
118
|
'- hasHumanSignals (bool): The message contains personal signals — preferences ("I prefer", "call me"), relationships ("my wife", "my partner", "my kid"), life events ("birthday", "wedding", "promotion", "moving", "graduation", "hospital"), or personal disclosures.',
|
|
107
119
|
'- hasSignificantEvent (bool): The message mentions a notable life/work event or milestone (birthday, anniversary, wedding, graduation, promotion, new job, relocation, illness, funeral, travel, house, deadline, launch).',
|
|
108
120
|
'- isResearchSynthesis (bool): The task requires gathering information from multiple sources and synthesizing it — research reports, competitive analysis, market overviews, literature reviews, multi-source comparisons. NOT simple factual lookups.',
|
|
@@ -121,7 +133,7 @@ function buildClassificationPrompt(message: string, recentHistory: string): stri
|
|
|
121
133
|
'- Prefer the most execution-relevant taskIntent. Example: "research this and send me a voice note" is "research", not "outreach".',
|
|
122
134
|
'',
|
|
123
135
|
'Output shape:',
|
|
124
|
-
'{"taskIntent":"coding|research|browsing|outreach|scheduling|general","isDeliverableTask":bool,"isBroadGoal":bool,"isLightweightDirectChat":bool,"hasHumanSignals":bool,"hasSignificantEvent":bool,"isResearchSynthesis":bool,"workType":"coding|research|writing|review|operations|general","wantsScreenshots":bool,"wantsOutboundDelivery":bool,"wantsVoiceDelivery":bool,"explicitToolRequests":[],"confidence":0.0-1.0}',
|
|
136
|
+
'{"taskIntent":"coding|research|browsing|outreach|scheduling|general","isDeliverableTask":bool,"isBroadGoal":bool,"isLightweightDirectChat":bool,"isCurrentThreadRecall":bool,"isGreeting":bool,"isAcknowledgement":bool,"isMemoryWriteIntent":bool,"hasHumanSignals":bool,"hasSignificantEvent":bool,"isResearchSynthesis":bool,"workType":"coding|research|writing|review|operations|general","wantsScreenshots":bool,"wantsOutboundDelivery":bool,"wantsVoiceDelivery":bool,"explicitToolRequests":[],"confidence":0.0-1.0}',
|
|
125
137
|
'',
|
|
126
138
|
recentHistory ? `Recent context:\n${recentHistory}\n` : '',
|
|
127
139
|
`User message: ${JSON.stringify(message)}`,
|
|
@@ -206,7 +218,13 @@ export interface ClassifyMessageInput {
|
|
|
206
218
|
history?: Message[]
|
|
207
219
|
}
|
|
208
220
|
|
|
209
|
-
|
|
221
|
+
// Timeout sized for Ollama Cloud with a fully-configured agent: observed
|
|
222
|
+
// classifier calls in the 4-6 s range during live testing, plus the expanded
|
|
223
|
+
// 4-flag semantic schema requires a slightly larger JSON output. 10 s
|
|
224
|
+
// accommodates the tail without blocking chat turns for long on a total
|
|
225
|
+
// failure. Result is cached per-message so the latency tax only applies to
|
|
226
|
+
// first-seen messages.
|
|
227
|
+
const CLASSIFIER_TIMEOUT_MS = 10_000
|
|
210
228
|
|
|
211
229
|
/**
|
|
212
230
|
* Classify a user message using a single LLM call.
|
|
@@ -240,6 +258,9 @@ export async function classifyMessage(
|
|
|
240
258
|
options?.generateText
|
|
241
259
|
? options.generateText(prompt)
|
|
242
260
|
: (async () => {
|
|
261
|
+
// Uses the agent's configured LLM (same model/credential), but
|
|
262
|
+
// with a lightweight prompt-only call — no agent system prompt,
|
|
263
|
+
// no tools, no memory injection, no history replay.
|
|
243
264
|
const { llm } = await buildLLM({
|
|
244
265
|
sessionId: input.sessionId,
|
|
245
266
|
agentId: input.agentId || null,
|
|
@@ -253,9 +274,16 @@ export async function classifyMessage(
|
|
|
253
274
|
])
|
|
254
275
|
|
|
255
276
|
const durationMs = Date.now() - startMs
|
|
256
|
-
log.info(TAG, `session=${input.sessionId} completed in ${durationMs}ms`)
|
|
257
|
-
|
|
258
277
|
const classification = parseClassificationResponse(responseText)
|
|
278
|
+
log.info(TAG, `session=${input.sessionId} completed in ${durationMs}ms`, classification ? {
|
|
279
|
+
taskIntent: classification.taskIntent,
|
|
280
|
+
isCurrentThreadRecall: classification.isCurrentThreadRecall || false,
|
|
281
|
+
isGreeting: classification.isGreeting || false,
|
|
282
|
+
isAcknowledgement: classification.isAcknowledgement || false,
|
|
283
|
+
isMemoryWriteIntent: classification.isMemoryWriteIntent || false,
|
|
284
|
+
isLightweightDirectChat: classification.isLightweightDirectChat || false,
|
|
285
|
+
confidence: classification.confidence,
|
|
286
|
+
} : { parsed: false })
|
|
259
287
|
if (classification) {
|
|
260
288
|
setCache(message, classification)
|
|
261
289
|
}
|
|
@@ -344,10 +344,17 @@ export function buildAgenticExecutionPolicy(opts: {
|
|
|
344
344
|
if (hasTooling) {
|
|
345
345
|
parts.push(
|
|
346
346
|
'## Routing Matrix',
|
|
347
|
+
// Smaller open-source models (observed with devstral-small-2:24b) routinely
|
|
348
|
+
// ignore a terse "use the thread first" line and call `memory_search`
|
|
349
|
+
// whenever a user message contains referential words like "that", "those",
|
|
350
|
+
// "both", "my last", "your previous". Spell out the boundary explicitly
|
|
351
|
+
// so compliance is consistent regardless of model size.
|
|
347
352
|
'Current-thread facts already visible in this chat: answer directly from the thread before using tools.',
|
|
353
|
+
'References in the user\'s message to things from THIS conversation — e.g. "that", "those", "both", "your last reply", "the number you gave", "what I just said" — are already in the thread history above. Read the prior messages to answer. Do NOT call `memory_search`, `sessions_tool`, or any recall tool for these.',
|
|
354
|
+
'Only use memory or session-history tools when the user explicitly asks about a PRIOR conversation ("what did we discuss yesterday", "remember when I told you X last week") or names something not present in the current thread.',
|
|
348
355
|
hasMemoryTools
|
|
349
|
-
? 'Facts from previous conversations: start with `memory_search`, then `memory_get` only for a targeted follow-up read.'
|
|
350
|
-
: 'Facts from previous conversations: rely on the visible thread only and state when memory tools are unavailable.',
|
|
356
|
+
? 'Facts from previous conversations (not this thread): start with `memory_search`, then `memory_get` only for a targeted follow-up read.'
|
|
357
|
+
: 'Facts from previous conversations (not this thread): rely on the visible thread only and state when memory tools are unavailable.',
|
|
351
358
|
hasManageSessions
|
|
352
359
|
? 'Harness/session context, lineage, project attachment, or enabled-tool questions: use `sessions_tool` action `identity`.'
|
|
353
360
|
: 'Harness/session introspection is limited here; rely on the runtime orientation block and visible context.',
|
|
@@ -450,7 +457,10 @@ export function buildAgenticExecutionPolicy(opts: {
|
|
|
450
457
|
const exactStructureBlock = buildExactStructureBlock(opts.userMessage)
|
|
451
458
|
if (exactStructureBlock) parts.push(exactStructureBlock)
|
|
452
459
|
}
|
|
453
|
-
|
|
460
|
+
// Delegate to isCurrentThreadRecallRequest which internally prefers the
|
|
461
|
+
// LLM classifier's judgment and falls back to regex only when classifier
|
|
462
|
+
// is unavailable.
|
|
463
|
+
if (opts.userMessage && isCurrentThreadRecallRequest(opts.userMessage, opts.classification ?? null)) {
|
|
454
464
|
parts.push(buildCurrentThreadRecallBlock(opts.history || []))
|
|
455
465
|
}
|
|
456
466
|
}
|
|
@@ -1,10 +1,23 @@
|
|
|
1
1
|
import type { MemoryEntry } from '@/types'
|
|
2
2
|
|
|
3
|
+
// Shape subset — we only need the boolean signals the LLM classifier emits.
|
|
4
|
+
// Typed loosely here to avoid a circular import with chat-execution.
|
|
5
|
+
type ClassificationHint = {
|
|
6
|
+
isCurrentThreadRecall?: boolean
|
|
7
|
+
isGreeting?: boolean
|
|
8
|
+
isAcknowledgement?: boolean
|
|
9
|
+
isMemoryWriteIntent?: boolean
|
|
10
|
+
} | null | undefined
|
|
11
|
+
|
|
12
|
+
// The regexes below are kept as fallbacks: when the LLM classifier returns
|
|
13
|
+
// null (timeout, no provider), these cover the common English phrasings so
|
|
14
|
+
// the system degrades gracefully. Paraphrases, non-English, or novel wordings
|
|
15
|
+
// are handled by the classifier path in callers.
|
|
3
16
|
const ACK_RE = /^(?:ok(?:ay)?|cool|nice|got it|makes sense|thanks|thank you|thx|roger|copy|sounds good|sgtm|yep|yup|y|nope?|nah|kk|done)[.! ]*$/i
|
|
4
17
|
const GREETING_RE = /^(?:hi|hello|hey|yo|morning|good morning|good afternoon|good evening)[.! ]*$/i
|
|
5
18
|
const MEMORY_META_RE = /\b(?:remember|memory|memorize|store this|save this|forget)\b/i
|
|
6
19
|
const LOW_SIGNAL_RESPONSE_RE = /^(?:HEARTBEAT_OK|NO_MESSAGE)\b/i
|
|
7
|
-
const CURRENT_THREAD_RECALL_MARKER_RE = /\b(?:this conversation|this chat|this thread|current conversation|current chat|current thread|same thread|same chat|same conversation|earlier in (?:this )?(?:conversation|chat|thread)|from (?:this|our) (?:conversation|chat|thread)|you just stored|you just said|
|
|
20
|
+
const CURRENT_THREAD_RECALL_MARKER_RE = /\b(?:this conversation|this chat|this thread|current conversation|current chat|current thread|same thread|same chat|same conversation|earlier in (?:this )?(?:conversation|chat|thread)|from (?:this|our) (?:conversation|chat|thread)|you just stored|you just said|you just gave|you just told|you just answered|you just replied|i just (?:said|asked|gave|told|mentioned)|we just (?:discussed|decided|talked)|your last (?:reply|answer|response|message)|my last (?:question|message)|above in (?:this |the )?(?:chat|thread|conversation)|(?:both|two|all) (?:answers|numbers|results|replies|responses))\b/i
|
|
8
21
|
const CURRENT_THREAD_RECALL_INTENT_RE = /\b(?:what|which|who|when|where|did|remind|recap|summarize|repeat|list|tell me|answer|confirm|recall|mention)\b/i
|
|
9
22
|
const DIRECT_MEMORY_WRITE_MARKER_RE = /\b(?:remember|memorize|store (?:this|that|the fact|it)|save (?:this|that|the fact|it) (?:to|in) memory|write to memory|add to memory|update.*memory|correct.*memory)\b/i
|
|
10
23
|
const DIRECT_MEMORY_WRITE_FOLLOWUP_RE = /\b(?:confirm|recap|repeat|summarize|what you just stored|what you saved|what you updated)\b/i
|
|
@@ -17,17 +30,36 @@ function lower(value: string | null | undefined): string {
|
|
|
17
30
|
return normalizeWhitespace(value || '').toLowerCase()
|
|
18
31
|
}
|
|
19
32
|
|
|
20
|
-
export function shouldInjectMemoryContext(
|
|
33
|
+
export function shouldInjectMemoryContext(
|
|
34
|
+
message: string,
|
|
35
|
+
classification?: ClassificationHint,
|
|
36
|
+
): boolean {
|
|
21
37
|
const trimmed = normalizeWhitespace(message)
|
|
22
38
|
if (!trimmed) return false
|
|
39
|
+
// Prefer the LLM classifier's judgment when available — it generalizes across
|
|
40
|
+
// paraphrases and non-English phrasings that the static regexes miss.
|
|
41
|
+
if (classification) {
|
|
42
|
+
if (classification.isGreeting === true) return false
|
|
43
|
+
if (classification.isAcknowledgement === true) return false
|
|
44
|
+
if (classification.isMemoryWriteIntent === true && trimmed.length < 24) return false
|
|
45
|
+
return true
|
|
46
|
+
}
|
|
47
|
+
// Regex fallback for when classifier is unavailable.
|
|
23
48
|
if (trimmed.length < 16 && (ACK_RE.test(trimmed) || GREETING_RE.test(trimmed))) return false
|
|
24
49
|
if (trimmed.length < 24 && MEMORY_META_RE.test(trimmed)) return false
|
|
25
50
|
return true
|
|
26
51
|
}
|
|
27
52
|
|
|
28
|
-
export function isCurrentThreadRecallRequest(
|
|
53
|
+
export function isCurrentThreadRecallRequest(
|
|
54
|
+
message: string,
|
|
55
|
+
classification?: ClassificationHint,
|
|
56
|
+
): boolean {
|
|
29
57
|
const trimmed = normalizeWhitespace(message)
|
|
30
58
|
if (!trimmed) return false
|
|
59
|
+
if (classification?.isCurrentThreadRecall === true) return true
|
|
60
|
+
// Regex fallback. Skip when classifier confidently said "not thread recall"
|
|
61
|
+
// (isCurrentThreadRecall === false explicitly — not just missing).
|
|
62
|
+
if (classification && classification.isCurrentThreadRecall === false) return false
|
|
31
63
|
if (!CURRENT_THREAD_RECALL_MARKER_RE.test(trimmed)) return false
|
|
32
64
|
if (DIRECT_MEMORY_WRITE_MARKER_RE.test(trimmed) && DIRECT_MEMORY_WRITE_FOLLOWUP_RE.test(trimmed)) return false
|
|
33
65
|
if (/\b(?:remember|store|save)\b/i.test(trimmed) && !/\?\s*$/.test(trimmed) && !/\b(?:what|which|who|when|where|did|confirm|recap|summarize|repeat|list|tell me|answer|recall)\b/i.test(trimmed)) {
|
|
@@ -34,7 +34,11 @@ const perfState = hmrSingleton('__swarmclaw_perf__', () => ({
|
|
|
34
34
|
recentEntries: [] as PerfEntry[],
|
|
35
35
|
}))
|
|
36
36
|
|
|
37
|
-
|
|
37
|
+
// Keep a generous ring buffer so perf entries from a chat turn survive the
|
|
38
|
+
// flurry of repository/queue events that fire between them. 200 was too small
|
|
39
|
+
// — queue.get/tasks.list fire ~20/s during task processing and would evict
|
|
40
|
+
// chat-execution/prompt entries before they could be read.
|
|
41
|
+
const MAX_RECENT = 2000
|
|
38
42
|
|
|
39
43
|
function emitEntry(entry: PerfEntry): void {
|
|
40
44
|
perfState.recentEntries.push(entry)
|
|
@@ -700,6 +700,7 @@ export function reconcileFinishedRunningTasks(): { reconciled: number; deadLette
|
|
|
700
700
|
if (!fallbackText && !task.result) {
|
|
701
701
|
task.status = 'failed'
|
|
702
702
|
task.result = 'Agent session finished without producing output.'
|
|
703
|
+
task.checkoutRunId = null
|
|
703
704
|
task.updatedAt = now
|
|
704
705
|
tasksDirty = true
|
|
705
706
|
continue
|
|
@@ -1105,13 +1106,23 @@ export async function processNext() {
|
|
|
1105
1106
|
const currentQueue = loadQueue()
|
|
1106
1107
|
const queueSet = new Set(currentQueue)
|
|
1107
1108
|
let recovered = false
|
|
1109
|
+
let tasksDirty = false
|
|
1108
1110
|
for (const [id, t] of Object.entries(allTasks) as [string, BoardTask][]) {
|
|
1109
1111
|
if (t.status === 'queued' && !queueSet.has(id)) {
|
|
1110
1112
|
log.info(TAG, `[queue] Recovering orphaned queued task: "${t.title}" (${id})`)
|
|
1113
|
+
// Defence in depth: a queued task must not carry a stale checkoutRunId
|
|
1114
|
+
// (left over from pre-1.5.38 retries). If it does, checkoutTask() will
|
|
1115
|
+
// reject every attempt and this orphan-recovery loop will spin at 100%
|
|
1116
|
+
// CPU re-queueing a task that can never run.
|
|
1117
|
+
if (t.checkoutRunId) {
|
|
1118
|
+
t.checkoutRunId = null
|
|
1119
|
+
tasksDirty = true
|
|
1120
|
+
}
|
|
1111
1121
|
pushQueueUnique(currentQueue, id)
|
|
1112
1122
|
recovered = true
|
|
1113
1123
|
}
|
|
1114
1124
|
}
|
|
1125
|
+
if (tasksDirty) saveTasks(allTasks)
|
|
1115
1126
|
if (recovered) saveQueue(currentQueue)
|
|
1116
1127
|
}
|
|
1117
1128
|
|
|
@@ -1152,6 +1163,7 @@ export async function processNext() {
|
|
|
1152
1163
|
if (!agent) {
|
|
1153
1164
|
task.status = 'failed'
|
|
1154
1165
|
task.deadLetteredAt = Date.now()
|
|
1166
|
+
task.checkoutRunId = null
|
|
1155
1167
|
task.error = `Agent ${task.agentId} not found`
|
|
1156
1168
|
task.updatedAt = Date.now()
|
|
1157
1169
|
saveTasks(latestTasks)
|
|
@@ -1182,6 +1194,7 @@ export async function processNext() {
|
|
|
1182
1194
|
} else {
|
|
1183
1195
|
task.status = 'failed'
|
|
1184
1196
|
task.deadLetteredAt = Date.now()
|
|
1197
|
+
task.checkoutRunId = null
|
|
1185
1198
|
task.error = `No agent matches required capabilities: [${reqCaps.join(', ')}]`
|
|
1186
1199
|
task.updatedAt = Date.now()
|
|
1187
1200
|
saveTasks(latestTasks)
|
|
@@ -309,6 +309,67 @@ describe('queue recovery', () => {
|
|
|
309
309
|
assert.equal(output.attempts, 1)
|
|
310
310
|
})
|
|
311
311
|
|
|
312
|
+
it('processNext orphan recovery clears stale checkoutRunId on queued tasks', () => {
|
|
313
|
+
// Regression: tasks written before the 1.5.38 fix could land in storage with
|
|
314
|
+
// status='queued' + a set checkoutRunId (because the old scheduleRetryOrDeadLetter
|
|
315
|
+
// forgot to release the checkout). Orphan recovery must repair this invalid combo
|
|
316
|
+
// so the next checkoutTask() can succeed — otherwise the loop spins forever.
|
|
317
|
+
const output = runWithTempDataDir<{
|
|
318
|
+
status: string | null
|
|
319
|
+
checkoutRunId: string | null
|
|
320
|
+
queued: string[]
|
|
321
|
+
}>(`
|
|
322
|
+
const storageMod = await import('@/lib/server/storage')
|
|
323
|
+
const queueMod = await import('@/lib/server/runtime/queue')
|
|
324
|
+
const storage = storageMod.default || storageMod
|
|
325
|
+
const queue = queueMod.default || queueMod
|
|
326
|
+
|
|
327
|
+
const now = Date.now()
|
|
328
|
+
storage.saveAgents({
|
|
329
|
+
'agent-a': {
|
|
330
|
+
id: 'agent-a',
|
|
331
|
+
name: 'Agent A',
|
|
332
|
+
provider: 'openai',
|
|
333
|
+
model: 'gpt-test',
|
|
334
|
+
createdAt: now,
|
|
335
|
+
updatedAt: now,
|
|
336
|
+
},
|
|
337
|
+
})
|
|
338
|
+
storage.saveTasks({
|
|
339
|
+
stale: {
|
|
340
|
+
id: 'stale',
|
|
341
|
+
title: 'Pre-1.5.38 stuck task',
|
|
342
|
+
description: 'Queued but still holds a stale checkoutRunId from a prior failed run',
|
|
343
|
+
status: 'queued',
|
|
344
|
+
agentId: 'agent-a',
|
|
345
|
+
checkoutRunId: 'stale-run-id',
|
|
346
|
+
createdAt: now - 10_000,
|
|
347
|
+
updatedAt: now - 10_000,
|
|
348
|
+
},
|
|
349
|
+
})
|
|
350
|
+
// Intentionally NOT in the queue array — simulates the orphan condition.
|
|
351
|
+
storage.saveQueue([])
|
|
352
|
+
|
|
353
|
+
await queue.processNext()
|
|
354
|
+
|
|
355
|
+
const task = storage.loadTasks().stale
|
|
356
|
+
console.log(JSON.stringify({
|
|
357
|
+
status: task?.status ?? null,
|
|
358
|
+
checkoutRunId: task?.checkoutRunId ?? null,
|
|
359
|
+
queued: storage.loadQueue(),
|
|
360
|
+
}))
|
|
361
|
+
`)
|
|
362
|
+
|
|
363
|
+
// Orphan recovery should have put the task back in the queue AND cleared the stale id.
|
|
364
|
+
assert.equal(output.checkoutRunId, null, 'orphan recovery must clear stale checkoutRunId')
|
|
365
|
+
// After recovery the task either stayed queued or moved to running (depending on concurrency).
|
|
366
|
+
// Either way it must not still be stuck in an orphan state.
|
|
367
|
+
assert.ok(
|
|
368
|
+
output.status === 'queued' || output.status === 'running' || output.status === 'failed',
|
|
369
|
+
`unexpected status after recovery: ${output.status}`,
|
|
370
|
+
)
|
|
371
|
+
})
|
|
372
|
+
|
|
312
373
|
it('dead-letter path clears checkoutRunId so terminal tasks do not appear checked-out', () => {
|
|
313
374
|
const output = runWithTempDataDir<{
|
|
314
375
|
status: string | null
|
|
@@ -65,6 +65,40 @@ type MemoryActionContext = Partial<Session> & {
|
|
|
65
65
|
|
|
66
66
|
type MemorySearchSource = 'durable' | 'working' | 'archive' | 'all'
|
|
67
67
|
type NarrowMemoryAction = 'search' | 'get' | 'store' | 'update'
|
|
68
|
+
|
|
69
|
+
// Heuristic for detecting queries that actually refer to the current chat
|
|
70
|
+
// thread, not durable memory. Phrases like "just", "last reply", "both"
|
|
71
|
+
// (without any "yesterday/last week/before/earlier conversation" qualifier)
|
|
72
|
+
// are almost always pronouns targeting the visible thread. Small open-source
|
|
73
|
+
// models routinely run memory_search for these and then truthfully report
|
|
74
|
+
// "no memories found" even though the answer is three messages up.
|
|
75
|
+
const THREAD_RECALL_SIGNALS = [
|
|
76
|
+
/\bjust\b/i,
|
|
77
|
+
/\blast reply\b/i,
|
|
78
|
+
/\bmy last\b/i,
|
|
79
|
+
/\byour last\b/i,
|
|
80
|
+
/\bprevious (reply|answer|response|message)\b/i,
|
|
81
|
+
/\babove\b/i,
|
|
82
|
+
/\bwhat (i|you) (just|last) (said|asked|answered|gave|told)\b/i,
|
|
83
|
+
/\b(both|two|all) (answers|numbers|replies|responses)\b/i,
|
|
84
|
+
/\bthe (answer|number|result) you (just|last) (gave|said)\b/i,
|
|
85
|
+
]
|
|
86
|
+
const PRIOR_CONVERSATION_SIGNALS = [
|
|
87
|
+
/\byesterday\b/i,
|
|
88
|
+
/\blast (week|month|year|time)\b/i,
|
|
89
|
+
/\bearlier (today|conversation|session|chat)\b/i,
|
|
90
|
+
/\bbefore we\b/i,
|
|
91
|
+
/\bremember (that|when|the time)\b/i,
|
|
92
|
+
/\bin a (previous|prior) (chat|session|conversation)\b/i,
|
|
93
|
+
]
|
|
94
|
+
function isLikelyThreadRecallQuery(query: string): boolean {
|
|
95
|
+
if (typeof query !== 'string' || !query.trim()) return false
|
|
96
|
+
// If the user explicitly mentions a prior conversation/session, it's NOT
|
|
97
|
+
// a thread recall — let memory_search run normally.
|
|
98
|
+
if (PRIOR_CONVERSATION_SIGNALS.some((rx) => rx.test(query))) return false
|
|
99
|
+
return THREAD_RECALL_SIGNALS.some((rx) => rx.test(query))
|
|
100
|
+
}
|
|
101
|
+
|
|
68
102
|
type CanonicalMemoryCandidate = {
|
|
69
103
|
entry: MemoryEntry
|
|
70
104
|
score: number
|
|
@@ -567,6 +601,15 @@ export async function executeMemoryAction(input: unknown, ctx: MemoryActionConte
|
|
|
567
601
|
}
|
|
568
602
|
|
|
569
603
|
if (resolvedAction === 'search') {
|
|
604
|
+
// Short-circuit when the query obviously refers to something in the
|
|
605
|
+
// current chat thread (e.g. "both answers I just got", "your last reply",
|
|
606
|
+
// "what I just said"). Small open-source models repeatedly call
|
|
607
|
+
// memory_search for this pattern instead of reading the thread above,
|
|
608
|
+
// then truthfully report "no memories found" even though the answer is
|
|
609
|
+
// three messages up. Redirect them back to the thread.
|
|
610
|
+
if (queryText && isLikelyThreadRecallQuery(queryText)) {
|
|
611
|
+
return 'No stored memories match this query, and the phrasing looks like a reference to the current chat thread (e.g. "just", "last reply", "both"). The information is already in the conversation history above — read the prior messages in this thread to answer instead of searching memory.'
|
|
612
|
+
}
|
|
570
613
|
const queries = queryText ? await expandQuery(queryText) : [keyText]
|
|
571
614
|
const allResults: MemoryEntry[] = []
|
|
572
615
|
const seenIds = new Set<string>()
|
|
@@ -654,6 +654,16 @@ export function resolveRuntimeSkills(options: ResolveRuntimeSkillsOptions = {}):
|
|
|
654
654
|
}
|
|
655
655
|
}
|
|
656
656
|
|
|
657
|
+
// Dedicated sub-budget for auto-attached learned skills. buildSeedFromLearned
|
|
658
|
+
// marks every learned skill as `attached`, which means a single coordinator
|
|
659
|
+
// agent with 100+ historical learnings could flood the whole 30 k pinned-skill
|
|
660
|
+
// block every turn (observed: 178 learned skills / 176 k chars candidate pool
|
|
661
|
+
// → 24 k-char Pinned Skills section on every CEO turn). We cap learned-skill
|
|
662
|
+
// injection well below the full budget so explicitly-pinned/always-on skills
|
|
663
|
+
// still fit afterward.
|
|
664
|
+
const MAX_LEARNED_SKILLS_PROMPT_CHARS = 8000
|
|
665
|
+
const MAX_LEARNED_SKILLS_IN_PROMPT = 6
|
|
666
|
+
|
|
657
667
|
function selectPromptSkills(skills: ResolvedRuntimeSkill[]): ResolvedRuntimeSkill[] {
|
|
658
668
|
const ordered = [...skills]
|
|
659
669
|
.filter((skill) =>
|
|
@@ -670,16 +680,39 @@ function selectPromptSkills(skills: ResolvedRuntimeSkill[]): ResolvedRuntimeSkil
|
|
|
670
680
|
|
|
671
681
|
const selected: ResolvedRuntimeSkill[] = []
|
|
672
682
|
let totalChars = 0
|
|
683
|
+
let learnedChars = 0
|
|
684
|
+
let learnedCount = 0
|
|
673
685
|
for (const skill of ordered) {
|
|
674
686
|
if (selected.length >= MAX_SKILLS_IN_PROMPT) break
|
|
675
687
|
const contentLen = skill.name.length + skill.content.length + 12
|
|
676
688
|
if (totalChars + contentLen > MAX_SKILLS_PROMPT_CHARS) continue
|
|
689
|
+
const isLearned = skill.source === 'learned'
|
|
690
|
+
if (isLearned) {
|
|
691
|
+
if (learnedCount >= MAX_LEARNED_SKILLS_IN_PROMPT) continue
|
|
692
|
+
if (learnedChars + contentLen > MAX_LEARNED_SKILLS_PROMPT_CHARS) continue
|
|
693
|
+
learnedChars += contentLen
|
|
694
|
+
learnedCount += 1
|
|
695
|
+
}
|
|
677
696
|
totalChars += contentLen
|
|
678
697
|
selected.push(skill)
|
|
679
698
|
}
|
|
680
699
|
return selected
|
|
681
700
|
}
|
|
682
701
|
|
|
702
|
+
// Hard cap on how much skill content we inline per pinned skill. Long skill
|
|
703
|
+
// files (multi-page markdown guides) were dominating the system prompt — one
|
|
704
|
+
// coordinator agent had 24,402 chars (39% of its 62 k budget) from a single
|
|
705
|
+
// pinned skill. When content exceeds the cap we truncate and instruct the
|
|
706
|
+
// agent to pull the rest on demand via `use_skill` action="load".
|
|
707
|
+
const INLINED_SKILL_CHAR_CAP = 3000
|
|
708
|
+
|
|
709
|
+
function truncateInlinedSkillContent(content: string, skillName: string): string {
|
|
710
|
+
const trimmed = content.trim()
|
|
711
|
+
if (trimmed.length <= INLINED_SKILL_CHAR_CAP) return trimmed
|
|
712
|
+
const head = trimmed.slice(0, INLINED_SKILL_CHAR_CAP)
|
|
713
|
+
return `${head}\n\n[Skill content truncated at ${INLINED_SKILL_CHAR_CAP} chars to save context. Call \`use_skill\` with action="load" and skillId for "${skillName}" to load the full guide when you need it.]`
|
|
714
|
+
}
|
|
715
|
+
|
|
683
716
|
function sectionFromSkills(params: {
|
|
684
717
|
title: string
|
|
685
718
|
preface: string
|
|
@@ -688,7 +721,7 @@ function sectionFromSkills(params: {
|
|
|
688
721
|
const usable = params.skills.filter((skill) => skill.content.trim())
|
|
689
722
|
if (usable.length === 0) return ''
|
|
690
723
|
const body = usable
|
|
691
|
-
.map((skill) => `### ${skill.name}\n${skill.content}`)
|
|
724
|
+
.map((skill) => `### ${skill.name}\n${truncateInlinedSkillContent(skill.content, skill.name)}`)
|
|
692
725
|
.join('\n\n')
|
|
693
726
|
return [params.title, params.preface, '', body].join('\n')
|
|
694
727
|
}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import { describe, it } from 'node:test'
|
|
2
|
+
import assert from 'node:assert/strict'
|
|
3
|
+
|
|
4
|
+
// NOTE: we intentionally avoid importing the real universal-tool-access
|
|
5
|
+
// module here — it pulls in the extension manager which transitively loads
|
|
6
|
+
// the whole plugin system and OOMs in test workers. We re-declare the pure
|
|
7
|
+
// logic and verify the algorithmic behavior. Integration coverage for the
|
|
8
|
+
// extension-manager branch happens via live-chat profiling instead.
|
|
9
|
+
|
|
10
|
+
const SCOPED_TOOL_BASELINE = ['memory', 'context_mgmt', 'ask_human'] as const
|
|
11
|
+
const UNIVERSAL_SAMPLE = new Set([
|
|
12
|
+
'shell', 'files', 'edit_file', 'delegate', 'web', 'browser', 'memory',
|
|
13
|
+
'manage_platform', 'manage_tasks', 'context_mgmt', 'ask_human',
|
|
14
|
+
'schedule_wake', 'email', 'image_gen',
|
|
15
|
+
])
|
|
16
|
+
|
|
17
|
+
function normalize(value: string[] | undefined | null): string[] {
|
|
18
|
+
if (!Array.isArray(value)) return []
|
|
19
|
+
return value.map((entry) => (typeof entry === 'string' ? entry.trim() : '')).filter(Boolean)
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
function scoped(declared: string[] | null | undefined, universe: Set<string> = UNIVERSAL_SAMPLE): string[] {
|
|
23
|
+
const picks = normalize(declared).filter((t) => universe.has(t))
|
|
24
|
+
return Array.from(new Set([...SCOPED_TOOL_BASELINE, ...picks]))
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
describe('scoped tool access algorithm', () => {
|
|
28
|
+
it('intersects declared tools with the universe and keeps the baseline', () => {
|
|
29
|
+
const out = scoped(['shell', 'files', 'edit_file', 'web'])
|
|
30
|
+
assert.ok(out.includes('memory'))
|
|
31
|
+
assert.ok(out.includes('context_mgmt'))
|
|
32
|
+
assert.ok(out.includes('ask_human'))
|
|
33
|
+
assert.ok(out.includes('shell'))
|
|
34
|
+
assert.ok(out.includes('files'))
|
|
35
|
+
assert.ok(out.includes('edit_file'))
|
|
36
|
+
assert.ok(out.includes('web'))
|
|
37
|
+
assert.ok(!out.includes('browser'))
|
|
38
|
+
assert.ok(!out.includes('manage_platform'))
|
|
39
|
+
assert.ok(!out.includes('delegate'))
|
|
40
|
+
})
|
|
41
|
+
|
|
42
|
+
it('drops declared tools that are not in the universe', () => {
|
|
43
|
+
const out = scoped(['shell', 'not_a_real_tool'])
|
|
44
|
+
assert.ok(out.includes('shell'))
|
|
45
|
+
assert.ok(!out.includes('not_a_real_tool'))
|
|
46
|
+
})
|
|
47
|
+
|
|
48
|
+
it('returns only the baseline when declared tools is empty', () => {
|
|
49
|
+
assert.deepEqual(scoped([]).sort(), ['ask_human', 'context_mgmt', 'memory'])
|
|
50
|
+
})
|
|
51
|
+
|
|
52
|
+
it('produces a strictly smaller set than the universe for a focused agent', () => {
|
|
53
|
+
assert.ok(scoped(['shell', 'files', 'web']).length < UNIVERSAL_SAMPLE.size)
|
|
54
|
+
})
|
|
55
|
+
|
|
56
|
+
it('deduplicates when baseline overlaps with declared tools', () => {
|
|
57
|
+
const out = scoped(['memory', 'shell'])
|
|
58
|
+
assert.equal(out.filter((t) => t === 'memory').length, 1)
|
|
59
|
+
})
|
|
60
|
+
|
|
61
|
+
it('treats null / undefined / non-array declared tools as empty', () => {
|
|
62
|
+
assert.deepEqual(scoped(null).sort(), ['ask_human', 'context_mgmt', 'memory'])
|
|
63
|
+
assert.deepEqual(scoped(undefined).sort(), ['ask_human', 'context_mgmt', 'memory'])
|
|
64
|
+
})
|
|
65
|
+
|
|
66
|
+
it('trims whitespace in declared tool names', () => {
|
|
67
|
+
const out = scoped([' shell ', '\tfiles\n'])
|
|
68
|
+
assert.ok(out.includes('shell'))
|
|
69
|
+
assert.ok(out.includes('files'))
|
|
70
|
+
})
|
|
71
|
+
})
|
|
@@ -57,3 +57,26 @@ export function listUniversalToolAccessExtensionIds(extraExtensions?: string[] |
|
|
|
57
57
|
...normalizeExtensionList(extraExtensions),
|
|
58
58
|
])
|
|
59
59
|
}
|
|
60
|
+
|
|
61
|
+
// Minimum extensions that a 'scoped' agent always gets regardless of its
|
|
62
|
+
// declared tool list. Memory + context management are required for the agent
|
|
63
|
+
// to function (remembering things, noticing when it's out of context), and
|
|
64
|
+
// ask_human lets it escalate to the user when stuck. Everything else is
|
|
65
|
+
// filterable through agent.tools.
|
|
66
|
+
const SCOPED_TOOL_BASELINE = ['memory', 'context_mgmt', 'ask_human'] as const
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Returns the set of enabled extension IDs for a scoped-access agent: the
|
|
70
|
+
* intersection of `listUniversalToolAccessExtensionIds()` with the agent's
|
|
71
|
+
* declared tools, plus the non-negotiable baseline. Use this when an agent
|
|
72
|
+
* has opted into `toolAccessMode: 'scoped'` to shrink per-turn context.
|
|
73
|
+
*/
|
|
74
|
+
export function listScopedToolAccessExtensionIds(
|
|
75
|
+
declaredTools: string[] | null | undefined,
|
|
76
|
+
extraExtensions?: string[] | null,
|
|
77
|
+
): string[] {
|
|
78
|
+
const universe = new Set(listUniversalToolAccessExtensionIds(extraExtensions))
|
|
79
|
+
const declared = normalizeExtensionList(declaredTools)
|
|
80
|
+
const scoped = declared.filter((tool) => universe.has(tool))
|
|
81
|
+
return dedup([...SCOPED_TOOL_BASELINE, ...scoped])
|
|
82
|
+
}
|
package/src/types/agent.ts
CHANGED
|
@@ -68,6 +68,13 @@ export interface Agent {
|
|
|
68
68
|
delegationTargetMode?: DelegationTargetMode
|
|
69
69
|
delegationTargetAgentIds?: string[]
|
|
70
70
|
tools?: string[]
|
|
71
|
+
// When 'scoped', the chat turn restricts enabled extensions to the
|
|
72
|
+
// intersection of the universal core list and agent.tools (plus a small
|
|
73
|
+
// non-negotiable baseline for memory + context management). Default
|
|
74
|
+
// 'universal' preserves existing behavior. Opt in to cut per-turn tool
|
|
75
|
+
// guidance dramatically — a focused agent with 5 tools drops ~15 k chars
|
|
76
|
+
// of tool-related prompt text vs. the full 33-tool universe.
|
|
77
|
+
toolAccessMode?: 'universal' | 'scoped'
|
|
71
78
|
extensions?: string[]
|
|
72
79
|
skills?: string[] // e.g. ['frontend-design'] — pinned Claude Code skills to mention explicitly
|
|
73
80
|
skillIds?: string[] // IDs of pinned managed skills to keep always-on for this agent
|