@swarmclawai/swarmclaw 0.7.7 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -14
- package/next.config.ts +13 -2
- package/package.json +4 -2
- package/src/app/api/agents/[id]/thread/route.ts +9 -0
- package/src/app/api/agents/route.ts +4 -0
- package/src/app/api/agents/thread-route.test.ts +133 -0
- package/src/app/api/approvals/route.test.ts +148 -0
- package/src/app/api/canvas/[sessionId]/route.ts +3 -1
- package/src/app/api/chatrooms/[id]/chat/route.ts +4 -2
- package/src/app/api/chats/[id]/devserver/route.ts +48 -7
- package/src/app/api/chats/[id]/messages/route.ts +42 -18
- package/src/app/api/chats/[id]/route.ts +1 -1
- package/src/app/api/chats/[id]/stop/route.ts +5 -4
- package/src/app/api/chats/route.ts +23 -2
- package/src/app/api/clawhub/install/route.ts +28 -8
- package/src/app/api/connectors/[id]/route.ts +46 -3
- package/src/app/api/connectors/route.ts +12 -8
- package/src/app/api/external-agents/route.test.ts +165 -0
- package/src/app/api/gateways/[id]/health/route.ts +27 -12
- package/src/app/api/gateways/[id]/route.ts +2 -0
- package/src/app/api/gateways/health-route.test.ts +135 -0
- package/src/app/api/gateways/route.ts +2 -0
- package/src/app/api/mcp-servers/route.test.ts +130 -0
- package/src/app/api/openclaw/deploy/route.ts +38 -5
- package/src/app/api/plugins/install/route.ts +46 -6
- package/src/app/api/plugins/marketplace/route.ts +48 -15
- package/src/app/api/preview-server/route.ts +26 -11
- package/src/app/api/projects/[id]/route.ts +6 -2
- package/src/app/api/projects/route.ts +4 -3
- package/src/app/api/schedules/[id]/run/route.ts +4 -0
- package/src/app/api/schedules/route.test.ts +86 -0
- package/src/app/api/schedules/route.ts +6 -1
- package/src/app/api/secrets/[id]/route.ts +1 -0
- package/src/app/api/secrets/route.ts +2 -1
- package/src/app/api/settings/route.ts +2 -0
- package/src/app/api/setup/check-provider/route.test.ts +19 -0
- package/src/app/api/setup/check-provider/route.ts +40 -10
- package/src/app/api/skills/[id]/route.ts +12 -0
- package/src/app/api/skills/import/route.ts +14 -12
- package/src/app/api/skills/route.ts +13 -1
- package/src/app/api/tasks/[id]/route.ts +10 -1
- package/src/app/api/tasks/import/github/route.test.ts +65 -0
- package/src/app/api/tasks/import/github/route.ts +337 -0
- package/src/app/api/wallets/[id]/approve/route.ts +17 -3
- package/src/app/api/wallets/[id]/route.ts +79 -33
- package/src/app/api/wallets/[id]/send/route.ts +19 -33
- package/src/app/api/wallets/route.ts +78 -61
- package/src/app/api/webhooks/[id]/route.ts +33 -6
- package/src/app/api/webhooks/route.test.ts +272 -0
- package/src/cli/index.js +1 -0
- package/src/cli/spec.js +1 -0
- package/src/components/agents/agent-card.tsx +9 -2
- package/src/components/agents/agent-chat-list.tsx +18 -2
- package/src/components/agents/agent-list.tsx +1 -0
- package/src/components/agents/agent-sheet.tsx +257 -38
- package/src/components/agents/inspector-panel.tsx +41 -0
- package/src/components/canvas/canvas-panel.tsx +236 -65
- package/src/components/chat/chat-area.tsx +36 -19
- package/src/components/chat/chat-card.tsx +36 -13
- package/src/components/chat/chat-header.tsx +48 -16
- package/src/components/chat/chat-list.tsx +28 -4
- package/src/components/chat/checkpoint-timeline.tsx +50 -34
- package/src/components/chat/delegation-banner.test.ts +14 -1
- package/src/components/chat/delegation-banner.tsx +1 -1
- package/src/components/chat/message-bubble.tsx +208 -145
- package/src/components/chat/message-list.tsx +48 -19
- package/src/components/chatrooms/chatroom-message.tsx +2 -2
- package/src/components/chatrooms/chatroom-sheet.tsx +16 -2
- package/src/components/connectors/connector-health.tsx +1 -1
- package/src/components/connectors/connector-list.tsx +7 -2
- package/src/components/connectors/connector-sheet.tsx +337 -148
- package/src/components/gateways/gateway-sheet.tsx +2 -2
- package/src/components/layout/app-layout.tsx +40 -23
- package/src/components/mcp-servers/mcp-server-list.tsx +26 -5
- package/src/components/mcp-servers/mcp-server-sheet.tsx +19 -2
- package/src/components/openclaw/openclaw-deploy-panel.tsx +269 -21
- package/src/components/plugins/plugin-list.tsx +45 -9
- package/src/components/plugins/plugin-sheet.tsx +55 -7
- package/src/components/projects/project-detail.tsx +217 -0
- package/src/components/projects/project-sheet.tsx +176 -4
- package/src/components/providers/provider-list.tsx +2 -1
- package/src/components/providers/provider-sheet.tsx +21 -2
- package/src/components/schedules/schedule-card.tsx +25 -1
- package/src/components/schedules/schedule-sheet.tsx +44 -2
- package/src/components/secrets/secret-sheet.tsx +21 -2
- package/src/components/shared/agent-switch-dialog.tsx +12 -1
- package/src/components/shared/bottom-sheet.tsx +13 -3
- package/src/components/shared/command-palette.tsx +8 -1
- package/src/components/shared/confirm-dialog.tsx +19 -4
- package/src/components/shared/connector-platform-icon.test.ts +28 -0
- package/src/components/shared/connector-platform-icon.tsx +39 -6
- package/src/components/shared/settings/plugin-manager.tsx +29 -6
- package/src/components/shared/settings/section-capability-policy.tsx +45 -3
- package/src/components/shared/settings/section-voice.tsx +11 -3
- package/src/components/skills/skill-list.tsx +25 -0
- package/src/components/skills/skill-sheet.tsx +84 -12
- package/src/components/tasks/approvals-panel.tsx +289 -34
- package/src/components/tasks/task-board.tsx +410 -25
- package/src/components/tasks/task-card.tsx +66 -8
- package/src/components/tasks/task-sheet.tsx +16 -4
- package/src/components/ui/dialog.tsx +2 -2
- package/src/components/wallets/wallet-approval-dialog.tsx +4 -2
- package/src/components/wallets/wallet-panel.tsx +435 -90
- package/src/components/wallets/wallet-section.tsx +198 -48
- package/src/components/webhooks/webhook-sheet.tsx +22 -2
- package/src/lib/approval-display.ts +20 -0
- package/src/lib/canvas-content.ts +198 -0
- package/src/lib/chat-artifact-summary.ts +165 -0
- package/src/lib/chat-display.test.ts +91 -0
- package/src/lib/chat-display.ts +58 -0
- package/src/lib/chat-streaming-state.test.ts +47 -1
- package/src/lib/chat-streaming-state.ts +42 -0
- package/src/lib/ollama-model.ts +10 -0
- package/src/lib/openclaw-endpoint.test.ts +8 -0
- package/src/lib/openclaw-endpoint.ts +6 -1
- package/src/lib/plugin-install-cors.ts +46 -0
- package/src/lib/plugin-sources.test.ts +43 -0
- package/src/lib/plugin-sources.ts +77 -0
- package/src/lib/providers/ollama.ts +16 -6
- package/src/lib/providers/openclaw.test.ts +54 -0
- package/src/lib/providers/openclaw.ts +127 -11
- package/src/lib/schedule-dedupe-advanced.test.ts +1335 -0
- package/src/lib/schedule-dedupe.test.ts +66 -1
- package/src/lib/schedule-dedupe.ts +169 -12
- package/src/lib/schedule-origin.test.ts +20 -0
- package/src/lib/schedule-origin.ts +15 -0
- package/src/lib/server/__fixtures__/fake-mcp-stdio-server.mjs +27 -0
- package/src/lib/server/agent-availability.ts +16 -0
- package/src/lib/server/agent-runtime-config.ts +12 -4
- package/src/lib/server/agent-thread-session.test.ts +51 -0
- package/src/lib/server/agent-thread-session.ts +7 -0
- package/src/lib/server/approval-match.ts +205 -0
- package/src/lib/server/approvals-auto-approve.test.ts +538 -1
- package/src/lib/server/approvals.ts +214 -1
- package/src/lib/server/assistant-control.test.ts +29 -0
- package/src/lib/server/assistant-control.ts +23 -0
- package/src/lib/server/build-llm.test.ts +79 -0
- package/src/lib/server/build-llm.ts +14 -4
- package/src/lib/server/canvas-content.test.ts +32 -0
- package/src/lib/server/canvas-content.ts +6 -0
- package/src/lib/server/capability-router.test.ts +33 -0
- package/src/lib/server/capability-router.ts +80 -19
- package/src/lib/server/chat-execution-advanced.test.ts +651 -0
- package/src/lib/server/chat-execution-disabled.test.ts +94 -0
- package/src/lib/server/chat-execution-tool-events.test.ts +157 -0
- package/src/lib/server/chat-execution.ts +378 -73
- package/src/lib/server/clawhub-client.test.ts +14 -8
- package/src/lib/server/connectors/manager-reconnect.test.ts +47 -0
- package/src/lib/server/connectors/manager.test.ts +1147 -0
- package/src/lib/server/connectors/manager.ts +461 -137
- package/src/lib/server/connectors/pairing.ts +26 -5
- package/src/lib/server/connectors/types.ts +2 -0
- package/src/lib/server/connectors/whatsapp.test.ts +134 -0
- package/src/lib/server/connectors/whatsapp.ts +271 -47
- package/src/lib/server/context-manager.ts +6 -1
- package/src/lib/server/daemon-state.ts +84 -47
- package/src/lib/server/data-dir.test.ts +37 -0
- package/src/lib/server/data-dir.ts +20 -1
- package/src/lib/server/delegation-jobs-advanced.test.ts +513 -0
- package/src/lib/server/devserver-launch.test.ts +60 -0
- package/src/lib/server/devserver-launch.ts +85 -0
- package/src/lib/server/elevenlabs.test.ts +247 -1
- package/src/lib/server/elevenlabs.ts +147 -43
- package/src/lib/server/ethereum.ts +590 -0
- package/src/lib/server/eval/agent-regression-advanced.test.ts +302 -0
- package/src/lib/server/eval/agent-regression.test.ts +18 -1
- package/src/lib/server/eval/agent-regression.ts +383 -11
- package/src/lib/server/evm-swap.ts +475 -0
- package/src/lib/server/execution-log.ts +1 -0
- package/src/lib/server/heartbeat-service-timer.test.ts +173 -0
- package/src/lib/server/heartbeat-service.ts +20 -11
- package/src/lib/server/heartbeat-wake.test.ts +112 -0
- package/src/lib/server/heartbeat-wake.ts +338 -57
- package/src/lib/server/main-agent-loop-advanced.test.ts +538 -0
- package/src/lib/server/main-agent-loop.test.ts +260 -0
- package/src/lib/server/main-agent-loop.ts +559 -14
- package/src/lib/server/mcp-client.test.ts +16 -0
- package/src/lib/server/mcp-client.ts +25 -0
- package/src/lib/server/memory-integration.test.ts +719 -0
- package/src/lib/server/memory-policy.test.ts +43 -0
- package/src/lib/server/memory-policy.ts +132 -0
- package/src/lib/server/memory-tiers.test.ts +60 -0
- package/src/lib/server/memory-tiers.ts +16 -0
- package/src/lib/server/ollama-runtime.ts +58 -0
- package/src/lib/server/openclaw-deploy.test.ts +109 -1
- package/src/lib/server/openclaw-deploy.ts +557 -81
- package/src/lib/server/openclaw-gateway.test.ts +131 -0
- package/src/lib/server/openclaw-gateway.ts +10 -4
- package/src/lib/server/openclaw-health.test.ts +35 -0
- package/src/lib/server/openclaw-health.ts +215 -47
- package/src/lib/server/orchestrator-lg.ts +3 -2
- package/src/lib/server/orchestrator.ts +2 -0
- package/src/lib/server/plugins-advanced.test.ts +351 -0
- package/src/lib/server/plugins.ts +211 -6
- package/src/lib/server/project-context.ts +162 -0
- package/src/lib/server/project-utils.ts +150 -0
- package/src/lib/server/queue-advanced.test.ts +528 -0
- package/src/lib/server/queue-followups.test.ts +409 -2
- package/src/lib/server/queue-reconcile.test.ts +128 -0
- package/src/lib/server/queue.ts +527 -68
- package/src/lib/server/scheduler.ts +29 -1
- package/src/lib/server/session-note.test.ts +36 -0
- package/src/lib/server/session-note.ts +42 -0
- package/src/lib/server/session-run-manager.ts +83 -4
- package/src/lib/server/session-tools/canvas.ts +14 -12
- package/src/lib/server/session-tools/connector-inputs.test.ts +37 -0
- package/src/lib/server/session-tools/connector.test.ts +138 -0
- package/src/lib/server/session-tools/connector.ts +366 -54
- package/src/lib/server/session-tools/context.ts +17 -3
- package/src/lib/server/session-tools/crud.ts +484 -84
- package/src/lib/server/session-tools/delegate-fallback.test.ts +103 -0
- package/src/lib/server/session-tools/delegate-resume.test.ts +50 -0
- package/src/lib/server/session-tools/delegate.ts +102 -10
- package/src/lib/server/session-tools/discovery-approvals.test.ts +142 -0
- package/src/lib/server/session-tools/discovery.ts +80 -12
- package/src/lib/server/session-tools/file-normalize.test.ts +36 -0
- package/src/lib/server/session-tools/file.ts +43 -4
- package/src/lib/server/session-tools/human-loop.ts +35 -5
- package/src/lib/server/session-tools/index.ts +44 -9
- package/src/lib/server/session-tools/manage-connectors.test.ts +139 -0
- package/src/lib/server/session-tools/manage-schedules-advanced.test.ts +564 -0
- package/src/lib/server/session-tools/manage-schedules.test.ts +283 -0
- package/src/lib/server/session-tools/manage-tasks-advanced.test.ts +852 -0
- package/src/lib/server/session-tools/manage-tasks.test.ts +114 -0
- package/src/lib/server/session-tools/memory.test.ts +93 -0
- package/src/lib/server/session-tools/memory.ts +554 -75
- package/src/lib/server/session-tools/normalize-tool-args.ts +1 -1
- package/src/lib/server/session-tools/platform-access.test.ts +58 -0
- package/src/lib/server/session-tools/platform.ts +60 -19
- package/src/lib/server/session-tools/plugin-creator.ts +57 -1
- package/src/lib/server/session-tools/primitive-tools.test.ts +6 -0
- package/src/lib/server/session-tools/schedule.ts +6 -1
- package/src/lib/server/session-tools/shell-normalize.test.ts +25 -1
- package/src/lib/server/session-tools/shell.ts +22 -3
- package/src/lib/server/session-tools/wallet-tool.test.ts +254 -0
- package/src/lib/server/session-tools/wallet.ts +1374 -139
- package/src/lib/server/session-tools/web-inputs.test.ts +178 -0
- package/src/lib/server/session-tools/web.ts +621 -70
- package/src/lib/server/skill-discovery.ts +128 -0
- package/src/lib/server/skill-eligibility.test.ts +84 -0
- package/src/lib/server/skill-eligibility.ts +95 -0
- package/src/lib/server/skill-prompt-budget.test.ts +102 -0
- package/src/lib/server/skill-prompt-budget.ts +125 -0
- package/src/lib/server/skills-normalize.test.ts +54 -0
- package/src/lib/server/skills-normalize.ts +372 -26
- package/src/lib/server/solana.ts +214 -29
- package/src/lib/server/storage.ts +65 -36
- package/src/lib/server/stream-agent-chat.test.ts +437 -2
- package/src/lib/server/stream-agent-chat.ts +957 -79
- package/src/lib/server/system-events.ts +1 -1
- package/src/lib/server/tool-aliases.ts +2 -0
- package/src/lib/server/tool-capability-policy-advanced.test.ts +502 -0
- package/src/lib/server/tool-capability-policy.test.ts +24 -0
- package/src/lib/server/tool-capability-policy.ts +29 -1
- package/src/lib/server/tool-loop-detection.test.ts +105 -0
- package/src/lib/server/tool-loop-detection.ts +260 -0
- package/src/lib/server/tool-planning.test.ts +44 -0
- package/src/lib/server/tool-planning.ts +271 -0
- package/src/lib/server/wallet-execution.test.ts +198 -0
- package/src/lib/server/wallet-portfolio.test.ts +98 -0
- package/src/lib/server/wallet-portfolio.ts +724 -0
- package/src/lib/server/wallet-service.test.ts +57 -0
- package/src/lib/server/wallet-service.ts +213 -0
- package/src/lib/server/watch-jobs-advanced.test.ts +594 -0
- package/src/lib/server/watch-jobs.ts +17 -2
- package/src/lib/server/workspace-context.ts +111 -0
- package/src/lib/skill-save-payload.test.ts +39 -0
- package/src/lib/skill-save-payload.ts +37 -0
- package/src/lib/tasks.ts +28 -0
- package/src/lib/tool-definitions.ts +2 -1
- package/src/lib/tool-event-summary.test.ts +30 -0
- package/src/lib/tool-event-summary.ts +37 -0
- package/src/lib/validation/schemas.ts +1 -0
- package/src/lib/wallet-transactions.test.ts +75 -0
- package/src/lib/wallet-transactions.ts +43 -0
- package/src/lib/wallet.test.ts +17 -0
- package/src/lib/wallet.ts +183 -0
- package/src/proxy.test.ts +31 -0
- package/src/proxy.ts +34 -2
- package/src/stores/use-chat-store.ts +15 -1
- package/src/types/index.ts +249 -14
|
@@ -10,6 +10,7 @@ import { executeSessionChatTurn, type ExecuteChatTurnResult } from '../chat-exec
|
|
|
10
10
|
import { WORKSPACE_DIR } from '../data-dir'
|
|
11
11
|
import { getPluginManager } from '../plugins'
|
|
12
12
|
import { sendMailboxEnvelope, listMailbox } from '../session-mailbox'
|
|
13
|
+
import { canonicalizePluginId, expandPluginIds } from '../tool-aliases'
|
|
13
14
|
import { processDueWatchJobs } from '../watch-jobs'
|
|
14
15
|
import {
|
|
15
16
|
deleteApproval,
|
|
@@ -34,6 +35,7 @@ import {
|
|
|
34
35
|
} from '../storage'
|
|
35
36
|
|
|
36
37
|
export type RegressionApprovalMode = 'manual' | 'auto' | 'off'
|
|
38
|
+
export type RegressionPluginMode = 'scenario' | 'agent'
|
|
37
39
|
|
|
38
40
|
export interface RegressionAssertion {
|
|
39
41
|
name: string
|
|
@@ -46,12 +48,16 @@ export interface AgentRegressionScenarioResult {
|
|
|
46
48
|
scenarioId: string
|
|
47
49
|
name: string
|
|
48
50
|
approvalMode: RegressionApprovalMode
|
|
51
|
+
pluginMode: RegressionPluginMode
|
|
49
52
|
status: 'passed' | 'failed'
|
|
50
53
|
score: number
|
|
51
54
|
maxScore: number
|
|
52
55
|
assertions: RegressionAssertion[]
|
|
53
56
|
sessionId: string
|
|
54
57
|
workspaceDir: string
|
|
58
|
+
requiredPlugins: string[]
|
|
59
|
+
effectivePlugins: string[]
|
|
60
|
+
missingPlugins: string[]
|
|
55
61
|
toolNames: string[]
|
|
56
62
|
approvalIds: string[]
|
|
57
63
|
approvals: RegressionApprovalEvidence[]
|
|
@@ -82,8 +88,12 @@ interface ScenarioContext {
|
|
|
82
88
|
agentId: string
|
|
83
89
|
agent: Record<string, unknown>
|
|
84
90
|
approvalMode: RegressionApprovalMode
|
|
91
|
+
pluginMode: RegressionPluginMode
|
|
85
92
|
sessionId: string
|
|
86
93
|
workspaceDir: string
|
|
94
|
+
requiredPlugins: string[]
|
|
95
|
+
effectivePlugins: string[]
|
|
96
|
+
missingPlugins: string[]
|
|
87
97
|
responseTexts: string[]
|
|
88
98
|
toolEvents: MessageToolEvent[]
|
|
89
99
|
toolNames: Set<string>
|
|
@@ -97,6 +107,12 @@ interface AgentRegressionScenarioDefinition {
|
|
|
97
107
|
run: (ctx: ScenarioContext) => Promise<AgentRegressionScenarioResult>
|
|
98
108
|
}
|
|
99
109
|
|
|
110
|
+
interface RegressionPluginResolution {
|
|
111
|
+
requiredPlugins: string[]
|
|
112
|
+
effectivePlugins: string[]
|
|
113
|
+
missingPlugins: string[]
|
|
114
|
+
}
|
|
115
|
+
|
|
100
116
|
interface MockMailAccount {
|
|
101
117
|
email: string
|
|
102
118
|
chosenPassword: string
|
|
@@ -813,6 +829,48 @@ export function scoreAssertions(assertions: RegressionAssertion[]): { score: num
|
|
|
813
829
|
}
|
|
814
830
|
}
|
|
815
831
|
|
|
832
|
+
function normalizePluginList(values: unknown): string[] {
|
|
833
|
+
if (!Array.isArray(values)) return []
|
|
834
|
+
const seen = new Set<string>()
|
|
835
|
+
const normalized: string[] = []
|
|
836
|
+
for (const value of values) {
|
|
837
|
+
if (typeof value !== 'string') continue
|
|
838
|
+
const trimmed = value.trim()
|
|
839
|
+
if (!trimmed || seen.has(trimmed)) continue
|
|
840
|
+
seen.add(trimmed)
|
|
841
|
+
normalized.push(trimmed)
|
|
842
|
+
}
|
|
843
|
+
return normalized
|
|
844
|
+
}
|
|
845
|
+
|
|
846
|
+
export function resolveRegressionPlugins(
|
|
847
|
+
requiredPlugins: string[],
|
|
848
|
+
agent: Record<string, unknown>,
|
|
849
|
+
pluginMode: RegressionPluginMode,
|
|
850
|
+
): RegressionPluginResolution {
|
|
851
|
+
const requiredCanonical = Array.from(new Set(
|
|
852
|
+
normalizePluginList(requiredPlugins)
|
|
853
|
+
.map((plugin) => canonicalizePluginId(plugin))
|
|
854
|
+
.filter(Boolean),
|
|
855
|
+
))
|
|
856
|
+
if (pluginMode === 'scenario') {
|
|
857
|
+
return {
|
|
858
|
+
requiredPlugins: requiredCanonical,
|
|
859
|
+
effectivePlugins: normalizePluginList(requiredPlugins),
|
|
860
|
+
missingPlugins: [],
|
|
861
|
+
}
|
|
862
|
+
}
|
|
863
|
+
|
|
864
|
+
const effectivePlugins = normalizePluginList(agent.plugins ?? agent.tools)
|
|
865
|
+
const expandedAgentPlugins = new Set(expandPluginIds(effectivePlugins))
|
|
866
|
+
const missingPlugins = requiredCanonical.filter((plugin) => !expandedAgentPlugins.has(plugin))
|
|
867
|
+
return {
|
|
868
|
+
requiredPlugins: requiredCanonical,
|
|
869
|
+
effectivePlugins,
|
|
870
|
+
missingPlugins,
|
|
871
|
+
}
|
|
872
|
+
}
|
|
873
|
+
|
|
816
874
|
function listSessionApprovals(sessionId: string): ApprovalRequest[] {
|
|
817
875
|
return Object.values(loadApprovals() as Record<string, ApprovalRequest>)
|
|
818
876
|
.filter((approval) => approval.sessionId === sessionId)
|
|
@@ -838,13 +896,23 @@ function listSessionSecrets(sessionId: string): Array<Record<string, unknown>> {
|
|
|
838
896
|
.filter((secret) => secret.createdInSessionId === sessionId)
|
|
839
897
|
}
|
|
840
898
|
|
|
841
|
-
function parseJsonRecord(raw: string | undefined): Record<string, unknown> | null {
|
|
899
|
+
function parseJsonRecord(raw: string | undefined, depth = 0): Record<string, unknown> | null {
|
|
842
900
|
if (!raw || !raw.trim()) return null
|
|
843
901
|
try {
|
|
844
902
|
const parsed = JSON.parse(raw)
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
903
|
+
if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) return null
|
|
904
|
+
const record = parsed as Record<string, unknown>
|
|
905
|
+
if (depth < 2) {
|
|
906
|
+
if (typeof record.input === 'string') {
|
|
907
|
+
const nested = parseJsonRecord(record.input, depth + 1)
|
|
908
|
+
if (nested) return nested
|
|
909
|
+
}
|
|
910
|
+
if (typeof record.data === 'string' && Object.keys(record).length === 1) {
|
|
911
|
+
const nested = parseJsonRecord(record.data, depth + 1)
|
|
912
|
+
if (nested) return nested
|
|
913
|
+
}
|
|
914
|
+
}
|
|
915
|
+
return record
|
|
848
916
|
} catch {
|
|
849
917
|
return null
|
|
850
918
|
}
|
|
@@ -935,12 +1003,28 @@ function buildRegressionSession(params: {
|
|
|
935
1003
|
}
|
|
936
1004
|
|
|
937
1005
|
async function runTurn(ctx: ScenarioContext, message: string): Promise<ExecuteChatTurnResult> {
|
|
938
|
-
const
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
1006
|
+
const timeoutMs = 120_000
|
|
1007
|
+
const controller = new AbortController()
|
|
1008
|
+
const abortTimer = setTimeout(() => controller.abort(), timeoutMs)
|
|
1009
|
+
const hardTimeout = setTimeout(() => controller.abort(), timeoutMs + 5_000)
|
|
1010
|
+
let result: ExecuteChatTurnResult
|
|
1011
|
+
try {
|
|
1012
|
+
result = await Promise.race([
|
|
1013
|
+
executeSessionChatTurn({
|
|
1014
|
+
sessionId: ctx.sessionId,
|
|
1015
|
+
message,
|
|
1016
|
+
internal: true,
|
|
1017
|
+
source: 'eval',
|
|
1018
|
+
signal: controller.signal,
|
|
1019
|
+
}),
|
|
1020
|
+
new Promise<never>((_, reject) => {
|
|
1021
|
+
setTimeout(() => reject(new Error(`Eval turn timed out after ${timeoutMs}ms.`)), timeoutMs + 10_000)
|
|
1022
|
+
}),
|
|
1023
|
+
])
|
|
1024
|
+
} finally {
|
|
1025
|
+
clearTimeout(abortTimer)
|
|
1026
|
+
clearTimeout(hardTimeout)
|
|
1027
|
+
}
|
|
944
1028
|
ctx.responseTexts.push(result.text)
|
|
945
1029
|
for (const event of result.toolEvents || []) {
|
|
946
1030
|
ctx.toolEvents.push(event)
|
|
@@ -1042,10 +1126,14 @@ async function runApprovalResumeScenario(ctx: ScenarioContext): Promise<AgentReg
|
|
|
1042
1126
|
scenarioId: 'approval-resume',
|
|
1043
1127
|
name: 'Approval Resume',
|
|
1044
1128
|
approvalMode: ctx.approvalMode,
|
|
1129
|
+
pluginMode: ctx.pluginMode,
|
|
1045
1130
|
...scored,
|
|
1046
1131
|
assertions,
|
|
1047
1132
|
sessionId: ctx.sessionId,
|
|
1048
1133
|
workspaceDir: ctx.workspaceDir,
|
|
1134
|
+
requiredPlugins: [...ctx.requiredPlugins],
|
|
1135
|
+
effectivePlugins: [...ctx.effectivePlugins],
|
|
1136
|
+
missingPlugins: [...ctx.missingPlugins],
|
|
1049
1137
|
toolNames: Array.from(ctx.toolNames),
|
|
1050
1138
|
approvalIds: shellApprovals.map((approval) => approval.id),
|
|
1051
1139
|
approvals: buildApprovalEvidence(ctx.sessionId),
|
|
@@ -1102,10 +1190,14 @@ async function runDelegateLiteralScenario(ctx: ScenarioContext): Promise<AgentRe
|
|
|
1102
1190
|
scenarioId: 'delegate-literal-artifact',
|
|
1103
1191
|
name: 'Delegate Literal Artifact',
|
|
1104
1192
|
approvalMode: ctx.approvalMode,
|
|
1193
|
+
pluginMode: ctx.pluginMode,
|
|
1105
1194
|
...scored,
|
|
1106
1195
|
assertions,
|
|
1107
1196
|
sessionId: ctx.sessionId,
|
|
1108
1197
|
workspaceDir: ctx.workspaceDir,
|
|
1198
|
+
requiredPlugins: [...ctx.requiredPlugins],
|
|
1199
|
+
effectivePlugins: [...ctx.effectivePlugins],
|
|
1200
|
+
missingPlugins: [...ctx.missingPlugins],
|
|
1109
1201
|
toolNames: Array.from(ctx.toolNames),
|
|
1110
1202
|
approvalIds: [],
|
|
1111
1203
|
approvals: buildApprovalEvidence(ctx.sessionId),
|
|
@@ -1167,10 +1259,14 @@ async function runScheduleScenario(ctx: ScenarioContext): Promise<AgentRegressio
|
|
|
1167
1259
|
scenarioId: 'schedule-script',
|
|
1168
1260
|
name: 'Schedule Script Workflow',
|
|
1169
1261
|
approvalMode: ctx.approvalMode,
|
|
1262
|
+
pluginMode: ctx.pluginMode,
|
|
1170
1263
|
...scored,
|
|
1171
1264
|
assertions,
|
|
1172
1265
|
sessionId: ctx.sessionId,
|
|
1173
1266
|
workspaceDir: ctx.workspaceDir,
|
|
1267
|
+
requiredPlugins: [...ctx.requiredPlugins],
|
|
1268
|
+
effectivePlugins: [...ctx.effectivePlugins],
|
|
1269
|
+
missingPlugins: [...ctx.missingPlugins],
|
|
1174
1270
|
toolNames: Array.from(ctx.toolNames),
|
|
1175
1271
|
approvalIds: [],
|
|
1176
1272
|
approvals: buildApprovalEvidence(ctx.sessionId),
|
|
@@ -1237,10 +1333,14 @@ async function runOpenEndedIterationScenario(ctx: ScenarioContext): Promise<Agen
|
|
|
1237
1333
|
scenarioId: 'open-ended-iteration',
|
|
1238
1334
|
name: 'Open-Ended Iteration Pack',
|
|
1239
1335
|
approvalMode: ctx.approvalMode,
|
|
1336
|
+
pluginMode: ctx.pluginMode,
|
|
1240
1337
|
...scored,
|
|
1241
1338
|
assertions,
|
|
1242
1339
|
sessionId: ctx.sessionId,
|
|
1243
1340
|
workspaceDir: ctx.workspaceDir,
|
|
1341
|
+
requiredPlugins: [...ctx.requiredPlugins],
|
|
1342
|
+
effectivePlugins: [...ctx.effectivePlugins],
|
|
1343
|
+
missingPlugins: [...ctx.missingPlugins],
|
|
1244
1344
|
toolNames: Array.from(ctx.toolNames),
|
|
1245
1345
|
approvalIds: [],
|
|
1246
1346
|
approvals: buildApprovalEvidence(ctx.sessionId),
|
|
@@ -1354,10 +1454,14 @@ async function runMockSignupSecretEmailScenario(ctx: ScenarioContext): Promise<A
|
|
|
1354
1454
|
scenarioId: 'mock-signup-secret-email',
|
|
1355
1455
|
name: 'Mock Signup Secret Email',
|
|
1356
1456
|
approvalMode: ctx.approvalMode,
|
|
1457
|
+
pluginMode: ctx.pluginMode,
|
|
1357
1458
|
...scored,
|
|
1358
1459
|
assertions,
|
|
1359
1460
|
sessionId: ctx.sessionId,
|
|
1360
1461
|
workspaceDir: ctx.workspaceDir,
|
|
1462
|
+
requiredPlugins: [...ctx.requiredPlugins],
|
|
1463
|
+
effectivePlugins: [...ctx.effectivePlugins],
|
|
1464
|
+
missingPlugins: [...ctx.missingPlugins],
|
|
1361
1465
|
toolNames: Array.from(ctx.toolNames),
|
|
1362
1466
|
approvalIds: [],
|
|
1363
1467
|
approvals: buildApprovalEvidence(ctx.sessionId),
|
|
@@ -1475,10 +1579,14 @@ async function runHumanVerifiedSignupScenario(ctx: ScenarioContext): Promise<Age
|
|
|
1475
1579
|
scenarioId: 'human-verified-signup',
|
|
1476
1580
|
name: 'Human Verified Signup',
|
|
1477
1581
|
approvalMode: ctx.approvalMode,
|
|
1582
|
+
pluginMode: ctx.pluginMode,
|
|
1478
1583
|
...scored,
|
|
1479
1584
|
assertions,
|
|
1480
1585
|
sessionId: ctx.sessionId,
|
|
1481
1586
|
workspaceDir: ctx.workspaceDir,
|
|
1587
|
+
requiredPlugins: [...ctx.requiredPlugins],
|
|
1588
|
+
effectivePlugins: [...ctx.effectivePlugins],
|
|
1589
|
+
missingPlugins: [...ctx.missingPlugins],
|
|
1482
1590
|
toolNames: Array.from(ctx.toolNames),
|
|
1483
1591
|
approvalIds: [],
|
|
1484
1592
|
approvals: buildApprovalEvidence(ctx.sessionId),
|
|
@@ -1581,10 +1689,14 @@ async function runResearchBuildDeployScenario(ctx: ScenarioContext): Promise<Age
|
|
|
1581
1689
|
scenarioId: 'research-build-deploy',
|
|
1582
1690
|
name: 'Research Build Deploy',
|
|
1583
1691
|
approvalMode: ctx.approvalMode,
|
|
1692
|
+
pluginMode: ctx.pluginMode,
|
|
1584
1693
|
...scored,
|
|
1585
1694
|
assertions,
|
|
1586
1695
|
sessionId: ctx.sessionId,
|
|
1587
1696
|
workspaceDir: ctx.workspaceDir,
|
|
1697
|
+
requiredPlugins: [...ctx.requiredPlugins],
|
|
1698
|
+
effectivePlugins: [...ctx.effectivePlugins],
|
|
1699
|
+
missingPlugins: [...ctx.missingPlugins],
|
|
1588
1700
|
toolNames: Array.from(ctx.toolNames),
|
|
1589
1701
|
approvalIds: [],
|
|
1590
1702
|
approvals: buildApprovalEvidence(ctx.sessionId),
|
|
@@ -1598,6 +1710,241 @@ async function runResearchBuildDeployScenario(ctx: ScenarioContext): Promise<Age
|
|
|
1598
1710
|
}
|
|
1599
1711
|
}
|
|
1600
1712
|
|
|
1713
|
+
/**
|
|
1714
|
+
* Tool-call efficiency scenario: verifies the agent uses minimal tool calls
|
|
1715
|
+
* for simple data-retrieval tasks. Catches regressions like:
|
|
1716
|
+
* - Duplicate tool events from nested tool wrappers
|
|
1717
|
+
* - requiredToolsPending forcing redundant web_search after shell-based curl
|
|
1718
|
+
* - Response duplication from forced continuation loops
|
|
1719
|
+
*/
|
|
1720
|
+
async function runToolCallEfficiencyScenario(ctx: ScenarioContext): Promise<AgentRegressionScenarioResult> {
|
|
1721
|
+
// Use a well-known API endpoint so no real-time external dependency
|
|
1722
|
+
const prompt = 'Use the GitHub API to get the description of the openclaw/openclaw repository. Just the description text, nothing else.'
|
|
1723
|
+
|
|
1724
|
+
await runTurn(ctx, prompt)
|
|
1725
|
+
|
|
1726
|
+
const totalToolCalls = ctx.toolEvents.filter((e) => e.name).length
|
|
1727
|
+
const responseTexts = ctx.responseTexts
|
|
1728
|
+
const allResponseText = responseTexts.join('\n')
|
|
1729
|
+
|
|
1730
|
+
// Check for response duplication (same content repeated)
|
|
1731
|
+
const hasResponseDuplication = responseTexts.length > 1
|
|
1732
|
+
&& responseTexts[0].length > 20
|
|
1733
|
+
&& responseTexts.some((text, i) => i > 0 && text.includes(responseTexts[0].slice(0, 40)))
|
|
1734
|
+
|
|
1735
|
+
const assertions: RegressionAssertion[] = [
|
|
1736
|
+
{
|
|
1737
|
+
name: 'used shell or web tool',
|
|
1738
|
+
passed: ctx.toolNames.has('shell') || ctx.toolNames.has('web'),
|
|
1739
|
+
},
|
|
1740
|
+
{
|
|
1741
|
+
name: 'completed in 3 or fewer tool calls',
|
|
1742
|
+
passed: totalToolCalls <= 3,
|
|
1743
|
+
details: `${totalToolCalls} tool calls`,
|
|
1744
|
+
weight: 2,
|
|
1745
|
+
},
|
|
1746
|
+
{
|
|
1747
|
+
name: 'response contains repo description text',
|
|
1748
|
+
passed: allResponseText.length > 10,
|
|
1749
|
+
details: `${allResponseText.length} chars`,
|
|
1750
|
+
},
|
|
1751
|
+
{
|
|
1752
|
+
name: 'no response duplication from forced continuations',
|
|
1753
|
+
passed: !hasResponseDuplication,
|
|
1754
|
+
details: hasResponseDuplication ? `${responseTexts.length} response segments with overlap` : 'clean',
|
|
1755
|
+
weight: 2,
|
|
1756
|
+
},
|
|
1757
|
+
]
|
|
1758
|
+
|
|
1759
|
+
const scored = scoreAssertions(assertions)
|
|
1760
|
+
return {
|
|
1761
|
+
scenarioId: 'tool-call-efficiency',
|
|
1762
|
+
name: 'Tool Call Efficiency',
|
|
1763
|
+
approvalMode: ctx.approvalMode,
|
|
1764
|
+
pluginMode: ctx.pluginMode,
|
|
1765
|
+
...scored,
|
|
1766
|
+
assertions,
|
|
1767
|
+
sessionId: ctx.sessionId,
|
|
1768
|
+
workspaceDir: ctx.workspaceDir,
|
|
1769
|
+
requiredPlugins: [...ctx.requiredPlugins],
|
|
1770
|
+
effectivePlugins: [...ctx.effectivePlugins],
|
|
1771
|
+
missingPlugins: [...ctx.missingPlugins],
|
|
1772
|
+
toolNames: Array.from(ctx.toolNames),
|
|
1773
|
+
approvalIds: [],
|
|
1774
|
+
approvals: buildApprovalEvidence(ctx.sessionId),
|
|
1775
|
+
responseTexts: [...ctx.responseTexts],
|
|
1776
|
+
turns: [...ctx.turns],
|
|
1777
|
+
artifacts: buildArtifactEvidence(ctx, []),
|
|
1778
|
+
evidencePaths: writeScenarioEvidenceFiles(ctx),
|
|
1779
|
+
}
|
|
1780
|
+
}
|
|
1781
|
+
|
|
1782
|
+
/**
|
|
1783
|
+
* File-creation followthrough scenario: verifies the agent creates a file
|
|
1784
|
+
* when asked to save output to a specific path. Catches regressions like:
|
|
1785
|
+
* - looksLikeOpenEndedDeliverableTask not matching file-save requests
|
|
1786
|
+
* - shouldForceDeliverableFollowthrough not triggering for HTML/JSON file tasks
|
|
1787
|
+
* - Agent stopping before writing the file
|
|
1788
|
+
*/
|
|
1789
|
+
async function runFileCreationFollowthroughScenario(ctx: ScenarioContext): Promise<AgentRegressionScenarioResult> {
|
|
1790
|
+
const targetRelativePath = 'output/planets.json'
|
|
1791
|
+
const targetPath = scenarioFile(ctx, targetRelativePath)
|
|
1792
|
+
const prompt = `Create a JSON file at ${targetRelativePath} containing a list of the 3 largest planets in our solar system with their name and diameter in km.`
|
|
1793
|
+
|
|
1794
|
+
await runTurn(ctx, prompt)
|
|
1795
|
+
// Allow a second turn if the first didn't produce the file
|
|
1796
|
+
if (!fs.existsSync(targetPath)) {
|
|
1797
|
+
await runTurn(ctx, 'Complete the task. The file must exist at the specified path.')
|
|
1798
|
+
}
|
|
1799
|
+
|
|
1800
|
+
const fileContent = readIfExists(targetPath)
|
|
1801
|
+
let validJson = false
|
|
1802
|
+
let hasPlanets = false
|
|
1803
|
+
try {
|
|
1804
|
+
const parsed = JSON.parse(fileContent)
|
|
1805
|
+
validJson = true
|
|
1806
|
+
const items = Array.isArray(parsed) ? parsed : (parsed.planets || parsed.data || [])
|
|
1807
|
+
hasPlanets = Array.isArray(items) && items.length >= 3
|
|
1808
|
+
&& items.every((item: Record<string, unknown>) => item.name && item.diameter)
|
|
1809
|
+
} catch {
|
|
1810
|
+
// not valid JSON
|
|
1811
|
+
}
|
|
1812
|
+
|
|
1813
|
+
const assertions: RegressionAssertion[] = [
|
|
1814
|
+
{
|
|
1815
|
+
name: 'file tool or shell used',
|
|
1816
|
+
passed: ctx.toolNames.has('files') || ctx.toolNames.has('shell'),
|
|
1817
|
+
},
|
|
1818
|
+
{
|
|
1819
|
+
name: 'output file exists',
|
|
1820
|
+
passed: fs.existsSync(targetPath),
|
|
1821
|
+
details: targetPath,
|
|
1822
|
+
weight: 2,
|
|
1823
|
+
},
|
|
1824
|
+
{
|
|
1825
|
+
name: 'output is valid JSON',
|
|
1826
|
+
passed: validJson,
|
|
1827
|
+
weight: 2,
|
|
1828
|
+
},
|
|
1829
|
+
{
|
|
1830
|
+
name: 'JSON contains 3+ planets with name and diameter',
|
|
1831
|
+
passed: hasPlanets,
|
|
1832
|
+
details: fileContent.slice(0, 200),
|
|
1833
|
+
},
|
|
1834
|
+
{
|
|
1835
|
+
name: 'completed within 2 turns',
|
|
1836
|
+
passed: ctx.turns.length <= 2,
|
|
1837
|
+
details: `${ctx.turns.length} turns`,
|
|
1838
|
+
},
|
|
1839
|
+
]
|
|
1840
|
+
|
|
1841
|
+
const scored = scoreAssertions(assertions)
|
|
1842
|
+
return {
|
|
1843
|
+
scenarioId: 'file-creation-followthrough',
|
|
1844
|
+
name: 'File Creation Followthrough',
|
|
1845
|
+
approvalMode: ctx.approvalMode,
|
|
1846
|
+
pluginMode: ctx.pluginMode,
|
|
1847
|
+
...scored,
|
|
1848
|
+
assertions,
|
|
1849
|
+
sessionId: ctx.sessionId,
|
|
1850
|
+
workspaceDir: ctx.workspaceDir,
|
|
1851
|
+
requiredPlugins: [...ctx.requiredPlugins],
|
|
1852
|
+
effectivePlugins: [...ctx.effectivePlugins],
|
|
1853
|
+
missingPlugins: [...ctx.missingPlugins],
|
|
1854
|
+
toolNames: Array.from(ctx.toolNames),
|
|
1855
|
+
approvalIds: [],
|
|
1856
|
+
approvals: buildApprovalEvidence(ctx.sessionId),
|
|
1857
|
+
responseTexts: [...ctx.responseTexts],
|
|
1858
|
+
turns: [...ctx.turns],
|
|
1859
|
+
artifacts: buildArtifactEvidence(ctx, [targetRelativePath]),
|
|
1860
|
+
evidencePaths: writeScenarioEvidenceFiles(ctx),
|
|
1861
|
+
}
|
|
1862
|
+
}
|
|
1863
|
+
|
|
1864
|
+
/**
|
|
1865
|
+
* Knowledge-first file creation: validates the agent uses its own knowledge
|
|
1866
|
+
* for commonly known data instead of wasting web searches. Modelled after
|
|
1867
|
+
* OpenClaw's approach where agents rely on knowledge for non-time-sensitive data.
|
|
1868
|
+
*/
|
|
1869
|
+
async function runKnowledgeFirstFileScenario(ctx: ScenarioContext): Promise<AgentRegressionScenarioResult> {
|
|
1870
|
+
const targetRelativePath = 'output/cities.json'
|
|
1871
|
+
const targetPath = scenarioFile(ctx, targetRelativePath)
|
|
1872
|
+
const prompt = `Create a JSON file at ${targetRelativePath} containing name, population, and country for Tokyo, London, and New York City.`
|
|
1873
|
+
|
|
1874
|
+
await runTurn(ctx, prompt)
|
|
1875
|
+
if (!fs.existsSync(targetPath)) {
|
|
1876
|
+
await runTurn(ctx, 'Complete the task. Write the file now.')
|
|
1877
|
+
}
|
|
1878
|
+
|
|
1879
|
+
const fileContent = readIfExists(targetPath)
|
|
1880
|
+
let validJson = false
|
|
1881
|
+
let hasCities = false
|
|
1882
|
+
try {
|
|
1883
|
+
const parsed = JSON.parse(fileContent)
|
|
1884
|
+
validJson = true
|
|
1885
|
+
const items = Array.isArray(parsed) ? parsed : (parsed.cities || parsed.data || [])
|
|
1886
|
+
hasCities = Array.isArray(items) && items.length >= 3
|
|
1887
|
+
&& items.every((item: Record<string, unknown>) => item.name && item.population && item.country)
|
|
1888
|
+
} catch {
|
|
1889
|
+
// not valid JSON
|
|
1890
|
+
}
|
|
1891
|
+
|
|
1892
|
+
// Count web-related tool calls — there should be zero for commonly known data
|
|
1893
|
+
const webToolCalls = ctx.toolEvents.filter(
|
|
1894
|
+
(e) => e.name && ['web', 'web_search', 'web_fetch'].includes(canonicalizePluginId(e.name) || e.name),
|
|
1895
|
+
).length
|
|
1896
|
+
|
|
1897
|
+
const assertions: RegressionAssertion[] = [
|
|
1898
|
+
{
|
|
1899
|
+
name: 'file tool used',
|
|
1900
|
+
passed: ctx.toolNames.has('files') || ctx.toolNames.has('shell'),
|
|
1901
|
+
},
|
|
1902
|
+
{
|
|
1903
|
+
name: 'output file exists',
|
|
1904
|
+
passed: fs.existsSync(targetPath),
|
|
1905
|
+
weight: 2,
|
|
1906
|
+
},
|
|
1907
|
+
{
|
|
1908
|
+
name: 'output is valid JSON with cities',
|
|
1909
|
+
passed: validJson && hasCities,
|
|
1910
|
+
weight: 2,
|
|
1911
|
+
},
|
|
1912
|
+
{
|
|
1913
|
+
name: 'no web searches for commonly known data (OpenClaw parity)',
|
|
1914
|
+
passed: webToolCalls === 0,
|
|
1915
|
+
details: `${webToolCalls} web tool calls`,
|
|
1916
|
+
weight: 3,
|
|
1917
|
+
},
|
|
1918
|
+
{
|
|
1919
|
+
name: 'completed within 2 turns',
|
|
1920
|
+
passed: ctx.turns.length <= 2,
|
|
1921
|
+
details: `${ctx.turns.length} turns`,
|
|
1922
|
+
},
|
|
1923
|
+
]
|
|
1924
|
+
|
|
1925
|
+
const scored = scoreAssertions(assertions)
|
|
1926
|
+
return {
|
|
1927
|
+
scenarioId: 'knowledge-first-file',
|
|
1928
|
+
name: 'Knowledge-First File Creation',
|
|
1929
|
+
approvalMode: ctx.approvalMode,
|
|
1930
|
+
pluginMode: ctx.pluginMode,
|
|
1931
|
+
...scored,
|
|
1932
|
+
assertions,
|
|
1933
|
+
sessionId: ctx.sessionId,
|
|
1934
|
+
workspaceDir: ctx.workspaceDir,
|
|
1935
|
+
requiredPlugins: [...ctx.requiredPlugins],
|
|
1936
|
+
effectivePlugins: [...ctx.effectivePlugins],
|
|
1937
|
+
missingPlugins: [...ctx.missingPlugins],
|
|
1938
|
+
toolNames: Array.from(ctx.toolNames),
|
|
1939
|
+
approvalIds: [],
|
|
1940
|
+
approvals: buildApprovalEvidence(ctx.sessionId),
|
|
1941
|
+
responseTexts: [...ctx.responseTexts],
|
|
1942
|
+
turns: [...ctx.turns],
|
|
1943
|
+
artifacts: buildArtifactEvidence(ctx, [targetRelativePath]),
|
|
1944
|
+
evidencePaths: writeScenarioEvidenceFiles(ctx),
|
|
1945
|
+
}
|
|
1946
|
+
}
|
|
1947
|
+
|
|
1601
1948
|
export const AGENT_REGRESSION_SCENARIOS: AgentRegressionScenarioDefinition[] = [
|
|
1602
1949
|
{
|
|
1603
1950
|
id: 'approval-resume',
|
|
@@ -1641,6 +1988,24 @@ export const AGENT_REGRESSION_SCENARIOS: AgentRegressionScenarioDefinition[] = [
|
|
|
1641
1988
|
plugins: ['http_request', 'files', 'browser'],
|
|
1642
1989
|
run: runResearchBuildDeployScenario,
|
|
1643
1990
|
},
|
|
1991
|
+
{
|
|
1992
|
+
id: 'tool-call-efficiency',
|
|
1993
|
+
name: 'Tool Call Efficiency',
|
|
1994
|
+
plugins: ['shell', 'web'],
|
|
1995
|
+
run: runToolCallEfficiencyScenario,
|
|
1996
|
+
},
|
|
1997
|
+
{
|
|
1998
|
+
id: 'file-creation-followthrough',
|
|
1999
|
+
name: 'File Creation Followthrough',
|
|
2000
|
+
plugins: ['files', 'shell'],
|
|
2001
|
+
run: runFileCreationFollowthroughScenario,
|
|
2002
|
+
},
|
|
2003
|
+
{
|
|
2004
|
+
id: 'knowledge-first-file',
|
|
2005
|
+
name: 'Knowledge-First File Creation',
|
|
2006
|
+
plugins: ['files', 'web'],
|
|
2007
|
+
run: runKnowledgeFirstFileScenario,
|
|
2008
|
+
},
|
|
1644
2009
|
]
|
|
1645
2010
|
|
|
1646
2011
|
function resolveScenarioDefinitions(ids?: string[]): AgentRegressionScenarioDefinition[] {
|
|
@@ -1653,11 +2018,13 @@ export async function runAgentRegressionSuite(params?: {
|
|
|
1653
2018
|
agentId?: string
|
|
1654
2019
|
approvalModes?: RegressionApprovalMode[]
|
|
1655
2020
|
scenarioIds?: string[]
|
|
2021
|
+
pluginMode?: RegressionPluginMode
|
|
1656
2022
|
}): Promise<AgentRegressionSuiteResult> {
|
|
1657
2023
|
const agentId = params?.agentId || 'default'
|
|
1658
2024
|
const approvalModes: RegressionApprovalMode[] = params?.approvalModes?.length
|
|
1659
2025
|
? [...params.approvalModes]
|
|
1660
2026
|
: ['manual', 'auto', 'off']
|
|
2027
|
+
const pluginMode: RegressionPluginMode = params?.pluginMode === 'agent' ? 'agent' : 'scenario'
|
|
1661
2028
|
const agents = loadAgents() as Record<string, Record<string, unknown>>
|
|
1662
2029
|
const agent = agents[agentId]
|
|
1663
2030
|
if (!agent) throw new Error(`Unknown agent: ${agentId}`)
|
|
@@ -1681,11 +2048,12 @@ export async function runAgentRegressionSuite(params?: {
|
|
|
1681
2048
|
const scenarioDir = path.join(suiteDir, approvalMode, definition.id)
|
|
1682
2049
|
ensureDir(scenarioDir)
|
|
1683
2050
|
const sessionId = `${suiteId}-${approvalMode}-${definition.id}`
|
|
2051
|
+
const pluginResolution = resolveRegressionPlugins(definition.plugins, agent, pluginMode)
|
|
1684
2052
|
const session = buildRegressionSession({
|
|
1685
2053
|
agent,
|
|
1686
2054
|
sessionId,
|
|
1687
2055
|
cwd: scenarioDir,
|
|
1688
|
-
plugins:
|
|
2056
|
+
plugins: pluginResolution.effectivePlugins,
|
|
1689
2057
|
})
|
|
1690
2058
|
const sessions = loadSessions()
|
|
1691
2059
|
sessions[sessionId] = session
|
|
@@ -1696,8 +2064,12 @@ export async function runAgentRegressionSuite(params?: {
|
|
|
1696
2064
|
agentId,
|
|
1697
2065
|
agent,
|
|
1698
2066
|
approvalMode,
|
|
2067
|
+
pluginMode,
|
|
1699
2068
|
sessionId,
|
|
1700
2069
|
workspaceDir: scenarioDir,
|
|
2070
|
+
requiredPlugins: pluginResolution.requiredPlugins,
|
|
2071
|
+
effectivePlugins: pluginResolution.effectivePlugins,
|
|
2072
|
+
missingPlugins: pluginResolution.missingPlugins,
|
|
1701
2073
|
responseTexts: [],
|
|
1702
2074
|
toolEvents: [],
|
|
1703
2075
|
toolNames: new Set<string>(),
|