@swarmclawai/swarmclaw 0.7.7 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -14
- package/next.config.ts +13 -2
- package/package.json +4 -2
- package/src/app/api/agents/[id]/thread/route.ts +9 -0
- package/src/app/api/agents/route.ts +4 -0
- package/src/app/api/agents/thread-route.test.ts +133 -0
- package/src/app/api/approvals/route.test.ts +148 -0
- package/src/app/api/canvas/[sessionId]/route.ts +3 -1
- package/src/app/api/chatrooms/[id]/chat/route.ts +4 -2
- package/src/app/api/chats/[id]/devserver/route.ts +48 -7
- package/src/app/api/chats/[id]/messages/route.ts +42 -18
- package/src/app/api/chats/[id]/route.ts +1 -1
- package/src/app/api/chats/[id]/stop/route.ts +5 -4
- package/src/app/api/chats/route.ts +23 -2
- package/src/app/api/clawhub/install/route.ts +28 -8
- package/src/app/api/connectors/[id]/route.ts +46 -3
- package/src/app/api/connectors/route.ts +12 -8
- package/src/app/api/external-agents/route.test.ts +165 -0
- package/src/app/api/gateways/[id]/health/route.ts +27 -12
- package/src/app/api/gateways/[id]/route.ts +2 -0
- package/src/app/api/gateways/health-route.test.ts +135 -0
- package/src/app/api/gateways/route.ts +2 -0
- package/src/app/api/mcp-servers/route.test.ts +130 -0
- package/src/app/api/openclaw/deploy/route.ts +38 -5
- package/src/app/api/plugins/install/route.ts +46 -6
- package/src/app/api/plugins/marketplace/route.ts +48 -15
- package/src/app/api/preview-server/route.ts +26 -11
- package/src/app/api/projects/[id]/route.ts +6 -2
- package/src/app/api/projects/route.ts +4 -3
- package/src/app/api/schedules/[id]/run/route.ts +4 -0
- package/src/app/api/schedules/route.test.ts +86 -0
- package/src/app/api/schedules/route.ts +6 -1
- package/src/app/api/secrets/[id]/route.ts +1 -0
- package/src/app/api/secrets/route.ts +2 -1
- package/src/app/api/settings/route.ts +2 -0
- package/src/app/api/setup/check-provider/route.test.ts +19 -0
- package/src/app/api/setup/check-provider/route.ts +40 -10
- package/src/app/api/skills/[id]/route.ts +12 -0
- package/src/app/api/skills/import/route.ts +14 -12
- package/src/app/api/skills/route.ts +13 -1
- package/src/app/api/tasks/[id]/route.ts +10 -1
- package/src/app/api/tasks/import/github/route.test.ts +65 -0
- package/src/app/api/tasks/import/github/route.ts +337 -0
- package/src/app/api/wallets/[id]/approve/route.ts +17 -3
- package/src/app/api/wallets/[id]/route.ts +79 -33
- package/src/app/api/wallets/[id]/send/route.ts +19 -33
- package/src/app/api/wallets/route.ts +78 -61
- package/src/app/api/webhooks/[id]/route.ts +33 -6
- package/src/app/api/webhooks/route.test.ts +272 -0
- package/src/cli/index.js +1 -0
- package/src/cli/spec.js +1 -0
- package/src/components/agents/agent-card.tsx +9 -2
- package/src/components/agents/agent-chat-list.tsx +18 -2
- package/src/components/agents/agent-list.tsx +1 -0
- package/src/components/agents/agent-sheet.tsx +257 -38
- package/src/components/agents/inspector-panel.tsx +41 -0
- package/src/components/canvas/canvas-panel.tsx +236 -65
- package/src/components/chat/chat-area.tsx +36 -19
- package/src/components/chat/chat-card.tsx +36 -13
- package/src/components/chat/chat-header.tsx +48 -16
- package/src/components/chat/chat-list.tsx +28 -4
- package/src/components/chat/checkpoint-timeline.tsx +50 -34
- package/src/components/chat/delegation-banner.test.ts +14 -1
- package/src/components/chat/delegation-banner.tsx +1 -1
- package/src/components/chat/message-bubble.tsx +208 -145
- package/src/components/chat/message-list.tsx +48 -19
- package/src/components/chatrooms/chatroom-message.tsx +2 -2
- package/src/components/chatrooms/chatroom-sheet.tsx +16 -2
- package/src/components/connectors/connector-health.tsx +1 -1
- package/src/components/connectors/connector-list.tsx +7 -2
- package/src/components/connectors/connector-sheet.tsx +337 -148
- package/src/components/gateways/gateway-sheet.tsx +2 -2
- package/src/components/layout/app-layout.tsx +40 -23
- package/src/components/mcp-servers/mcp-server-list.tsx +26 -5
- package/src/components/mcp-servers/mcp-server-sheet.tsx +19 -2
- package/src/components/openclaw/openclaw-deploy-panel.tsx +269 -21
- package/src/components/plugins/plugin-list.tsx +45 -9
- package/src/components/plugins/plugin-sheet.tsx +55 -7
- package/src/components/projects/project-detail.tsx +217 -0
- package/src/components/projects/project-sheet.tsx +176 -4
- package/src/components/providers/provider-list.tsx +2 -1
- package/src/components/providers/provider-sheet.tsx +21 -2
- package/src/components/schedules/schedule-card.tsx +25 -1
- package/src/components/schedules/schedule-sheet.tsx +44 -2
- package/src/components/secrets/secret-sheet.tsx +21 -2
- package/src/components/shared/agent-switch-dialog.tsx +12 -1
- package/src/components/shared/bottom-sheet.tsx +13 -3
- package/src/components/shared/command-palette.tsx +8 -1
- package/src/components/shared/confirm-dialog.tsx +19 -4
- package/src/components/shared/connector-platform-icon.test.ts +28 -0
- package/src/components/shared/connector-platform-icon.tsx +39 -6
- package/src/components/shared/settings/plugin-manager.tsx +29 -6
- package/src/components/shared/settings/section-capability-policy.tsx +45 -3
- package/src/components/shared/settings/section-voice.tsx +11 -3
- package/src/components/skills/skill-list.tsx +25 -0
- package/src/components/skills/skill-sheet.tsx +84 -12
- package/src/components/tasks/approvals-panel.tsx +289 -34
- package/src/components/tasks/task-board.tsx +410 -25
- package/src/components/tasks/task-card.tsx +66 -8
- package/src/components/tasks/task-sheet.tsx +16 -4
- package/src/components/ui/dialog.tsx +2 -2
- package/src/components/wallets/wallet-approval-dialog.tsx +4 -2
- package/src/components/wallets/wallet-panel.tsx +435 -90
- package/src/components/wallets/wallet-section.tsx +198 -48
- package/src/components/webhooks/webhook-sheet.tsx +22 -2
- package/src/lib/approval-display.ts +20 -0
- package/src/lib/canvas-content.ts +198 -0
- package/src/lib/chat-artifact-summary.ts +165 -0
- package/src/lib/chat-display.test.ts +91 -0
- package/src/lib/chat-display.ts +58 -0
- package/src/lib/chat-streaming-state.test.ts +47 -1
- package/src/lib/chat-streaming-state.ts +42 -0
- package/src/lib/ollama-model.ts +10 -0
- package/src/lib/openclaw-endpoint.test.ts +8 -0
- package/src/lib/openclaw-endpoint.ts +6 -1
- package/src/lib/plugin-install-cors.ts +46 -0
- package/src/lib/plugin-sources.test.ts +43 -0
- package/src/lib/plugin-sources.ts +77 -0
- package/src/lib/providers/ollama.ts +16 -6
- package/src/lib/providers/openclaw.test.ts +54 -0
- package/src/lib/providers/openclaw.ts +127 -11
- package/src/lib/schedule-dedupe-advanced.test.ts +1335 -0
- package/src/lib/schedule-dedupe.test.ts +66 -1
- package/src/lib/schedule-dedupe.ts +169 -12
- package/src/lib/schedule-origin.test.ts +20 -0
- package/src/lib/schedule-origin.ts +15 -0
- package/src/lib/server/__fixtures__/fake-mcp-stdio-server.mjs +27 -0
- package/src/lib/server/agent-availability.ts +16 -0
- package/src/lib/server/agent-runtime-config.ts +12 -4
- package/src/lib/server/agent-thread-session.test.ts +51 -0
- package/src/lib/server/agent-thread-session.ts +7 -0
- package/src/lib/server/approval-match.ts +205 -0
- package/src/lib/server/approvals-auto-approve.test.ts +538 -1
- package/src/lib/server/approvals.ts +214 -1
- package/src/lib/server/assistant-control.test.ts +29 -0
- package/src/lib/server/assistant-control.ts +23 -0
- package/src/lib/server/build-llm.test.ts +79 -0
- package/src/lib/server/build-llm.ts +14 -4
- package/src/lib/server/canvas-content.test.ts +32 -0
- package/src/lib/server/canvas-content.ts +6 -0
- package/src/lib/server/capability-router.test.ts +33 -0
- package/src/lib/server/capability-router.ts +80 -19
- package/src/lib/server/chat-execution-advanced.test.ts +651 -0
- package/src/lib/server/chat-execution-disabled.test.ts +94 -0
- package/src/lib/server/chat-execution-tool-events.test.ts +157 -0
- package/src/lib/server/chat-execution.ts +378 -73
- package/src/lib/server/clawhub-client.test.ts +14 -8
- package/src/lib/server/connectors/manager-reconnect.test.ts +47 -0
- package/src/lib/server/connectors/manager.test.ts +1147 -0
- package/src/lib/server/connectors/manager.ts +461 -137
- package/src/lib/server/connectors/pairing.ts +26 -5
- package/src/lib/server/connectors/types.ts +2 -0
- package/src/lib/server/connectors/whatsapp.test.ts +134 -0
- package/src/lib/server/connectors/whatsapp.ts +271 -47
- package/src/lib/server/context-manager.ts +6 -1
- package/src/lib/server/daemon-state.ts +84 -47
- package/src/lib/server/data-dir.test.ts +37 -0
- package/src/lib/server/data-dir.ts +20 -1
- package/src/lib/server/delegation-jobs-advanced.test.ts +513 -0
- package/src/lib/server/devserver-launch.test.ts +60 -0
- package/src/lib/server/devserver-launch.ts +85 -0
- package/src/lib/server/elevenlabs.test.ts +247 -1
- package/src/lib/server/elevenlabs.ts +147 -43
- package/src/lib/server/ethereum.ts +590 -0
- package/src/lib/server/eval/agent-regression-advanced.test.ts +302 -0
- package/src/lib/server/eval/agent-regression.test.ts +18 -1
- package/src/lib/server/eval/agent-regression.ts +383 -11
- package/src/lib/server/evm-swap.ts +475 -0
- package/src/lib/server/execution-log.ts +1 -0
- package/src/lib/server/heartbeat-service-timer.test.ts +173 -0
- package/src/lib/server/heartbeat-service.ts +20 -11
- package/src/lib/server/heartbeat-wake.test.ts +112 -0
- package/src/lib/server/heartbeat-wake.ts +338 -57
- package/src/lib/server/main-agent-loop-advanced.test.ts +538 -0
- package/src/lib/server/main-agent-loop.test.ts +260 -0
- package/src/lib/server/main-agent-loop.ts +559 -14
- package/src/lib/server/mcp-client.test.ts +16 -0
- package/src/lib/server/mcp-client.ts +25 -0
- package/src/lib/server/memory-integration.test.ts +719 -0
- package/src/lib/server/memory-policy.test.ts +43 -0
- package/src/lib/server/memory-policy.ts +132 -0
- package/src/lib/server/memory-tiers.test.ts +60 -0
- package/src/lib/server/memory-tiers.ts +16 -0
- package/src/lib/server/ollama-runtime.ts +58 -0
- package/src/lib/server/openclaw-deploy.test.ts +109 -1
- package/src/lib/server/openclaw-deploy.ts +557 -81
- package/src/lib/server/openclaw-gateway.test.ts +131 -0
- package/src/lib/server/openclaw-gateway.ts +10 -4
- package/src/lib/server/openclaw-health.test.ts +35 -0
- package/src/lib/server/openclaw-health.ts +215 -47
- package/src/lib/server/orchestrator-lg.ts +3 -2
- package/src/lib/server/orchestrator.ts +2 -0
- package/src/lib/server/plugins-advanced.test.ts +351 -0
- package/src/lib/server/plugins.ts +211 -6
- package/src/lib/server/project-context.ts +162 -0
- package/src/lib/server/project-utils.ts +150 -0
- package/src/lib/server/queue-advanced.test.ts +528 -0
- package/src/lib/server/queue-followups.test.ts +409 -2
- package/src/lib/server/queue-reconcile.test.ts +128 -0
- package/src/lib/server/queue.ts +527 -68
- package/src/lib/server/scheduler.ts +29 -1
- package/src/lib/server/session-note.test.ts +36 -0
- package/src/lib/server/session-note.ts +42 -0
- package/src/lib/server/session-run-manager.ts +83 -4
- package/src/lib/server/session-tools/canvas.ts +14 -12
- package/src/lib/server/session-tools/connector-inputs.test.ts +37 -0
- package/src/lib/server/session-tools/connector.test.ts +138 -0
- package/src/lib/server/session-tools/connector.ts +366 -54
- package/src/lib/server/session-tools/context.ts +17 -3
- package/src/lib/server/session-tools/crud.ts +484 -84
- package/src/lib/server/session-tools/delegate-fallback.test.ts +103 -0
- package/src/lib/server/session-tools/delegate-resume.test.ts +50 -0
- package/src/lib/server/session-tools/delegate.ts +102 -10
- package/src/lib/server/session-tools/discovery-approvals.test.ts +142 -0
- package/src/lib/server/session-tools/discovery.ts +80 -12
- package/src/lib/server/session-tools/file-normalize.test.ts +36 -0
- package/src/lib/server/session-tools/file.ts +43 -4
- package/src/lib/server/session-tools/human-loop.ts +35 -5
- package/src/lib/server/session-tools/index.ts +44 -9
- package/src/lib/server/session-tools/manage-connectors.test.ts +139 -0
- package/src/lib/server/session-tools/manage-schedules-advanced.test.ts +564 -0
- package/src/lib/server/session-tools/manage-schedules.test.ts +283 -0
- package/src/lib/server/session-tools/manage-tasks-advanced.test.ts +852 -0
- package/src/lib/server/session-tools/manage-tasks.test.ts +114 -0
- package/src/lib/server/session-tools/memory.test.ts +93 -0
- package/src/lib/server/session-tools/memory.ts +554 -75
- package/src/lib/server/session-tools/normalize-tool-args.ts +1 -1
- package/src/lib/server/session-tools/platform-access.test.ts +58 -0
- package/src/lib/server/session-tools/platform.ts +60 -19
- package/src/lib/server/session-tools/plugin-creator.ts +57 -1
- package/src/lib/server/session-tools/primitive-tools.test.ts +6 -0
- package/src/lib/server/session-tools/schedule.ts +6 -1
- package/src/lib/server/session-tools/shell-normalize.test.ts +25 -1
- package/src/lib/server/session-tools/shell.ts +22 -3
- package/src/lib/server/session-tools/wallet-tool.test.ts +254 -0
- package/src/lib/server/session-tools/wallet.ts +1374 -139
- package/src/lib/server/session-tools/web-inputs.test.ts +178 -0
- package/src/lib/server/session-tools/web.ts +621 -70
- package/src/lib/server/skill-discovery.ts +128 -0
- package/src/lib/server/skill-eligibility.test.ts +84 -0
- package/src/lib/server/skill-eligibility.ts +95 -0
- package/src/lib/server/skill-prompt-budget.test.ts +102 -0
- package/src/lib/server/skill-prompt-budget.ts +125 -0
- package/src/lib/server/skills-normalize.test.ts +54 -0
- package/src/lib/server/skills-normalize.ts +372 -26
- package/src/lib/server/solana.ts +214 -29
- package/src/lib/server/storage.ts +65 -36
- package/src/lib/server/stream-agent-chat.test.ts +437 -2
- package/src/lib/server/stream-agent-chat.ts +957 -79
- package/src/lib/server/system-events.ts +1 -1
- package/src/lib/server/tool-aliases.ts +2 -0
- package/src/lib/server/tool-capability-policy-advanced.test.ts +502 -0
- package/src/lib/server/tool-capability-policy.test.ts +24 -0
- package/src/lib/server/tool-capability-policy.ts +29 -1
- package/src/lib/server/tool-loop-detection.test.ts +105 -0
- package/src/lib/server/tool-loop-detection.ts +260 -0
- package/src/lib/server/tool-planning.test.ts +44 -0
- package/src/lib/server/tool-planning.ts +271 -0
- package/src/lib/server/wallet-execution.test.ts +198 -0
- package/src/lib/server/wallet-portfolio.test.ts +98 -0
- package/src/lib/server/wallet-portfolio.ts +724 -0
- package/src/lib/server/wallet-service.test.ts +57 -0
- package/src/lib/server/wallet-service.ts +213 -0
- package/src/lib/server/watch-jobs-advanced.test.ts +594 -0
- package/src/lib/server/watch-jobs.ts +17 -2
- package/src/lib/server/workspace-context.ts +111 -0
- package/src/lib/skill-save-payload.test.ts +39 -0
- package/src/lib/skill-save-payload.ts +37 -0
- package/src/lib/tasks.ts +28 -0
- package/src/lib/tool-definitions.ts +2 -1
- package/src/lib/tool-event-summary.test.ts +30 -0
- package/src/lib/tool-event-summary.ts +37 -0
- package/src/lib/validation/schemas.ts +1 -0
- package/src/lib/wallet-transactions.test.ts +75 -0
- package/src/lib/wallet-transactions.ts +43 -0
- package/src/lib/wallet.test.ts +17 -0
- package/src/lib/wallet.ts +183 -0
- package/src/proxy.test.ts +31 -0
- package/src/proxy.ts +34 -2
- package/src/stores/use-chat-store.ts +15 -1
- package/src/types/index.ts +249 -14
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
import assert from 'node:assert/strict'
|
|
2
|
+
import { describe, it } from 'node:test'
|
|
3
|
+
|
|
4
|
+
import {
|
|
5
|
+
AGENT_REGRESSION_SCENARIOS,
|
|
6
|
+
resolveRegressionApprovalSettings,
|
|
7
|
+
resolveRegressionPlugins,
|
|
8
|
+
scoreAssertions,
|
|
9
|
+
} from './agent-regression'
|
|
10
|
+
|
|
11
|
+
import type { RegressionAssertion } from './agent-regression'
|
|
12
|
+
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
// scoreAssertions
|
|
15
|
+
// ---------------------------------------------------------------------------
|
|
16
|
+
|
|
17
|
+
describe('scoreAssertions', () => {
|
|
18
|
+
it('perfect score with weighted assertions', () => {
|
|
19
|
+
const assertions: RegressionAssertion[] = [
|
|
20
|
+
{ name: 'a', passed: true, weight: 1 },
|
|
21
|
+
{ name: 'b', passed: true, weight: 2 },
|
|
22
|
+
{ name: 'c', passed: true, weight: 3 },
|
|
23
|
+
{ name: 'd', passed: true, weight: 4 },
|
|
24
|
+
{ name: 'e', passed: true, weight: 5 },
|
|
25
|
+
]
|
|
26
|
+
const result = scoreAssertions(assertions)
|
|
27
|
+
assert.equal(result.score, 15)
|
|
28
|
+
assert.equal(result.maxScore, 15)
|
|
29
|
+
assert.equal(result.status, 'passed')
|
|
30
|
+
})
|
|
31
|
+
|
|
32
|
+
it('single failure tanks status even when most pass', () => {
|
|
33
|
+
const assertions: RegressionAssertion[] = [
|
|
34
|
+
{ name: 'a', passed: true, weight: 1 },
|
|
35
|
+
{ name: 'b', passed: true, weight: 1 },
|
|
36
|
+
{ name: 'c', passed: true, weight: 1 },
|
|
37
|
+
{ name: 'd', passed: true, weight: 1 },
|
|
38
|
+
{ name: 'e', passed: false, weight: 1 },
|
|
39
|
+
]
|
|
40
|
+
const result = scoreAssertions(assertions)
|
|
41
|
+
assert.equal(result.score, 4)
|
|
42
|
+
assert.equal(result.maxScore, 5)
|
|
43
|
+
assert.equal(result.status, 'failed')
|
|
44
|
+
})
|
|
45
|
+
|
|
46
|
+
it('zero-weight failing assertion does not affect score or status', () => {
|
|
47
|
+
const assertions: RegressionAssertion[] = [
|
|
48
|
+
{ name: 'high-value-1', passed: true, weight: 5 },
|
|
49
|
+
{ name: 'high-value-2', passed: true, weight: 5 },
|
|
50
|
+
{ name: 'cosmetic-check', passed: false, weight: 0 },
|
|
51
|
+
]
|
|
52
|
+
const result = scoreAssertions(assertions)
|
|
53
|
+
assert.equal(result.score, 10)
|
|
54
|
+
assert.equal(result.maxScore, 10)
|
|
55
|
+
assert.equal(result.status, 'passed')
|
|
56
|
+
})
|
|
57
|
+
|
|
58
|
+
it('defaults weight to 1 when not specified', () => {
|
|
59
|
+
const assertions: RegressionAssertion[] = [
|
|
60
|
+
{ name: 'explicit', passed: true, weight: 3 },
|
|
61
|
+
{ name: 'implicit-1', passed: true },
|
|
62
|
+
{ name: 'implicit-2', passed: false },
|
|
63
|
+
]
|
|
64
|
+
const result = scoreAssertions(assertions)
|
|
65
|
+
// score: 3 (explicit) + 1 (implicit-1) = 4
|
|
66
|
+
// maxScore: 3 + 1 + 1 = 5
|
|
67
|
+
assert.equal(result.score, 4)
|
|
68
|
+
assert.equal(result.maxScore, 5)
|
|
69
|
+
assert.equal(result.status, 'failed')
|
|
70
|
+
})
|
|
71
|
+
|
|
72
|
+
it('empty assertions produce score 0/0 with passed status (vacuous truth)', () => {
|
|
73
|
+
const result = scoreAssertions([])
|
|
74
|
+
assert.equal(result.score, 0)
|
|
75
|
+
assert.equal(result.maxScore, 0)
|
|
76
|
+
assert.equal(result.status, 'passed')
|
|
77
|
+
})
|
|
78
|
+
|
|
79
|
+
it('all failures yield score 0 with failed status', () => {
|
|
80
|
+
const assertions: RegressionAssertion[] = [
|
|
81
|
+
{ name: 'a', passed: false, weight: 2 },
|
|
82
|
+
{ name: 'b', passed: false, weight: 3 },
|
|
83
|
+
{ name: 'c', passed: false, weight: 5 },
|
|
84
|
+
]
|
|
85
|
+
const result = scoreAssertions(assertions)
|
|
86
|
+
assert.equal(result.score, 0)
|
|
87
|
+
assert.equal(result.maxScore, 10)
|
|
88
|
+
assert.equal(result.status, 'failed')
|
|
89
|
+
})
|
|
90
|
+
|
|
91
|
+
it('handles a large batch of 100 assertions correctly', () => {
|
|
92
|
+
// Deterministic pseudo-random: alternate pass/fail in a pattern
|
|
93
|
+
const assertions: RegressionAssertion[] = []
|
|
94
|
+
let expectedScore = 0
|
|
95
|
+
let expectedMaxScore = 0
|
|
96
|
+
|
|
97
|
+
for (let i = 0; i < 100; i++) {
|
|
98
|
+
const weight = (i % 7) + 1 // weights cycle 1..7
|
|
99
|
+
const passed = i % 3 !== 0 // fails on every 3rd (indices 0, 3, 6, ...)
|
|
100
|
+
assertions.push({ name: `assertion-${i}`, passed, weight })
|
|
101
|
+
expectedMaxScore += weight
|
|
102
|
+
if (passed) expectedScore += weight
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
const result = scoreAssertions(assertions)
|
|
106
|
+
assert.equal(result.score, expectedScore)
|
|
107
|
+
assert.equal(result.maxScore, expectedMaxScore)
|
|
108
|
+
// At least some fail, so status should be 'failed'
|
|
109
|
+
assert.equal(result.status, expectedScore === expectedMaxScore ? 'passed' : 'failed')
|
|
110
|
+
})
|
|
111
|
+
|
|
112
|
+
it('handles negative and fractional weights without clamping', () => {
|
|
113
|
+
// The implementation does weight ?? 1 with no clamping, so negative
|
|
114
|
+
// weights are added as-is. This test documents actual behavior.
|
|
115
|
+
const assertions: RegressionAssertion[] = [
|
|
116
|
+
{ name: 'fractional-pass', passed: true, weight: 0.5 },
|
|
117
|
+
{ name: 'fractional-fail', passed: false, weight: 0.5 },
|
|
118
|
+
{ name: 'negative-pass', passed: true, weight: -1 },
|
|
119
|
+
{ name: 'zero-pass', passed: true, weight: 0 },
|
|
120
|
+
]
|
|
121
|
+
const result = scoreAssertions(assertions)
|
|
122
|
+
|
|
123
|
+
// score = 0.5 (fractional-pass) + (-1) (negative-pass) + 0 (zero-pass) = -0.5
|
|
124
|
+
// maxScore = 0.5 + 0.5 + (-1) + 0 = 0
|
|
125
|
+
assert.equal(result.score, -0.5)
|
|
126
|
+
assert.equal(result.maxScore, 0)
|
|
127
|
+
// score !== maxScore → 'failed'
|
|
128
|
+
assert.equal(result.status, 'failed')
|
|
129
|
+
})
|
|
130
|
+
})
|
|
131
|
+
|
|
132
|
+
// ---------------------------------------------------------------------------
|
|
133
|
+
// resolveRegressionPlugins
|
|
134
|
+
// ---------------------------------------------------------------------------
|
|
135
|
+
|
|
136
|
+
describe('resolveRegressionPlugins', () => {
|
|
137
|
+
it('scenario mode uses scenario plugins as effective plugins', () => {
|
|
138
|
+
const scenarioPlugins = ['delegate', 'browser', 'email']
|
|
139
|
+
const agent = { plugins: ['delegate', 'files', 'web'] }
|
|
140
|
+
|
|
141
|
+
const result = resolveRegressionPlugins(scenarioPlugins, agent, 'scenario')
|
|
142
|
+
|
|
143
|
+
assert.deepEqual(result.effectivePlugins, ['delegate', 'browser', 'email'])
|
|
144
|
+
assert.deepEqual(result.missingPlugins, [])
|
|
145
|
+
})
|
|
146
|
+
|
|
147
|
+
it('agent mode uses agent plugins and reports missing ones', () => {
|
|
148
|
+
const scenarioPlugins = ['delegate', 'browser', 'email']
|
|
149
|
+
const agent = { plugins: ['delegate', 'files', 'web'] }
|
|
150
|
+
|
|
151
|
+
const result = resolveRegressionPlugins(scenarioPlugins, agent, 'agent')
|
|
152
|
+
|
|
153
|
+
assert.deepEqual(result.effectivePlugins, ['delegate', 'files', 'web'])
|
|
154
|
+
assert.deepEqual(result.requiredPlugins, ['delegate', 'browser', 'email'])
|
|
155
|
+
// 'delegate' is present (agent has it), 'browser' and 'email' are missing
|
|
156
|
+
assert.ok(result.missingPlugins.includes('browser'))
|
|
157
|
+
assert.ok(result.missingPlugins.includes('email'))
|
|
158
|
+
assert.ok(!result.missingPlugins.includes('delegate'))
|
|
159
|
+
})
|
|
160
|
+
|
|
161
|
+
it('reports no missing plugins when agent has all required', () => {
|
|
162
|
+
const scenarioPlugins = ['delegate', 'browser']
|
|
163
|
+
const agent = { plugins: ['delegate', 'browser', 'email', 'files'] }
|
|
164
|
+
|
|
165
|
+
const result = resolveRegressionPlugins(scenarioPlugins, agent, 'agent')
|
|
166
|
+
|
|
167
|
+
assert.deepEqual(result.missingPlugins, [])
|
|
168
|
+
assert.deepEqual(result.effectivePlugins, ['delegate', 'browser', 'email', 'files'])
|
|
169
|
+
})
|
|
170
|
+
|
|
171
|
+
it('handles plugin aliases — web_search resolves to canonical web', () => {
|
|
172
|
+
// 'web_search' is an alias for 'web'. When the scenario requires 'web_search',
|
|
173
|
+
// canonicalization maps it to 'web'. If the agent has 'web', it should not
|
|
174
|
+
// appear in missingPlugins because expandPluginIds expands 'web' to include
|
|
175
|
+
// all aliases.
|
|
176
|
+
const scenarioPlugins = ['web_search']
|
|
177
|
+
const agent = { plugins: ['web'] }
|
|
178
|
+
|
|
179
|
+
const result = resolveRegressionPlugins(scenarioPlugins, agent, 'agent')
|
|
180
|
+
assert.deepEqual(result.missingPlugins, [])
|
|
181
|
+
})
|
|
182
|
+
|
|
183
|
+
it('handles alias in scenario mode — effectivePlugins preserves original strings', () => {
|
|
184
|
+
const scenarioPlugins = ['web_search', 'claude_code']
|
|
185
|
+
const agent = { plugins: [] }
|
|
186
|
+
|
|
187
|
+
const result = resolveRegressionPlugins(scenarioPlugins, agent, 'scenario')
|
|
188
|
+
|
|
189
|
+
// In scenario mode, effectivePlugins comes from normalizePluginList(requiredPlugins)
|
|
190
|
+
// which preserves original strings
|
|
191
|
+
assert.deepEqual(result.effectivePlugins, ['web_search', 'claude_code'])
|
|
192
|
+
assert.deepEqual(result.missingPlugins, [])
|
|
193
|
+
})
|
|
194
|
+
|
|
195
|
+
it('empty agent plugins — all scenario plugins are missing', () => {
|
|
196
|
+
const scenarioPlugins = ['delegate', 'browser', 'web']
|
|
197
|
+
const agent = { plugins: [] }
|
|
198
|
+
|
|
199
|
+
const result = resolveRegressionPlugins(scenarioPlugins, agent, 'agent')
|
|
200
|
+
|
|
201
|
+
assert.deepEqual(result.effectivePlugins, [])
|
|
202
|
+
assert.equal(result.missingPlugins.length, 3)
|
|
203
|
+
assert.ok(result.missingPlugins.includes('delegate'))
|
|
204
|
+
assert.ok(result.missingPlugins.includes('browser'))
|
|
205
|
+
assert.ok(result.missingPlugins.includes('web'))
|
|
206
|
+
})
|
|
207
|
+
|
|
208
|
+
it('undefined agent plugins — all scenario plugins are missing', () => {
|
|
209
|
+
const scenarioPlugins = ['delegate', 'browser']
|
|
210
|
+
const agent: Record<string, unknown> = {}
|
|
211
|
+
|
|
212
|
+
const result = resolveRegressionPlugins(scenarioPlugins, agent, 'agent')
|
|
213
|
+
|
|
214
|
+
assert.deepEqual(result.effectivePlugins, [])
|
|
215
|
+
assert.equal(result.missingPlugins.length, 2)
|
|
216
|
+
})
|
|
217
|
+
|
|
218
|
+
it('requiredPlugins are canonicalized in both modes', () => {
|
|
219
|
+
const scenarioPlugins = ['claude_code', 'web_fetch']
|
|
220
|
+
|
|
221
|
+
const scenarioResult = resolveRegressionPlugins(scenarioPlugins, {}, 'scenario')
|
|
222
|
+
const agentResult = resolveRegressionPlugins(scenarioPlugins, { plugins: [] }, 'agent')
|
|
223
|
+
|
|
224
|
+
// 'claude_code' → canonical 'delegate', 'web_fetch' → canonical 'web'
|
|
225
|
+
assert.deepEqual(scenarioResult.requiredPlugins, ['delegate', 'web'])
|
|
226
|
+
assert.deepEqual(agentResult.requiredPlugins, ['delegate', 'web'])
|
|
227
|
+
})
|
|
228
|
+
})
|
|
229
|
+
|
|
230
|
+
// ---------------------------------------------------------------------------
|
|
231
|
+
// resolveRegressionApprovalSettings
|
|
232
|
+
// ---------------------------------------------------------------------------
|
|
233
|
+
|
|
234
|
+
describe('resolveRegressionApprovalSettings', () => {
|
|
235
|
+
it('manual mode enables approvals with no auto-approve categories', () => {
|
|
236
|
+
const settings = resolveRegressionApprovalSettings('manual')
|
|
237
|
+
assert.equal(settings.approvalsEnabled, true)
|
|
238
|
+
assert.deepEqual(settings.approvalAutoApproveCategories, [])
|
|
239
|
+
})
|
|
240
|
+
|
|
241
|
+
it('auto mode enables approvals with tool_access auto-approved', () => {
|
|
242
|
+
const settings = resolveRegressionApprovalSettings('auto')
|
|
243
|
+
assert.equal(settings.approvalsEnabled, true)
|
|
244
|
+
assert.deepEqual(settings.approvalAutoApproveCategories, ['tool_access'])
|
|
245
|
+
})
|
|
246
|
+
|
|
247
|
+
it('off mode disables approvals entirely', () => {
|
|
248
|
+
const settings = resolveRegressionApprovalSettings('off')
|
|
249
|
+
assert.equal(settings.approvalsEnabled, false)
|
|
250
|
+
assert.deepEqual(settings.approvalAutoApproveCategories, [])
|
|
251
|
+
})
|
|
252
|
+
})
|
|
253
|
+
|
|
254
|
+
// ---------------------------------------------------------------------------
|
|
255
|
+
// AGENT_REGRESSION_SCENARIOS registry
|
|
256
|
+
// ---------------------------------------------------------------------------
|
|
257
|
+
|
|
258
|
+
describe('AGENT_REGRESSION_SCENARIOS registry', () => {
|
|
259
|
+
it('contains the expected scenario IDs in order', () => {
|
|
260
|
+
const ids = AGENT_REGRESSION_SCENARIOS.map((s) => s.id)
|
|
261
|
+
assert.deepEqual(ids, [
|
|
262
|
+
'approval-resume',
|
|
263
|
+
'delegate-literal-artifact',
|
|
264
|
+
'schedule-script',
|
|
265
|
+
'open-ended-iteration',
|
|
266
|
+
'mock-signup-secret-email',
|
|
267
|
+
'human-verified-signup',
|
|
268
|
+
'research-build-deploy',
|
|
269
|
+
])
|
|
270
|
+
})
|
|
271
|
+
|
|
272
|
+
it('every scenario has all required fields', () => {
|
|
273
|
+
for (const scenario of AGENT_REGRESSION_SCENARIOS) {
|
|
274
|
+
assert.ok(typeof scenario.id === 'string' && scenario.id.length > 0,
|
|
275
|
+
`scenario missing non-empty id`)
|
|
276
|
+
assert.ok(typeof scenario.name === 'string' && scenario.name.length > 0,
|
|
277
|
+
`scenario ${scenario.id} missing non-empty name`)
|
|
278
|
+
assert.ok(Array.isArray(scenario.plugins),
|
|
279
|
+
`scenario ${scenario.id} missing plugins array`)
|
|
280
|
+
assert.ok(typeof scenario.run === 'function',
|
|
281
|
+
`scenario ${scenario.id} missing run function`)
|
|
282
|
+
}
|
|
283
|
+
})
|
|
284
|
+
|
|
285
|
+
it('no duplicate scenario IDs', () => {
|
|
286
|
+
const ids = AGENT_REGRESSION_SCENARIOS.map((s) => s.id)
|
|
287
|
+
const unique = new Set(ids)
|
|
288
|
+
assert.equal(unique.size, ids.length, 'duplicate scenario IDs detected')
|
|
289
|
+
})
|
|
290
|
+
|
|
291
|
+
it('every scenario declares at least an empty plugins array', () => {
|
|
292
|
+
for (const scenario of AGENT_REGRESSION_SCENARIOS) {
|
|
293
|
+
assert.ok(Array.isArray(scenario.plugins),
|
|
294
|
+
`scenario ${scenario.id}: plugins should be an array`)
|
|
295
|
+
// Each plugin entry should be a non-empty string
|
|
296
|
+
for (const plugin of scenario.plugins) {
|
|
297
|
+
assert.ok(typeof plugin === 'string' && plugin.trim().length > 0,
|
|
298
|
+
`scenario ${scenario.id}: plugin entries must be non-empty strings`)
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
})
|
|
302
|
+
})
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import assert from 'node:assert/strict'
|
|
2
2
|
import { describe, it } from 'node:test'
|
|
3
|
-
import { AGENT_REGRESSION_SCENARIOS, resolveRegressionApprovalSettings, scoreAssertions } from './agent-regression'
|
|
3
|
+
import { AGENT_REGRESSION_SCENARIOS, resolveRegressionApprovalSettings, resolveRegressionPlugins, scoreAssertions } from './agent-regression'
|
|
4
4
|
|
|
5
5
|
describe('agent regression helpers', () => {
|
|
6
6
|
it('maps approval modes onto deterministic platform settings', () => {
|
|
@@ -42,6 +42,23 @@ describe('agent regression helpers', () => {
|
|
|
42
42
|
'mock-signup-secret-email',
|
|
43
43
|
'human-verified-signup',
|
|
44
44
|
'research-build-deploy',
|
|
45
|
+
'tool-call-efficiency',
|
|
46
|
+
'file-creation-followthrough',
|
|
47
|
+
'knowledge-first-file',
|
|
45
48
|
])
|
|
46
49
|
})
|
|
50
|
+
|
|
51
|
+
it('can resolve regressions against the agent capability set instead of injected scenario plugins', () => {
|
|
52
|
+
const resolved = resolveRegressionPlugins(
|
|
53
|
+
['delegate', 'browser', 'manage_secrets', 'email'],
|
|
54
|
+
{
|
|
55
|
+
plugins: ['codex_cli', 'browser', 'manage_secrets', 'files'],
|
|
56
|
+
},
|
|
57
|
+
'agent',
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
assert.deepEqual(resolved.requiredPlugins, ['delegate', 'browser', 'manage_secrets', 'email'])
|
|
61
|
+
assert.deepEqual(resolved.effectivePlugins, ['codex_cli', 'browser', 'manage_secrets', 'files'])
|
|
62
|
+
assert.deepEqual(resolved.missingPlugins, ['email'])
|
|
63
|
+
})
|
|
47
64
|
})
|