@swarmclawai/swarmclaw 0.6.7 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +82 -39
- package/next.config.ts +31 -6
- package/package.json +3 -2
- package/src/app/api/agents/[id]/thread/route.ts +1 -0
- package/src/app/api/agents/route.ts +19 -5
- package/src/app/api/approvals/route.ts +22 -0
- package/src/app/api/chatrooms/[id]/chat/route.ts +4 -0
- package/src/app/api/clawhub/install/route.ts +2 -2
- package/src/app/api/eval/run/route.ts +37 -0
- package/src/app/api/eval/scenarios/route.ts +24 -0
- package/src/app/api/eval/suite/route.ts +29 -0
- package/src/app/api/mcp-servers/[id]/conformance/route.ts +26 -0
- package/src/app/api/mcp-servers/[id]/invoke/route.ts +81 -0
- package/src/app/api/memory/graph/route.ts +46 -0
- package/src/app/api/memory/route.ts +36 -5
- package/src/app/api/notifications/route.ts +3 -0
- package/src/app/api/plugins/install/route.ts +57 -5
- package/src/app/api/plugins/marketplace/route.ts +73 -22
- package/src/app/api/plugins/route.ts +61 -1
- package/src/app/api/plugins/ui/route.ts +34 -0
- package/src/app/api/sessions/[id]/checkpoints/route.ts +31 -0
- package/src/app/api/sessions/[id]/restore/route.ts +36 -0
- package/src/app/api/settings/route.ts +62 -0
- package/src/app/api/setup/doctor/route.ts +22 -5
- package/src/app/api/souls/[id]/route.ts +65 -0
- package/src/app/api/souls/route.ts +70 -0
- package/src/app/api/tasks/[id]/approve/route.ts +4 -3
- package/src/app/api/tasks/[id]/route.ts +16 -3
- package/src/app/api/tasks/route.ts +10 -2
- package/src/app/api/usage/route.ts +9 -2
- package/src/app/globals.css +27 -0
- package/src/app/page.tsx +10 -5
- package/src/cli/index.js +37 -0
- package/src/components/activity/activity-feed.tsx +9 -2
- package/src/components/agents/agent-avatar.tsx +5 -1
- package/src/components/agents/agent-card.tsx +55 -9
- package/src/components/agents/agent-sheet.tsx +112 -34
- package/src/components/agents/inspector-panel.tsx +1 -1
- package/src/components/agents/soul-library-picker.tsx +84 -13
- package/src/components/auth/access-key-gate.tsx +63 -54
- package/src/components/auth/user-picker.tsx +37 -32
- package/src/components/chat/activity-moment.tsx +2 -0
- package/src/components/chat/chat-area.tsx +11 -0
- package/src/components/chat/chat-header.tsx +69 -25
- package/src/components/chat/chat-tool-toggles.tsx +2 -2
- package/src/components/chat/checkpoint-timeline.tsx +112 -0
- package/src/components/chat/code-block.tsx +3 -1
- package/src/components/chat/exec-approval-card.tsx +8 -1
- package/src/components/chat/message-bubble.tsx +164 -4
- package/src/components/chat/message-list.tsx +46 -4
- package/src/components/chat/session-approval-card.tsx +80 -0
- package/src/components/chat/session-debug-panel.tsx +106 -84
- package/src/components/chat/streaming-bubble.tsx +6 -5
- package/src/components/chat/task-approval-card.tsx +78 -0
- package/src/components/chat/thinking-indicator.tsx +48 -12
- package/src/components/chat/tool-call-bubble.tsx +3 -0
- package/src/components/chat/tool-request-banner.tsx +39 -20
- package/src/components/chatrooms/chatroom-list.tsx +11 -4
- package/src/components/chatrooms/chatroom-sheet.tsx +7 -2
- package/src/components/connectors/connector-list.tsx +33 -11
- package/src/components/connectors/connector-sheet.tsx +37 -7
- package/src/components/home/home-view.tsx +54 -24
- package/src/components/input/chat-input.tsx +22 -1
- package/src/components/knowledge/knowledge-list.tsx +17 -18
- package/src/components/knowledge/knowledge-sheet.tsx +9 -5
- package/src/components/layout/app-layout.tsx +87 -19
- package/src/components/mcp-servers/mcp-server-list.tsx +352 -50
- package/src/components/mcp-servers/mcp-server-sheet.tsx +25 -9
- package/src/components/memory/memory-browser.tsx +73 -45
- package/src/components/memory/memory-graph-view.tsx +203 -0
- package/src/components/memory/memory-list.tsx +20 -13
- package/src/components/plugins/plugin-list.tsx +214 -60
- package/src/components/plugins/plugin-sheet.tsx +119 -24
- package/src/components/projects/project-list.tsx +17 -9
- package/src/components/providers/provider-list.tsx +21 -6
- package/src/components/providers/provider-sheet.tsx +42 -25
- package/src/components/runs/run-list.tsx +17 -13
- package/src/components/schedules/schedule-card.tsx +10 -3
- package/src/components/schedules/schedule-list.tsx +2 -2
- package/src/components/schedules/schedule-sheet.tsx +28 -9
- package/src/components/secrets/secret-sheet.tsx +7 -2
- package/src/components/secrets/secrets-list.tsx +18 -5
- package/src/components/sessions/new-session-sheet.tsx +183 -376
- package/src/components/sessions/session-card.tsx +10 -2
- package/src/components/settings/gateway-connection-panel.tsx +9 -8
- package/src/components/shared/command-palette.tsx +13 -5
- package/src/components/shared/empty-state.tsx +20 -8
- package/src/components/shared/hint-tip.tsx +31 -0
- package/src/components/shared/notification-center.tsx +134 -86
- package/src/components/shared/profile-sheet.tsx +4 -0
- package/src/components/shared/settings/plugin-manager.tsx +360 -135
- package/src/components/shared/settings/section-capability-policy.tsx +3 -3
- package/src/components/shared/settings/section-runtime-loop.tsx +149 -4
- package/src/components/skills/clawhub-browser.tsx +1 -0
- package/src/components/skills/skill-list.tsx +31 -12
- package/src/components/skills/skill-sheet.tsx +20 -7
- package/src/components/tasks/approvals-panel.tsx +224 -0
- package/src/components/tasks/task-board.tsx +20 -12
- package/src/components/tasks/task-card.tsx +21 -7
- package/src/components/tasks/task-column.tsx +4 -3
- package/src/components/tasks/task-list.tsx +1 -1
- package/src/components/tasks/task-sheet.tsx +130 -1
- package/src/components/ui/dialog.tsx +1 -0
- package/src/components/ui/sheet.tsx +1 -0
- package/src/components/usage/metrics-dashboard.tsx +72 -48
- package/src/components/wallets/wallet-panel.tsx +65 -41
- package/src/components/wallets/wallet-section.tsx +9 -3
- package/src/components/webhooks/webhook-list.tsx +21 -12
- package/src/components/webhooks/webhook-sheet.tsx +13 -3
- package/src/lib/approval-display.test.ts +45 -0
- package/src/lib/approval-display.ts +62 -0
- package/src/lib/clipboard.ts +38 -0
- package/src/lib/memory.ts +8 -0
- package/src/lib/providers/claude-cli.ts +5 -3
- package/src/lib/providers/index.ts +67 -21
- package/src/lib/runtime-loop.ts +3 -2
- package/src/lib/server/approvals.ts +150 -0
- package/src/lib/server/chat-execution.ts +319 -74
- package/src/lib/server/chatroom-helpers.ts +63 -5
- package/src/lib/server/chatroom-orchestration.ts +74 -0
- package/src/lib/server/clawhub-client.ts +82 -6
- package/src/lib/server/connectors/manager.ts +27 -1
- package/src/lib/server/context-manager.ts +132 -50
- package/src/lib/server/cost.test.ts +73 -0
- package/src/lib/server/cost.ts +165 -34
- package/src/lib/server/daemon-state.ts +112 -1
- package/src/lib/server/data-dir.ts +18 -1
- package/src/lib/server/eval/runner.ts +126 -0
- package/src/lib/server/eval/scenarios.ts +218 -0
- package/src/lib/server/eval/scorer.ts +96 -0
- package/src/lib/server/eval/store.ts +37 -0
- package/src/lib/server/eval/types.ts +48 -0
- package/src/lib/server/execution-log.ts +12 -8
- package/src/lib/server/guardian.ts +34 -0
- package/src/lib/server/heartbeat-service.ts +53 -1
- package/src/lib/server/integrity-monitor.ts +208 -0
- package/src/lib/server/langgraph-checkpoint.ts +10 -0
- package/src/lib/server/link-understanding.ts +55 -0
- package/src/lib/server/llm-response-cache.test.ts +102 -0
- package/src/lib/server/llm-response-cache.ts +227 -0
- package/src/lib/server/main-agent-loop.ts +115 -16
- package/src/lib/server/main-session.ts +6 -3
- package/src/lib/server/mcp-conformance.test.ts +18 -0
- package/src/lib/server/mcp-conformance.ts +233 -0
- package/src/lib/server/memory-db.ts +193 -19
- package/src/lib/server/memory-retrieval.test.ts +56 -0
- package/src/lib/server/mmr.ts +73 -0
- package/src/lib/server/orchestrator-lg.ts +7 -1
- package/src/lib/server/orchestrator.ts +4 -3
- package/src/lib/server/plugins.ts +662 -132
- package/src/lib/server/process-manager.ts +18 -0
- package/src/lib/server/query-expansion.ts +57 -0
- package/src/lib/server/queue.ts +280 -11
- package/src/lib/server/runtime-settings.ts +9 -0
- package/src/lib/server/session-run-manager.test.ts +23 -0
- package/src/lib/server/session-run-manager.ts +32 -2
- package/src/lib/server/session-tools/canvas.ts +85 -50
- package/src/lib/server/session-tools/chatroom.ts +130 -127
- package/src/lib/server/session-tools/connector.ts +233 -454
- package/src/lib/server/session-tools/context-mgmt.ts +87 -105
- package/src/lib/server/session-tools/crud.ts +84 -7
- package/src/lib/server/session-tools/delegate.ts +351 -752
- package/src/lib/server/session-tools/discovery.ts +198 -0
- package/src/lib/server/session-tools/edit_file.ts +82 -0
- package/src/lib/server/session-tools/file-send.test.ts +39 -0
- package/src/lib/server/session-tools/file.ts +257 -425
- package/src/lib/server/session-tools/git.ts +87 -47
- package/src/lib/server/session-tools/http.ts +95 -33
- package/src/lib/server/session-tools/index.ts +217 -138
- package/src/lib/server/session-tools/memory.ts +154 -239
- package/src/lib/server/session-tools/monitor.ts +126 -0
- package/src/lib/server/session-tools/normalize-tool-args.test.ts +61 -0
- package/src/lib/server/session-tools/normalize-tool-args.ts +48 -0
- package/src/lib/server/session-tools/openclaw-nodes.ts +82 -99
- package/src/lib/server/session-tools/openclaw-workspace.ts +103 -93
- package/src/lib/server/session-tools/platform.ts +86 -0
- package/src/lib/server/session-tools/plugin-creator.ts +239 -0
- package/src/lib/server/session-tools/sample-ui.ts +97 -0
- package/src/lib/server/session-tools/sandbox.ts +175 -148
- package/src/lib/server/session-tools/schedule.ts +78 -0
- package/src/lib/server/session-tools/session-info.ts +104 -410
- package/src/lib/server/session-tools/shell-normalize.test.ts +43 -0
- package/src/lib/server/session-tools/shell.ts +171 -143
- package/src/lib/server/session-tools/subagent.ts +77 -77
- package/src/lib/server/session-tools/wallet.ts +182 -106
- package/src/lib/server/session-tools/web.ts +181 -327
- package/src/lib/server/storage.ts +36 -0
- package/src/lib/server/stream-agent-chat.ts +348 -242
- package/src/lib/server/task-quality-gate.test.ts +44 -0
- package/src/lib/server/task-quality-gate.ts +67 -0
- package/src/lib/server/task-validation.test.ts +78 -0
- package/src/lib/server/task-validation.ts +67 -2
- package/src/lib/server/tool-aliases.ts +68 -0
- package/src/lib/server/tool-capability-policy.ts +24 -5
- package/src/lib/server/tool-retry.ts +62 -0
- package/src/lib/server/transcript-repair.ts +72 -0
- package/src/lib/setup-defaults.ts +1 -0
- package/src/lib/tasks.ts +7 -1
- package/src/lib/tool-definitions.ts +24 -23
- package/src/lib/validation/schemas.ts +13 -0
- package/src/lib/view-routes.ts +2 -23
- package/src/stores/use-app-store.ts +23 -1
- package/src/types/index.ts +155 -10
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
import type { EvalScenario } from './types'
|
|
2
|
+
|
|
3
|
+
export const EVAL_SCENARIOS: EvalScenario[] = [
|
|
4
|
+
{
|
|
5
|
+
id: 'coding-prime',
|
|
6
|
+
name: 'Prime Number Function',
|
|
7
|
+
category: 'coding',
|
|
8
|
+
description: 'Create and test a function that checks if a number is prime',
|
|
9
|
+
userMessage: 'Create a function that checks if a number is prime and test it with a few examples including 2, 7, 10, and 97.',
|
|
10
|
+
expectedBehaviors: [
|
|
11
|
+
'Writes a correct isPrime function',
|
|
12
|
+
'Tests with the specified numbers',
|
|
13
|
+
'Returns correct results for each test case',
|
|
14
|
+
],
|
|
15
|
+
scoringCriteria: [
|
|
16
|
+
{ name: 'uses_shell', weight: 2, evaluator: 'tool_used', expected: 'shell' },
|
|
17
|
+
{ name: 'uses_files', weight: 2, evaluator: 'tool_used', expected: 'files' },
|
|
18
|
+
{ name: 'mentions_prime', weight: 1, evaluator: 'contains', expected: 'prime' },
|
|
19
|
+
{ name: 'tests_number_2', weight: 1, evaluator: 'contains', expected: '2' },
|
|
20
|
+
{ name: 'tests_number_97', weight: 1, evaluator: 'contains', expected: '97' },
|
|
21
|
+
{ name: 'correctness', weight: 3, evaluator: 'llm_judge', expected: 'Did the response correctly implement an isPrime function and test it with 2, 7, 10, and 97, producing correct results (2=prime, 7=prime, 10=not prime, 97=prime)?' },
|
|
22
|
+
],
|
|
23
|
+
timeoutMs: 60_000,
|
|
24
|
+
tools: ['shell', 'files'],
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
id: 'research-frameworks',
|
|
28
|
+
name: 'Node.js Framework Comparison',
|
|
29
|
+
category: 'research',
|
|
30
|
+
description: 'Research and compare top Node.js web frameworks by GitHub stars',
|
|
31
|
+
userMessage: 'Find the top 3 Node.js web frameworks by GitHub stars and compare them. Include star counts and key differences.',
|
|
32
|
+
expectedBehaviors: [
|
|
33
|
+
'Searches the web for framework information',
|
|
34
|
+
'Identifies at least 3 frameworks (e.g. Express, Next.js, Fastify)',
|
|
35
|
+
'Provides star counts and meaningful comparison',
|
|
36
|
+
],
|
|
37
|
+
scoringCriteria: [
|
|
38
|
+
{ name: 'uses_web_search', weight: 2, evaluator: 'tool_used', expected: 'web_search' },
|
|
39
|
+
{ name: 'mentions_express', weight: 1, evaluator: 'regex', expected: 'express(\\.js)?' },
|
|
40
|
+
{ name: 'mentions_next', weight: 1, evaluator: 'regex', expected: 'next(\\.js)?' },
|
|
41
|
+
{ name: 'includes_stars', weight: 1, evaluator: 'regex', expected: '\\d+[kK,.]?\\d*\\s*(stars|\\*)' },
|
|
42
|
+
{ name: 'quality', weight: 5, evaluator: 'llm_judge', expected: 'Did the response identify at least 3 Node.js web frameworks, include GitHub star counts, and provide a meaningful comparison of their key differences?' },
|
|
43
|
+
],
|
|
44
|
+
timeoutMs: 90_000,
|
|
45
|
+
tools: ['web_search', 'web_fetch'],
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
id: 'companionship-overwhelmed',
|
|
49
|
+
name: 'Emotional Support - Overwhelmed',
|
|
50
|
+
category: 'companionship',
|
|
51
|
+
description: 'Provide empathetic support when user is overwhelmed with work',
|
|
52
|
+
userMessage: "I'm feeling overwhelmed with work deadlines. Everything is piling up and I don't know where to start.",
|
|
53
|
+
expectedBehaviors: [
|
|
54
|
+
'Acknowledges the feeling of being overwhelmed',
|
|
55
|
+
'Shows empathy without being dismissive',
|
|
56
|
+
'Offers actionable suggestions for managing workload',
|
|
57
|
+
],
|
|
58
|
+
scoringCriteria: [
|
|
59
|
+
{ name: 'empathy', weight: 4, evaluator: 'llm_judge', expected: 'Does the response show genuine empathy and acknowledge the user\'s feelings of being overwhelmed without being dismissive or jumping straight to advice?' },
|
|
60
|
+
{ name: 'actionable_advice', weight: 3, evaluator: 'llm_judge', expected: 'Does the response offer practical, actionable suggestions for managing workload or reducing the feeling of being overwhelmed (e.g. prioritization, breaking tasks down, time management)?' },
|
|
61
|
+
{ name: 'appropriate_tone', weight: 3, evaluator: 'llm_judge', expected: 'Is the tone warm, supportive, and human-like rather than clinical, robotic, or overly formal?' },
|
|
62
|
+
],
|
|
63
|
+
timeoutMs: 30_000,
|
|
64
|
+
tools: [],
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
id: 'multi-step-project',
|
|
68
|
+
name: 'Project Directory Setup',
|
|
69
|
+
category: 'multi-step',
|
|
70
|
+
description: 'Create a project directory, write a README, and initialize git',
|
|
71
|
+
userMessage: "Create a project directory called 'demo-project' in /tmp, write a README.md with a title and description, and initialize a git repository in it.",
|
|
72
|
+
expectedBehaviors: [
|
|
73
|
+
'Creates the demo-project directory',
|
|
74
|
+
'Writes a README.md file with content',
|
|
75
|
+
'Initializes a git repository',
|
|
76
|
+
],
|
|
77
|
+
scoringCriteria: [
|
|
78
|
+
{ name: 'uses_shell', weight: 2, evaluator: 'tool_used', expected: 'shell' },
|
|
79
|
+
{ name: 'uses_files', weight: 2, evaluator: 'tool_used', expected: 'files' },
|
|
80
|
+
{ name: 'mentions_mkdir', weight: 1, evaluator: 'regex', expected: 'demo-project' },
|
|
81
|
+
{ name: 'mentions_readme', weight: 1, evaluator: 'contains', expected: 'README' },
|
|
82
|
+
{ name: 'mentions_git_init', weight: 1, evaluator: 'contains', expected: 'git init' },
|
|
83
|
+
{ name: 'completeness', weight: 3, evaluator: 'llm_judge', expected: 'Did the response successfully complete all 3 steps: create the demo-project directory, write a README.md with meaningful content, and initialize a git repository?' },
|
|
84
|
+
],
|
|
85
|
+
timeoutMs: 60_000,
|
|
86
|
+
tools: ['shell', 'files'],
|
|
87
|
+
},
|
|
88
|
+
{
|
|
89
|
+
id: 'memory-store-recall',
|
|
90
|
+
name: 'Memory Store and Recall',
|
|
91
|
+
category: 'memory',
|
|
92
|
+
description: 'Store a fact in memory and demonstrate recall capability',
|
|
93
|
+
userMessage: 'Remember that my favorite programming language is Rust and I prefer functional programming patterns. Then confirm what you just stored.',
|
|
94
|
+
expectedBehaviors: [
|
|
95
|
+
'Uses memory tool to store the information',
|
|
96
|
+
'Confirms what was stored',
|
|
97
|
+
'Accurately reflects the stored preferences',
|
|
98
|
+
],
|
|
99
|
+
scoringCriteria: [
|
|
100
|
+
{ name: 'uses_memory', weight: 3, evaluator: 'tool_used', expected: 'memory' },
|
|
101
|
+
{ name: 'mentions_rust', weight: 2, evaluator: 'contains', expected: 'Rust' },
|
|
102
|
+
{ name: 'mentions_functional', weight: 2, evaluator: 'contains', expected: 'functional' },
|
|
103
|
+
{ name: 'confirmation', weight: 3, evaluator: 'llm_judge', expected: 'Did the response confirm storing the user\'s preference for Rust and functional programming, and accurately summarize what was stored?' },
|
|
104
|
+
],
|
|
105
|
+
timeoutMs: 30_000,
|
|
106
|
+
tools: ['memory'],
|
|
107
|
+
},
|
|
108
|
+
{
|
|
109
|
+
id: 'planning-blog',
|
|
110
|
+
name: 'Blog Platform Planning',
|
|
111
|
+
category: 'planning',
|
|
112
|
+
description: 'Create a detailed plan for building a blog platform',
|
|
113
|
+
userMessage: 'Build me a detailed plan for a blog platform with posts, comments, and user authentication. Break it into tasks I can work through.',
|
|
114
|
+
expectedBehaviors: [
|
|
115
|
+
'Creates structured tasks or a plan',
|
|
116
|
+
'Covers posts, comments, and authentication',
|
|
117
|
+
'Breaks work into manageable pieces',
|
|
118
|
+
],
|
|
119
|
+
scoringCriteria: [
|
|
120
|
+
{ name: 'uses_tasks', weight: 2, evaluator: 'tool_used', expected: 'manage_tasks' },
|
|
121
|
+
{ name: 'mentions_posts', weight: 1, evaluator: 'contains', expected: 'post' },
|
|
122
|
+
{ name: 'mentions_comments', weight: 1, evaluator: 'contains', expected: 'comment' },
|
|
123
|
+
{ name: 'mentions_auth', weight: 1, evaluator: 'regex', expected: 'auth(entication|orization)?' },
|
|
124
|
+
{ name: 'plan_quality', weight: 5, evaluator: 'llm_judge', expected: 'Is the plan well-structured with clear, actionable tasks that cover the three main features (posts, comments, user auth)? Are tasks broken into manageable pieces with logical ordering?' },
|
|
125
|
+
],
|
|
126
|
+
timeoutMs: 60_000,
|
|
127
|
+
tools: ['manage_tasks'],
|
|
128
|
+
},
|
|
129
|
+
{
|
|
130
|
+
id: 'tool-usage-weather',
|
|
131
|
+
name: 'Web Search - Weather',
|
|
132
|
+
category: 'tool-usage',
|
|
133
|
+
description: 'Search the web for current weather information',
|
|
134
|
+
userMessage: 'Search the web for today\'s weather in London and tell me the temperature and conditions.',
|
|
135
|
+
expectedBehaviors: [
|
|
136
|
+
'Uses web search tool',
|
|
137
|
+
'Reports temperature',
|
|
138
|
+
'Reports weather conditions',
|
|
139
|
+
],
|
|
140
|
+
scoringCriteria: [
|
|
141
|
+
{ name: 'uses_web_search', weight: 3, evaluator: 'tool_used', expected: 'web_search' },
|
|
142
|
+
{ name: 'mentions_temperature', weight: 2, evaluator: 'regex', expected: '\\d+\\s*[°]?\\s*[CcFf]' },
|
|
143
|
+
{ name: 'mentions_london', weight: 1, evaluator: 'contains', expected: 'London' },
|
|
144
|
+
{ name: 'quality', weight: 4, evaluator: 'llm_judge', expected: 'Did the response provide specific, current weather information for London including temperature and conditions (e.g. sunny, cloudy, rain)?' },
|
|
145
|
+
],
|
|
146
|
+
timeoutMs: 60_000,
|
|
147
|
+
tools: ['web_search'],
|
|
148
|
+
},
|
|
149
|
+
{
|
|
150
|
+
id: 'coding-fizzbuzz',
|
|
151
|
+
name: 'FizzBuzz Implementation',
|
|
152
|
+
category: 'coding',
|
|
153
|
+
description: 'Write and run a FizzBuzz implementation in Python',
|
|
154
|
+
userMessage: 'Write a FizzBuzz implementation in Python that prints numbers 1 to 30 and run it.',
|
|
155
|
+
expectedBehaviors: [
|
|
156
|
+
'Writes correct FizzBuzz logic',
|
|
157
|
+
'Runs the code successfully',
|
|
158
|
+
'Output contains Fizz, Buzz, and FizzBuzz',
|
|
159
|
+
],
|
|
160
|
+
scoringCriteria: [
|
|
161
|
+
{ name: 'uses_shell', weight: 2, evaluator: 'tool_used', expected: 'shell' },
|
|
162
|
+
{ name: 'uses_files', weight: 2, evaluator: 'tool_used', expected: 'files' },
|
|
163
|
+
{ name: 'contains_fizz', weight: 1, evaluator: 'contains', expected: 'Fizz' },
|
|
164
|
+
{ name: 'contains_buzz', weight: 1, evaluator: 'contains', expected: 'Buzz' },
|
|
165
|
+
{ name: 'contains_fizzbuzz', weight: 1, evaluator: 'contains', expected: 'FizzBuzz' },
|
|
166
|
+
{ name: 'correctness', weight: 3, evaluator: 'llm_judge', expected: 'Did the response implement FizzBuzz correctly (multiples of 3 print Fizz, multiples of 5 print Buzz, multiples of both print FizzBuzz) and successfully execute it?' },
|
|
167
|
+
],
|
|
168
|
+
timeoutMs: 60_000,
|
|
169
|
+
tools: ['shell', 'files'],
|
|
170
|
+
},
|
|
171
|
+
{
|
|
172
|
+
id: 'research-comparison',
|
|
173
|
+
name: 'LLM Pricing Comparison',
|
|
174
|
+
category: 'research',
|
|
175
|
+
description: 'Compare pricing of major LLM models',
|
|
176
|
+
userMessage: 'Compare the pricing of OpenAI GPT-4o and Anthropic Claude 3.5 Sonnet. Include input and output token costs.',
|
|
177
|
+
expectedBehaviors: [
|
|
178
|
+
'Searches for current pricing',
|
|
179
|
+
'Includes both models',
|
|
180
|
+
'Reports input and output token costs',
|
|
181
|
+
],
|
|
182
|
+
scoringCriteria: [
|
|
183
|
+
{ name: 'uses_web_search', weight: 2, evaluator: 'tool_used', expected: 'web_search' },
|
|
184
|
+
{ name: 'mentions_gpt4o', weight: 1, evaluator: 'regex', expected: 'GPT-?4[oO]' },
|
|
185
|
+
{ name: 'mentions_claude', weight: 1, evaluator: 'regex', expected: 'Claude\\s*3\\.?5' },
|
|
186
|
+
{ name: 'mentions_pricing', weight: 1, evaluator: 'regex', expected: '\\$\\d+' },
|
|
187
|
+
{ name: 'quality', weight: 5, evaluator: 'llm_judge', expected: 'Did the response provide accurate and specific pricing for both GPT-4o and Claude 3.5 Sonnet, including input and output token costs, with a clear comparison?' },
|
|
188
|
+
],
|
|
189
|
+
timeoutMs: 90_000,
|
|
190
|
+
tools: ['web_search', 'web_fetch'],
|
|
191
|
+
},
|
|
192
|
+
{
|
|
193
|
+
id: 'multi-step-analyze',
|
|
194
|
+
name: 'Package.json Analysis',
|
|
195
|
+
category: 'multi-step',
|
|
196
|
+
description: 'Read and analyze the current project\'s package.json',
|
|
197
|
+
userMessage: 'Read the package.json in the current directory and list all dependencies. Group them into regular dependencies and dev dependencies.',
|
|
198
|
+
expectedBehaviors: [
|
|
199
|
+
'Reads package.json using shell or files tool',
|
|
200
|
+
'Lists regular dependencies',
|
|
201
|
+
'Lists dev dependencies',
|
|
202
|
+
'Groups them clearly',
|
|
203
|
+
],
|
|
204
|
+
scoringCriteria: [
|
|
205
|
+
{ name: 'uses_shell_or_files', weight: 2, evaluator: 'tool_used', expected: 'shell' },
|
|
206
|
+
{ name: 'mentions_dependencies', weight: 1, evaluator: 'contains', expected: 'dependencies' },
|
|
207
|
+
{ name: 'mentions_dev_deps', weight: 1, evaluator: 'regex', expected: 'dev[Dd]ependencies|dev dependencies' },
|
|
208
|
+
{ name: 'mentions_package_json', weight: 1, evaluator: 'contains', expected: 'package.json' },
|
|
209
|
+
{ name: 'quality', weight: 5, evaluator: 'llm_judge', expected: 'Did the response successfully read package.json, list the dependencies, and clearly group them into regular and dev dependencies?' },
|
|
210
|
+
],
|
|
211
|
+
timeoutMs: 60_000,
|
|
212
|
+
tools: ['shell', 'files'],
|
|
213
|
+
},
|
|
214
|
+
]
|
|
215
|
+
|
|
216
|
+
export function getScenario(id: string): EvalScenario | undefined {
|
|
217
|
+
return EVAL_SCENARIOS.find(s => s.id === id)
|
|
218
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import type { ScoringCriterion, EvalCriterionResult } from './types'
|
|
2
|
+
import type { MessageToolEvent } from '@/types'
|
|
3
|
+
|
|
4
|
+
export async function scoreCriteria(
|
|
5
|
+
criteria: ScoringCriterion[],
|
|
6
|
+
responseText: string,
|
|
7
|
+
toolEvents: MessageToolEvent[],
|
|
8
|
+
judgeOpts?: { provider: string; model: string; apiKey: string | null; apiEndpoint?: string | null },
|
|
9
|
+
): Promise<EvalCriterionResult[]> {
|
|
10
|
+
const results: EvalCriterionResult[] = []
|
|
11
|
+
|
|
12
|
+
for (const criterion of criteria) {
|
|
13
|
+
switch (criterion.evaluator) {
|
|
14
|
+
case 'contains': {
|
|
15
|
+
const found = responseText.toLowerCase().includes(criterion.expected.toLowerCase())
|
|
16
|
+
results.push({
|
|
17
|
+
criterion: criterion.name,
|
|
18
|
+
score: found ? criterion.weight : 0,
|
|
19
|
+
maxScore: criterion.weight,
|
|
20
|
+
evidence: found ? `Found "${criterion.expected}" in response` : `"${criterion.expected}" not found in response`,
|
|
21
|
+
})
|
|
22
|
+
break
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
case 'regex': {
|
|
26
|
+
const regex = new RegExp(criterion.expected, 'i')
|
|
27
|
+
const matched = regex.test(responseText)
|
|
28
|
+
results.push({
|
|
29
|
+
criterion: criterion.name,
|
|
30
|
+
score: matched ? criterion.weight : 0,
|
|
31
|
+
maxScore: criterion.weight,
|
|
32
|
+
evidence: matched ? `Pattern /${criterion.expected}/i matched` : `Pattern /${criterion.expected}/i did not match`,
|
|
33
|
+
})
|
|
34
|
+
break
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
case 'tool_used': {
|
|
38
|
+
const used = toolEvents.some(e => e.name === criterion.expected)
|
|
39
|
+
results.push({
|
|
40
|
+
criterion: criterion.name,
|
|
41
|
+
score: used ? criterion.weight : 0,
|
|
42
|
+
maxScore: criterion.weight,
|
|
43
|
+
evidence: used ? `Tool "${criterion.expected}" was used` : `Tool "${criterion.expected}" was not used`,
|
|
44
|
+
})
|
|
45
|
+
break
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
case 'llm_judge': {
|
|
49
|
+
if (!judgeOpts) {
|
|
50
|
+
results.push({
|
|
51
|
+
criterion: criterion.name,
|
|
52
|
+
score: 0,
|
|
53
|
+
maxScore: criterion.weight,
|
|
54
|
+
evidence: 'No judge provider configured; skipped',
|
|
55
|
+
})
|
|
56
|
+
break
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
try {
|
|
60
|
+
const { buildChatModel } = await import('../build-llm')
|
|
61
|
+
const { HumanMessage } = await import('@langchain/core/messages')
|
|
62
|
+
|
|
63
|
+
const llm = buildChatModel({
|
|
64
|
+
provider: judgeOpts.provider,
|
|
65
|
+
model: judgeOpts.model,
|
|
66
|
+
apiKey: judgeOpts.apiKey,
|
|
67
|
+
apiEndpoint: judgeOpts.apiEndpoint,
|
|
68
|
+
})
|
|
69
|
+
|
|
70
|
+
const judgePrompt = `Rate the following AI response on a scale of 0-10.\n\nCriterion: ${criterion.expected}\n\nResponse:\n${responseText}\n\nReply with ONLY a number 0-10.`
|
|
71
|
+
const result = await llm.invoke([new HumanMessage(judgePrompt)])
|
|
72
|
+
const scoreText = typeof result.content === 'string' ? result.content : ''
|
|
73
|
+
const parsed = parseInt(scoreText.trim(), 10)
|
|
74
|
+
const rawScore = Number.isFinite(parsed) ? Math.max(0, Math.min(10, parsed)) : 5
|
|
75
|
+
|
|
76
|
+
results.push({
|
|
77
|
+
criterion: criterion.name,
|
|
78
|
+
score: (rawScore / 10) * criterion.weight,
|
|
79
|
+
maxScore: criterion.weight,
|
|
80
|
+
evidence: `LLM judge: ${rawScore}/10`,
|
|
81
|
+
})
|
|
82
|
+
} catch (err: unknown) {
|
|
83
|
+
results.push({
|
|
84
|
+
criterion: criterion.name,
|
|
85
|
+
score: 0,
|
|
86
|
+
maxScore: criterion.weight,
|
|
87
|
+
evidence: `LLM judge error: ${err instanceof Error ? err.message : String(err)}`,
|
|
88
|
+
})
|
|
89
|
+
}
|
|
90
|
+
break
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
return results
|
|
96
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import Database from 'better-sqlite3'
|
|
2
|
+
import path from 'path'
|
|
3
|
+
import type { EvalRun } from './types'
|
|
4
|
+
|
|
5
|
+
const DB_PATH = path.join(process.cwd(), 'data', 'eval-runs.db')
|
|
6
|
+
|
|
7
|
+
let db: Database.Database | null = null
|
|
8
|
+
|
|
9
|
+
function getDb(): Database.Database {
|
|
10
|
+
if (!db) {
|
|
11
|
+
db = new Database(DB_PATH)
|
|
12
|
+
db.pragma('journal_mode = WAL')
|
|
13
|
+
db.exec(`CREATE TABLE IF NOT EXISTS eval_runs (
|
|
14
|
+
id TEXT PRIMARY KEY,
|
|
15
|
+
data TEXT NOT NULL
|
|
16
|
+
)`)
|
|
17
|
+
}
|
|
18
|
+
return db
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export function saveEvalRun(run: EvalRun): void {
|
|
22
|
+
getDb().prepare('INSERT OR REPLACE INTO eval_runs (id, data) VALUES (?, ?)').run(run.id, JSON.stringify(run))
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export function getEvalRun(id: string): EvalRun | null {
|
|
26
|
+
const row = getDb().prepare('SELECT data FROM eval_runs WHERE id = ?').get(id) as { data: string } | undefined
|
|
27
|
+
return row ? JSON.parse(row.data) as EvalRun : null
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export function listEvalRuns(limit = 50): EvalRun[] {
|
|
31
|
+
const rows = getDb().prepare('SELECT data FROM eval_runs ORDER BY rowid DESC LIMIT ?').all(limit) as { data: string }[]
|
|
32
|
+
return rows.map(r => JSON.parse(r.data) as EvalRun)
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export function listEvalRunsByAgent(agentId: string, limit = 50): EvalRun[] {
|
|
36
|
+
return listEvalRuns(limit * 2).filter(r => r.agentId === agentId).slice(0, limit)
|
|
37
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
export interface ScoringCriterion {
|
|
2
|
+
name: string
|
|
3
|
+
weight: number
|
|
4
|
+
evaluator: 'contains' | 'regex' | 'tool_used' | 'llm_judge'
|
|
5
|
+
expected: string
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
export interface EvalScenario {
|
|
9
|
+
id: string
|
|
10
|
+
name: string
|
|
11
|
+
category: 'coding' | 'research' | 'companionship' | 'multi-step' | 'memory' | 'planning' | 'tool-usage' | 'long-lived'
|
|
12
|
+
description: string
|
|
13
|
+
userMessage: string
|
|
14
|
+
expectedBehaviors: string[]
|
|
15
|
+
scoringCriteria: ScoringCriterion[]
|
|
16
|
+
timeoutMs: number
|
|
17
|
+
tools: string[]
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export interface EvalRun {
|
|
21
|
+
id: string
|
|
22
|
+
scenarioId: string
|
|
23
|
+
agentId: string
|
|
24
|
+
status: 'pending' | 'running' | 'completed' | 'failed'
|
|
25
|
+
startedAt: number
|
|
26
|
+
endedAt?: number
|
|
27
|
+
score: number
|
|
28
|
+
maxScore: number
|
|
29
|
+
details: EvalCriterionResult[]
|
|
30
|
+
sessionId?: string
|
|
31
|
+
error?: string
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export interface EvalCriterionResult {
|
|
35
|
+
criterion: string
|
|
36
|
+
score: number
|
|
37
|
+
maxScore: number
|
|
38
|
+
evidence?: string
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export interface EvalSuiteResult {
|
|
42
|
+
agentId: string
|
|
43
|
+
totalScore: number
|
|
44
|
+
maxScore: number
|
|
45
|
+
percentage: number
|
|
46
|
+
runs: EvalRun[]
|
|
47
|
+
completedAt: number
|
|
48
|
+
}
|
|
@@ -8,14 +8,18 @@ import { genId } from '@/lib/id'
|
|
|
8
8
|
// ---------------------------------------------------------------------------
|
|
9
9
|
|
|
10
10
|
export type LogCategory =
|
|
11
|
-
| 'trigger'
|
|
12
|
-
| 'decision'
|
|
13
|
-
| 'tool_call'
|
|
14
|
-
| 'tool_result'
|
|
15
|
-
| 'outbound'
|
|
16
|
-
| 'file_op'
|
|
17
|
-
| 'commit'
|
|
18
|
-
| 'error'
|
|
11
|
+
| 'trigger' // what kicked off the action
|
|
12
|
+
| 'decision' // reasoning / model choice
|
|
13
|
+
| 'tool_call' // tool invocation with input
|
|
14
|
+
| 'tool_result' // tool output
|
|
15
|
+
| 'outbound' // messages sent to users/platforms
|
|
16
|
+
| 'file_op' // file read/write/delete with checksums
|
|
17
|
+
| 'commit' // git commit activity
|
|
18
|
+
| 'error' // errors during execution
|
|
19
|
+
| 'mission_start' // new mission/goal started
|
|
20
|
+
| 'mission_checkpoint' // periodic mission state snapshot
|
|
21
|
+
| 'mission_complete' // mission reached ok status
|
|
22
|
+
| 'budget_warning' // mission approaching or exceeding budget
|
|
19
23
|
|
|
20
24
|
export interface ExecutionLogEntry {
|
|
21
25
|
id: string
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import { execSync } from 'child_process'
|
|
2
|
+
import path from 'path'
|
|
3
|
+
import fs from 'fs'
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* OpenClaw Guardian — Auto-Rollback capability.
|
|
7
|
+
* If an agent fails a task critically and has autoRecovery enabled,
|
|
8
|
+
* we attempt to roll back the workspace to the last known good state.
|
|
9
|
+
*/
|
|
10
|
+
export function performGuardianRollback(cwd: string): { ok: boolean; reason?: string } {
|
|
11
|
+
try {
|
|
12
|
+
const gitDir = path.join(cwd, '.git')
|
|
13
|
+
if (!fs.existsSync(gitDir)) {
|
|
14
|
+
return { ok: false, reason: 'Workspace is not a git repository. Cannot rollback.' }
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
// Check if dirty
|
|
18
|
+
const status = execSync('git status --porcelain', { cwd, encoding: 'utf8' })
|
|
19
|
+
if (!status.trim()) {
|
|
20
|
+
return { ok: false, reason: 'Workspace is clean. Nothing to rollback.' }
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
console.log(`[guardian] Auto-recovery triggered in ${cwd}. Rolling back changes...`)
|
|
24
|
+
|
|
25
|
+
// Perform rollback
|
|
26
|
+
execSync('git reset --hard HEAD', { cwd, encoding: 'utf8' })
|
|
27
|
+
execSync('git clean -fd', { cwd, encoding: 'utf8' })
|
|
28
|
+
|
|
29
|
+
return { ok: true }
|
|
30
|
+
} catch (err: unknown) {
|
|
31
|
+
console.error('[guardian] Auto-rollback failed:', err)
|
|
32
|
+
return { ok: false, reason: `Git operation failed: ${err instanceof Error ? err.message : String(err)}` }
|
|
33
|
+
}
|
|
34
|
+
}
|
|
@@ -132,6 +132,45 @@ function readHeartbeatFile(session: any): string {
|
|
|
132
132
|
return ''
|
|
133
133
|
}
|
|
134
134
|
|
|
135
|
+
function readIdentityFile(session: Record<string, unknown>): Record<string, string> {
|
|
136
|
+
try {
|
|
137
|
+
const filePath = path.join(typeof session.cwd === 'string' ? session.cwd : WORKSPACE_DIR, 'IDENTITY.md')
|
|
138
|
+
if (fs.existsSync(filePath)) {
|
|
139
|
+
const content = fs.readFileSync(filePath, 'utf-8')
|
|
140
|
+
const identity: Record<string, string> = {}
|
|
141
|
+
for (const line of content.split('\n')) {
|
|
142
|
+
const cleaned = line.trim().replace(/^\s*-\s*/, '')
|
|
143
|
+
const colonIndex = cleaned.indexOf(':')
|
|
144
|
+
if (colonIndex === -1) continue
|
|
145
|
+
const label = cleaned.slice(0, colonIndex).replace(/[*_]/g, '').trim().toLowerCase()
|
|
146
|
+
const value = cleaned.slice(colonIndex + 1).replace(/^[*_]+|[*_]+$/g, '').trim()
|
|
147
|
+
if (value) identity[label] = value
|
|
148
|
+
}
|
|
149
|
+
return identity
|
|
150
|
+
}
|
|
151
|
+
} catch { /* ignore */ }
|
|
152
|
+
return {}
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
export function buildIdentityContext(session: Record<string, unknown> | undefined | null, agent: Record<string, unknown> | undefined | null): string {
|
|
156
|
+
const fileId = session ? readIdentityFile(session) : {}
|
|
157
|
+
const name = fileId.name || agent?.name || ''
|
|
158
|
+
const emoji = fileId.emoji || agent?.emoji || ''
|
|
159
|
+
const creature = fileId.creature || agent?.creature || ''
|
|
160
|
+
const vibe = fileId.vibe || agent?.vibe || ''
|
|
161
|
+
const theme = fileId.theme || agent?.theme || ''
|
|
162
|
+
|
|
163
|
+
const lines = []
|
|
164
|
+
if (name) lines.push(`Name: ${name}`)
|
|
165
|
+
if (emoji) lines.push(`Emoji: ${emoji}`)
|
|
166
|
+
if (creature) lines.push(`Creature: ${creature}`)
|
|
167
|
+
if (vibe) lines.push(`Vibe: ${vibe}`)
|
|
168
|
+
if (theme) lines.push(`Theme: ${theme}`)
|
|
169
|
+
|
|
170
|
+
if (lines.length === 0) return ''
|
|
171
|
+
return `## Your Identity\n${lines.join('\n')}`
|
|
172
|
+
}
|
|
173
|
+
|
|
135
174
|
/** Detect HEARTBEAT.md files that contain only skeleton structure (headers, empty list items) but no real content. */
|
|
136
175
|
export function isHeartbeatContentEffectivelyEmpty(content: string | undefined | null): boolean {
|
|
137
176
|
if (!content || typeof content !== 'string') return true
|
|
@@ -148,6 +187,7 @@ export function isHeartbeatContentEffectivelyEmpty(content: string | undefined |
|
|
|
148
187
|
function buildAgentHeartbeatPrompt(session: any, agent: any, fallbackPrompt: string, heartbeatFileContent: string): string {
|
|
149
188
|
if (!agent) return fallbackPrompt
|
|
150
189
|
|
|
190
|
+
const identityContext = buildIdentityContext(session, agent)
|
|
151
191
|
// Drain system events accumulated since last heartbeat
|
|
152
192
|
const events = drainSystemEvents(session.id)
|
|
153
193
|
const eventBlock = events.length > 0
|
|
@@ -178,7 +218,7 @@ function buildAgentHeartbeatPrompt(session: any, agent: any, fallbackPrompt: str
|
|
|
178
218
|
return [
|
|
179
219
|
'AGENT_HEARTBEAT_TICK',
|
|
180
220
|
`Time: ${new Date().toISOString()}`,
|
|
181
|
-
|
|
221
|
+
identityContext,
|
|
182
222
|
description ? `Description: ${description}` : '',
|
|
183
223
|
eventBlock ? `Events since last heartbeat:\n${eventBlock}` : '',
|
|
184
224
|
dynamicGoal
|
|
@@ -202,6 +242,14 @@ function buildAgentHeartbeatPrompt(session: any, agent: any, fallbackPrompt: str
|
|
|
202
242
|
].filter(Boolean).join('\n')
|
|
203
243
|
}
|
|
204
244
|
|
|
245
|
+
function applyMomentumMultiplier(intervalSec: number, momentumScore: number): number {
|
|
246
|
+
let multiplier = 1.0
|
|
247
|
+
if (momentumScore >= 80) multiplier = 0.5
|
|
248
|
+
else if (momentumScore < 40) multiplier = 2.0
|
|
249
|
+
const adjusted = Math.round(intervalSec * multiplier)
|
|
250
|
+
return Math.max(30, Math.min(7200, adjusted))
|
|
251
|
+
}
|
|
252
|
+
|
|
205
253
|
function resolveInterval(obj: Record<string, any>, currentSec: number): number {
|
|
206
254
|
// Prefer heartbeatInterval (duration string) over heartbeatIntervalSec (raw number)
|
|
207
255
|
if (obj.heartbeatInterval !== undefined && obj.heartbeatInterval !== null) {
|
|
@@ -347,6 +395,10 @@ async function tickHeartbeats() {
|
|
|
347
395
|
const cfg = heartbeatConfigForSession(session, settings, agents)
|
|
348
396
|
if (!cfg.enabled) continue
|
|
349
397
|
|
|
398
|
+
// Apply momentum-based multiplier to heartbeat interval
|
|
399
|
+
const momentumScore = session.mainLoopState?.momentumScore ?? 40
|
|
400
|
+
cfg.intervalSec = applyMomentumMultiplier(cfg.intervalSec, momentumScore)
|
|
401
|
+
|
|
350
402
|
// For sessions with explicit opt-in, use a shorter idle threshold (just intervalSec * 2).
|
|
351
403
|
// For inherited/global heartbeats, keep the 180s minimum to avoid noisy auto-fire.
|
|
352
404
|
const defaultIdleSec = explicitOptIn
|