@swarmclawai/swarmclaw 0.6.7 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. package/README.md +82 -39
  2. package/next.config.ts +31 -6
  3. package/package.json +3 -2
  4. package/src/app/api/agents/[id]/thread/route.ts +1 -0
  5. package/src/app/api/agents/route.ts +19 -5
  6. package/src/app/api/approvals/route.ts +22 -0
  7. package/src/app/api/chatrooms/[id]/chat/route.ts +4 -0
  8. package/src/app/api/clawhub/install/route.ts +2 -2
  9. package/src/app/api/eval/run/route.ts +37 -0
  10. package/src/app/api/eval/scenarios/route.ts +24 -0
  11. package/src/app/api/eval/suite/route.ts +29 -0
  12. package/src/app/api/mcp-servers/[id]/conformance/route.ts +26 -0
  13. package/src/app/api/mcp-servers/[id]/invoke/route.ts +81 -0
  14. package/src/app/api/memory/graph/route.ts +46 -0
  15. package/src/app/api/memory/route.ts +36 -5
  16. package/src/app/api/notifications/route.ts +3 -0
  17. package/src/app/api/plugins/install/route.ts +57 -5
  18. package/src/app/api/plugins/marketplace/route.ts +73 -22
  19. package/src/app/api/plugins/route.ts +61 -1
  20. package/src/app/api/plugins/ui/route.ts +34 -0
  21. package/src/app/api/sessions/[id]/checkpoints/route.ts +31 -0
  22. package/src/app/api/sessions/[id]/restore/route.ts +36 -0
  23. package/src/app/api/settings/route.ts +62 -0
  24. package/src/app/api/setup/doctor/route.ts +22 -5
  25. package/src/app/api/souls/[id]/route.ts +65 -0
  26. package/src/app/api/souls/route.ts +70 -0
  27. package/src/app/api/tasks/[id]/approve/route.ts +4 -3
  28. package/src/app/api/tasks/[id]/route.ts +16 -3
  29. package/src/app/api/tasks/route.ts +10 -2
  30. package/src/app/api/usage/route.ts +9 -2
  31. package/src/app/globals.css +27 -0
  32. package/src/app/page.tsx +10 -5
  33. package/src/cli/index.js +37 -0
  34. package/src/components/activity/activity-feed.tsx +9 -2
  35. package/src/components/agents/agent-avatar.tsx +5 -1
  36. package/src/components/agents/agent-card.tsx +55 -9
  37. package/src/components/agents/agent-sheet.tsx +112 -34
  38. package/src/components/agents/inspector-panel.tsx +1 -1
  39. package/src/components/agents/soul-library-picker.tsx +84 -13
  40. package/src/components/auth/access-key-gate.tsx +63 -54
  41. package/src/components/auth/user-picker.tsx +37 -32
  42. package/src/components/chat/activity-moment.tsx +2 -0
  43. package/src/components/chat/chat-area.tsx +11 -0
  44. package/src/components/chat/chat-header.tsx +69 -25
  45. package/src/components/chat/chat-tool-toggles.tsx +2 -2
  46. package/src/components/chat/checkpoint-timeline.tsx +112 -0
  47. package/src/components/chat/code-block.tsx +3 -1
  48. package/src/components/chat/exec-approval-card.tsx +8 -1
  49. package/src/components/chat/message-bubble.tsx +164 -4
  50. package/src/components/chat/message-list.tsx +46 -4
  51. package/src/components/chat/session-approval-card.tsx +80 -0
  52. package/src/components/chat/session-debug-panel.tsx +106 -84
  53. package/src/components/chat/streaming-bubble.tsx +6 -5
  54. package/src/components/chat/task-approval-card.tsx +78 -0
  55. package/src/components/chat/thinking-indicator.tsx +48 -12
  56. package/src/components/chat/tool-call-bubble.tsx +3 -0
  57. package/src/components/chat/tool-request-banner.tsx +39 -20
  58. package/src/components/chatrooms/chatroom-list.tsx +11 -4
  59. package/src/components/chatrooms/chatroom-sheet.tsx +7 -2
  60. package/src/components/connectors/connector-list.tsx +33 -11
  61. package/src/components/connectors/connector-sheet.tsx +37 -7
  62. package/src/components/home/home-view.tsx +54 -24
  63. package/src/components/input/chat-input.tsx +22 -1
  64. package/src/components/knowledge/knowledge-list.tsx +17 -18
  65. package/src/components/knowledge/knowledge-sheet.tsx +9 -5
  66. package/src/components/layout/app-layout.tsx +87 -19
  67. package/src/components/mcp-servers/mcp-server-list.tsx +352 -50
  68. package/src/components/mcp-servers/mcp-server-sheet.tsx +25 -9
  69. package/src/components/memory/memory-browser.tsx +73 -45
  70. package/src/components/memory/memory-graph-view.tsx +203 -0
  71. package/src/components/memory/memory-list.tsx +20 -13
  72. package/src/components/plugins/plugin-list.tsx +214 -60
  73. package/src/components/plugins/plugin-sheet.tsx +119 -24
  74. package/src/components/projects/project-list.tsx +17 -9
  75. package/src/components/providers/provider-list.tsx +21 -6
  76. package/src/components/providers/provider-sheet.tsx +42 -25
  77. package/src/components/runs/run-list.tsx +17 -13
  78. package/src/components/schedules/schedule-card.tsx +10 -3
  79. package/src/components/schedules/schedule-list.tsx +2 -2
  80. package/src/components/schedules/schedule-sheet.tsx +28 -9
  81. package/src/components/secrets/secret-sheet.tsx +7 -2
  82. package/src/components/secrets/secrets-list.tsx +18 -5
  83. package/src/components/sessions/new-session-sheet.tsx +183 -376
  84. package/src/components/sessions/session-card.tsx +10 -2
  85. package/src/components/settings/gateway-connection-panel.tsx +9 -8
  86. package/src/components/shared/command-palette.tsx +13 -5
  87. package/src/components/shared/empty-state.tsx +20 -8
  88. package/src/components/shared/hint-tip.tsx +31 -0
  89. package/src/components/shared/notification-center.tsx +134 -86
  90. package/src/components/shared/profile-sheet.tsx +4 -0
  91. package/src/components/shared/settings/plugin-manager.tsx +360 -135
  92. package/src/components/shared/settings/section-capability-policy.tsx +3 -3
  93. package/src/components/shared/settings/section-runtime-loop.tsx +149 -4
  94. package/src/components/skills/clawhub-browser.tsx +1 -0
  95. package/src/components/skills/skill-list.tsx +31 -12
  96. package/src/components/skills/skill-sheet.tsx +20 -7
  97. package/src/components/tasks/approvals-panel.tsx +224 -0
  98. package/src/components/tasks/task-board.tsx +20 -12
  99. package/src/components/tasks/task-card.tsx +21 -7
  100. package/src/components/tasks/task-column.tsx +4 -3
  101. package/src/components/tasks/task-list.tsx +1 -1
  102. package/src/components/tasks/task-sheet.tsx +130 -1
  103. package/src/components/ui/dialog.tsx +1 -0
  104. package/src/components/ui/sheet.tsx +1 -0
  105. package/src/components/usage/metrics-dashboard.tsx +72 -48
  106. package/src/components/wallets/wallet-panel.tsx +65 -41
  107. package/src/components/wallets/wallet-section.tsx +9 -3
  108. package/src/components/webhooks/webhook-list.tsx +21 -12
  109. package/src/components/webhooks/webhook-sheet.tsx +13 -3
  110. package/src/lib/approval-display.test.ts +45 -0
  111. package/src/lib/approval-display.ts +62 -0
  112. package/src/lib/clipboard.ts +38 -0
  113. package/src/lib/memory.ts +8 -0
  114. package/src/lib/providers/claude-cli.ts +5 -3
  115. package/src/lib/providers/index.ts +67 -21
  116. package/src/lib/runtime-loop.ts +3 -2
  117. package/src/lib/server/approvals.ts +150 -0
  118. package/src/lib/server/chat-execution.ts +319 -74
  119. package/src/lib/server/chatroom-helpers.ts +63 -5
  120. package/src/lib/server/chatroom-orchestration.ts +74 -0
  121. package/src/lib/server/clawhub-client.ts +82 -6
  122. package/src/lib/server/connectors/manager.ts +27 -1
  123. package/src/lib/server/context-manager.ts +132 -50
  124. package/src/lib/server/cost.test.ts +73 -0
  125. package/src/lib/server/cost.ts +165 -34
  126. package/src/lib/server/daemon-state.ts +112 -1
  127. package/src/lib/server/data-dir.ts +18 -1
  128. package/src/lib/server/eval/runner.ts +126 -0
  129. package/src/lib/server/eval/scenarios.ts +218 -0
  130. package/src/lib/server/eval/scorer.ts +96 -0
  131. package/src/lib/server/eval/store.ts +37 -0
  132. package/src/lib/server/eval/types.ts +48 -0
  133. package/src/lib/server/execution-log.ts +12 -8
  134. package/src/lib/server/guardian.ts +34 -0
  135. package/src/lib/server/heartbeat-service.ts +53 -1
  136. package/src/lib/server/integrity-monitor.ts +208 -0
  137. package/src/lib/server/langgraph-checkpoint.ts +10 -0
  138. package/src/lib/server/link-understanding.ts +55 -0
  139. package/src/lib/server/llm-response-cache.test.ts +102 -0
  140. package/src/lib/server/llm-response-cache.ts +227 -0
  141. package/src/lib/server/main-agent-loop.ts +115 -16
  142. package/src/lib/server/main-session.ts +6 -3
  143. package/src/lib/server/mcp-conformance.test.ts +18 -0
  144. package/src/lib/server/mcp-conformance.ts +233 -0
  145. package/src/lib/server/memory-db.ts +193 -19
  146. package/src/lib/server/memory-retrieval.test.ts +56 -0
  147. package/src/lib/server/mmr.ts +73 -0
  148. package/src/lib/server/orchestrator-lg.ts +7 -1
  149. package/src/lib/server/orchestrator.ts +4 -3
  150. package/src/lib/server/plugins.ts +662 -132
  151. package/src/lib/server/process-manager.ts +18 -0
  152. package/src/lib/server/query-expansion.ts +57 -0
  153. package/src/lib/server/queue.ts +280 -11
  154. package/src/lib/server/runtime-settings.ts +9 -0
  155. package/src/lib/server/session-run-manager.test.ts +23 -0
  156. package/src/lib/server/session-run-manager.ts +32 -2
  157. package/src/lib/server/session-tools/canvas.ts +85 -50
  158. package/src/lib/server/session-tools/chatroom.ts +130 -127
  159. package/src/lib/server/session-tools/connector.ts +233 -454
  160. package/src/lib/server/session-tools/context-mgmt.ts +87 -105
  161. package/src/lib/server/session-tools/crud.ts +84 -7
  162. package/src/lib/server/session-tools/delegate.ts +351 -752
  163. package/src/lib/server/session-tools/discovery.ts +198 -0
  164. package/src/lib/server/session-tools/edit_file.ts +82 -0
  165. package/src/lib/server/session-tools/file-send.test.ts +39 -0
  166. package/src/lib/server/session-tools/file.ts +257 -425
  167. package/src/lib/server/session-tools/git.ts +87 -47
  168. package/src/lib/server/session-tools/http.ts +95 -33
  169. package/src/lib/server/session-tools/index.ts +217 -138
  170. package/src/lib/server/session-tools/memory.ts +154 -239
  171. package/src/lib/server/session-tools/monitor.ts +126 -0
  172. package/src/lib/server/session-tools/normalize-tool-args.test.ts +61 -0
  173. package/src/lib/server/session-tools/normalize-tool-args.ts +48 -0
  174. package/src/lib/server/session-tools/openclaw-nodes.ts +82 -99
  175. package/src/lib/server/session-tools/openclaw-workspace.ts +103 -93
  176. package/src/lib/server/session-tools/platform.ts +86 -0
  177. package/src/lib/server/session-tools/plugin-creator.ts +239 -0
  178. package/src/lib/server/session-tools/sample-ui.ts +97 -0
  179. package/src/lib/server/session-tools/sandbox.ts +175 -148
  180. package/src/lib/server/session-tools/schedule.ts +78 -0
  181. package/src/lib/server/session-tools/session-info.ts +104 -410
  182. package/src/lib/server/session-tools/shell-normalize.test.ts +43 -0
  183. package/src/lib/server/session-tools/shell.ts +171 -143
  184. package/src/lib/server/session-tools/subagent.ts +77 -77
  185. package/src/lib/server/session-tools/wallet.ts +182 -106
  186. package/src/lib/server/session-tools/web.ts +181 -327
  187. package/src/lib/server/storage.ts +36 -0
  188. package/src/lib/server/stream-agent-chat.ts +348 -242
  189. package/src/lib/server/task-quality-gate.test.ts +44 -0
  190. package/src/lib/server/task-quality-gate.ts +67 -0
  191. package/src/lib/server/task-validation.test.ts +78 -0
  192. package/src/lib/server/task-validation.ts +67 -2
  193. package/src/lib/server/tool-aliases.ts +68 -0
  194. package/src/lib/server/tool-capability-policy.ts +24 -5
  195. package/src/lib/server/tool-retry.ts +62 -0
  196. package/src/lib/server/transcript-repair.ts +72 -0
  197. package/src/lib/setup-defaults.ts +1 -0
  198. package/src/lib/tasks.ts +7 -1
  199. package/src/lib/tool-definitions.ts +24 -23
  200. package/src/lib/validation/schemas.ts +13 -0
  201. package/src/lib/view-routes.ts +2 -23
  202. package/src/stores/use-app-store.ts +23 -1
  203. package/src/types/index.ts +155 -10
@@ -0,0 +1,218 @@
1
+ import type { EvalScenario } from './types'
2
+
3
+ export const EVAL_SCENARIOS: EvalScenario[] = [
4
+ {
5
+ id: 'coding-prime',
6
+ name: 'Prime Number Function',
7
+ category: 'coding',
8
+ description: 'Create and test a function that checks if a number is prime',
9
+ userMessage: 'Create a function that checks if a number is prime and test it with a few examples including 2, 7, 10, and 97.',
10
+ expectedBehaviors: [
11
+ 'Writes a correct isPrime function',
12
+ 'Tests with the specified numbers',
13
+ 'Returns correct results for each test case',
14
+ ],
15
+ scoringCriteria: [
16
+ { name: 'uses_shell', weight: 2, evaluator: 'tool_used', expected: 'shell' },
17
+ { name: 'uses_files', weight: 2, evaluator: 'tool_used', expected: 'files' },
18
+ { name: 'mentions_prime', weight: 1, evaluator: 'contains', expected: 'prime' },
19
+ { name: 'tests_number_2', weight: 1, evaluator: 'contains', expected: '2' },
20
+ { name: 'tests_number_97', weight: 1, evaluator: 'contains', expected: '97' },
21
+ { name: 'correctness', weight: 3, evaluator: 'llm_judge', expected: 'Did the response correctly implement an isPrime function and test it with 2, 7, 10, and 97, producing correct results (2=prime, 7=prime, 10=not prime, 97=prime)?' },
22
+ ],
23
+ timeoutMs: 60_000,
24
+ tools: ['shell', 'files'],
25
+ },
26
+ {
27
+ id: 'research-frameworks',
28
+ name: 'Node.js Framework Comparison',
29
+ category: 'research',
30
+ description: 'Research and compare top Node.js web frameworks by GitHub stars',
31
+ userMessage: 'Find the top 3 Node.js web frameworks by GitHub stars and compare them. Include star counts and key differences.',
32
+ expectedBehaviors: [
33
+ 'Searches the web for framework information',
34
+ 'Identifies at least 3 frameworks (e.g. Express, Next.js, Fastify)',
35
+ 'Provides star counts and meaningful comparison',
36
+ ],
37
+ scoringCriteria: [
38
+ { name: 'uses_web_search', weight: 2, evaluator: 'tool_used', expected: 'web_search' },
39
+ { name: 'mentions_express', weight: 1, evaluator: 'regex', expected: 'express(\\.js)?' },
40
+ { name: 'mentions_next', weight: 1, evaluator: 'regex', expected: 'next(\\.js)?' },
41
+ { name: 'includes_stars', weight: 1, evaluator: 'regex', expected: '\\d+[kK,.]?\\d*\\s*(stars|\\*)' },
42
+ { name: 'quality', weight: 5, evaluator: 'llm_judge', expected: 'Did the response identify at least 3 Node.js web frameworks, include GitHub star counts, and provide a meaningful comparison of their key differences?' },
43
+ ],
44
+ timeoutMs: 90_000,
45
+ tools: ['web_search', 'web_fetch'],
46
+ },
47
+ {
48
+ id: 'companionship-overwhelmed',
49
+ name: 'Emotional Support - Overwhelmed',
50
+ category: 'companionship',
51
+ description: 'Provide empathetic support when user is overwhelmed with work',
52
+ userMessage: "I'm feeling overwhelmed with work deadlines. Everything is piling up and I don't know where to start.",
53
+ expectedBehaviors: [
54
+ 'Acknowledges the feeling of being overwhelmed',
55
+ 'Shows empathy without being dismissive',
56
+ 'Offers actionable suggestions for managing workload',
57
+ ],
58
+ scoringCriteria: [
59
+ { name: 'empathy', weight: 4, evaluator: 'llm_judge', expected: 'Does the response show genuine empathy and acknowledge the user\'s feelings of being overwhelmed without being dismissive or jumping straight to advice?' },
60
+ { name: 'actionable_advice', weight: 3, evaluator: 'llm_judge', expected: 'Does the response offer practical, actionable suggestions for managing workload or reducing the feeling of being overwhelmed (e.g. prioritization, breaking tasks down, time management)?' },
61
+ { name: 'appropriate_tone', weight: 3, evaluator: 'llm_judge', expected: 'Is the tone warm, supportive, and human-like rather than clinical, robotic, or overly formal?' },
62
+ ],
63
+ timeoutMs: 30_000,
64
+ tools: [],
65
+ },
66
+ {
67
+ id: 'multi-step-project',
68
+ name: 'Project Directory Setup',
69
+ category: 'multi-step',
70
+ description: 'Create a project directory, write a README, and initialize git',
71
+ userMessage: "Create a project directory called 'demo-project' in /tmp, write a README.md with a title and description, and initialize a git repository in it.",
72
+ expectedBehaviors: [
73
+ 'Creates the demo-project directory',
74
+ 'Writes a README.md file with content',
75
+ 'Initializes a git repository',
76
+ ],
77
+ scoringCriteria: [
78
+ { name: 'uses_shell', weight: 2, evaluator: 'tool_used', expected: 'shell' },
79
+ { name: 'uses_files', weight: 2, evaluator: 'tool_used', expected: 'files' },
80
+ { name: 'mentions_mkdir', weight: 1, evaluator: 'regex', expected: 'demo-project' },
81
+ { name: 'mentions_readme', weight: 1, evaluator: 'contains', expected: 'README' },
82
+ { name: 'mentions_git_init', weight: 1, evaluator: 'contains', expected: 'git init' },
83
+ { name: 'completeness', weight: 3, evaluator: 'llm_judge', expected: 'Did the response successfully complete all 3 steps: create the demo-project directory, write a README.md with meaningful content, and initialize a git repository?' },
84
+ ],
85
+ timeoutMs: 60_000,
86
+ tools: ['shell', 'files'],
87
+ },
88
+ {
89
+ id: 'memory-store-recall',
90
+ name: 'Memory Store and Recall',
91
+ category: 'memory',
92
+ description: 'Store a fact in memory and demonstrate recall capability',
93
+ userMessage: 'Remember that my favorite programming language is Rust and I prefer functional programming patterns. Then confirm what you just stored.',
94
+ expectedBehaviors: [
95
+ 'Uses memory tool to store the information',
96
+ 'Confirms what was stored',
97
+ 'Accurately reflects the stored preferences',
98
+ ],
99
+ scoringCriteria: [
100
+ { name: 'uses_memory', weight: 3, evaluator: 'tool_used', expected: 'memory' },
101
+ { name: 'mentions_rust', weight: 2, evaluator: 'contains', expected: 'Rust' },
102
+ { name: 'mentions_functional', weight: 2, evaluator: 'contains', expected: 'functional' },
103
+ { name: 'confirmation', weight: 3, evaluator: 'llm_judge', expected: 'Did the response confirm storing the user\'s preference for Rust and functional programming, and accurately summarize what was stored?' },
104
+ ],
105
+ timeoutMs: 30_000,
106
+ tools: ['memory'],
107
+ },
108
+ {
109
+ id: 'planning-blog',
110
+ name: 'Blog Platform Planning',
111
+ category: 'planning',
112
+ description: 'Create a detailed plan for building a blog platform',
113
+ userMessage: 'Build me a detailed plan for a blog platform with posts, comments, and user authentication. Break it into tasks I can work through.',
114
+ expectedBehaviors: [
115
+ 'Creates structured tasks or a plan',
116
+ 'Covers posts, comments, and authentication',
117
+ 'Breaks work into manageable pieces',
118
+ ],
119
+ scoringCriteria: [
120
+ { name: 'uses_tasks', weight: 2, evaluator: 'tool_used', expected: 'manage_tasks' },
121
+ { name: 'mentions_posts', weight: 1, evaluator: 'contains', expected: 'post' },
122
+ { name: 'mentions_comments', weight: 1, evaluator: 'contains', expected: 'comment' },
123
+ { name: 'mentions_auth', weight: 1, evaluator: 'regex', expected: 'auth(entication|orization)?' },
124
+ { name: 'plan_quality', weight: 5, evaluator: 'llm_judge', expected: 'Is the plan well-structured with clear, actionable tasks that cover the three main features (posts, comments, user auth)? Are tasks broken into manageable pieces with logical ordering?' },
125
+ ],
126
+ timeoutMs: 60_000,
127
+ tools: ['manage_tasks'],
128
+ },
129
+ {
130
+ id: 'tool-usage-weather',
131
+ name: 'Web Search - Weather',
132
+ category: 'tool-usage',
133
+ description: 'Search the web for current weather information',
134
+ userMessage: 'Search the web for today\'s weather in London and tell me the temperature and conditions.',
135
+ expectedBehaviors: [
136
+ 'Uses web search tool',
137
+ 'Reports temperature',
138
+ 'Reports weather conditions',
139
+ ],
140
+ scoringCriteria: [
141
+ { name: 'uses_web_search', weight: 3, evaluator: 'tool_used', expected: 'web_search' },
142
+ { name: 'mentions_temperature', weight: 2, evaluator: 'regex', expected: '\\d+\\s*[°]?\\s*[CcFf]' },
143
+ { name: 'mentions_london', weight: 1, evaluator: 'contains', expected: 'London' },
144
+ { name: 'quality', weight: 4, evaluator: 'llm_judge', expected: 'Did the response provide specific, current weather information for London including temperature and conditions (e.g. sunny, cloudy, rain)?' },
145
+ ],
146
+ timeoutMs: 60_000,
147
+ tools: ['web_search'],
148
+ },
149
+ {
150
+ id: 'coding-fizzbuzz',
151
+ name: 'FizzBuzz Implementation',
152
+ category: 'coding',
153
+ description: 'Write and run a FizzBuzz implementation in Python',
154
+ userMessage: 'Write a FizzBuzz implementation in Python that prints numbers 1 to 30 and run it.',
155
+ expectedBehaviors: [
156
+ 'Writes correct FizzBuzz logic',
157
+ 'Runs the code successfully',
158
+ 'Output contains Fizz, Buzz, and FizzBuzz',
159
+ ],
160
+ scoringCriteria: [
161
+ { name: 'uses_shell', weight: 2, evaluator: 'tool_used', expected: 'shell' },
162
+ { name: 'uses_files', weight: 2, evaluator: 'tool_used', expected: 'files' },
163
+ { name: 'contains_fizz', weight: 1, evaluator: 'contains', expected: 'Fizz' },
164
+ { name: 'contains_buzz', weight: 1, evaluator: 'contains', expected: 'Buzz' },
165
+ { name: 'contains_fizzbuzz', weight: 1, evaluator: 'contains', expected: 'FizzBuzz' },
166
+ { name: 'correctness', weight: 3, evaluator: 'llm_judge', expected: 'Did the response implement FizzBuzz correctly (multiples of 3 print Fizz, multiples of 5 print Buzz, multiples of both print FizzBuzz) and successfully execute it?' },
167
+ ],
168
+ timeoutMs: 60_000,
169
+ tools: ['shell', 'files'],
170
+ },
171
+ {
172
+ id: 'research-comparison',
173
+ name: 'LLM Pricing Comparison',
174
+ category: 'research',
175
+ description: 'Compare pricing of major LLM models',
176
+ userMessage: 'Compare the pricing of OpenAI GPT-4o and Anthropic Claude 3.5 Sonnet. Include input and output token costs.',
177
+ expectedBehaviors: [
178
+ 'Searches for current pricing',
179
+ 'Includes both models',
180
+ 'Reports input and output token costs',
181
+ ],
182
+ scoringCriteria: [
183
+ { name: 'uses_web_search', weight: 2, evaluator: 'tool_used', expected: 'web_search' },
184
+ { name: 'mentions_gpt4o', weight: 1, evaluator: 'regex', expected: 'GPT-?4[oO]' },
185
+ { name: 'mentions_claude', weight: 1, evaluator: 'regex', expected: 'Claude\\s*3\\.?5' },
186
+ { name: 'mentions_pricing', weight: 1, evaluator: 'regex', expected: '\\$\\d+' },
187
+ { name: 'quality', weight: 5, evaluator: 'llm_judge', expected: 'Did the response provide accurate and specific pricing for both GPT-4o and Claude 3.5 Sonnet, including input and output token costs, with a clear comparison?' },
188
+ ],
189
+ timeoutMs: 90_000,
190
+ tools: ['web_search', 'web_fetch'],
191
+ },
192
+ {
193
+ id: 'multi-step-analyze',
194
+ name: 'Package.json Analysis',
195
+ category: 'multi-step',
196
+ description: 'Read and analyze the current project\'s package.json',
197
+ userMessage: 'Read the package.json in the current directory and list all dependencies. Group them into regular dependencies and dev dependencies.',
198
+ expectedBehaviors: [
199
+ 'Reads package.json using shell or files tool',
200
+ 'Lists regular dependencies',
201
+ 'Lists dev dependencies',
202
+ 'Groups them clearly',
203
+ ],
204
+ scoringCriteria: [
205
+ { name: 'uses_shell_or_files', weight: 2, evaluator: 'tool_used', expected: 'shell' },
206
+ { name: 'mentions_dependencies', weight: 1, evaluator: 'contains', expected: 'dependencies' },
207
+ { name: 'mentions_dev_deps', weight: 1, evaluator: 'regex', expected: 'dev[Dd]ependencies|dev dependencies' },
208
+ { name: 'mentions_package_json', weight: 1, evaluator: 'contains', expected: 'package.json' },
209
+ { name: 'quality', weight: 5, evaluator: 'llm_judge', expected: 'Did the response successfully read package.json, list the dependencies, and clearly group them into regular and dev dependencies?' },
210
+ ],
211
+ timeoutMs: 60_000,
212
+ tools: ['shell', 'files'],
213
+ },
214
+ ]
215
+
216
+ export function getScenario(id: string): EvalScenario | undefined {
217
+ return EVAL_SCENARIOS.find(s => s.id === id)
218
+ }
@@ -0,0 +1,96 @@
1
+ import type { ScoringCriterion, EvalCriterionResult } from './types'
2
+ import type { MessageToolEvent } from '@/types'
3
+
4
+ export async function scoreCriteria(
5
+ criteria: ScoringCriterion[],
6
+ responseText: string,
7
+ toolEvents: MessageToolEvent[],
8
+ judgeOpts?: { provider: string; model: string; apiKey: string | null; apiEndpoint?: string | null },
9
+ ): Promise<EvalCriterionResult[]> {
10
+ const results: EvalCriterionResult[] = []
11
+
12
+ for (const criterion of criteria) {
13
+ switch (criterion.evaluator) {
14
+ case 'contains': {
15
+ const found = responseText.toLowerCase().includes(criterion.expected.toLowerCase())
16
+ results.push({
17
+ criterion: criterion.name,
18
+ score: found ? criterion.weight : 0,
19
+ maxScore: criterion.weight,
20
+ evidence: found ? `Found "${criterion.expected}" in response` : `"${criterion.expected}" not found in response`,
21
+ })
22
+ break
23
+ }
24
+
25
+ case 'regex': {
26
+ const regex = new RegExp(criterion.expected, 'i')
27
+ const matched = regex.test(responseText)
28
+ results.push({
29
+ criterion: criterion.name,
30
+ score: matched ? criterion.weight : 0,
31
+ maxScore: criterion.weight,
32
+ evidence: matched ? `Pattern /${criterion.expected}/i matched` : `Pattern /${criterion.expected}/i did not match`,
33
+ })
34
+ break
35
+ }
36
+
37
+ case 'tool_used': {
38
+ const used = toolEvents.some(e => e.name === criterion.expected)
39
+ results.push({
40
+ criterion: criterion.name,
41
+ score: used ? criterion.weight : 0,
42
+ maxScore: criterion.weight,
43
+ evidence: used ? `Tool "${criterion.expected}" was used` : `Tool "${criterion.expected}" was not used`,
44
+ })
45
+ break
46
+ }
47
+
48
+ case 'llm_judge': {
49
+ if (!judgeOpts) {
50
+ results.push({
51
+ criterion: criterion.name,
52
+ score: 0,
53
+ maxScore: criterion.weight,
54
+ evidence: 'No judge provider configured; skipped',
55
+ })
56
+ break
57
+ }
58
+
59
+ try {
60
+ const { buildChatModel } = await import('../build-llm')
61
+ const { HumanMessage } = await import('@langchain/core/messages')
62
+
63
+ const llm = buildChatModel({
64
+ provider: judgeOpts.provider,
65
+ model: judgeOpts.model,
66
+ apiKey: judgeOpts.apiKey,
67
+ apiEndpoint: judgeOpts.apiEndpoint,
68
+ })
69
+
70
+ const judgePrompt = `Rate the following AI response on a scale of 0-10.\n\nCriterion: ${criterion.expected}\n\nResponse:\n${responseText}\n\nReply with ONLY a number 0-10.`
71
+ const result = await llm.invoke([new HumanMessage(judgePrompt)])
72
+ const scoreText = typeof result.content === 'string' ? result.content : ''
73
+ const parsed = parseInt(scoreText.trim(), 10)
74
+ const rawScore = Number.isFinite(parsed) ? Math.max(0, Math.min(10, parsed)) : 5
75
+
76
+ results.push({
77
+ criterion: criterion.name,
78
+ score: (rawScore / 10) * criterion.weight,
79
+ maxScore: criterion.weight,
80
+ evidence: `LLM judge: ${rawScore}/10`,
81
+ })
82
+ } catch (err: unknown) {
83
+ results.push({
84
+ criterion: criterion.name,
85
+ score: 0,
86
+ maxScore: criterion.weight,
87
+ evidence: `LLM judge error: ${err instanceof Error ? err.message : String(err)}`,
88
+ })
89
+ }
90
+ break
91
+ }
92
+ }
93
+ }
94
+
95
+ return results
96
+ }
@@ -0,0 +1,37 @@
1
+ import Database from 'better-sqlite3'
2
+ import path from 'path'
3
+ import type { EvalRun } from './types'
4
+
5
+ const DB_PATH = path.join(process.cwd(), 'data', 'eval-runs.db')
6
+
7
+ let db: Database.Database | null = null
8
+
9
+ function getDb(): Database.Database {
10
+ if (!db) {
11
+ db = new Database(DB_PATH)
12
+ db.pragma('journal_mode = WAL')
13
+ db.exec(`CREATE TABLE IF NOT EXISTS eval_runs (
14
+ id TEXT PRIMARY KEY,
15
+ data TEXT NOT NULL
16
+ )`)
17
+ }
18
+ return db
19
+ }
20
+
21
+ export function saveEvalRun(run: EvalRun): void {
22
+ getDb().prepare('INSERT OR REPLACE INTO eval_runs (id, data) VALUES (?, ?)').run(run.id, JSON.stringify(run))
23
+ }
24
+
25
+ export function getEvalRun(id: string): EvalRun | null {
26
+ const row = getDb().prepare('SELECT data FROM eval_runs WHERE id = ?').get(id) as { data: string } | undefined
27
+ return row ? JSON.parse(row.data) as EvalRun : null
28
+ }
29
+
30
+ export function listEvalRuns(limit = 50): EvalRun[] {
31
+ const rows = getDb().prepare('SELECT data FROM eval_runs ORDER BY rowid DESC LIMIT ?').all(limit) as { data: string }[]
32
+ return rows.map(r => JSON.parse(r.data) as EvalRun)
33
+ }
34
+
35
+ export function listEvalRunsByAgent(agentId: string, limit = 50): EvalRun[] {
36
+ return listEvalRuns(limit * 2).filter(r => r.agentId === agentId).slice(0, limit)
37
+ }
@@ -0,0 +1,48 @@
1
+ export interface ScoringCriterion {
2
+ name: string
3
+ weight: number
4
+ evaluator: 'contains' | 'regex' | 'tool_used' | 'llm_judge'
5
+ expected: string
6
+ }
7
+
8
+ export interface EvalScenario {
9
+ id: string
10
+ name: string
11
+ category: 'coding' | 'research' | 'companionship' | 'multi-step' | 'memory' | 'planning' | 'tool-usage' | 'long-lived'
12
+ description: string
13
+ userMessage: string
14
+ expectedBehaviors: string[]
15
+ scoringCriteria: ScoringCriterion[]
16
+ timeoutMs: number
17
+ tools: string[]
18
+ }
19
+
20
+ export interface EvalRun {
21
+ id: string
22
+ scenarioId: string
23
+ agentId: string
24
+ status: 'pending' | 'running' | 'completed' | 'failed'
25
+ startedAt: number
26
+ endedAt?: number
27
+ score: number
28
+ maxScore: number
29
+ details: EvalCriterionResult[]
30
+ sessionId?: string
31
+ error?: string
32
+ }
33
+
34
+ export interface EvalCriterionResult {
35
+ criterion: string
36
+ score: number
37
+ maxScore: number
38
+ evidence?: string
39
+ }
40
+
41
+ export interface EvalSuiteResult {
42
+ agentId: string
43
+ totalScore: number
44
+ maxScore: number
45
+ percentage: number
46
+ runs: EvalRun[]
47
+ completedAt: number
48
+ }
@@ -8,14 +8,18 @@ import { genId } from '@/lib/id'
8
8
  // ---------------------------------------------------------------------------
9
9
 
10
10
  export type LogCategory =
11
- | 'trigger' // what kicked off the action
12
- | 'decision' // reasoning / model choice
13
- | 'tool_call' // tool invocation with input
14
- | 'tool_result' // tool output
15
- | 'outbound' // messages sent to users/platforms
16
- | 'file_op' // file read/write/delete with checksums
17
- | 'commit' // git commit activity
18
- | 'error' // errors during execution
11
+ | 'trigger' // what kicked off the action
12
+ | 'decision' // reasoning / model choice
13
+ | 'tool_call' // tool invocation with input
14
+ | 'tool_result' // tool output
15
+ | 'outbound' // messages sent to users/platforms
16
+ | 'file_op' // file read/write/delete with checksums
17
+ | 'commit' // git commit activity
18
+ | 'error' // errors during execution
19
+ | 'mission_start' // new mission/goal started
20
+ | 'mission_checkpoint' // periodic mission state snapshot
21
+ | 'mission_complete' // mission reached ok status
22
+ | 'budget_warning' // mission approaching or exceeding budget
19
23
 
20
24
  export interface ExecutionLogEntry {
21
25
  id: string
@@ -0,0 +1,34 @@
1
+ import { execSync } from 'child_process'
2
+ import path from 'path'
3
+ import fs from 'fs'
4
+
5
+ /**
6
+ * OpenClaw Guardian — Auto-Rollback capability.
7
+ * If an agent fails a task critically and has autoRecovery enabled,
8
+ * we attempt to roll back the workspace to the last known good state.
9
+ */
10
+ export function performGuardianRollback(cwd: string): { ok: boolean; reason?: string } {
11
+ try {
12
+ const gitDir = path.join(cwd, '.git')
13
+ if (!fs.existsSync(gitDir)) {
14
+ return { ok: false, reason: 'Workspace is not a git repository. Cannot rollback.' }
15
+ }
16
+
17
+ // Check if dirty
18
+ const status = execSync('git status --porcelain', { cwd, encoding: 'utf8' })
19
+ if (!status.trim()) {
20
+ return { ok: false, reason: 'Workspace is clean. Nothing to rollback.' }
21
+ }
22
+
23
+ console.log(`[guardian] Auto-recovery triggered in ${cwd}. Rolling back changes...`)
24
+
25
+ // Perform rollback
26
+ execSync('git reset --hard HEAD', { cwd, encoding: 'utf8' })
27
+ execSync('git clean -fd', { cwd, encoding: 'utf8' })
28
+
29
+ return { ok: true }
30
+ } catch (err: unknown) {
31
+ console.error('[guardian] Auto-rollback failed:', err)
32
+ return { ok: false, reason: `Git operation failed: ${err instanceof Error ? err.message : String(err)}` }
33
+ }
34
+ }
@@ -132,6 +132,45 @@ function readHeartbeatFile(session: any): string {
132
132
  return ''
133
133
  }
134
134
 
135
+ function readIdentityFile(session: Record<string, unknown>): Record<string, string> {
136
+ try {
137
+ const filePath = path.join(typeof session.cwd === 'string' ? session.cwd : WORKSPACE_DIR, 'IDENTITY.md')
138
+ if (fs.existsSync(filePath)) {
139
+ const content = fs.readFileSync(filePath, 'utf-8')
140
+ const identity: Record<string, string> = {}
141
+ for (const line of content.split('\n')) {
142
+ const cleaned = line.trim().replace(/^\s*-\s*/, '')
143
+ const colonIndex = cleaned.indexOf(':')
144
+ if (colonIndex === -1) continue
145
+ const label = cleaned.slice(0, colonIndex).replace(/[*_]/g, '').trim().toLowerCase()
146
+ const value = cleaned.slice(colonIndex + 1).replace(/^[*_]+|[*_]+$/g, '').trim()
147
+ if (value) identity[label] = value
148
+ }
149
+ return identity
150
+ }
151
+ } catch { /* ignore */ }
152
+ return {}
153
+ }
154
+
155
+ export function buildIdentityContext(session: Record<string, unknown> | undefined | null, agent: Record<string, unknown> | undefined | null): string {
156
+ const fileId = session ? readIdentityFile(session) : {}
157
+ const name = fileId.name || agent?.name || ''
158
+ const emoji = fileId.emoji || agent?.emoji || ''
159
+ const creature = fileId.creature || agent?.creature || ''
160
+ const vibe = fileId.vibe || agent?.vibe || ''
161
+ const theme = fileId.theme || agent?.theme || ''
162
+
163
+ const lines = []
164
+ if (name) lines.push(`Name: ${name}`)
165
+ if (emoji) lines.push(`Emoji: ${emoji}`)
166
+ if (creature) lines.push(`Creature: ${creature}`)
167
+ if (vibe) lines.push(`Vibe: ${vibe}`)
168
+ if (theme) lines.push(`Theme: ${theme}`)
169
+
170
+ if (lines.length === 0) return ''
171
+ return `## Your Identity\n${lines.join('\n')}`
172
+ }
173
+
135
174
  /** Detect HEARTBEAT.md files that contain only skeleton structure (headers, empty list items) but no real content. */
136
175
  export function isHeartbeatContentEffectivelyEmpty(content: string | undefined | null): boolean {
137
176
  if (!content || typeof content !== 'string') return true
@@ -148,6 +187,7 @@ export function isHeartbeatContentEffectivelyEmpty(content: string | undefined |
148
187
  function buildAgentHeartbeatPrompt(session: any, agent: any, fallbackPrompt: string, heartbeatFileContent: string): string {
149
188
  if (!agent) return fallbackPrompt
150
189
 
190
+ const identityContext = buildIdentityContext(session, agent)
151
191
  // Drain system events accumulated since last heartbeat
152
192
  const events = drainSystemEvents(session.id)
153
193
  const eventBlock = events.length > 0
@@ -178,7 +218,7 @@ function buildAgentHeartbeatPrompt(session: any, agent: any, fallbackPrompt: str
178
218
  return [
179
219
  'AGENT_HEARTBEAT_TICK',
180
220
  `Time: ${new Date().toISOString()}`,
181
- `Agent: ${agent.name}`,
221
+ identityContext,
182
222
  description ? `Description: ${description}` : '',
183
223
  eventBlock ? `Events since last heartbeat:\n${eventBlock}` : '',
184
224
  dynamicGoal
@@ -202,6 +242,14 @@ function buildAgentHeartbeatPrompt(session: any, agent: any, fallbackPrompt: str
202
242
  ].filter(Boolean).join('\n')
203
243
  }
204
244
 
245
+ function applyMomentumMultiplier(intervalSec: number, momentumScore: number): number {
246
+ let multiplier = 1.0
247
+ if (momentumScore >= 80) multiplier = 0.5
248
+ else if (momentumScore < 40) multiplier = 2.0
249
+ const adjusted = Math.round(intervalSec * multiplier)
250
+ return Math.max(30, Math.min(7200, adjusted))
251
+ }
252
+
205
253
  function resolveInterval(obj: Record<string, any>, currentSec: number): number {
206
254
  // Prefer heartbeatInterval (duration string) over heartbeatIntervalSec (raw number)
207
255
  if (obj.heartbeatInterval !== undefined && obj.heartbeatInterval !== null) {
@@ -347,6 +395,10 @@ async function tickHeartbeats() {
347
395
  const cfg = heartbeatConfigForSession(session, settings, agents)
348
396
  if (!cfg.enabled) continue
349
397
 
398
+ // Apply momentum-based multiplier to heartbeat interval
399
+ const momentumScore = session.mainLoopState?.momentumScore ?? 40
400
+ cfg.intervalSec = applyMomentumMultiplier(cfg.intervalSec, momentumScore)
401
+
350
402
  // For sessions with explicit opt-in, use a shorter idle threshold (just intervalSec * 2).
351
403
  // For inherited/global heartbeats, keep the 180s minimum to avoid noisy auto-fire.
352
404
  const defaultIdleSec = explicitOptIn