@swarmclawai/swarmclaw 0.6.7 → 0.6.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/README.md +24 -6
  2. package/package.json +1 -1
  3. package/src/app/api/agents/route.ts +1 -0
  4. package/src/app/api/chatrooms/[id]/chat/route.ts +4 -0
  5. package/src/app/api/eval/run/route.ts +37 -0
  6. package/src/app/api/eval/scenarios/route.ts +24 -0
  7. package/src/app/api/eval/suite/route.ts +29 -0
  8. package/src/app/api/memory/graph/route.ts +46 -0
  9. package/src/app/api/sessions/[id]/checkpoints/route.ts +31 -0
  10. package/src/app/api/sessions/[id]/restore/route.ts +36 -0
  11. package/src/app/api/souls/[id]/route.ts +65 -0
  12. package/src/app/api/souls/route.ts +70 -0
  13. package/src/app/api/tasks/[id]/route.ts +5 -0
  14. package/src/app/api/tasks/route.ts +2 -0
  15. package/src/app/api/usage/route.ts +9 -2
  16. package/src/cli/index.js +24 -0
  17. package/src/components/agents/agent-sheet.tsx +27 -6
  18. package/src/components/agents/soul-library-picker.tsx +84 -13
  19. package/src/components/chat/activity-moment.tsx +2 -0
  20. package/src/components/chat/checkpoint-timeline.tsx +112 -0
  21. package/src/components/chat/message-list.tsx +19 -3
  22. package/src/components/chat/session-debug-panel.tsx +106 -84
  23. package/src/components/chat/task-approval-card.tsx +78 -0
  24. package/src/components/chat/tool-call-bubble.tsx +3 -0
  25. package/src/components/connectors/connector-sheet.tsx +8 -1
  26. package/src/components/home/home-view.tsx +39 -15
  27. package/src/components/layout/app-layout.tsx +18 -2
  28. package/src/components/memory/memory-browser.tsx +73 -45
  29. package/src/components/memory/memory-graph-view.tsx +203 -0
  30. package/src/components/plugins/plugin-list.tsx +1 -1
  31. package/src/components/schedules/schedule-sheet.tsx +9 -2
  32. package/src/components/shared/hint-tip.tsx +31 -0
  33. package/src/components/shared/settings/section-runtime-loop.tsx +5 -4
  34. package/src/components/tasks/approvals-panel.tsx +120 -0
  35. package/src/components/usage/metrics-dashboard.tsx +25 -3
  36. package/src/lib/server/chat-execution.ts +96 -12
  37. package/src/lib/server/chatroom-helpers.ts +63 -5
  38. package/src/lib/server/chatroom-orchestration.ts +74 -0
  39. package/src/lib/server/context-manager.ts +132 -50
  40. package/src/lib/server/daemon-state.ts +70 -1
  41. package/src/lib/server/eval/runner.ts +126 -0
  42. package/src/lib/server/eval/scenarios.ts +218 -0
  43. package/src/lib/server/eval/scorer.ts +96 -0
  44. package/src/lib/server/eval/store.ts +37 -0
  45. package/src/lib/server/eval/types.ts +48 -0
  46. package/src/lib/server/execution-log.ts +12 -8
  47. package/src/lib/server/guardian.ts +34 -0
  48. package/src/lib/server/heartbeat-service.ts +53 -1
  49. package/src/lib/server/langgraph-checkpoint.ts +10 -0
  50. package/src/lib/server/link-understanding.ts +55 -0
  51. package/src/lib/server/main-agent-loop.ts +114 -15
  52. package/src/lib/server/memory-db.ts +18 -7
  53. package/src/lib/server/mmr.ts +73 -0
  54. package/src/lib/server/orchestrator-lg.ts +3 -0
  55. package/src/lib/server/plugins.ts +44 -22
  56. package/src/lib/server/query-expansion.ts +57 -0
  57. package/src/lib/server/queue.ts +27 -0
  58. package/src/lib/server/session-run-manager.ts +21 -1
  59. package/src/lib/server/session-tools/http.ts +19 -9
  60. package/src/lib/server/session-tools/index.ts +34 -0
  61. package/src/lib/server/session-tools/memory.ts +39 -11
  62. package/src/lib/server/session-tools/schedule.ts +43 -0
  63. package/src/lib/server/session-tools/web.ts +35 -11
  64. package/src/lib/server/storage.ts +12 -0
  65. package/src/lib/server/stream-agent-chat.ts +57 -8
  66. package/src/lib/server/tool-capability-policy.ts +1 -0
  67. package/src/lib/server/tool-retry.ts +62 -0
  68. package/src/lib/server/transcript-repair.ts +72 -0
  69. package/src/lib/setup-defaults.ts +1 -0
  70. package/src/lib/tool-definitions.ts +1 -0
  71. package/src/lib/validation/schemas.ts +1 -0
  72. package/src/lib/view-routes.ts +1 -0
  73. package/src/types/index.ts +34 -3
@@ -1,4 +1,4 @@
1
- import { loadQueue, loadSchedules, loadSessions, saveSessions, loadConnectors, saveConnectors, loadWebhookRetryQueue, upsertWebhookRetry, deleteWebhookRetry, loadWebhooks, loadAgents, appendWebhookLog, loadCredentials, decryptKey } from './storage'
1
+ import { loadQueue, loadSchedules, loadSessions, saveSessions, loadConnectors, saveConnectors, loadWebhookRetryQueue, upsertWebhookRetry, deleteWebhookRetry, loadWebhooks, loadAgents, loadSettings, appendWebhookLog, loadCredentials, decryptKey } from './storage'
2
2
  import { notify } from './ws-hub'
3
3
  import { processNext, cleanupFinishedTaskSessions, validateCompletedTasksQueue, recoverStalledRunningTasks } from './queue'
4
4
  import { startScheduler, stopScheduler } from './scheduler'
@@ -80,6 +80,7 @@ const ds: {
80
80
  healthIntervalId: ReturnType<typeof setInterval> | null
81
81
  memoryConsolidationTimeoutId: ReturnType<typeof setTimeout> | null
82
82
  memoryConsolidationIntervalId: ReturnType<typeof setInterval> | null
83
+ evalSchedulerIntervalId: ReturnType<typeof setInterval> | null
83
84
  /** Session IDs we've already alerted as stale (alert-once semantics). */
84
85
  staleSessionIds: Set<string>
85
86
  connectorRestartState: Map<string, { lastAttemptAt: number; failCount: number; wakeAttempts: number }>
@@ -97,6 +98,7 @@ const ds: {
97
98
  healthIntervalId: null,
98
99
  memoryConsolidationTimeoutId: null,
99
100
  memoryConsolidationIntervalId: null,
101
+ evalSchedulerIntervalId: null,
100
102
  staleSessionIds: new Set<string>(),
101
103
  connectorRestartState: new Map<string, { lastAttemptAt: number; failCount: number; wakeAttempts: number }>(),
102
104
  openclawDownAgentIds: new Set<string>(),
@@ -118,6 +120,7 @@ if (ds.healthIntervalId === undefined) ds.healthIntervalId = null
118
120
  if (ds.manualStopRequested === undefined) ds.manualStopRequested = false
119
121
  if (ds.memoryConsolidationTimeoutId === undefined) ds.memoryConsolidationTimeoutId = null
120
122
  if (ds.memoryConsolidationIntervalId === undefined) ds.memoryConsolidationIntervalId = null
123
+ if (ds.evalSchedulerIntervalId === undefined) ds.evalSchedulerIntervalId = null
121
124
 
122
125
  export function ensureDaemonStarted(source = 'unknown'): boolean {
123
126
  if (ds.running) return false
@@ -140,6 +143,7 @@ export function startDaemon(options?: { source?: string; manualStart?: boolean }
140
143
  startHealthMonitor()
141
144
  startHeartbeatService()
142
145
  startMemoryConsolidation()
146
+ startEvalScheduler()
143
147
  return
144
148
  }
145
149
  ds.running = true
@@ -155,6 +159,7 @@ export function startDaemon(options?: { source?: string; manualStart?: boolean }
155
159
  startHealthMonitor()
156
160
  startHeartbeatService()
157
161
  startMemoryConsolidation()
162
+ startEvalScheduler()
158
163
  } catch (err: unknown) {
159
164
  ds.running = false
160
165
  notify('daemon')
@@ -182,6 +187,7 @@ export function stopDaemon(options?: { source?: string; manualStop?: boolean })
182
187
  stopHealthMonitor()
183
188
  stopHeartbeatService()
184
189
  stopMemoryConsolidation()
190
+ stopEvalScheduler()
185
191
  stopAllConnectors().catch(() => {})
186
192
  }
187
193
 
@@ -785,6 +791,69 @@ function stopMemoryConsolidation() {
785
791
  }
786
792
  }
787
793
 
794
+ // --- Eval scheduler ---
795
+
796
+ const EVAL_DEFAULT_INTERVAL_MS = 24 * 3600_000 // 24 hours
797
+
798
+ function parseCronToMs(cron: string | null | undefined): number | null {
799
+ if (!cron || typeof cron !== 'string') return null
800
+ // Simple heuristic: extract hours from common cron patterns like "0 */6 * * *"
801
+ const hourMatch = cron.match(/\*\/(\d+)/)
802
+ if (hourMatch) return parseInt(hourMatch[1], 10) * 3600_000
803
+ return EVAL_DEFAULT_INTERVAL_MS
804
+ }
805
+
806
+ async function runEvalSchedulerTick() {
807
+ try {
808
+ const settings = loadSettings()
809
+ if (!settings.autonomyEvalEnabled) return
810
+
811
+ const { runEvalSuite } = await import('./eval/runner')
812
+ const agents = loadAgents()
813
+ const heartbeatAgentIds = Object.keys(agents).filter(
814
+ (id) => agents[id].heartbeatEnabled === true,
815
+ )
816
+
817
+ for (const agentId of heartbeatAgentIds) {
818
+ try {
819
+ const result = await runEvalSuite(agentId)
820
+ console.log(
821
+ `[daemon:eval] Agent ${agents[agentId].name}: ${result.percentage}% (${result.totalScore}/${result.maxScore})`,
822
+ )
823
+ createNotification({
824
+ title: `Eval: ${agents[agentId].name} scored ${result.percentage}%`,
825
+ message: `${result.runs.length} scenarios, ${result.totalScore}/${result.maxScore} points`,
826
+ type: result.percentage >= 60 ? 'info' : 'warning',
827
+ })
828
+ } catch (err: unknown) {
829
+ console.error(`[daemon:eval] Failed for agent ${agentId}:`, err instanceof Error ? err.message : String(err))
830
+ }
831
+ }
832
+ } catch (err: unknown) {
833
+ console.error('[daemon:eval] Scheduler tick error:', err instanceof Error ? err.message : String(err))
834
+ }
835
+ }
836
+
837
+ function startEvalScheduler() {
838
+ if (ds.evalSchedulerIntervalId) return
839
+ try {
840
+ const settings = loadSettings()
841
+ if (!settings.autonomyEvalEnabled) return
842
+ const intervalMs = parseCronToMs(settings.autonomyEvalCron) || EVAL_DEFAULT_INTERVAL_MS
843
+ ds.evalSchedulerIntervalId = setInterval(runEvalSchedulerTick, intervalMs)
844
+ console.log(`[daemon:eval] Eval scheduler started (interval=${Math.round(intervalMs / 3600_000)}h)`)
845
+ } catch {
846
+ // Eval scheduling is optional — don't block daemon start
847
+ }
848
+ }
849
+
850
+ function stopEvalScheduler() {
851
+ if (ds.evalSchedulerIntervalId) {
852
+ clearInterval(ds.evalSchedulerIntervalId)
853
+ ds.evalSchedulerIntervalId = null
854
+ }
855
+ }
856
+
788
857
  export async function runDaemonHealthCheckNow() {
789
858
  await runHealthChecks()
790
859
  }
@@ -0,0 +1,126 @@
1
+ import { genId } from '@/lib/id'
2
+ import type { EvalScenario, EvalRun, EvalSuiteResult } from './types'
3
+ import { getScenario, EVAL_SCENARIOS } from './scenarios'
4
+ import { scoreCriteria } from './scorer'
5
+ import { saveEvalRun } from './store'
6
+ import { loadSessions, saveSessions, loadAgents, loadCredentials, decryptKey } from '../storage'
7
+ import { executeSessionChatTurn } from '../chat-execution'
8
+ import type { Session } from '@/types'
9
+
10
+ export async function runEvalScenario(scenarioId: string, agentId: string): Promise<EvalRun> {
11
+ const scenario = getScenario(scenarioId)
12
+ if (!scenario) throw new Error(`Unknown eval scenario: ${scenarioId}`)
13
+
14
+ const agents = loadAgents() as Record<string, Record<string, unknown>>
15
+ const agent = agents[agentId]
16
+ if (!agent) throw new Error(`Unknown agent: ${agentId}`)
17
+
18
+ const runId = genId()
19
+ const sessionId = `eval-${runId}`
20
+ const now = Date.now()
21
+
22
+ const run: EvalRun = {
23
+ id: runId,
24
+ scenarioId,
25
+ agentId,
26
+ status: 'running',
27
+ startedAt: now,
28
+ score: 0,
29
+ maxScore: scenario.scoringCriteria.reduce((sum, c) => sum + c.weight, 0),
30
+ details: [],
31
+ sessionId,
32
+ }
33
+
34
+ // Create temporary eval session
35
+ const sessions = loadSessions() as Record<string, Session>
36
+ const evalSession: Session = {
37
+ id: sessionId,
38
+ name: `Eval: ${scenario.name}`,
39
+ cwd: process.cwd(),
40
+ user: 'eval-runner',
41
+ provider: (agent.provider as Session['provider']) ?? 'anthropic',
42
+ model: (agent.model as string) ?? '',
43
+ credentialId: (agent.credentialId as string | null) ?? null,
44
+ apiEndpoint: (agent.apiEndpoint as string | null) ?? null,
45
+ claudeSessionId: null,
46
+ agentId,
47
+ tools: scenario.tools,
48
+ messages: [],
49
+ createdAt: now,
50
+ lastActiveAt: now,
51
+ }
52
+ sessions[sessionId] = evalSession
53
+ saveSessions(sessions)
54
+
55
+ try {
56
+ const result = await executeSessionChatTurn({
57
+ sessionId,
58
+ message: scenario.userMessage,
59
+ internal: true,
60
+ source: 'eval',
61
+ })
62
+
63
+ const judgeProvider = typeof agent.provider === 'string' ? agent.provider : undefined
64
+ const judgeModel = typeof agent.model === 'string' ? agent.model : undefined
65
+ let judgeApiKey: string | null = null
66
+ if (typeof agent.credentialId === 'string' && agent.credentialId) {
67
+ const creds = loadCredentials()
68
+ const cred = creds[agent.credentialId]
69
+ if (cred) {
70
+ try { judgeApiKey = decryptKey(cred.encryptedKey) } catch { /* skip undecryptable */ }
71
+ }
72
+ }
73
+ const judgeOpts = judgeProvider && judgeModel ? {
74
+ provider: judgeProvider,
75
+ model: judgeModel,
76
+ apiKey: judgeApiKey,
77
+ apiEndpoint: typeof agent.apiEndpoint === 'string' ? agent.apiEndpoint : undefined,
78
+ } : undefined
79
+
80
+ run.details = await scoreCriteria(
81
+ scenario.scoringCriteria,
82
+ result.text,
83
+ result.toolEvents || [],
84
+ judgeOpts,
85
+ )
86
+ run.score = run.details.reduce((sum, d) => sum + d.score, 0)
87
+ run.status = 'completed'
88
+ run.endedAt = Date.now()
89
+ } catch (err: unknown) {
90
+ run.status = 'failed'
91
+ run.error = err instanceof Error ? err.message : String(err)
92
+ run.endedAt = Date.now()
93
+ } finally {
94
+ // Clean up eval session
95
+ const currentSessions = loadSessions() as Record<string, Session>
96
+ delete currentSessions[sessionId]
97
+ saveSessions(currentSessions)
98
+ }
99
+
100
+ saveEvalRun(run)
101
+ return run
102
+ }
103
+
104
+ export async function runEvalSuite(agentId: string, categories?: string[]): Promise<EvalSuiteResult> {
105
+ const scenarios: EvalScenario[] = categories
106
+ ? EVAL_SCENARIOS.filter(s => categories.includes(s.category))
107
+ : EVAL_SCENARIOS
108
+
109
+ const runs: EvalRun[] = []
110
+ for (const scenario of scenarios) {
111
+ const evalRun = await runEvalScenario(scenario.id, agentId)
112
+ runs.push(evalRun)
113
+ }
114
+
115
+ const totalScore = runs.reduce((sum, r) => sum + r.score, 0)
116
+ const maxScore = runs.reduce((sum, r) => sum + r.maxScore, 0)
117
+
118
+ return {
119
+ agentId,
120
+ totalScore,
121
+ maxScore,
122
+ percentage: maxScore > 0 ? Math.round((totalScore / maxScore) * 100) : 0,
123
+ runs,
124
+ completedAt: Date.now(),
125
+ }
126
+ }
@@ -0,0 +1,218 @@
1
+ import type { EvalScenario } from './types'
2
+
3
+ export const EVAL_SCENARIOS: EvalScenario[] = [
4
+ {
5
+ id: 'coding-prime',
6
+ name: 'Prime Number Function',
7
+ category: 'coding',
8
+ description: 'Create and test a function that checks if a number is prime',
9
+ userMessage: 'Create a function that checks if a number is prime and test it with a few examples including 2, 7, 10, and 97.',
10
+ expectedBehaviors: [
11
+ 'Writes a correct isPrime function',
12
+ 'Tests with the specified numbers',
13
+ 'Returns correct results for each test case',
14
+ ],
15
+ scoringCriteria: [
16
+ { name: 'uses_shell', weight: 2, evaluator: 'tool_used', expected: 'shell' },
17
+ { name: 'uses_files', weight: 2, evaluator: 'tool_used', expected: 'files' },
18
+ { name: 'mentions_prime', weight: 1, evaluator: 'contains', expected: 'prime' },
19
+ { name: 'tests_number_2', weight: 1, evaluator: 'contains', expected: '2' },
20
+ { name: 'tests_number_97', weight: 1, evaluator: 'contains', expected: '97' },
21
+ { name: 'correctness', weight: 3, evaluator: 'llm_judge', expected: 'Did the response correctly implement an isPrime function and test it with 2, 7, 10, and 97, producing correct results (2=prime, 7=prime, 10=not prime, 97=prime)?' },
22
+ ],
23
+ timeoutMs: 60_000,
24
+ tools: ['shell', 'files'],
25
+ },
26
+ {
27
+ id: 'research-frameworks',
28
+ name: 'Node.js Framework Comparison',
29
+ category: 'research',
30
+ description: 'Research and compare top Node.js web frameworks by GitHub stars',
31
+ userMessage: 'Find the top 3 Node.js web frameworks by GitHub stars and compare them. Include star counts and key differences.',
32
+ expectedBehaviors: [
33
+ 'Searches the web for framework information',
34
+ 'Identifies at least 3 frameworks (e.g. Express, Next.js, Fastify)',
35
+ 'Provides star counts and meaningful comparison',
36
+ ],
37
+ scoringCriteria: [
38
+ { name: 'uses_web_search', weight: 2, evaluator: 'tool_used', expected: 'web_search' },
39
+ { name: 'mentions_express', weight: 1, evaluator: 'regex', expected: 'express(\\.js)?' },
40
+ { name: 'mentions_next', weight: 1, evaluator: 'regex', expected: 'next(\\.js)?' },
41
+ { name: 'includes_stars', weight: 1, evaluator: 'regex', expected: '\\d+[kK,.]?\\d*\\s*(stars|\\*)' },
42
+ { name: 'quality', weight: 5, evaluator: 'llm_judge', expected: 'Did the response identify at least 3 Node.js web frameworks, include GitHub star counts, and provide a meaningful comparison of their key differences?' },
43
+ ],
44
+ timeoutMs: 90_000,
45
+ tools: ['web_search', 'web_fetch'],
46
+ },
47
+ {
48
+ id: 'companionship-overwhelmed',
49
+ name: 'Emotional Support - Overwhelmed',
50
+ category: 'companionship',
51
+ description: 'Provide empathetic support when user is overwhelmed with work',
52
+ userMessage: "I'm feeling overwhelmed with work deadlines. Everything is piling up and I don't know where to start.",
53
+ expectedBehaviors: [
54
+ 'Acknowledges the feeling of being overwhelmed',
55
+ 'Shows empathy without being dismissive',
56
+ 'Offers actionable suggestions for managing workload',
57
+ ],
58
+ scoringCriteria: [
59
+ { name: 'empathy', weight: 4, evaluator: 'llm_judge', expected: 'Does the response show genuine empathy and acknowledge the user\'s feelings of being overwhelmed without being dismissive or jumping straight to advice?' },
60
+ { name: 'actionable_advice', weight: 3, evaluator: 'llm_judge', expected: 'Does the response offer practical, actionable suggestions for managing workload or reducing the feeling of being overwhelmed (e.g. prioritization, breaking tasks down, time management)?' },
61
+ { name: 'appropriate_tone', weight: 3, evaluator: 'llm_judge', expected: 'Is the tone warm, supportive, and human-like rather than clinical, robotic, or overly formal?' },
62
+ ],
63
+ timeoutMs: 30_000,
64
+ tools: [],
65
+ },
66
+ {
67
+ id: 'multi-step-project',
68
+ name: 'Project Directory Setup',
69
+ category: 'multi-step',
70
+ description: 'Create a project directory, write a README, and initialize git',
71
+ userMessage: "Create a project directory called 'demo-project' in /tmp, write a README.md with a title and description, and initialize a git repository in it.",
72
+ expectedBehaviors: [
73
+ 'Creates the demo-project directory',
74
+ 'Writes a README.md file with content',
75
+ 'Initializes a git repository',
76
+ ],
77
+ scoringCriteria: [
78
+ { name: 'uses_shell', weight: 2, evaluator: 'tool_used', expected: 'shell' },
79
+ { name: 'uses_files', weight: 2, evaluator: 'tool_used', expected: 'files' },
80
+ { name: 'mentions_mkdir', weight: 1, evaluator: 'regex', expected: 'demo-project' },
81
+ { name: 'mentions_readme', weight: 1, evaluator: 'contains', expected: 'README' },
82
+ { name: 'mentions_git_init', weight: 1, evaluator: 'contains', expected: 'git init' },
83
+ { name: 'completeness', weight: 3, evaluator: 'llm_judge', expected: 'Did the response successfully complete all 3 steps: create the demo-project directory, write a README.md with meaningful content, and initialize a git repository?' },
84
+ ],
85
+ timeoutMs: 60_000,
86
+ tools: ['shell', 'files'],
87
+ },
88
+ {
89
+ id: 'memory-store-recall',
90
+ name: 'Memory Store and Recall',
91
+ category: 'memory',
92
+ description: 'Store a fact in memory and demonstrate recall capability',
93
+ userMessage: 'Remember that my favorite programming language is Rust and I prefer functional programming patterns. Then confirm what you just stored.',
94
+ expectedBehaviors: [
95
+ 'Uses memory tool to store the information',
96
+ 'Confirms what was stored',
97
+ 'Accurately reflects the stored preferences',
98
+ ],
99
+ scoringCriteria: [
100
+ { name: 'uses_memory', weight: 3, evaluator: 'tool_used', expected: 'memory' },
101
+ { name: 'mentions_rust', weight: 2, evaluator: 'contains', expected: 'Rust' },
102
+ { name: 'mentions_functional', weight: 2, evaluator: 'contains', expected: 'functional' },
103
+ { name: 'confirmation', weight: 3, evaluator: 'llm_judge', expected: 'Did the response confirm storing the user\'s preference for Rust and functional programming, and accurately summarize what was stored?' },
104
+ ],
105
+ timeoutMs: 30_000,
106
+ tools: ['memory'],
107
+ },
108
+ {
109
+ id: 'planning-blog',
110
+ name: 'Blog Platform Planning',
111
+ category: 'planning',
112
+ description: 'Create a detailed plan for building a blog platform',
113
+ userMessage: 'Build me a detailed plan for a blog platform with posts, comments, and user authentication. Break it into tasks I can work through.',
114
+ expectedBehaviors: [
115
+ 'Creates structured tasks or a plan',
116
+ 'Covers posts, comments, and authentication',
117
+ 'Breaks work into manageable pieces',
118
+ ],
119
+ scoringCriteria: [
120
+ { name: 'uses_tasks', weight: 2, evaluator: 'tool_used', expected: 'manage_tasks' },
121
+ { name: 'mentions_posts', weight: 1, evaluator: 'contains', expected: 'post' },
122
+ { name: 'mentions_comments', weight: 1, evaluator: 'contains', expected: 'comment' },
123
+ { name: 'mentions_auth', weight: 1, evaluator: 'regex', expected: 'auth(entication|orization)?' },
124
+ { name: 'plan_quality', weight: 5, evaluator: 'llm_judge', expected: 'Is the plan well-structured with clear, actionable tasks that cover the three main features (posts, comments, user auth)? Are tasks broken into manageable pieces with logical ordering?' },
125
+ ],
126
+ timeoutMs: 60_000,
127
+ tools: ['manage_tasks'],
128
+ },
129
+ {
130
+ id: 'tool-usage-weather',
131
+ name: 'Web Search - Weather',
132
+ category: 'tool-usage',
133
+ description: 'Search the web for current weather information',
134
+ userMessage: 'Search the web for today\'s weather in London and tell me the temperature and conditions.',
135
+ expectedBehaviors: [
136
+ 'Uses web search tool',
137
+ 'Reports temperature',
138
+ 'Reports weather conditions',
139
+ ],
140
+ scoringCriteria: [
141
+ { name: 'uses_web_search', weight: 3, evaluator: 'tool_used', expected: 'web_search' },
142
+ { name: 'mentions_temperature', weight: 2, evaluator: 'regex', expected: '\\d+\\s*[°]?\\s*[CcFf]' },
143
+ { name: 'mentions_london', weight: 1, evaluator: 'contains', expected: 'London' },
144
+ { name: 'quality', weight: 4, evaluator: 'llm_judge', expected: 'Did the response provide specific, current weather information for London including temperature and conditions (e.g. sunny, cloudy, rain)?' },
145
+ ],
146
+ timeoutMs: 60_000,
147
+ tools: ['web_search'],
148
+ },
149
+ {
150
+ id: 'coding-fizzbuzz',
151
+ name: 'FizzBuzz Implementation',
152
+ category: 'coding',
153
+ description: 'Write and run a FizzBuzz implementation in Python',
154
+ userMessage: 'Write a FizzBuzz implementation in Python that prints numbers 1 to 30 and run it.',
155
+ expectedBehaviors: [
156
+ 'Writes correct FizzBuzz logic',
157
+ 'Runs the code successfully',
158
+ 'Output contains Fizz, Buzz, and FizzBuzz',
159
+ ],
160
+ scoringCriteria: [
161
+ { name: 'uses_shell', weight: 2, evaluator: 'tool_used', expected: 'shell' },
162
+ { name: 'uses_files', weight: 2, evaluator: 'tool_used', expected: 'files' },
163
+ { name: 'contains_fizz', weight: 1, evaluator: 'contains', expected: 'Fizz' },
164
+ { name: 'contains_buzz', weight: 1, evaluator: 'contains', expected: 'Buzz' },
165
+ { name: 'contains_fizzbuzz', weight: 1, evaluator: 'contains', expected: 'FizzBuzz' },
166
+ { name: 'correctness', weight: 3, evaluator: 'llm_judge', expected: 'Did the response implement FizzBuzz correctly (multiples of 3 print Fizz, multiples of 5 print Buzz, multiples of both print FizzBuzz) and successfully execute it?' },
167
+ ],
168
+ timeoutMs: 60_000,
169
+ tools: ['shell', 'files'],
170
+ },
171
+ {
172
+ id: 'research-comparison',
173
+ name: 'LLM Pricing Comparison',
174
+ category: 'research',
175
+ description: 'Compare pricing of major LLM models',
176
+ userMessage: 'Compare the pricing of OpenAI GPT-4o and Anthropic Claude 3.5 Sonnet. Include input and output token costs.',
177
+ expectedBehaviors: [
178
+ 'Searches for current pricing',
179
+ 'Includes both models',
180
+ 'Reports input and output token costs',
181
+ ],
182
+ scoringCriteria: [
183
+ { name: 'uses_web_search', weight: 2, evaluator: 'tool_used', expected: 'web_search' },
184
+ { name: 'mentions_gpt4o', weight: 1, evaluator: 'regex', expected: 'GPT-?4[oO]' },
185
+ { name: 'mentions_claude', weight: 1, evaluator: 'regex', expected: 'Claude\\s*3\\.?5' },
186
+ { name: 'mentions_pricing', weight: 1, evaluator: 'regex', expected: '\\$\\d+' },
187
+ { name: 'quality', weight: 5, evaluator: 'llm_judge', expected: 'Did the response provide accurate and specific pricing for both GPT-4o and Claude 3.5 Sonnet, including input and output token costs, with a clear comparison?' },
188
+ ],
189
+ timeoutMs: 90_000,
190
+ tools: ['web_search', 'web_fetch'],
191
+ },
192
+ {
193
+ id: 'multi-step-analyze',
194
+ name: 'Package.json Analysis',
195
+ category: 'multi-step',
196
+ description: 'Read and analyze the current project\'s package.json',
197
+ userMessage: 'Read the package.json in the current directory and list all dependencies. Group them into regular dependencies and dev dependencies.',
198
+ expectedBehaviors: [
199
+ 'Reads package.json using shell or files tool',
200
+ 'Lists regular dependencies',
201
+ 'Lists dev dependencies',
202
+ 'Groups them clearly',
203
+ ],
204
+ scoringCriteria: [
205
+ { name: 'uses_shell_or_files', weight: 2, evaluator: 'tool_used', expected: 'shell' },
206
+ { name: 'mentions_dependencies', weight: 1, evaluator: 'contains', expected: 'dependencies' },
207
+ { name: 'mentions_dev_deps', weight: 1, evaluator: 'regex', expected: 'dev[Dd]ependencies|dev dependencies' },
208
+ { name: 'mentions_package_json', weight: 1, evaluator: 'contains', expected: 'package.json' },
209
+ { name: 'quality', weight: 5, evaluator: 'llm_judge', expected: 'Did the response successfully read package.json, list the dependencies, and clearly group them into regular and dev dependencies?' },
210
+ ],
211
+ timeoutMs: 60_000,
212
+ tools: ['shell', 'files'],
213
+ },
214
+ ]
215
+
216
+ export function getScenario(id: string): EvalScenario | undefined {
217
+ return EVAL_SCENARIOS.find(s => s.id === id)
218
+ }
@@ -0,0 +1,96 @@
1
+ import type { ScoringCriterion, EvalCriterionResult } from './types'
2
+ import type { MessageToolEvent } from '@/types'
3
+
4
+ export async function scoreCriteria(
5
+ criteria: ScoringCriterion[],
6
+ responseText: string,
7
+ toolEvents: MessageToolEvent[],
8
+ judgeOpts?: { provider: string; model: string; apiKey: string | null; apiEndpoint?: string | null },
9
+ ): Promise<EvalCriterionResult[]> {
10
+ const results: EvalCriterionResult[] = []
11
+
12
+ for (const criterion of criteria) {
13
+ switch (criterion.evaluator) {
14
+ case 'contains': {
15
+ const found = responseText.toLowerCase().includes(criterion.expected.toLowerCase())
16
+ results.push({
17
+ criterion: criterion.name,
18
+ score: found ? criterion.weight : 0,
19
+ maxScore: criterion.weight,
20
+ evidence: found ? `Found "${criterion.expected}" in response` : `"${criterion.expected}" not found in response`,
21
+ })
22
+ break
23
+ }
24
+
25
+ case 'regex': {
26
+ const regex = new RegExp(criterion.expected, 'i')
27
+ const matched = regex.test(responseText)
28
+ results.push({
29
+ criterion: criterion.name,
30
+ score: matched ? criterion.weight : 0,
31
+ maxScore: criterion.weight,
32
+ evidence: matched ? `Pattern /${criterion.expected}/i matched` : `Pattern /${criterion.expected}/i did not match`,
33
+ })
34
+ break
35
+ }
36
+
37
+ case 'tool_used': {
38
+ const used = toolEvents.some(e => e.name === criterion.expected)
39
+ results.push({
40
+ criterion: criterion.name,
41
+ score: used ? criterion.weight : 0,
42
+ maxScore: criterion.weight,
43
+ evidence: used ? `Tool "${criterion.expected}" was used` : `Tool "${criterion.expected}" was not used`,
44
+ })
45
+ break
46
+ }
47
+
48
+ case 'llm_judge': {
49
+ if (!judgeOpts) {
50
+ results.push({
51
+ criterion: criterion.name,
52
+ score: 0,
53
+ maxScore: criterion.weight,
54
+ evidence: 'No judge provider configured; skipped',
55
+ })
56
+ break
57
+ }
58
+
59
+ try {
60
+ const { buildChatModel } = await import('../build-llm')
61
+ const { HumanMessage } = await import('@langchain/core/messages')
62
+
63
+ const llm = buildChatModel({
64
+ provider: judgeOpts.provider,
65
+ model: judgeOpts.model,
66
+ apiKey: judgeOpts.apiKey,
67
+ apiEndpoint: judgeOpts.apiEndpoint,
68
+ })
69
+
70
+ const judgePrompt = `Rate the following AI response on a scale of 0-10.\n\nCriterion: ${criterion.expected}\n\nResponse:\n${responseText}\n\nReply with ONLY a number 0-10.`
71
+ const result = await llm.invoke([new HumanMessage(judgePrompt)])
72
+ const scoreText = typeof result.content === 'string' ? result.content : ''
73
+ const parsed = parseInt(scoreText.trim(), 10)
74
+ const rawScore = Number.isFinite(parsed) ? Math.max(0, Math.min(10, parsed)) : 5
75
+
76
+ results.push({
77
+ criterion: criterion.name,
78
+ score: (rawScore / 10) * criterion.weight,
79
+ maxScore: criterion.weight,
80
+ evidence: `LLM judge: ${rawScore}/10`,
81
+ })
82
+ } catch (err: unknown) {
83
+ results.push({
84
+ criterion: criterion.name,
85
+ score: 0,
86
+ maxScore: criterion.weight,
87
+ evidence: `LLM judge error: ${err instanceof Error ? err.message : String(err)}`,
88
+ })
89
+ }
90
+ break
91
+ }
92
+ }
93
+ }
94
+
95
+ return results
96
+ }
@@ -0,0 +1,37 @@
1
+ import Database from 'better-sqlite3'
2
+ import path from 'path'
3
+ import type { EvalRun } from './types'
4
+
5
+ const DB_PATH = path.join(process.cwd(), 'data', 'eval-runs.db')
6
+
7
+ let db: Database.Database | null = null
8
+
9
+ function getDb(): Database.Database {
10
+ if (!db) {
11
+ db = new Database(DB_PATH)
12
+ db.pragma('journal_mode = WAL')
13
+ db.exec(`CREATE TABLE IF NOT EXISTS eval_runs (
14
+ id TEXT PRIMARY KEY,
15
+ data TEXT NOT NULL
16
+ )`)
17
+ }
18
+ return db
19
+ }
20
+
21
+ export function saveEvalRun(run: EvalRun): void {
22
+ getDb().prepare('INSERT OR REPLACE INTO eval_runs (id, data) VALUES (?, ?)').run(run.id, JSON.stringify(run))
23
+ }
24
+
25
+ export function getEvalRun(id: string): EvalRun | null {
26
+ const row = getDb().prepare('SELECT data FROM eval_runs WHERE id = ?').get(id) as { data: string } | undefined
27
+ return row ? JSON.parse(row.data) as EvalRun : null
28
+ }
29
+
30
+ export function listEvalRuns(limit = 50): EvalRun[] {
31
+ const rows = getDb().prepare('SELECT data FROM eval_runs ORDER BY rowid DESC LIMIT ?').all(limit) as { data: string }[]
32
+ return rows.map(r => JSON.parse(r.data) as EvalRun)
33
+ }
34
+
35
+ export function listEvalRunsByAgent(agentId: string, limit = 50): EvalRun[] {
36
+ return listEvalRuns(limit * 2).filter(r => r.agentId === agentId).slice(0, limit)
37
+ }
@@ -0,0 +1,48 @@
1
+ export interface ScoringCriterion {
2
+ name: string
3
+ weight: number
4
+ evaluator: 'contains' | 'regex' | 'tool_used' | 'llm_judge'
5
+ expected: string
6
+ }
7
+
8
+ export interface EvalScenario {
9
+ id: string
10
+ name: string
11
+ category: 'coding' | 'research' | 'companionship' | 'multi-step' | 'memory' | 'planning' | 'tool-usage' | 'long-lived'
12
+ description: string
13
+ userMessage: string
14
+ expectedBehaviors: string[]
15
+ scoringCriteria: ScoringCriterion[]
16
+ timeoutMs: number
17
+ tools: string[]
18
+ }
19
+
20
+ export interface EvalRun {
21
+ id: string
22
+ scenarioId: string
23
+ agentId: string
24
+ status: 'pending' | 'running' | 'completed' | 'failed'
25
+ startedAt: number
26
+ endedAt?: number
27
+ score: number
28
+ maxScore: number
29
+ details: EvalCriterionResult[]
30
+ sessionId?: string
31
+ error?: string
32
+ }
33
+
34
+ export interface EvalCriterionResult {
35
+ criterion: string
36
+ score: number
37
+ maxScore: number
38
+ evidence?: string
39
+ }
40
+
41
+ export interface EvalSuiteResult {
42
+ agentId: string
43
+ totalScore: number
44
+ maxScore: number
45
+ percentage: number
46
+ runs: EvalRun[]
47
+ completedAt: number
48
+ }
@@ -8,14 +8,18 @@ import { genId } from '@/lib/id'
8
8
  // ---------------------------------------------------------------------------
9
9
 
10
10
  export type LogCategory =
11
- | 'trigger' // what kicked off the action
12
- | 'decision' // reasoning / model choice
13
- | 'tool_call' // tool invocation with input
14
- | 'tool_result' // tool output
15
- | 'outbound' // messages sent to users/platforms
16
- | 'file_op' // file read/write/delete with checksums
17
- | 'commit' // git commit activity
18
- | 'error' // errors during execution
11
+ | 'trigger' // what kicked off the action
12
+ | 'decision' // reasoning / model choice
13
+ | 'tool_call' // tool invocation with input
14
+ | 'tool_result' // tool output
15
+ | 'outbound' // messages sent to users/platforms
16
+ | 'file_op' // file read/write/delete with checksums
17
+ | 'commit' // git commit activity
18
+ | 'error' // errors during execution
19
+ | 'mission_start' // new mission/goal started
20
+ | 'mission_checkpoint' // periodic mission state snapshot
21
+ | 'mission_complete' // mission reached ok status
22
+ | 'budget_warning' // mission approaching or exceeding budget
19
23
 
20
24
  export interface ExecutionLogEntry {
21
25
  id: string