@swarmclawai/swarmclaw 0.6.7 → 0.6.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +24 -6
- package/package.json +1 -1
- package/src/app/api/agents/route.ts +1 -0
- package/src/app/api/chatrooms/[id]/chat/route.ts +4 -0
- package/src/app/api/eval/run/route.ts +37 -0
- package/src/app/api/eval/scenarios/route.ts +24 -0
- package/src/app/api/eval/suite/route.ts +29 -0
- package/src/app/api/memory/graph/route.ts +46 -0
- package/src/app/api/sessions/[id]/checkpoints/route.ts +31 -0
- package/src/app/api/sessions/[id]/restore/route.ts +36 -0
- package/src/app/api/souls/[id]/route.ts +65 -0
- package/src/app/api/souls/route.ts +70 -0
- package/src/app/api/tasks/[id]/route.ts +5 -0
- package/src/app/api/tasks/route.ts +2 -0
- package/src/app/api/usage/route.ts +9 -2
- package/src/cli/index.js +24 -0
- package/src/components/agents/agent-sheet.tsx +27 -6
- package/src/components/agents/soul-library-picker.tsx +84 -13
- package/src/components/chat/activity-moment.tsx +2 -0
- package/src/components/chat/checkpoint-timeline.tsx +112 -0
- package/src/components/chat/message-list.tsx +19 -3
- package/src/components/chat/session-debug-panel.tsx +106 -84
- package/src/components/chat/task-approval-card.tsx +78 -0
- package/src/components/chat/tool-call-bubble.tsx +3 -0
- package/src/components/connectors/connector-sheet.tsx +8 -1
- package/src/components/home/home-view.tsx +39 -15
- package/src/components/layout/app-layout.tsx +18 -2
- package/src/components/memory/memory-browser.tsx +73 -45
- package/src/components/memory/memory-graph-view.tsx +203 -0
- package/src/components/plugins/plugin-list.tsx +1 -1
- package/src/components/schedules/schedule-sheet.tsx +9 -2
- package/src/components/shared/hint-tip.tsx +31 -0
- package/src/components/shared/settings/section-runtime-loop.tsx +5 -4
- package/src/components/tasks/approvals-panel.tsx +120 -0
- package/src/components/usage/metrics-dashboard.tsx +25 -3
- package/src/lib/server/chat-execution.ts +96 -12
- package/src/lib/server/chatroom-helpers.ts +63 -5
- package/src/lib/server/chatroom-orchestration.ts +74 -0
- package/src/lib/server/context-manager.ts +132 -50
- package/src/lib/server/daemon-state.ts +70 -1
- package/src/lib/server/eval/runner.ts +126 -0
- package/src/lib/server/eval/scenarios.ts +218 -0
- package/src/lib/server/eval/scorer.ts +96 -0
- package/src/lib/server/eval/store.ts +37 -0
- package/src/lib/server/eval/types.ts +48 -0
- package/src/lib/server/execution-log.ts +12 -8
- package/src/lib/server/guardian.ts +34 -0
- package/src/lib/server/heartbeat-service.ts +53 -1
- package/src/lib/server/langgraph-checkpoint.ts +10 -0
- package/src/lib/server/link-understanding.ts +55 -0
- package/src/lib/server/main-agent-loop.ts +114 -15
- package/src/lib/server/memory-db.ts +18 -7
- package/src/lib/server/mmr.ts +73 -0
- package/src/lib/server/orchestrator-lg.ts +3 -0
- package/src/lib/server/plugins.ts +44 -22
- package/src/lib/server/query-expansion.ts +57 -0
- package/src/lib/server/queue.ts +27 -0
- package/src/lib/server/session-run-manager.ts +21 -1
- package/src/lib/server/session-tools/http.ts +19 -9
- package/src/lib/server/session-tools/index.ts +34 -0
- package/src/lib/server/session-tools/memory.ts +39 -11
- package/src/lib/server/session-tools/schedule.ts +43 -0
- package/src/lib/server/session-tools/web.ts +35 -11
- package/src/lib/server/storage.ts +12 -0
- package/src/lib/server/stream-agent-chat.ts +57 -8
- package/src/lib/server/tool-capability-policy.ts +1 -0
- package/src/lib/server/tool-retry.ts +62 -0
- package/src/lib/server/transcript-repair.ts +72 -0
- package/src/lib/setup-defaults.ts +1 -0
- package/src/lib/tool-definitions.ts +1 -0
- package/src/lib/validation/schemas.ts +1 -0
- package/src/lib/view-routes.ts +1 -0
- package/src/types/index.ts +34 -3
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { loadQueue, loadSchedules, loadSessions, saveSessions, loadConnectors, saveConnectors, loadWebhookRetryQueue, upsertWebhookRetry, deleteWebhookRetry, loadWebhooks, loadAgents, appendWebhookLog, loadCredentials, decryptKey } from './storage'
|
|
1
|
+
import { loadQueue, loadSchedules, loadSessions, saveSessions, loadConnectors, saveConnectors, loadWebhookRetryQueue, upsertWebhookRetry, deleteWebhookRetry, loadWebhooks, loadAgents, loadSettings, appendWebhookLog, loadCredentials, decryptKey } from './storage'
|
|
2
2
|
import { notify } from './ws-hub'
|
|
3
3
|
import { processNext, cleanupFinishedTaskSessions, validateCompletedTasksQueue, recoverStalledRunningTasks } from './queue'
|
|
4
4
|
import { startScheduler, stopScheduler } from './scheduler'
|
|
@@ -80,6 +80,7 @@ const ds: {
|
|
|
80
80
|
healthIntervalId: ReturnType<typeof setInterval> | null
|
|
81
81
|
memoryConsolidationTimeoutId: ReturnType<typeof setTimeout> | null
|
|
82
82
|
memoryConsolidationIntervalId: ReturnType<typeof setInterval> | null
|
|
83
|
+
evalSchedulerIntervalId: ReturnType<typeof setInterval> | null
|
|
83
84
|
/** Session IDs we've already alerted as stale (alert-once semantics). */
|
|
84
85
|
staleSessionIds: Set<string>
|
|
85
86
|
connectorRestartState: Map<string, { lastAttemptAt: number; failCount: number; wakeAttempts: number }>
|
|
@@ -97,6 +98,7 @@ const ds: {
|
|
|
97
98
|
healthIntervalId: null,
|
|
98
99
|
memoryConsolidationTimeoutId: null,
|
|
99
100
|
memoryConsolidationIntervalId: null,
|
|
101
|
+
evalSchedulerIntervalId: null,
|
|
100
102
|
staleSessionIds: new Set<string>(),
|
|
101
103
|
connectorRestartState: new Map<string, { lastAttemptAt: number; failCount: number; wakeAttempts: number }>(),
|
|
102
104
|
openclawDownAgentIds: new Set<string>(),
|
|
@@ -118,6 +120,7 @@ if (ds.healthIntervalId === undefined) ds.healthIntervalId = null
|
|
|
118
120
|
if (ds.manualStopRequested === undefined) ds.manualStopRequested = false
|
|
119
121
|
if (ds.memoryConsolidationTimeoutId === undefined) ds.memoryConsolidationTimeoutId = null
|
|
120
122
|
if (ds.memoryConsolidationIntervalId === undefined) ds.memoryConsolidationIntervalId = null
|
|
123
|
+
if (ds.evalSchedulerIntervalId === undefined) ds.evalSchedulerIntervalId = null
|
|
121
124
|
|
|
122
125
|
export function ensureDaemonStarted(source = 'unknown'): boolean {
|
|
123
126
|
if (ds.running) return false
|
|
@@ -140,6 +143,7 @@ export function startDaemon(options?: { source?: string; manualStart?: boolean }
|
|
|
140
143
|
startHealthMonitor()
|
|
141
144
|
startHeartbeatService()
|
|
142
145
|
startMemoryConsolidation()
|
|
146
|
+
startEvalScheduler()
|
|
143
147
|
return
|
|
144
148
|
}
|
|
145
149
|
ds.running = true
|
|
@@ -155,6 +159,7 @@ export function startDaemon(options?: { source?: string; manualStart?: boolean }
|
|
|
155
159
|
startHealthMonitor()
|
|
156
160
|
startHeartbeatService()
|
|
157
161
|
startMemoryConsolidation()
|
|
162
|
+
startEvalScheduler()
|
|
158
163
|
} catch (err: unknown) {
|
|
159
164
|
ds.running = false
|
|
160
165
|
notify('daemon')
|
|
@@ -182,6 +187,7 @@ export function stopDaemon(options?: { source?: string; manualStop?: boolean })
|
|
|
182
187
|
stopHealthMonitor()
|
|
183
188
|
stopHeartbeatService()
|
|
184
189
|
stopMemoryConsolidation()
|
|
190
|
+
stopEvalScheduler()
|
|
185
191
|
stopAllConnectors().catch(() => {})
|
|
186
192
|
}
|
|
187
193
|
|
|
@@ -785,6 +791,69 @@ function stopMemoryConsolidation() {
|
|
|
785
791
|
}
|
|
786
792
|
}
|
|
787
793
|
|
|
794
|
+
// --- Eval scheduler ---
|
|
795
|
+
|
|
796
|
+
const EVAL_DEFAULT_INTERVAL_MS = 24 * 3600_000 // 24 hours
|
|
797
|
+
|
|
798
|
+
function parseCronToMs(cron: string | null | undefined): number | null {
|
|
799
|
+
if (!cron || typeof cron !== 'string') return null
|
|
800
|
+
// Simple heuristic: extract hours from common cron patterns like "0 */6 * * *"
|
|
801
|
+
const hourMatch = cron.match(/\*\/(\d+)/)
|
|
802
|
+
if (hourMatch) return parseInt(hourMatch[1], 10) * 3600_000
|
|
803
|
+
return EVAL_DEFAULT_INTERVAL_MS
|
|
804
|
+
}
|
|
805
|
+
|
|
806
|
+
async function runEvalSchedulerTick() {
|
|
807
|
+
try {
|
|
808
|
+
const settings = loadSettings()
|
|
809
|
+
if (!settings.autonomyEvalEnabled) return
|
|
810
|
+
|
|
811
|
+
const { runEvalSuite } = await import('./eval/runner')
|
|
812
|
+
const agents = loadAgents()
|
|
813
|
+
const heartbeatAgentIds = Object.keys(agents).filter(
|
|
814
|
+
(id) => agents[id].heartbeatEnabled === true,
|
|
815
|
+
)
|
|
816
|
+
|
|
817
|
+
for (const agentId of heartbeatAgentIds) {
|
|
818
|
+
try {
|
|
819
|
+
const result = await runEvalSuite(agentId)
|
|
820
|
+
console.log(
|
|
821
|
+
`[daemon:eval] Agent ${agents[agentId].name}: ${result.percentage}% (${result.totalScore}/${result.maxScore})`,
|
|
822
|
+
)
|
|
823
|
+
createNotification({
|
|
824
|
+
title: `Eval: ${agents[agentId].name} scored ${result.percentage}%`,
|
|
825
|
+
message: `${result.runs.length} scenarios, ${result.totalScore}/${result.maxScore} points`,
|
|
826
|
+
type: result.percentage >= 60 ? 'info' : 'warning',
|
|
827
|
+
})
|
|
828
|
+
} catch (err: unknown) {
|
|
829
|
+
console.error(`[daemon:eval] Failed for agent ${agentId}:`, err instanceof Error ? err.message : String(err))
|
|
830
|
+
}
|
|
831
|
+
}
|
|
832
|
+
} catch (err: unknown) {
|
|
833
|
+
console.error('[daemon:eval] Scheduler tick error:', err instanceof Error ? err.message : String(err))
|
|
834
|
+
}
|
|
835
|
+
}
|
|
836
|
+
|
|
837
|
+
function startEvalScheduler() {
|
|
838
|
+
if (ds.evalSchedulerIntervalId) return
|
|
839
|
+
try {
|
|
840
|
+
const settings = loadSettings()
|
|
841
|
+
if (!settings.autonomyEvalEnabled) return
|
|
842
|
+
const intervalMs = parseCronToMs(settings.autonomyEvalCron) || EVAL_DEFAULT_INTERVAL_MS
|
|
843
|
+
ds.evalSchedulerIntervalId = setInterval(runEvalSchedulerTick, intervalMs)
|
|
844
|
+
console.log(`[daemon:eval] Eval scheduler started (interval=${Math.round(intervalMs / 3600_000)}h)`)
|
|
845
|
+
} catch {
|
|
846
|
+
// Eval scheduling is optional — don't block daemon start
|
|
847
|
+
}
|
|
848
|
+
}
|
|
849
|
+
|
|
850
|
+
function stopEvalScheduler() {
|
|
851
|
+
if (ds.evalSchedulerIntervalId) {
|
|
852
|
+
clearInterval(ds.evalSchedulerIntervalId)
|
|
853
|
+
ds.evalSchedulerIntervalId = null
|
|
854
|
+
}
|
|
855
|
+
}
|
|
856
|
+
|
|
788
857
|
export async function runDaemonHealthCheckNow() {
|
|
789
858
|
await runHealthChecks()
|
|
790
859
|
}
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import { genId } from '@/lib/id'
|
|
2
|
+
import type { EvalScenario, EvalRun, EvalSuiteResult } from './types'
|
|
3
|
+
import { getScenario, EVAL_SCENARIOS } from './scenarios'
|
|
4
|
+
import { scoreCriteria } from './scorer'
|
|
5
|
+
import { saveEvalRun } from './store'
|
|
6
|
+
import { loadSessions, saveSessions, loadAgents, loadCredentials, decryptKey } from '../storage'
|
|
7
|
+
import { executeSessionChatTurn } from '../chat-execution'
|
|
8
|
+
import type { Session } from '@/types'
|
|
9
|
+
|
|
10
|
+
export async function runEvalScenario(scenarioId: string, agentId: string): Promise<EvalRun> {
|
|
11
|
+
const scenario = getScenario(scenarioId)
|
|
12
|
+
if (!scenario) throw new Error(`Unknown eval scenario: ${scenarioId}`)
|
|
13
|
+
|
|
14
|
+
const agents = loadAgents() as Record<string, Record<string, unknown>>
|
|
15
|
+
const agent = agents[agentId]
|
|
16
|
+
if (!agent) throw new Error(`Unknown agent: ${agentId}`)
|
|
17
|
+
|
|
18
|
+
const runId = genId()
|
|
19
|
+
const sessionId = `eval-${runId}`
|
|
20
|
+
const now = Date.now()
|
|
21
|
+
|
|
22
|
+
const run: EvalRun = {
|
|
23
|
+
id: runId,
|
|
24
|
+
scenarioId,
|
|
25
|
+
agentId,
|
|
26
|
+
status: 'running',
|
|
27
|
+
startedAt: now,
|
|
28
|
+
score: 0,
|
|
29
|
+
maxScore: scenario.scoringCriteria.reduce((sum, c) => sum + c.weight, 0),
|
|
30
|
+
details: [],
|
|
31
|
+
sessionId,
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
// Create temporary eval session
|
|
35
|
+
const sessions = loadSessions() as Record<string, Session>
|
|
36
|
+
const evalSession: Session = {
|
|
37
|
+
id: sessionId,
|
|
38
|
+
name: `Eval: ${scenario.name}`,
|
|
39
|
+
cwd: process.cwd(),
|
|
40
|
+
user: 'eval-runner',
|
|
41
|
+
provider: (agent.provider as Session['provider']) ?? 'anthropic',
|
|
42
|
+
model: (agent.model as string) ?? '',
|
|
43
|
+
credentialId: (agent.credentialId as string | null) ?? null,
|
|
44
|
+
apiEndpoint: (agent.apiEndpoint as string | null) ?? null,
|
|
45
|
+
claudeSessionId: null,
|
|
46
|
+
agentId,
|
|
47
|
+
tools: scenario.tools,
|
|
48
|
+
messages: [],
|
|
49
|
+
createdAt: now,
|
|
50
|
+
lastActiveAt: now,
|
|
51
|
+
}
|
|
52
|
+
sessions[sessionId] = evalSession
|
|
53
|
+
saveSessions(sessions)
|
|
54
|
+
|
|
55
|
+
try {
|
|
56
|
+
const result = await executeSessionChatTurn({
|
|
57
|
+
sessionId,
|
|
58
|
+
message: scenario.userMessage,
|
|
59
|
+
internal: true,
|
|
60
|
+
source: 'eval',
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
const judgeProvider = typeof agent.provider === 'string' ? agent.provider : undefined
|
|
64
|
+
const judgeModel = typeof agent.model === 'string' ? agent.model : undefined
|
|
65
|
+
let judgeApiKey: string | null = null
|
|
66
|
+
if (typeof agent.credentialId === 'string' && agent.credentialId) {
|
|
67
|
+
const creds = loadCredentials()
|
|
68
|
+
const cred = creds[agent.credentialId]
|
|
69
|
+
if (cred) {
|
|
70
|
+
try { judgeApiKey = decryptKey(cred.encryptedKey) } catch { /* skip undecryptable */ }
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
const judgeOpts = judgeProvider && judgeModel ? {
|
|
74
|
+
provider: judgeProvider,
|
|
75
|
+
model: judgeModel,
|
|
76
|
+
apiKey: judgeApiKey,
|
|
77
|
+
apiEndpoint: typeof agent.apiEndpoint === 'string' ? agent.apiEndpoint : undefined,
|
|
78
|
+
} : undefined
|
|
79
|
+
|
|
80
|
+
run.details = await scoreCriteria(
|
|
81
|
+
scenario.scoringCriteria,
|
|
82
|
+
result.text,
|
|
83
|
+
result.toolEvents || [],
|
|
84
|
+
judgeOpts,
|
|
85
|
+
)
|
|
86
|
+
run.score = run.details.reduce((sum, d) => sum + d.score, 0)
|
|
87
|
+
run.status = 'completed'
|
|
88
|
+
run.endedAt = Date.now()
|
|
89
|
+
} catch (err: unknown) {
|
|
90
|
+
run.status = 'failed'
|
|
91
|
+
run.error = err instanceof Error ? err.message : String(err)
|
|
92
|
+
run.endedAt = Date.now()
|
|
93
|
+
} finally {
|
|
94
|
+
// Clean up eval session
|
|
95
|
+
const currentSessions = loadSessions() as Record<string, Session>
|
|
96
|
+
delete currentSessions[sessionId]
|
|
97
|
+
saveSessions(currentSessions)
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
saveEvalRun(run)
|
|
101
|
+
return run
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
export async function runEvalSuite(agentId: string, categories?: string[]): Promise<EvalSuiteResult> {
|
|
105
|
+
const scenarios: EvalScenario[] = categories
|
|
106
|
+
? EVAL_SCENARIOS.filter(s => categories.includes(s.category))
|
|
107
|
+
: EVAL_SCENARIOS
|
|
108
|
+
|
|
109
|
+
const runs: EvalRun[] = []
|
|
110
|
+
for (const scenario of scenarios) {
|
|
111
|
+
const evalRun = await runEvalScenario(scenario.id, agentId)
|
|
112
|
+
runs.push(evalRun)
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
const totalScore = runs.reduce((sum, r) => sum + r.score, 0)
|
|
116
|
+
const maxScore = runs.reduce((sum, r) => sum + r.maxScore, 0)
|
|
117
|
+
|
|
118
|
+
return {
|
|
119
|
+
agentId,
|
|
120
|
+
totalScore,
|
|
121
|
+
maxScore,
|
|
122
|
+
percentage: maxScore > 0 ? Math.round((totalScore / maxScore) * 100) : 0,
|
|
123
|
+
runs,
|
|
124
|
+
completedAt: Date.now(),
|
|
125
|
+
}
|
|
126
|
+
}
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
import type { EvalScenario } from './types'
|
|
2
|
+
|
|
3
|
+
export const EVAL_SCENARIOS: EvalScenario[] = [
|
|
4
|
+
{
|
|
5
|
+
id: 'coding-prime',
|
|
6
|
+
name: 'Prime Number Function',
|
|
7
|
+
category: 'coding',
|
|
8
|
+
description: 'Create and test a function that checks if a number is prime',
|
|
9
|
+
userMessage: 'Create a function that checks if a number is prime and test it with a few examples including 2, 7, 10, and 97.',
|
|
10
|
+
expectedBehaviors: [
|
|
11
|
+
'Writes a correct isPrime function',
|
|
12
|
+
'Tests with the specified numbers',
|
|
13
|
+
'Returns correct results for each test case',
|
|
14
|
+
],
|
|
15
|
+
scoringCriteria: [
|
|
16
|
+
{ name: 'uses_shell', weight: 2, evaluator: 'tool_used', expected: 'shell' },
|
|
17
|
+
{ name: 'uses_files', weight: 2, evaluator: 'tool_used', expected: 'files' },
|
|
18
|
+
{ name: 'mentions_prime', weight: 1, evaluator: 'contains', expected: 'prime' },
|
|
19
|
+
{ name: 'tests_number_2', weight: 1, evaluator: 'contains', expected: '2' },
|
|
20
|
+
{ name: 'tests_number_97', weight: 1, evaluator: 'contains', expected: '97' },
|
|
21
|
+
{ name: 'correctness', weight: 3, evaluator: 'llm_judge', expected: 'Did the response correctly implement an isPrime function and test it with 2, 7, 10, and 97, producing correct results (2=prime, 7=prime, 10=not prime, 97=prime)?' },
|
|
22
|
+
],
|
|
23
|
+
timeoutMs: 60_000,
|
|
24
|
+
tools: ['shell', 'files'],
|
|
25
|
+
},
|
|
26
|
+
{
|
|
27
|
+
id: 'research-frameworks',
|
|
28
|
+
name: 'Node.js Framework Comparison',
|
|
29
|
+
category: 'research',
|
|
30
|
+
description: 'Research and compare top Node.js web frameworks by GitHub stars',
|
|
31
|
+
userMessage: 'Find the top 3 Node.js web frameworks by GitHub stars and compare them. Include star counts and key differences.',
|
|
32
|
+
expectedBehaviors: [
|
|
33
|
+
'Searches the web for framework information',
|
|
34
|
+
'Identifies at least 3 frameworks (e.g. Express, Next.js, Fastify)',
|
|
35
|
+
'Provides star counts and meaningful comparison',
|
|
36
|
+
],
|
|
37
|
+
scoringCriteria: [
|
|
38
|
+
{ name: 'uses_web_search', weight: 2, evaluator: 'tool_used', expected: 'web_search' },
|
|
39
|
+
{ name: 'mentions_express', weight: 1, evaluator: 'regex', expected: 'express(\\.js)?' },
|
|
40
|
+
{ name: 'mentions_next', weight: 1, evaluator: 'regex', expected: 'next(\\.js)?' },
|
|
41
|
+
{ name: 'includes_stars', weight: 1, evaluator: 'regex', expected: '\\d+[kK,.]?\\d*\\s*(stars|\\*)' },
|
|
42
|
+
{ name: 'quality', weight: 5, evaluator: 'llm_judge', expected: 'Did the response identify at least 3 Node.js web frameworks, include GitHub star counts, and provide a meaningful comparison of their key differences?' },
|
|
43
|
+
],
|
|
44
|
+
timeoutMs: 90_000,
|
|
45
|
+
tools: ['web_search', 'web_fetch'],
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
id: 'companionship-overwhelmed',
|
|
49
|
+
name: 'Emotional Support - Overwhelmed',
|
|
50
|
+
category: 'companionship',
|
|
51
|
+
description: 'Provide empathetic support when user is overwhelmed with work',
|
|
52
|
+
userMessage: "I'm feeling overwhelmed with work deadlines. Everything is piling up and I don't know where to start.",
|
|
53
|
+
expectedBehaviors: [
|
|
54
|
+
'Acknowledges the feeling of being overwhelmed',
|
|
55
|
+
'Shows empathy without being dismissive',
|
|
56
|
+
'Offers actionable suggestions for managing workload',
|
|
57
|
+
],
|
|
58
|
+
scoringCriteria: [
|
|
59
|
+
{ name: 'empathy', weight: 4, evaluator: 'llm_judge', expected: 'Does the response show genuine empathy and acknowledge the user\'s feelings of being overwhelmed without being dismissive or jumping straight to advice?' },
|
|
60
|
+
{ name: 'actionable_advice', weight: 3, evaluator: 'llm_judge', expected: 'Does the response offer practical, actionable suggestions for managing workload or reducing the feeling of being overwhelmed (e.g. prioritization, breaking tasks down, time management)?' },
|
|
61
|
+
{ name: 'appropriate_tone', weight: 3, evaluator: 'llm_judge', expected: 'Is the tone warm, supportive, and human-like rather than clinical, robotic, or overly formal?' },
|
|
62
|
+
],
|
|
63
|
+
timeoutMs: 30_000,
|
|
64
|
+
tools: [],
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
id: 'multi-step-project',
|
|
68
|
+
name: 'Project Directory Setup',
|
|
69
|
+
category: 'multi-step',
|
|
70
|
+
description: 'Create a project directory, write a README, and initialize git',
|
|
71
|
+
userMessage: "Create a project directory called 'demo-project' in /tmp, write a README.md with a title and description, and initialize a git repository in it.",
|
|
72
|
+
expectedBehaviors: [
|
|
73
|
+
'Creates the demo-project directory',
|
|
74
|
+
'Writes a README.md file with content',
|
|
75
|
+
'Initializes a git repository',
|
|
76
|
+
],
|
|
77
|
+
scoringCriteria: [
|
|
78
|
+
{ name: 'uses_shell', weight: 2, evaluator: 'tool_used', expected: 'shell' },
|
|
79
|
+
{ name: 'uses_files', weight: 2, evaluator: 'tool_used', expected: 'files' },
|
|
80
|
+
{ name: 'mentions_mkdir', weight: 1, evaluator: 'regex', expected: 'demo-project' },
|
|
81
|
+
{ name: 'mentions_readme', weight: 1, evaluator: 'contains', expected: 'README' },
|
|
82
|
+
{ name: 'mentions_git_init', weight: 1, evaluator: 'contains', expected: 'git init' },
|
|
83
|
+
{ name: 'completeness', weight: 3, evaluator: 'llm_judge', expected: 'Did the response successfully complete all 3 steps: create the demo-project directory, write a README.md with meaningful content, and initialize a git repository?' },
|
|
84
|
+
],
|
|
85
|
+
timeoutMs: 60_000,
|
|
86
|
+
tools: ['shell', 'files'],
|
|
87
|
+
},
|
|
88
|
+
{
|
|
89
|
+
id: 'memory-store-recall',
|
|
90
|
+
name: 'Memory Store and Recall',
|
|
91
|
+
category: 'memory',
|
|
92
|
+
description: 'Store a fact in memory and demonstrate recall capability',
|
|
93
|
+
userMessage: 'Remember that my favorite programming language is Rust and I prefer functional programming patterns. Then confirm what you just stored.',
|
|
94
|
+
expectedBehaviors: [
|
|
95
|
+
'Uses memory tool to store the information',
|
|
96
|
+
'Confirms what was stored',
|
|
97
|
+
'Accurately reflects the stored preferences',
|
|
98
|
+
],
|
|
99
|
+
scoringCriteria: [
|
|
100
|
+
{ name: 'uses_memory', weight: 3, evaluator: 'tool_used', expected: 'memory' },
|
|
101
|
+
{ name: 'mentions_rust', weight: 2, evaluator: 'contains', expected: 'Rust' },
|
|
102
|
+
{ name: 'mentions_functional', weight: 2, evaluator: 'contains', expected: 'functional' },
|
|
103
|
+
{ name: 'confirmation', weight: 3, evaluator: 'llm_judge', expected: 'Did the response confirm storing the user\'s preference for Rust and functional programming, and accurately summarize what was stored?' },
|
|
104
|
+
],
|
|
105
|
+
timeoutMs: 30_000,
|
|
106
|
+
tools: ['memory'],
|
|
107
|
+
},
|
|
108
|
+
{
|
|
109
|
+
id: 'planning-blog',
|
|
110
|
+
name: 'Blog Platform Planning',
|
|
111
|
+
category: 'planning',
|
|
112
|
+
description: 'Create a detailed plan for building a blog platform',
|
|
113
|
+
userMessage: 'Build me a detailed plan for a blog platform with posts, comments, and user authentication. Break it into tasks I can work through.',
|
|
114
|
+
expectedBehaviors: [
|
|
115
|
+
'Creates structured tasks or a plan',
|
|
116
|
+
'Covers posts, comments, and authentication',
|
|
117
|
+
'Breaks work into manageable pieces',
|
|
118
|
+
],
|
|
119
|
+
scoringCriteria: [
|
|
120
|
+
{ name: 'uses_tasks', weight: 2, evaluator: 'tool_used', expected: 'manage_tasks' },
|
|
121
|
+
{ name: 'mentions_posts', weight: 1, evaluator: 'contains', expected: 'post' },
|
|
122
|
+
{ name: 'mentions_comments', weight: 1, evaluator: 'contains', expected: 'comment' },
|
|
123
|
+
{ name: 'mentions_auth', weight: 1, evaluator: 'regex', expected: 'auth(entication|orization)?' },
|
|
124
|
+
{ name: 'plan_quality', weight: 5, evaluator: 'llm_judge', expected: 'Is the plan well-structured with clear, actionable tasks that cover the three main features (posts, comments, user auth)? Are tasks broken into manageable pieces with logical ordering?' },
|
|
125
|
+
],
|
|
126
|
+
timeoutMs: 60_000,
|
|
127
|
+
tools: ['manage_tasks'],
|
|
128
|
+
},
|
|
129
|
+
{
|
|
130
|
+
id: 'tool-usage-weather',
|
|
131
|
+
name: 'Web Search - Weather',
|
|
132
|
+
category: 'tool-usage',
|
|
133
|
+
description: 'Search the web for current weather information',
|
|
134
|
+
userMessage: 'Search the web for today\'s weather in London and tell me the temperature and conditions.',
|
|
135
|
+
expectedBehaviors: [
|
|
136
|
+
'Uses web search tool',
|
|
137
|
+
'Reports temperature',
|
|
138
|
+
'Reports weather conditions',
|
|
139
|
+
],
|
|
140
|
+
scoringCriteria: [
|
|
141
|
+
{ name: 'uses_web_search', weight: 3, evaluator: 'tool_used', expected: 'web_search' },
|
|
142
|
+
{ name: 'mentions_temperature', weight: 2, evaluator: 'regex', expected: '\\d+\\s*[°]?\\s*[CcFf]' },
|
|
143
|
+
{ name: 'mentions_london', weight: 1, evaluator: 'contains', expected: 'London' },
|
|
144
|
+
{ name: 'quality', weight: 4, evaluator: 'llm_judge', expected: 'Did the response provide specific, current weather information for London including temperature and conditions (e.g. sunny, cloudy, rain)?' },
|
|
145
|
+
],
|
|
146
|
+
timeoutMs: 60_000,
|
|
147
|
+
tools: ['web_search'],
|
|
148
|
+
},
|
|
149
|
+
{
|
|
150
|
+
id: 'coding-fizzbuzz',
|
|
151
|
+
name: 'FizzBuzz Implementation',
|
|
152
|
+
category: 'coding',
|
|
153
|
+
description: 'Write and run a FizzBuzz implementation in Python',
|
|
154
|
+
userMessage: 'Write a FizzBuzz implementation in Python that prints numbers 1 to 30 and run it.',
|
|
155
|
+
expectedBehaviors: [
|
|
156
|
+
'Writes correct FizzBuzz logic',
|
|
157
|
+
'Runs the code successfully',
|
|
158
|
+
'Output contains Fizz, Buzz, and FizzBuzz',
|
|
159
|
+
],
|
|
160
|
+
scoringCriteria: [
|
|
161
|
+
{ name: 'uses_shell', weight: 2, evaluator: 'tool_used', expected: 'shell' },
|
|
162
|
+
{ name: 'uses_files', weight: 2, evaluator: 'tool_used', expected: 'files' },
|
|
163
|
+
{ name: 'contains_fizz', weight: 1, evaluator: 'contains', expected: 'Fizz' },
|
|
164
|
+
{ name: 'contains_buzz', weight: 1, evaluator: 'contains', expected: 'Buzz' },
|
|
165
|
+
{ name: 'contains_fizzbuzz', weight: 1, evaluator: 'contains', expected: 'FizzBuzz' },
|
|
166
|
+
{ name: 'correctness', weight: 3, evaluator: 'llm_judge', expected: 'Did the response implement FizzBuzz correctly (multiples of 3 print Fizz, multiples of 5 print Buzz, multiples of both print FizzBuzz) and successfully execute it?' },
|
|
167
|
+
],
|
|
168
|
+
timeoutMs: 60_000,
|
|
169
|
+
tools: ['shell', 'files'],
|
|
170
|
+
},
|
|
171
|
+
{
|
|
172
|
+
id: 'research-comparison',
|
|
173
|
+
name: 'LLM Pricing Comparison',
|
|
174
|
+
category: 'research',
|
|
175
|
+
description: 'Compare pricing of major LLM models',
|
|
176
|
+
userMessage: 'Compare the pricing of OpenAI GPT-4o and Anthropic Claude 3.5 Sonnet. Include input and output token costs.',
|
|
177
|
+
expectedBehaviors: [
|
|
178
|
+
'Searches for current pricing',
|
|
179
|
+
'Includes both models',
|
|
180
|
+
'Reports input and output token costs',
|
|
181
|
+
],
|
|
182
|
+
scoringCriteria: [
|
|
183
|
+
{ name: 'uses_web_search', weight: 2, evaluator: 'tool_used', expected: 'web_search' },
|
|
184
|
+
{ name: 'mentions_gpt4o', weight: 1, evaluator: 'regex', expected: 'GPT-?4[oO]' },
|
|
185
|
+
{ name: 'mentions_claude', weight: 1, evaluator: 'regex', expected: 'Claude\\s*3\\.?5' },
|
|
186
|
+
{ name: 'mentions_pricing', weight: 1, evaluator: 'regex', expected: '\\$\\d+' },
|
|
187
|
+
{ name: 'quality', weight: 5, evaluator: 'llm_judge', expected: 'Did the response provide accurate and specific pricing for both GPT-4o and Claude 3.5 Sonnet, including input and output token costs, with a clear comparison?' },
|
|
188
|
+
],
|
|
189
|
+
timeoutMs: 90_000,
|
|
190
|
+
tools: ['web_search', 'web_fetch'],
|
|
191
|
+
},
|
|
192
|
+
{
|
|
193
|
+
id: 'multi-step-analyze',
|
|
194
|
+
name: 'Package.json Analysis',
|
|
195
|
+
category: 'multi-step',
|
|
196
|
+
description: 'Read and analyze the current project\'s package.json',
|
|
197
|
+
userMessage: 'Read the package.json in the current directory and list all dependencies. Group them into regular dependencies and dev dependencies.',
|
|
198
|
+
expectedBehaviors: [
|
|
199
|
+
'Reads package.json using shell or files tool',
|
|
200
|
+
'Lists regular dependencies',
|
|
201
|
+
'Lists dev dependencies',
|
|
202
|
+
'Groups them clearly',
|
|
203
|
+
],
|
|
204
|
+
scoringCriteria: [
|
|
205
|
+
{ name: 'uses_shell_or_files', weight: 2, evaluator: 'tool_used', expected: 'shell' },
|
|
206
|
+
{ name: 'mentions_dependencies', weight: 1, evaluator: 'contains', expected: 'dependencies' },
|
|
207
|
+
{ name: 'mentions_dev_deps', weight: 1, evaluator: 'regex', expected: 'dev[Dd]ependencies|dev dependencies' },
|
|
208
|
+
{ name: 'mentions_package_json', weight: 1, evaluator: 'contains', expected: 'package.json' },
|
|
209
|
+
{ name: 'quality', weight: 5, evaluator: 'llm_judge', expected: 'Did the response successfully read package.json, list the dependencies, and clearly group them into regular and dev dependencies?' },
|
|
210
|
+
],
|
|
211
|
+
timeoutMs: 60_000,
|
|
212
|
+
tools: ['shell', 'files'],
|
|
213
|
+
},
|
|
214
|
+
]
|
|
215
|
+
|
|
216
|
+
export function getScenario(id: string): EvalScenario | undefined {
|
|
217
|
+
return EVAL_SCENARIOS.find(s => s.id === id)
|
|
218
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import type { ScoringCriterion, EvalCriterionResult } from './types'
|
|
2
|
+
import type { MessageToolEvent } from '@/types'
|
|
3
|
+
|
|
4
|
+
export async function scoreCriteria(
|
|
5
|
+
criteria: ScoringCriterion[],
|
|
6
|
+
responseText: string,
|
|
7
|
+
toolEvents: MessageToolEvent[],
|
|
8
|
+
judgeOpts?: { provider: string; model: string; apiKey: string | null; apiEndpoint?: string | null },
|
|
9
|
+
): Promise<EvalCriterionResult[]> {
|
|
10
|
+
const results: EvalCriterionResult[] = []
|
|
11
|
+
|
|
12
|
+
for (const criterion of criteria) {
|
|
13
|
+
switch (criterion.evaluator) {
|
|
14
|
+
case 'contains': {
|
|
15
|
+
const found = responseText.toLowerCase().includes(criterion.expected.toLowerCase())
|
|
16
|
+
results.push({
|
|
17
|
+
criterion: criterion.name,
|
|
18
|
+
score: found ? criterion.weight : 0,
|
|
19
|
+
maxScore: criterion.weight,
|
|
20
|
+
evidence: found ? `Found "${criterion.expected}" in response` : `"${criterion.expected}" not found in response`,
|
|
21
|
+
})
|
|
22
|
+
break
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
case 'regex': {
|
|
26
|
+
const regex = new RegExp(criterion.expected, 'i')
|
|
27
|
+
const matched = regex.test(responseText)
|
|
28
|
+
results.push({
|
|
29
|
+
criterion: criterion.name,
|
|
30
|
+
score: matched ? criterion.weight : 0,
|
|
31
|
+
maxScore: criterion.weight,
|
|
32
|
+
evidence: matched ? `Pattern /${criterion.expected}/i matched` : `Pattern /${criterion.expected}/i did not match`,
|
|
33
|
+
})
|
|
34
|
+
break
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
case 'tool_used': {
|
|
38
|
+
const used = toolEvents.some(e => e.name === criterion.expected)
|
|
39
|
+
results.push({
|
|
40
|
+
criterion: criterion.name,
|
|
41
|
+
score: used ? criterion.weight : 0,
|
|
42
|
+
maxScore: criterion.weight,
|
|
43
|
+
evidence: used ? `Tool "${criterion.expected}" was used` : `Tool "${criterion.expected}" was not used`,
|
|
44
|
+
})
|
|
45
|
+
break
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
case 'llm_judge': {
|
|
49
|
+
if (!judgeOpts) {
|
|
50
|
+
results.push({
|
|
51
|
+
criterion: criterion.name,
|
|
52
|
+
score: 0,
|
|
53
|
+
maxScore: criterion.weight,
|
|
54
|
+
evidence: 'No judge provider configured; skipped',
|
|
55
|
+
})
|
|
56
|
+
break
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
try {
|
|
60
|
+
const { buildChatModel } = await import('../build-llm')
|
|
61
|
+
const { HumanMessage } = await import('@langchain/core/messages')
|
|
62
|
+
|
|
63
|
+
const llm = buildChatModel({
|
|
64
|
+
provider: judgeOpts.provider,
|
|
65
|
+
model: judgeOpts.model,
|
|
66
|
+
apiKey: judgeOpts.apiKey,
|
|
67
|
+
apiEndpoint: judgeOpts.apiEndpoint,
|
|
68
|
+
})
|
|
69
|
+
|
|
70
|
+
const judgePrompt = `Rate the following AI response on a scale of 0-10.\n\nCriterion: ${criterion.expected}\n\nResponse:\n${responseText}\n\nReply with ONLY a number 0-10.`
|
|
71
|
+
const result = await llm.invoke([new HumanMessage(judgePrompt)])
|
|
72
|
+
const scoreText = typeof result.content === 'string' ? result.content : ''
|
|
73
|
+
const parsed = parseInt(scoreText.trim(), 10)
|
|
74
|
+
const rawScore = Number.isFinite(parsed) ? Math.max(0, Math.min(10, parsed)) : 5
|
|
75
|
+
|
|
76
|
+
results.push({
|
|
77
|
+
criterion: criterion.name,
|
|
78
|
+
score: (rawScore / 10) * criterion.weight,
|
|
79
|
+
maxScore: criterion.weight,
|
|
80
|
+
evidence: `LLM judge: ${rawScore}/10`,
|
|
81
|
+
})
|
|
82
|
+
} catch (err: unknown) {
|
|
83
|
+
results.push({
|
|
84
|
+
criterion: criterion.name,
|
|
85
|
+
score: 0,
|
|
86
|
+
maxScore: criterion.weight,
|
|
87
|
+
evidence: `LLM judge error: ${err instanceof Error ? err.message : String(err)}`,
|
|
88
|
+
})
|
|
89
|
+
}
|
|
90
|
+
break
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
return results
|
|
96
|
+
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import Database from 'better-sqlite3'
|
|
2
|
+
import path from 'path'
|
|
3
|
+
import type { EvalRun } from './types'
|
|
4
|
+
|
|
5
|
+
const DB_PATH = path.join(process.cwd(), 'data', 'eval-runs.db')
|
|
6
|
+
|
|
7
|
+
let db: Database.Database | null = null
|
|
8
|
+
|
|
9
|
+
function getDb(): Database.Database {
|
|
10
|
+
if (!db) {
|
|
11
|
+
db = new Database(DB_PATH)
|
|
12
|
+
db.pragma('journal_mode = WAL')
|
|
13
|
+
db.exec(`CREATE TABLE IF NOT EXISTS eval_runs (
|
|
14
|
+
id TEXT PRIMARY KEY,
|
|
15
|
+
data TEXT NOT NULL
|
|
16
|
+
)`)
|
|
17
|
+
}
|
|
18
|
+
return db
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export function saveEvalRun(run: EvalRun): void {
|
|
22
|
+
getDb().prepare('INSERT OR REPLACE INTO eval_runs (id, data) VALUES (?, ?)').run(run.id, JSON.stringify(run))
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export function getEvalRun(id: string): EvalRun | null {
|
|
26
|
+
const row = getDb().prepare('SELECT data FROM eval_runs WHERE id = ?').get(id) as { data: string } | undefined
|
|
27
|
+
return row ? JSON.parse(row.data) as EvalRun : null
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export function listEvalRuns(limit = 50): EvalRun[] {
|
|
31
|
+
const rows = getDb().prepare('SELECT data FROM eval_runs ORDER BY rowid DESC LIMIT ?').all(limit) as { data: string }[]
|
|
32
|
+
return rows.map(r => JSON.parse(r.data) as EvalRun)
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export function listEvalRunsByAgent(agentId: string, limit = 50): EvalRun[] {
|
|
36
|
+
return listEvalRuns(limit * 2).filter(r => r.agentId === agentId).slice(0, limit)
|
|
37
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
export interface ScoringCriterion {
|
|
2
|
+
name: string
|
|
3
|
+
weight: number
|
|
4
|
+
evaluator: 'contains' | 'regex' | 'tool_used' | 'llm_judge'
|
|
5
|
+
expected: string
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
export interface EvalScenario {
|
|
9
|
+
id: string
|
|
10
|
+
name: string
|
|
11
|
+
category: 'coding' | 'research' | 'companionship' | 'multi-step' | 'memory' | 'planning' | 'tool-usage' | 'long-lived'
|
|
12
|
+
description: string
|
|
13
|
+
userMessage: string
|
|
14
|
+
expectedBehaviors: string[]
|
|
15
|
+
scoringCriteria: ScoringCriterion[]
|
|
16
|
+
timeoutMs: number
|
|
17
|
+
tools: string[]
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export interface EvalRun {
|
|
21
|
+
id: string
|
|
22
|
+
scenarioId: string
|
|
23
|
+
agentId: string
|
|
24
|
+
status: 'pending' | 'running' | 'completed' | 'failed'
|
|
25
|
+
startedAt: number
|
|
26
|
+
endedAt?: number
|
|
27
|
+
score: number
|
|
28
|
+
maxScore: number
|
|
29
|
+
details: EvalCriterionResult[]
|
|
30
|
+
sessionId?: string
|
|
31
|
+
error?: string
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export interface EvalCriterionResult {
|
|
35
|
+
criterion: string
|
|
36
|
+
score: number
|
|
37
|
+
maxScore: number
|
|
38
|
+
evidence?: string
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export interface EvalSuiteResult {
|
|
42
|
+
agentId: string
|
|
43
|
+
totalScore: number
|
|
44
|
+
maxScore: number
|
|
45
|
+
percentage: number
|
|
46
|
+
runs: EvalRun[]
|
|
47
|
+
completedAt: number
|
|
48
|
+
}
|
|
@@ -8,14 +8,18 @@ import { genId } from '@/lib/id'
|
|
|
8
8
|
// ---------------------------------------------------------------------------
|
|
9
9
|
|
|
10
10
|
export type LogCategory =
|
|
11
|
-
| 'trigger'
|
|
12
|
-
| 'decision'
|
|
13
|
-
| 'tool_call'
|
|
14
|
-
| 'tool_result'
|
|
15
|
-
| 'outbound'
|
|
16
|
-
| 'file_op'
|
|
17
|
-
| 'commit'
|
|
18
|
-
| 'error'
|
|
11
|
+
| 'trigger' // what kicked off the action
|
|
12
|
+
| 'decision' // reasoning / model choice
|
|
13
|
+
| 'tool_call' // tool invocation with input
|
|
14
|
+
| 'tool_result' // tool output
|
|
15
|
+
| 'outbound' // messages sent to users/platforms
|
|
16
|
+
| 'file_op' // file read/write/delete with checksums
|
|
17
|
+
| 'commit' // git commit activity
|
|
18
|
+
| 'error' // errors during execution
|
|
19
|
+
| 'mission_start' // new mission/goal started
|
|
20
|
+
| 'mission_checkpoint' // periodic mission state snapshot
|
|
21
|
+
| 'mission_complete' // mission reached ok status
|
|
22
|
+
| 'budget_warning' // mission approaching or exceeding budget
|
|
19
23
|
|
|
20
24
|
export interface ExecutionLogEntry {
|
|
21
25
|
id: string
|