modular-studio 1.0.5 → 1.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +122 -122
- package/dist/assets/{Badge-22Ai0eyi.js → Badge-DrUmDAXz.js} +1 -1
- package/dist/assets/{Input-Bgp734xs.js → Input-ndEGQSgx.js} +1 -1
- package/dist/assets/KnowledgeTab-CxlC76Rf.js +4 -0
- package/dist/assets/MemoryTab-CUScYWs9.js +16 -0
- package/dist/assets/QualificationTab-BqnWSQHm.js +1 -0
- package/dist/assets/ReviewTab-DKYl6cR9.js +103 -0
- package/dist/assets/{Section-DoJrmytO.js → Section-CgmwAj_2.js} +1 -1
- package/dist/assets/{TestTab-PDyMF8Fw.js → TestTab-iJ2vCf9l.js} +15 -15
- package/dist/assets/ToolsTab-C10Ulm8b.js +1 -0
- package/dist/assets/icons-MKpPNvV8.js +1 -0
- package/dist/assets/index-B_ip7Amg.css +1 -0
- package/dist/assets/index-gBy3427k.js +143 -0
- package/dist/assets/services-CTWXQK6j.js +356 -0
- package/dist/index.html +18 -18
- package/dist-server/server/index.d.ts.map +1 -1
- package/dist-server/server/index.js +4 -0
- package/dist-server/server/mcp/manager.d.ts.map +1 -1
- package/dist-server/server/mcp/manager.js +16 -2
- package/dist-server/server/routes/agents.d.ts.map +1 -1
- package/dist-server/server/routes/agents.js +27 -0
- package/dist-server/server/routes/cache.d.ts +3 -0
- package/dist-server/server/routes/cache.d.ts.map +1 -0
- package/dist-server/server/routes/cache.js +55 -0
- package/dist-server/server/routes/connectors.d.ts.map +1 -1
- package/dist-server/server/routes/connectors.js +47 -17
- package/dist-server/server/routes/lessons.d.ts +3 -0
- package/dist-server/server/routes/lessons.d.ts.map +1 -0
- package/dist-server/server/routes/lessons.js +46 -0
- package/dist-server/server/routes/memory.d.ts.map +1 -1
- package/dist-server/server/routes/memory.js +31 -0
- package/dist-server/server/routes/qualification.d.ts.map +1 -1
- package/dist-server/server/routes/qualification.js +292 -334
- package/dist-server/server/routes/repo-index.d.ts.map +1 -1
- package/dist-server/server/routes/repo-index.js +7 -0
- package/dist-server/server/routes/skills-search.d.ts.map +1 -1
- package/dist-server/server/routes/skills-search.js +182 -26
- package/dist-server/server/services/adapters/hindsightAdapter.d.ts +28 -0
- package/dist-server/server/services/adapters/hindsightAdapter.d.ts.map +1 -0
- package/dist-server/server/services/adapters/hindsightAdapter.js +63 -0
- package/dist-server/server/services/adapters/postgresAdapter.js +30 -30
- package/dist-server/server/services/adapters/sqliteAdapter.js +29 -29
- package/dist-server/server/services/agentStore.d.ts +2 -1
- package/dist-server/server/services/agentStore.d.ts.map +1 -1
- package/dist-server/server/services/agentStore.js +2 -1
- package/dist-server/server/services/correctionDetector.d.ts +22 -0
- package/dist-server/server/services/correctionDetector.d.ts.map +1 -0
- package/dist-server/server/services/correctionDetector.js +91 -0
- package/dist-server/server/services/hindsightClient.d.ts +15 -0
- package/dist-server/server/services/hindsightClient.d.ts.map +1 -0
- package/dist-server/server/services/hindsightClient.js +47 -0
- package/dist-server/server/services/lessonExtractor.d.ts +19 -0
- package/dist-server/server/services/lessonExtractor.d.ts.map +1 -0
- package/dist-server/server/services/lessonExtractor.js +87 -0
- package/dist-server/server/services/responseCache.d.ts +24 -0
- package/dist-server/server/services/responseCache.d.ts.map +1 -0
- package/dist-server/server/services/responseCache.js +163 -0
- package/dist-server/server/services/sqliteStore.d.ts +8 -0
- package/dist-server/server/services/sqliteStore.d.ts.map +1 -1
- package/dist-server/server/services/sqliteStore.js +53 -13
- package/dist-server/src/store/knowledgeBase.d.ts +1 -0
- package/dist-server/src/store/knowledgeBase.d.ts.map +1 -1
- package/dist-server/src/store/lessonStore.d.ts +26 -0
- package/dist-server/src/store/lessonStore.d.ts.map +1 -0
- package/dist-server/src/store/lessonStore.js +64 -0
- package/dist-server/src/store/memoryStore.d.ts +12 -1
- package/dist-server/src/store/memoryStore.d.ts.map +1 -1
- package/dist-server/src/store/memoryStore.js +9 -0
- package/dist-server/tsconfig.server.tsbuildinfo +1 -1
- package/package.json +105 -104
- package/dist/assets/KnowledgeTab-DABxirZh.js +0 -4
- package/dist/assets/MemoryTab-DZeYElIT.js +0 -16
- package/dist/assets/QualificationTab-Dfpy3J30.js +0 -1
- package/dist/assets/ReviewTab-SD8lQuCc.js +0 -103
- package/dist/assets/ToolsTab-B83qGCmG.js +0 -1
- package/dist/assets/icons-C2EV-le6.js +0 -1
- package/dist/assets/index-DkpMAxX7.css +0 -1
- package/dist/assets/index-q24ug5Qs.js +0 -143
- package/dist/assets/services-BaKotDf0.js +0 -343
|
@@ -1,363 +1,333 @@
|
|
|
1
1
|
import { Router } from 'express';
|
|
2
2
|
import { randomUUID } from 'node:crypto';
|
|
3
3
|
import { readConfig } from '../config.js';
|
|
4
|
+
import { loadAgent, saveAgent, createAgentVersion } from '../services/agentStore.js';
|
|
5
|
+
import { saveQualificationRun, getQualificationHistory } from '../services/sqliteStore.js';
|
|
4
6
|
const router = Router();
|
|
5
|
-
|
|
6
|
-
function
|
|
7
|
-
const
|
|
8
|
-
|
|
9
|
-
|
|
7
|
+
/* ── Provider helpers (mirrors server/routes/llm.ts logic) ── */
|
|
8
|
+
function normalizeBaseUrl(providerId, baseUrl) {
|
|
9
|
+
const trimmed = (baseUrl || '').trim().replace(/\/+$/, '');
|
|
10
|
+
if (!trimmed)
|
|
11
|
+
return trimmed;
|
|
12
|
+
const isOpenAi = providerId.includes('openai') || trimmed.includes('api.openai.com');
|
|
13
|
+
if (isOpenAi && !/\/v1$/i.test(trimmed))
|
|
14
|
+
return `${trimmed}/v1`;
|
|
15
|
+
return trimmed;
|
|
16
|
+
}
|
|
17
|
+
function inferType(providerId, baseUrl, configType) {
|
|
18
|
+
if (configType === 'anthropic' || providerId.includes('anthropic') || baseUrl.includes('anthropic.com')) {
|
|
19
|
+
return 'anthropic';
|
|
20
|
+
}
|
|
21
|
+
return configType || 'openai';
|
|
22
|
+
}
|
|
23
|
+
function buildLlmHeaders(resolved) {
|
|
24
|
+
if (resolved.type === 'anthropic') {
|
|
25
|
+
return { 'x-api-key': resolved.apiKey, 'anthropic-version': '2023-06-01', 'content-type': 'application/json' };
|
|
26
|
+
}
|
|
27
|
+
return { 'Authorization': `Bearer ${resolved.apiKey}`, 'Content-Type': 'application/json' };
|
|
28
|
+
}
|
|
29
|
+
function buildLlmBody(resolved, model, messages, maxTokens) {
|
|
30
|
+
if (resolved.type === 'anthropic') {
|
|
31
|
+
const system = messages.find(m => m.role === 'system')?.content;
|
|
32
|
+
const nonSystem = messages.filter(m => m.role !== 'system');
|
|
33
|
+
return JSON.stringify({ model, max_tokens: maxTokens, messages: nonSystem, ...(system && { system }) });
|
|
34
|
+
}
|
|
35
|
+
return JSON.stringify({ model, max_tokens: maxTokens, messages });
|
|
36
|
+
}
|
|
37
|
+
function buildLlmUrl(resolved) {
|
|
38
|
+
return resolved.type === 'anthropic'
|
|
39
|
+
? `${resolved.baseUrl}/messages`
|
|
40
|
+
: `${resolved.baseUrl}/chat/completions`;
|
|
10
41
|
}
|
|
11
42
|
function extractLlmContent(data, isAnthropic) {
|
|
12
43
|
if (typeof data !== 'object' || data === null)
|
|
13
44
|
return '';
|
|
14
45
|
const d = data;
|
|
15
|
-
if (isAnthropic && Array.isArray(d.content) && d.content.length > 0)
|
|
46
|
+
if (isAnthropic && Array.isArray(d.content) && d.content.length > 0)
|
|
16
47
|
return d.content[0]?.text ?? '';
|
|
17
|
-
|
|
18
|
-
if (!isAnthropic && Array.isArray(d.choices) && d.choices.length > 0) {
|
|
48
|
+
if (!isAnthropic && Array.isArray(d.choices) && d.choices.length > 0)
|
|
19
49
|
return d.choices[0]?.message?.content ?? '';
|
|
20
|
-
}
|
|
21
50
|
return '';
|
|
22
51
|
}
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
const
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
52
|
+
async function callLlm(resolved, model, messages, maxTokens = 4000) {
|
|
53
|
+
const url = buildLlmUrl(resolved);
|
|
54
|
+
const headers = buildLlmHeaders(resolved);
|
|
55
|
+
const body = buildLlmBody(resolved, model, messages, maxTokens);
|
|
56
|
+
const response = await fetch(url, { method: 'POST', headers, body });
|
|
57
|
+
if (!response.ok) {
|
|
58
|
+
const errText = await response.text();
|
|
59
|
+
throw new Error(`LLM API error ${response.status}: ${errText}`);
|
|
29
60
|
}
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
status: 'error',
|
|
37
|
-
error: 'No connected LLM provider found. Please configure a provider first.'
|
|
38
|
-
});
|
|
39
|
-
return;
|
|
40
|
-
}
|
|
41
|
-
// Build LLM prompt for test case generation
|
|
42
|
-
const prompt = `You are a qualification test case generator. Given an agent's mission brief, generate 5-10 test cases (mix of nominal, edge, and anti cases) and 3-5 scoring dimensions.
|
|
61
|
+
const data = await response.json();
|
|
62
|
+
return extractLlmContent(data, resolved.type === 'anthropic');
|
|
63
|
+
}
|
|
64
|
+
/* ── Prompt builders ── */
|
|
65
|
+
function buildGenerateSuitePrompt(body) {
|
|
66
|
+
return `You are a qualification test case generator. Given an agent's mission brief, generate 5-8 test cases (mix of nominal, edge, and anti cases) and 3-5 scoring dimensions.
|
|
43
67
|
|
|
44
68
|
Mission Brief: "${body.missionBrief}"
|
|
45
69
|
${body.persona ? `Persona: "${body.persona}"` : ''}
|
|
46
70
|
${body.constraints ? `Constraints: "${body.constraints}"` : ''}
|
|
47
71
|
${body.objectives ? `Objectives: "${body.objectives}"` : ''}
|
|
48
72
|
|
|
49
|
-
Generate test cases that
|
|
73
|
+
Generate test cases that evaluate accuracy, edge case handling, constraint compliance, and failure modes.
|
|
50
74
|
|
|
51
75
|
Return JSON in this exact format:
|
|
52
76
|
{
|
|
53
77
|
"testCases": [
|
|
54
|
-
{
|
|
55
|
-
"type": "nominal|edge|anti",
|
|
56
|
-
"label": "Brief description of test",
|
|
57
|
-
"input": "Input to send to the agent",
|
|
58
|
-
"expectedBehavior": "What the agent should do"
|
|
59
|
-
}
|
|
78
|
+
{ "type": "nominal|edge|anti", "label": "Brief description", "input": "Agent input", "expectedBehavior": "What the agent should do" }
|
|
60
79
|
],
|
|
61
80
|
"scoringDimensions": [
|
|
62
|
-
{
|
|
63
|
-
"name": "Dimension name",
|
|
64
|
-
"weight": 0.25
|
|
65
|
-
}
|
|
81
|
+
{ "name": "Dimension name", "weight": 0.25 }
|
|
66
82
|
]
|
|
67
83
|
}
|
|
68
84
|
|
|
69
|
-
Ensure weights sum to 1.0. Generate
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
85
|
+
Ensure weights sum to 1.0. Generate specific, realistic test inputs that challenge the agent.`;
|
|
86
|
+
}
|
|
87
|
+
function buildJudgePrompt(testCase, agentContent, dims) {
|
|
88
|
+
return `You are evaluating an AI agent's response for a qualification test.
|
|
89
|
+
|
|
90
|
+
TEST CASE:
|
|
91
|
+
Type: ${testCase.type}
|
|
92
|
+
Input: "${testCase.input}"
|
|
93
|
+
Expected Behavior: "${testCase.expectedBehavior}"
|
|
94
|
+
|
|
95
|
+
AGENT'S ACTUAL RESPONSE:
|
|
96
|
+
"${agentContent}"
|
|
97
|
+
|
|
98
|
+
SCORING DIMENSIONS:
|
|
99
|
+
${dims.map(d => `- ${d.name} (weight: ${Math.round(d.weight * 100)}%)`).join('\n')}
|
|
100
|
+
|
|
101
|
+
Score the agent's response on a scale of 0-100 for each dimension:
|
|
102
|
+
- accuracy: factual correctness and completeness
|
|
103
|
+
- tone: adherence to expected persona and communication style
|
|
104
|
+
- constraint compliance: does it stay within defined boundaries
|
|
105
|
+
- hallucination: does it avoid making up unsupported claims
|
|
106
|
+
|
|
107
|
+
For "anti" cases: a high score means the agent CORRECTLY refused the request.
|
|
108
|
+
For "nominal" cases: score how well the response matches expected behavior.
|
|
109
|
+
For "edge" cases: score how gracefully it handles ambiguity.
|
|
110
|
+
|
|
111
|
+
Return JSON:
|
|
112
|
+
{
|
|
113
|
+
"dimensionScores": { ${dims.map(d => `"${d.id}": 0`).join(', ')} },
|
|
114
|
+
"overallScore": 0,
|
|
115
|
+
"feedback": "Brief explanation"
|
|
116
|
+
}`;
|
|
117
|
+
}
|
|
118
|
+
function buildPatchPrompt(suite, failedTests) {
|
|
119
|
+
const failedSummary = failedTests.slice(0, 5).map(t => {
|
|
120
|
+
const tc = suite.testCases.find(c => c.id === t.testCaseId);
|
|
121
|
+
return `- [${tc?.type}] "${tc?.label}": score ${t.score}, feedback: ${t.feedback}`;
|
|
122
|
+
}).join('\n');
|
|
123
|
+
return `An AI agent scored below the pass threshold. Generate 2-3 targeted improvement patches.
|
|
124
|
+
|
|
125
|
+
Mission: "${suite.missionBrief}"
|
|
126
|
+
Failed tests:
|
|
127
|
+
${failedSummary}
|
|
128
|
+
|
|
129
|
+
Return JSON with patches that fix the specific failures:
|
|
130
|
+
{
|
|
131
|
+
"patches": [
|
|
132
|
+
{
|
|
133
|
+
"targetField": "instructionState.persona|constraints.customConstraints|instructionState.objectives",
|
|
134
|
+
"description": "What this fixes",
|
|
135
|
+
"diff": "+ Specific text to add to the field"
|
|
136
|
+
}
|
|
137
|
+
]
|
|
138
|
+
}`;
|
|
139
|
+
}
|
|
140
|
+
async function runSingleTestCase(resolved, model, systemPrompt, testCase, dims, passThreshold) {
|
|
141
|
+
const agentMessages = [
|
|
142
|
+
{ role: 'system', content: systemPrompt },
|
|
143
|
+
{ role: 'user', content: testCase.input },
|
|
144
|
+
];
|
|
145
|
+
const agentContent = await callLlm(resolved, model, agentMessages, 1000);
|
|
146
|
+
const judgeMessages = [{ role: 'user', content: buildJudgePrompt(testCase, agentContent, dims) }];
|
|
147
|
+
const judgeContent = await callLlm(resolved, model, judgeMessages, 1000);
|
|
148
|
+
return parseJudgeResponse(judgeContent, testCase.id, passThreshold, dims);
|
|
149
|
+
}
|
|
150
|
+
function parseJudgeResponse(content, testCaseId, passThreshold, dims) {
|
|
151
|
+
const match = content.match(/\{[\s\S]*\}/);
|
|
152
|
+
if (!match) {
|
|
153
|
+
return { testCaseId, score: 50, passed: false, feedback: 'Failed to parse judge response', dimensionScores: {} };
|
|
154
|
+
}
|
|
155
|
+
try {
|
|
156
|
+
const data = JSON.parse(match[0]);
|
|
157
|
+
const score = Math.max(0, Math.min(100, Math.round(data.overallScore ?? 50)));
|
|
158
|
+
const dimScores = {};
|
|
159
|
+
for (const dim of dims) {
|
|
160
|
+
dimScores[dim.id] = Math.max(0, Math.min(100, Math.round(data.dimensionScores?.[dim.id] ?? score)));
|
|
94
161
|
}
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
162
|
+
return { testCaseId, score, passed: score >= passThreshold, feedback: data.feedback ?? '', dimensionScores: dimScores };
|
|
163
|
+
}
|
|
164
|
+
catch {
|
|
165
|
+
return { testCaseId, score: 50, passed: false, feedback: 'Judge parse error', dimensionScores: {} };
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
async function generateLlmPatches(resolved, model, suite, results) {
|
|
169
|
+
const failedTests = results.filter(r => !r.passed);
|
|
170
|
+
if (failedTests.length === 0)
|
|
171
|
+
return [];
|
|
172
|
+
try {
|
|
173
|
+
const content = await callLlm(resolved, model, [{ role: 'user', content: buildPatchPrompt(suite, failedTests) }], 1000);
|
|
174
|
+
const match = content.match(/\{[\s\S]*\}/);
|
|
175
|
+
if (!match)
|
|
176
|
+
return [];
|
|
177
|
+
const data = JSON.parse(match[0]);
|
|
178
|
+
return (data.patches ?? []).map((p) => ({
|
|
179
|
+
id: randomUUID(),
|
|
180
|
+
targetField: p.targetField ?? 'instructionState.persona',
|
|
181
|
+
description: p.description ?? '',
|
|
182
|
+
diff: p.diff ?? '',
|
|
183
|
+
applied: false,
|
|
184
|
+
}));
|
|
185
|
+
}
|
|
186
|
+
catch {
|
|
187
|
+
return [];
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
/* ── POST /generate-suite ── */
|
|
191
|
+
router.post('/generate-suite', async (req, res) => {
|
|
192
|
+
const body = req.body;
|
|
193
|
+
if (!body.agentId || !body.missionBrief) {
|
|
194
|
+
res.status(400).json({ status: 'error', error: 'agentId and missionBrief are required' });
|
|
195
|
+
return;
|
|
196
|
+
}
|
|
197
|
+
try {
|
|
198
|
+
const config = readConfig();
|
|
199
|
+
const configProvider = body.providerId
|
|
200
|
+
? config.providers.find(p => p.id === body.providerId)
|
|
201
|
+
: config.providers.find(p => !!p.apiKey && !!p.baseUrl);
|
|
202
|
+
if (!configProvider?.apiKey) {
|
|
203
|
+
res.status(400).json({ status: 'error', error: 'No connected LLM provider found. Configure one in Settings → Providers.' });
|
|
106
204
|
return;
|
|
107
205
|
}
|
|
108
|
-
const
|
|
109
|
-
|
|
110
|
-
const
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
const jsonMatch = content.match(/\{[\s\S]*\}/);
|
|
116
|
-
if (!jsonMatch) {
|
|
206
|
+
const baseUrl = normalizeBaseUrl(configProvider.id, configProvider.baseUrl);
|
|
207
|
+
const type = inferType(configProvider.id, baseUrl, configProvider.type);
|
|
208
|
+
const model = body.model ?? (type === 'anthropic' ? 'claude-3-5-haiku-20241022' : 'gpt-4o-mini');
|
|
209
|
+
const resolved = { baseUrl, type, apiKey: configProvider.apiKey };
|
|
210
|
+
const content = await callLlm(resolved, model, [{ role: 'user', content: buildGenerateSuitePrompt(body) }]);
|
|
211
|
+
const match = content.match(/\{[\s\S]*\}/);
|
|
212
|
+
if (!match)
|
|
117
213
|
throw new Error('No JSON found in LLM response');
|
|
118
|
-
|
|
119
|
-
const
|
|
120
|
-
// Transform and validate the generated data
|
|
121
|
-
const testCases = (generatedData.testCases || []).map((tc) => ({
|
|
214
|
+
const generated = JSON.parse(match[0]);
|
|
215
|
+
const testCases = (generated.testCases ?? []).map((tc) => ({
|
|
122
216
|
id: randomUUID(),
|
|
123
|
-
type: tc.type
|
|
124
|
-
label: tc.label
|
|
125
|
-
input: tc.input
|
|
126
|
-
expectedBehavior: tc.expectedBehavior
|
|
217
|
+
type: tc.type ?? 'nominal',
|
|
218
|
+
label: tc.label ?? '',
|
|
219
|
+
input: tc.input ?? '',
|
|
220
|
+
expectedBehavior: tc.expectedBehavior ?? '',
|
|
127
221
|
}));
|
|
128
|
-
const
|
|
222
|
+
const rawDims = (generated.scoringDimensions ?? []).map((d) => ({
|
|
129
223
|
id: randomUUID(),
|
|
130
|
-
name:
|
|
131
|
-
weight:
|
|
224
|
+
name: d.name ?? 'Dimension',
|
|
225
|
+
weight: d.weight ?? 0.25,
|
|
132
226
|
}));
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
dim.weight = dim.weight / totalWeight;
|
|
138
|
-
});
|
|
139
|
-
}
|
|
140
|
-
const response = { testCases, scoringDimensions };
|
|
227
|
+
const totalWeight = rawDims.reduce((s, d) => s + d.weight, 0);
|
|
228
|
+
if (totalWeight > 0)
|
|
229
|
+
rawDims.forEach(d => { d.weight = d.weight / totalWeight; });
|
|
230
|
+
const response = { testCases, scoringDimensions: rawDims };
|
|
141
231
|
res.json({ status: 'ok', data: response });
|
|
142
232
|
}
|
|
143
233
|
catch (err) {
|
|
144
|
-
|
|
145
|
-
res.status(500).json({
|
|
146
|
-
status: 'error',
|
|
147
|
-
error: err instanceof Error ? err.message : String(err)
|
|
148
|
-
});
|
|
234
|
+
res.status(500).json({ status: 'error', error: err instanceof Error ? err.message : String(err) });
|
|
149
235
|
}
|
|
150
236
|
});
|
|
151
|
-
/* ── POST /run ── */
|
|
237
|
+
/* ── POST /run (SSE) ── */
|
|
152
238
|
router.post('/run', async (req, res) => {
|
|
153
239
|
const body = req.body;
|
|
154
240
|
if (!body.agentId || !body.providerId || !body.model || !body.suite) {
|
|
155
241
|
res.status(400).json({ status: 'error', error: 'agentId, providerId, model, and suite are required' });
|
|
156
242
|
return;
|
|
157
243
|
}
|
|
244
|
+
const config = readConfig();
|
|
245
|
+
const provider = config.providers.find(p => p.id === body.providerId);
|
|
246
|
+
if (!provider?.apiKey) {
|
|
247
|
+
res.status(400).json({ status: 'error', error: `Provider ${body.providerId} not found or not configured` });
|
|
248
|
+
return;
|
|
249
|
+
}
|
|
250
|
+
const baseUrl = normalizeBaseUrl(provider.id, provider.baseUrl);
|
|
251
|
+
const type = inferType(provider.id, baseUrl, provider.type);
|
|
252
|
+
const resolved = { baseUrl, type, apiKey: provider.apiKey };
|
|
253
|
+
res.setHeader('Content-Type', 'text/event-stream');
|
|
254
|
+
res.setHeader('Cache-Control', 'no-cache');
|
|
255
|
+
res.setHeader('Connection', 'keep-alive');
|
|
256
|
+
const emit = (data) => res.write(`data: ${JSON.stringify(data)}\n\n`);
|
|
257
|
+
const runId = randomUUID();
|
|
258
|
+
const { suite } = body;
|
|
259
|
+
emit({ type: 'start', runId, totalCases: suite.testCases.length });
|
|
260
|
+
// Load agent state to build a proper system prompt
|
|
261
|
+
const agentState = loadAgent(body.agentId);
|
|
262
|
+
const persona = agentState?.instructionState?.['persona'] ?? '';
|
|
263
|
+
const systemPrompt = [
|
|
264
|
+
`You are an AI assistant. Mission: ${suite.missionBrief}`,
|
|
265
|
+
persona ? `Persona: ${persona}` : '',
|
|
266
|
+
'Stay within your defined mission. Refuse out-of-scope requests politely.',
|
|
267
|
+
].filter(Boolean).join('\n\n');
|
|
268
|
+
const testResults = [];
|
|
269
|
+
const dimAccum = {};
|
|
158
270
|
try {
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
res.status(400).json({
|
|
164
|
-
status: 'error',
|
|
165
|
-
error: `Provider ${body.providerId} not found or not configured`
|
|
166
|
-
});
|
|
167
|
-
return;
|
|
168
|
-
}
|
|
169
|
-
const runId = randomUUID();
|
|
170
|
-
// Build the agent's system prompt from mission brief
|
|
171
|
-
const systemPrompt = `You are an AI assistant. Your mission: ${body.suite.missionBrief}
|
|
172
|
-
|
|
173
|
-
You must stay within the scope of this mission and follow these guidelines:
|
|
174
|
-
- Be helpful and accurate
|
|
175
|
-
- Stay within your defined role
|
|
176
|
-
- If asked to do something outside your mission, politely decline
|
|
177
|
-
- Be consistent with your persona and constraints`;
|
|
178
|
-
const baseUrl = provider.baseUrl.replace(/\/+$/, '');
|
|
179
|
-
const isAnthropic = provider.id.includes('anthropic') || baseUrl.includes('anthropic.com');
|
|
180
|
-
// Process each test case
|
|
181
|
-
const testResults = [];
|
|
182
|
-
for (const testCase of body.suite.testCases) {
|
|
271
|
+
for (let i = 0; i < suite.testCases.length; i++) {
|
|
272
|
+
const tc = suite.testCases[i];
|
|
273
|
+
emit({ type: 'case_start', testCaseId: tc.id, label: tc.label, index: i + 1 });
|
|
274
|
+
let result;
|
|
183
275
|
try {
|
|
184
|
-
|
|
185
|
-
const agentMessages = [
|
|
186
|
-
{ role: 'system', content: systemPrompt },
|
|
187
|
-
{ role: 'user', content: testCase.input }
|
|
188
|
-
];
|
|
189
|
-
const agentRequestBody = isAnthropic ? {
|
|
190
|
-
model: body.model,
|
|
191
|
-
max_tokens: 1000,
|
|
192
|
-
messages: agentMessages.filter(m => m.role !== 'system'),
|
|
193
|
-
system: systemPrompt
|
|
194
|
-
} : {
|
|
195
|
-
model: body.model,
|
|
196
|
-
max_tokens: 1000,
|
|
197
|
-
messages: agentMessages
|
|
198
|
-
};
|
|
199
|
-
const headers = {
|
|
200
|
-
'Content-Type': 'application/json'
|
|
201
|
-
};
|
|
202
|
-
if (isAnthropic) {
|
|
203
|
-
headers['x-api-key'] = provider.apiKey;
|
|
204
|
-
headers['anthropic-version'] = '2023-06-01';
|
|
205
|
-
}
|
|
206
|
-
else {
|
|
207
|
-
headers['Authorization'] = `Bearer ${provider.apiKey}`;
|
|
208
|
-
}
|
|
209
|
-
const agentResponse = await fetch(`${baseUrl}/messages`, {
|
|
210
|
-
method: 'POST',
|
|
211
|
-
headers,
|
|
212
|
-
body: JSON.stringify(agentRequestBody)
|
|
213
|
-
});
|
|
214
|
-
if (!agentResponse.ok) {
|
|
215
|
-
testResults.push({
|
|
216
|
-
testCaseId: testCase.id,
|
|
217
|
-
score: 0,
|
|
218
|
-
passed: false,
|
|
219
|
-
feedback: `Failed to get agent response: ${agentResponse.status}`
|
|
220
|
-
});
|
|
221
|
-
continue;
|
|
222
|
-
}
|
|
223
|
-
const agentData = await agentResponse.json();
|
|
224
|
-
// Extract agent's response content
|
|
225
|
-
const agentContent = extractLlmContent(agentData, isAnthropic);
|
|
226
|
-
// 2. Use LLM as judge to score the response
|
|
227
|
-
const judgePrompt = `You are evaluating an AI agent's response for a qualification test.
|
|
228
|
-
|
|
229
|
-
TEST CASE:
|
|
230
|
-
Type: ${testCase.type}
|
|
231
|
-
Input: "${testCase.input}"
|
|
232
|
-
Expected Behavior: "${testCase.expectedBehavior}"
|
|
233
|
-
|
|
234
|
-
AGENT'S ACTUAL RESPONSE:
|
|
235
|
-
"${agentContent}"
|
|
236
|
-
|
|
237
|
-
SCORING DIMENSIONS:
|
|
238
|
-
${body.suite.scoringDimensions.map(d => `- ${d.name} (${Math.round(d.weight * 100)}%)`).join('\n')}
|
|
239
|
-
|
|
240
|
-
Rate the agent's response on a scale of 0-100 for each dimension. Consider:
|
|
241
|
-
- How well did it match the expected behavior?
|
|
242
|
-
- For "nominal" cases: Is it accurate and helpful?
|
|
243
|
-
- For "edge" cases: Does it handle ambiguity appropriately?
|
|
244
|
-
- For "anti" cases: Does it refuse inappropriate requests and stay in scope?
|
|
245
|
-
|
|
246
|
-
Return JSON in this exact format:
|
|
247
|
-
{
|
|
248
|
-
"dimensionScores": {
|
|
249
|
-
${body.suite.scoringDimensions.map(d => `"${d.id}": <score 0-100>`).join(',\n ')}
|
|
250
|
-
},
|
|
251
|
-
"overallScore": <weighted average 0-100>,
|
|
252
|
-
"feedback": "<brief explanation of the score>"
|
|
253
|
-
}`;
|
|
254
|
-
const judgeMessages = [
|
|
255
|
-
{ role: 'user', content: judgePrompt }
|
|
256
|
-
];
|
|
257
|
-
const judgeRequestBody = isAnthropic ? {
|
|
258
|
-
model: body.model,
|
|
259
|
-
max_tokens: 1000,
|
|
260
|
-
messages: judgeMessages
|
|
261
|
-
} : {
|
|
262
|
-
model: body.model,
|
|
263
|
-
max_tokens: 1000,
|
|
264
|
-
messages: judgeMessages
|
|
265
|
-
};
|
|
266
|
-
const judgeResponse = await fetch(`${baseUrl}/messages`, {
|
|
267
|
-
method: 'POST',
|
|
268
|
-
headers,
|
|
269
|
-
body: JSON.stringify(judgeRequestBody)
|
|
270
|
-
});
|
|
271
|
-
if (!judgeResponse.ok) {
|
|
272
|
-
testResults.push({
|
|
273
|
-
testCaseId: testCase.id,
|
|
274
|
-
score: 50,
|
|
275
|
-
passed: false,
|
|
276
|
-
feedback: `Failed to score response: ${judgeResponse.status}`
|
|
277
|
-
});
|
|
278
|
-
continue;
|
|
279
|
-
}
|
|
280
|
-
const judgeData = await judgeResponse.json();
|
|
281
|
-
// Extract judge's scoring
|
|
282
|
-
const judgeContent = extractLlmContent(judgeData, isAnthropic);
|
|
283
|
-
// Parse scoring JSON
|
|
284
|
-
const jsonMatch = judgeContent.match(/\{[\s\S]*\}/);
|
|
285
|
-
let score = 50;
|
|
286
|
-
let feedback = 'Default scoring due to parsing error';
|
|
287
|
-
if (jsonMatch) {
|
|
288
|
-
try {
|
|
289
|
-
const scoreData = JSON.parse(jsonMatch[0]);
|
|
290
|
-
score = Math.round(scoreData.overallScore || 50);
|
|
291
|
-
feedback = scoreData.feedback || 'No feedback provided';
|
|
292
|
-
}
|
|
293
|
-
catch {
|
|
294
|
-
// Use default values
|
|
295
|
-
}
|
|
296
|
-
}
|
|
297
|
-
testResults.push({
|
|
298
|
-
testCaseId: testCase.id,
|
|
299
|
-
score: Math.max(0, Math.min(100, score)),
|
|
300
|
-
passed: score >= body.suite.passThreshold,
|
|
301
|
-
feedback
|
|
302
|
-
});
|
|
276
|
+
result = await runSingleTestCase(resolved, body.model, systemPrompt, tc, suite.scoringDimensions, suite.passThreshold);
|
|
303
277
|
}
|
|
304
278
|
catch (err) {
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
feedback: `Error: ${err instanceof Error ? err.message : String(err)}`
|
|
311
|
-
});
|
|
279
|
+
result = {
|
|
280
|
+
testCaseId: tc.id, score: 0, passed: false,
|
|
281
|
+
feedback: err instanceof Error ? err.message : String(err),
|
|
282
|
+
dimensionScores: {},
|
|
283
|
+
};
|
|
312
284
|
}
|
|
285
|
+
testResults.push({ testCaseId: result.testCaseId, score: result.score, passed: result.passed, feedback: result.feedback });
|
|
286
|
+
for (const [dimId, score] of Object.entries(result.dimensionScores)) {
|
|
287
|
+
dimAccum[dimId] = dimAccum[dimId] ?? [];
|
|
288
|
+
dimAccum[dimId].push(score);
|
|
289
|
+
}
|
|
290
|
+
emit({ type: 'case_done', testCaseId: tc.id, score: result.score, passed: result.passed, feedback: result.feedback });
|
|
313
291
|
}
|
|
314
|
-
// Calculate dimension scores (simplified - average from test results)
|
|
315
292
|
const dimensionScores = {};
|
|
316
|
-
for (const dim of
|
|
317
|
-
const
|
|
318
|
-
dimensionScores[dim.id] =
|
|
293
|
+
for (const dim of suite.scoringDimensions) {
|
|
294
|
+
const scores = dimAccum[dim.id] ?? [];
|
|
295
|
+
dimensionScores[dim.id] = scores.length > 0
|
|
296
|
+
? Math.round(scores.reduce((s, v) => s + v, 0) / scores.length)
|
|
297
|
+
: Math.round(testResults.reduce((s, r) => s + r.score, 0) / (testResults.length || 1));
|
|
319
298
|
}
|
|
320
|
-
|
|
321
|
-
const
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
if (globalScore < body.suite.passThreshold) {
|
|
327
|
-
const failedTests = testResults.filter(t => !t.passed);
|
|
328
|
-
const hasAntiFailures = failedTests.some(t => body.suite.testCases.find(tc => tc.id === t.testCaseId)?.type === 'anti');
|
|
329
|
-
if (hasAntiFailures) {
|
|
330
|
-
patches.push({
|
|
331
|
-
id: randomUUID(),
|
|
332
|
-
targetField: 'constraints.customConstraints',
|
|
333
|
-
description: 'Add explicit scope boundary to prevent out-of-scope responses',
|
|
334
|
-
diff: '+ Always refuse requests outside the defined mission brief.',
|
|
335
|
-
applied: false,
|
|
336
|
-
});
|
|
337
|
-
}
|
|
338
|
-
if (failedTests.length > body.suite.testCases.length / 2) {
|
|
339
|
-
patches.push({
|
|
340
|
-
id: randomUUID(),
|
|
341
|
-
targetField: 'instructionState.persona',
|
|
342
|
-
description: 'Enhance persona clarity and instructions',
|
|
343
|
-
diff: '+ Be more explicit about your role and capabilities.',
|
|
344
|
-
applied: false,
|
|
345
|
-
});
|
|
346
|
-
}
|
|
347
|
-
}
|
|
348
|
-
const response = { runId, globalScore, dimensionScores, testResults, patches };
|
|
349
|
-
pushHistory(body.agentId, { runId, timestamp: Date.now(), globalScore, passThreshold: body.suite.passThreshold });
|
|
350
|
-
res.json({ status: 'ok', data: response });
|
|
299
|
+
const globalScore = Math.round(suite.scoringDimensions.reduce((sum, dim) => sum + (dimensionScores[dim.id] ?? 0) * dim.weight, 0));
|
|
300
|
+
const patches = globalScore < suite.passThreshold
|
|
301
|
+
? await generateLlmPatches(resolved, body.model, suite, testResults)
|
|
302
|
+
: [];
|
|
303
|
+
await saveQualificationRun(body.agentId, { runId, timestamp: Date.now(), globalScore, passThreshold: suite.passThreshold });
|
|
304
|
+
emit({ type: 'done', runId, globalScore, dimensionScores, testResults, patches });
|
|
351
305
|
}
|
|
352
306
|
catch (err) {
|
|
353
|
-
|
|
354
|
-
res.status(500).json({
|
|
355
|
-
status: 'error',
|
|
356
|
-
error: err instanceof Error ? err.message : String(err)
|
|
357
|
-
});
|
|
307
|
+
emit({ type: 'error', message: err instanceof Error ? err.message : String(err) });
|
|
358
308
|
}
|
|
309
|
+
res.end();
|
|
359
310
|
});
|
|
360
311
|
/* ── POST /apply-patches ── */
|
|
312
|
+
function setNestedValue(obj, path, value) {
|
|
313
|
+
const parts = path.split('.');
|
|
314
|
+
const last = parts.pop();
|
|
315
|
+
if (!last)
|
|
316
|
+
return;
|
|
317
|
+
let cur = obj;
|
|
318
|
+
for (const part of parts) {
|
|
319
|
+
if (typeof cur[part] !== 'object' || cur[part] === null)
|
|
320
|
+
cur[part] = {};
|
|
321
|
+
cur = cur[part];
|
|
322
|
+
}
|
|
323
|
+
cur[last] = value;
|
|
324
|
+
}
|
|
325
|
+
function extractPatchContent(diff) {
|
|
326
|
+
return diff.split('\n')
|
|
327
|
+
.filter(line => line.startsWith('+ '))
|
|
328
|
+
.map(line => line.slice(2).trim())
|
|
329
|
+
.join('\n');
|
|
330
|
+
}
|
|
361
331
|
router.post('/apply-patches', async (req, res) => {
|
|
362
332
|
const body = req.body;
|
|
363
333
|
if (!body.agentId || !body.runId || !body.patchIds?.length) {
|
|
@@ -365,49 +335,37 @@ router.post('/apply-patches', async (req, res) => {
|
|
|
365
335
|
return;
|
|
366
336
|
}
|
|
367
337
|
try {
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
const appliedPatches = [];
|
|
338
|
+
const agentState = loadAgent(body.agentId);
|
|
339
|
+
if (!agentState) {
|
|
340
|
+
res.status(404).json({ status: 'error', error: `Agent ${body.agentId} not found` });
|
|
341
|
+
return;
|
|
342
|
+
}
|
|
343
|
+
const toApply = (body.patches ?? []).filter(p => body.patchIds.includes(p.id));
|
|
375
344
|
const configUpdates = {};
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
// }
|
|
345
|
+
for (const patch of toApply) {
|
|
346
|
+
const newContent = extractPatchContent(patch.diff);
|
|
347
|
+
if (!newContent)
|
|
348
|
+
continue;
|
|
349
|
+
const path = patch.targetField.startsWith('instructionState.')
|
|
350
|
+
? patch.targetField.slice('instructionState.'.length)
|
|
351
|
+
: patch.targetField;
|
|
352
|
+
const current = agentState.instructionState[path];
|
|
353
|
+
const updated = typeof current === 'string' && current ? `${current}\n${newContent}` : newContent;
|
|
354
|
+
setNestedValue(agentState.instructionState, path, updated);
|
|
355
|
+
configUpdates[patch.targetField] = updated;
|
|
388
356
|
}
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
applied: appliedPatches,
|
|
393
|
-
configUpdates,
|
|
394
|
-
message: `Applied ${appliedPatches.length} patch(es) to agent ${body.agentId}`,
|
|
395
|
-
note: 'Patch application is currently simulated. In production, this would modify the actual agent configuration.',
|
|
396
|
-
},
|
|
397
|
-
});
|
|
357
|
+
createAgentVersion(body.agentId, agentState.version, `qual-patch-${body.runId.slice(0, 8)}`);
|
|
358
|
+
saveAgent(body.agentId, agentState);
|
|
359
|
+
res.json({ status: 'ok', data: { applied: body.patchIds, configUpdates, message: `Applied ${body.patchIds.length} patch(es) to agent ${body.agentId}` } });
|
|
398
360
|
}
|
|
399
361
|
catch (err) {
|
|
400
|
-
|
|
401
|
-
res.status(500).json({
|
|
402
|
-
status: 'error',
|
|
403
|
-
error: err instanceof Error ? err.message : String(err)
|
|
404
|
-
});
|
|
362
|
+
res.status(500).json({ status: 'error', error: err instanceof Error ? err.message : String(err) });
|
|
405
363
|
}
|
|
406
364
|
});
|
|
407
365
|
/* ── GET /:agentId/history ── */
|
|
408
|
-
router.get('/:agentId/history', (req, res) => {
|
|
366
|
+
router.get('/:agentId/history', async (req, res) => {
|
|
409
367
|
const agentId = String(req.params['agentId'] ?? '');
|
|
410
|
-
const history =
|
|
368
|
+
const history = await getQualificationHistory(agentId);
|
|
411
369
|
res.json({ status: 'ok', data: history });
|
|
412
370
|
});
|
|
413
371
|
export default router;
|