@elizaos/cli 1.4.4 → 1.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/BrowserWebSocketTransport-5YQPVDV7.js +7 -0
- package/dist/EnhancedEvaluationEngine-APOQ6INN.js +473 -0
- package/dist/EvaluationEngine-Y7ZQJBRC.js +9 -0
- package/dist/LocalEnvironmentProvider-JWFGG4IN.js +15 -0
- package/dist/NodeWebSocketTransport-PUO724EY.js +8 -0
- package/dist/ScreenRecorder-YK246DNJ.js +10 -0
- package/dist/agent-start-6QJQAMKA.js +13 -0
- package/dist/bidi-2SVNH6F7.js +15309 -0
- package/dist/{bun-exec-ULMPAIQC.js → bun-exec-NH4UCUY4.js} +1 -1
- package/dist/chunk-2ESYSVXG.js +48 -0
- package/dist/chunk-3AEYIKBZ.js +432 -0
- package/dist/chunk-5IWKEMEF.js +239 -0
- package/dist/chunk-5WZO2HMM.js +2644 -0
- package/dist/chunk-ABGBVB74.js +3501 -0
- package/dist/{chunk-NSNXXD3I.js → chunk-BCO32GR6.js} +2 -2
- package/dist/chunk-CGXTFHQP.js +25 -0
- package/dist/chunk-EXUFDTUD.js +3948 -0
- package/dist/chunk-FGGNHEXZ.js +211860 -0
- package/dist/chunk-FWYHSCLF.js +243 -0
- package/dist/chunk-I57T3WPO.js +165 -0
- package/dist/chunk-LBZLMFFF.js +221 -0
- package/dist/chunk-LG7YDBMV.js +401 -0
- package/dist/chunk-NHKLUXNE.js +166 -0
- package/dist/chunk-PUZHCSGF.js +828 -0
- package/dist/chunk-PWDR7CPA.js +7828 -0
- package/dist/{chunk-N5G5XSGP.js → chunk-Q6M2K53X.js} +3 -3
- package/dist/chunk-SVHCNBHM.js +289 -0
- package/dist/{chunk-HOC6B3QV.js → chunk-VFFOOPYS.js} +4 -238
- package/dist/chunk-WX37MM4G.js +292 -0
- package/dist/chunk-XFJIHUT3.js +6 -0
- package/dist/chunk-XPPESCCM.js +787 -0
- package/dist/chunk-YBDC5OZO.js +40 -0
- package/dist/commands/agent/actions/index.js +2 -2
- package/dist/commands/agent/index.js +2 -2
- package/dist/commands/create/actions/index.js +4 -3
- package/dist/commands/create/index.js +5 -4
- package/dist/commands/shared/index.js +1 -1
- package/dist/index.js +66796 -4986
- package/dist/js-yaml-KADNMPWR.js +35 -0
- package/dist/matrix-orchestrator-3WLRK7GG.js +1070 -0
- package/dist/matrix-runner-KDPETCKQ.js +160 -0
- package/dist/matrix-schema-PCO2KGJY.js +102 -0
- package/dist/parameter-override-ALOPPXCE.js +487 -0
- package/dist/{plugin-creator-TCUFII32.js → plugin-creator-J7GNPMPG.js} +1 -1
- package/dist/process-manager-IU2A3BTQ.js +9 -0
- package/dist/{registry-ELONUC44.js → registry-65KMEA7N.js} +2 -2
- package/dist/resource-monitor-EHZSH2P6.js +15 -0
- package/dist/run-isolation-PGLZ37Y7.js +29 -0
- package/dist/runtime-factory-Q4U5YBNV.js +22 -0
- package/dist/schema-C25LVPEK.js +17 -0
- package/dist/src/commands/report/src/assets/report_template.html +1704 -0
- package/dist/src-EJG4ILDC.js +5 -0
- package/dist/templates/plugin-quick-starter/package.json +2 -2
- package/dist/templates/plugin-starter/package.json +2 -2
- package/dist/templates/project-starter/package.json +4 -4
- package/dist/templates/project-tee-starter/package.json +4 -4
- package/dist/typescript-ZF3IK2DJ.js +5 -0
- package/dist/{utils-X6UXPLKD.js → utils-QFD2PW4X.js} +2 -2
- package/package.json +14 -8
- package/templates/plugin-quick-starter/package.json +2 -2
- package/templates/plugin-starter/package.json +2 -2
- package/templates/project-starter/package.json +4 -4
- package/templates/project-tee-starter/package.json +4 -4
- package/dist/chunk-3RG5ZIWI.js +0 -10
|
@@ -0,0 +1,473 @@
|
|
|
1
|
+
import "./chunk-2ESYSVXG.js";
|
|
2
|
+
|
|
3
|
+
// src/commands/scenario/src/EnhancedEvaluationEngine.ts
|
|
4
|
+
import { ModelType } from "@elizaos/core";
|
|
5
|
+
import { z } from "zod";
|
|
6
|
+
var EnhancedEvaluationEngine = class {
|
|
7
|
+
constructor(runtime) {
|
|
8
|
+
this.runtime = runtime;
|
|
9
|
+
this.register("string_contains", new EnhancedStringContainsEvaluator());
|
|
10
|
+
this.register("regex_match", new EnhancedRegexMatchEvaluator());
|
|
11
|
+
this.register("file_exists", new EnhancedFileExistsEvaluator());
|
|
12
|
+
this.register("trajectory_contains_action", new EnhancedTrajectoryContainsActionEvaluator());
|
|
13
|
+
this.register("llm_judge", new EnhancedLLMJudgeEvaluator());
|
|
14
|
+
this.register("execution_time", new EnhancedExecutionTimeEvaluator());
|
|
15
|
+
}
|
|
16
|
+
enhancedEvaluators = /* @__PURE__ */ new Map();
|
|
17
|
+
register(type, evaluator) {
|
|
18
|
+
this.enhancedEvaluators.set(type, evaluator);
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* NEW: Run evaluations with structured JSON output
|
|
22
|
+
*/
|
|
23
|
+
async runEnhancedEvaluations(evaluations, runResult) {
|
|
24
|
+
const results = [];
|
|
25
|
+
for (const evaluation of evaluations) {
|
|
26
|
+
const evaluator = this.enhancedEvaluators.get(evaluation.type);
|
|
27
|
+
if (!evaluator) {
|
|
28
|
+
results.push({
|
|
29
|
+
evaluator_type: evaluation.type,
|
|
30
|
+
success: false,
|
|
31
|
+
summary: `Unknown evaluator type: '${evaluation.type}'`,
|
|
32
|
+
details: {
|
|
33
|
+
error: "evaluator_not_found",
|
|
34
|
+
requested_type: evaluation.type,
|
|
35
|
+
available_types: Array.from(this.enhancedEvaluators.keys())
|
|
36
|
+
}
|
|
37
|
+
});
|
|
38
|
+
continue;
|
|
39
|
+
}
|
|
40
|
+
try {
|
|
41
|
+
const result = await evaluator.evaluateEnhanced(evaluation, runResult, this.runtime);
|
|
42
|
+
results.push(result);
|
|
43
|
+
} catch (error) {
|
|
44
|
+
results.push({
|
|
45
|
+
evaluator_type: evaluation.type,
|
|
46
|
+
success: false,
|
|
47
|
+
summary: `Evaluator '${evaluation.type}' failed with error: ${error instanceof Error ? error.message : String(error)}`,
|
|
48
|
+
details: {
|
|
49
|
+
error: "evaluator_execution_failed",
|
|
50
|
+
error_message: error instanceof Error ? error.message : String(error),
|
|
51
|
+
evaluation_config: evaluation
|
|
52
|
+
}
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
return results;
|
|
57
|
+
}
|
|
58
|
+
};
|
|
59
|
+
var EnhancedStringContainsEvaluator = class {
|
|
60
|
+
async evaluateEnhanced(params, runResult) {
|
|
61
|
+
if (params.type !== "string_contains") throw new Error("Mismatched evaluator");
|
|
62
|
+
const expectedValue = params.value;
|
|
63
|
+
const actualOutput = runResult.stdout;
|
|
64
|
+
const caseSensitive = params.case_sensitive ?? false;
|
|
65
|
+
const searchIn = caseSensitive ? actualOutput : actualOutput.toLowerCase();
|
|
66
|
+
const searchFor = caseSensitive ? expectedValue : expectedValue.toLowerCase();
|
|
67
|
+
const success = searchIn.includes(searchFor);
|
|
68
|
+
return {
|
|
69
|
+
evaluator_type: "string_contains",
|
|
70
|
+
success,
|
|
71
|
+
summary: success ? `Assertion PASSED: Agent response contained the expected substring "${expectedValue}".` : `Assertion FAILED: Agent response did not contain the expected substring "${expectedValue}".`,
|
|
72
|
+
details: {
|
|
73
|
+
expected_value: expectedValue,
|
|
74
|
+
actual_output: actualOutput,
|
|
75
|
+
case_sensitive: caseSensitive,
|
|
76
|
+
search_performed: `Looking for "${searchFor}" in "${searchIn.substring(0, 100)}${searchIn.length > 100 ? "..." : ""}"`
|
|
77
|
+
}
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
};
|
|
81
|
+
var EnhancedRegexMatchEvaluator = class {
|
|
82
|
+
async evaluateEnhanced(params, runResult) {
|
|
83
|
+
if (params.type !== "regex_match") throw new Error("Mismatched evaluator");
|
|
84
|
+
const pattern = params.pattern;
|
|
85
|
+
const actualOutput = runResult.stdout;
|
|
86
|
+
const regex = new RegExp(pattern, "i");
|
|
87
|
+
const match = regex.exec(actualOutput);
|
|
88
|
+
const success = match !== null;
|
|
89
|
+
return {
|
|
90
|
+
evaluator_type: "regex_match",
|
|
91
|
+
success,
|
|
92
|
+
summary: success ? `Regex PASSED: Pattern "${pattern}" matched in agent output.` : `Regex FAILED: Pattern "${pattern}" did not match in agent output.`,
|
|
93
|
+
details: {
|
|
94
|
+
pattern,
|
|
95
|
+
regex_flags: "i",
|
|
96
|
+
actual_output: actualOutput,
|
|
97
|
+
match_found: match,
|
|
98
|
+
match_index: match?.index,
|
|
99
|
+
matched_text: match?.[0]
|
|
100
|
+
}
|
|
101
|
+
};
|
|
102
|
+
}
|
|
103
|
+
};
|
|
104
|
+
var EnhancedFileExistsEvaluator = class {
|
|
105
|
+
async evaluateEnhanced(params, runResult) {
|
|
106
|
+
if (params.type !== "file_exists") throw new Error("Mismatched evaluator");
|
|
107
|
+
const expectedPath = params.path;
|
|
108
|
+
const createdFiles = Object.keys(runResult.files);
|
|
109
|
+
const success = createdFiles.includes(expectedPath) || createdFiles.includes(`./${expectedPath}`) || createdFiles.includes(expectedPath.replace(/^\.\//, ""));
|
|
110
|
+
const matchingPath = createdFiles.find(
|
|
111
|
+
(path) => path === expectedPath || path === `./${expectedPath}` || path === expectedPath.replace(/^\.\//, "")
|
|
112
|
+
);
|
|
113
|
+
return {
|
|
114
|
+
evaluator_type: "file_exists",
|
|
115
|
+
success,
|
|
116
|
+
summary: success ? `File check PASSED: File "${expectedPath}" was created by the agent.` : `File check FAILED: File "${expectedPath}" was not created by the agent.`,
|
|
117
|
+
details: {
|
|
118
|
+
expected_path: expectedPath,
|
|
119
|
+
created_files: createdFiles,
|
|
120
|
+
matching_path: matchingPath,
|
|
121
|
+
total_files_created: createdFiles.length
|
|
122
|
+
}
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
};
|
|
126
|
+
var EnhancedExecutionTimeEvaluator = class {
|
|
127
|
+
async evaluateEnhanced(params, runResult) {
|
|
128
|
+
if (params.type !== "execution_time") throw new Error("Mismatched evaluator");
|
|
129
|
+
const duration = runResult.durationMs ?? (runResult.endedAtMs ?? 0) - (runResult.startedAtMs ?? 0);
|
|
130
|
+
if (duration == null || Number.isNaN(duration) || runResult.durationMs === void 0 && (runResult.startedAtMs === void 0 || runResult.endedAtMs === void 0)) {
|
|
131
|
+
return {
|
|
132
|
+
evaluator_type: "execution_time",
|
|
133
|
+
success: false,
|
|
134
|
+
summary: "Timing check FAILED: No timing information available for this step.",
|
|
135
|
+
details: {
|
|
136
|
+
error: "no_timing_data",
|
|
137
|
+
runResult_timing: {
|
|
138
|
+
durationMs: runResult.durationMs,
|
|
139
|
+
startedAtMs: runResult.startedAtMs,
|
|
140
|
+
endedAtMs: runResult.endedAtMs
|
|
141
|
+
},
|
|
142
|
+
constraints: params
|
|
143
|
+
}
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
const tooSlow = duration > params.max_duration_ms;
|
|
147
|
+
const tooFast = params.min_duration_ms != null && duration < params.min_duration_ms;
|
|
148
|
+
const success = !tooSlow && !tooFast;
|
|
149
|
+
let summary;
|
|
150
|
+
if (success) {
|
|
151
|
+
summary = `Timing check PASSED: Execution took ${duration}ms (within expected range).`;
|
|
152
|
+
} else if (tooSlow) {
|
|
153
|
+
summary = `Timing check FAILED: Execution took ${duration}ms (exceeded maximum of ${params.max_duration_ms}ms).`;
|
|
154
|
+
} else {
|
|
155
|
+
summary = `Timing check FAILED: Execution took ${duration}ms (below minimum of ${params.min_duration_ms}ms).`;
|
|
156
|
+
}
|
|
157
|
+
return {
|
|
158
|
+
evaluator_type: "execution_time",
|
|
159
|
+
success,
|
|
160
|
+
summary,
|
|
161
|
+
details: {
|
|
162
|
+
actual_duration_ms: duration,
|
|
163
|
+
max_duration_ms: params.max_duration_ms,
|
|
164
|
+
min_duration_ms: params.min_duration_ms,
|
|
165
|
+
target_duration_ms: params.target_duration_ms,
|
|
166
|
+
performance_rating: params.target_duration_ms ? Math.abs(duration - params.target_duration_ms) / params.target_duration_ms : null,
|
|
167
|
+
timing_breakdown: {
|
|
168
|
+
started_at: runResult.startedAtMs,
|
|
169
|
+
ended_at: runResult.endedAtMs,
|
|
170
|
+
calculated_duration: runResult.durationMs
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
};
|
|
174
|
+
}
|
|
175
|
+
};
|
|
176
|
+
var EnhancedTrajectoryContainsActionEvaluator = class {
|
|
177
|
+
async evaluateEnhanced(params, runResult, runtime) {
|
|
178
|
+
if (params.type !== "trajectory_contains_action") throw new Error("Mismatched evaluator");
|
|
179
|
+
const actionName = params.action;
|
|
180
|
+
try {
|
|
181
|
+
const actionMemories = await runtime.getMemories({
|
|
182
|
+
tableName: "messages",
|
|
183
|
+
agentId: runtime.agentId,
|
|
184
|
+
count: 50,
|
|
185
|
+
unique: false
|
|
186
|
+
});
|
|
187
|
+
const actionResults = actionMemories.filter(
|
|
188
|
+
(mem) => mem?.type === "messages" && mem.content?.type === "action_result"
|
|
189
|
+
);
|
|
190
|
+
const normalize = (name) => (typeof name === "string" ? name : "").toLowerCase().replace(/_/g, "");
|
|
191
|
+
const target = normalize(actionName);
|
|
192
|
+
const matchingAction = actionResults.find(
|
|
193
|
+
(mem) => normalize(mem.content?.actionName ?? "") === target
|
|
194
|
+
);
|
|
195
|
+
const allActionNames = actionResults.map((mem) => mem.content?.actionName || "unknown");
|
|
196
|
+
if (!matchingAction) {
|
|
197
|
+
return {
|
|
198
|
+
evaluator_type: "trajectory_contains_action",
|
|
199
|
+
success: false,
|
|
200
|
+
summary: `Action check FAILED: Action '${actionName}' was not found in the execution trajectory.`,
|
|
201
|
+
details: {
|
|
202
|
+
expected_action: actionName,
|
|
203
|
+
normalized_expected: target,
|
|
204
|
+
actions_found: allActionNames,
|
|
205
|
+
total_actions_in_trajectory: actionResults.length,
|
|
206
|
+
search_method: "case_insensitive_with_underscore_normalization"
|
|
207
|
+
}
|
|
208
|
+
};
|
|
209
|
+
}
|
|
210
|
+
const actionStatus = matchingAction.content?.actionStatus || "unknown";
|
|
211
|
+
const actionSucceeded = actionStatus === "completed";
|
|
212
|
+
return {
|
|
213
|
+
evaluator_type: "trajectory_contains_action",
|
|
214
|
+
success: true,
|
|
215
|
+
// Success means the action was found (regardless of its outcome)
|
|
216
|
+
summary: actionSucceeded ? `Action check PASSED: Action '${actionName}' was executed successfully.` : `Action check PASSED: Action '${actionName}' was found but failed execution.`,
|
|
217
|
+
details: {
|
|
218
|
+
expected_action: actionName,
|
|
219
|
+
found_action: matchingAction.content?.actionName,
|
|
220
|
+
action_status: actionStatus,
|
|
221
|
+
action_succeeded: actionSucceeded,
|
|
222
|
+
action_error: matchingAction.content?.error,
|
|
223
|
+
action_result: matchingAction.content?.result,
|
|
224
|
+
memory_id: matchingAction.id,
|
|
225
|
+
all_actions_in_trajectory: allActionNames
|
|
226
|
+
}
|
|
227
|
+
};
|
|
228
|
+
} catch (error) {
|
|
229
|
+
return {
|
|
230
|
+
evaluator_type: "trajectory_contains_action",
|
|
231
|
+
success: false,
|
|
232
|
+
summary: `Action check FAILED: Error while checking trajectory: ${error instanceof Error ? error.message : String(error)}`,
|
|
233
|
+
details: {
|
|
234
|
+
expected_action: actionName,
|
|
235
|
+
error: "trajectory_access_failed",
|
|
236
|
+
error_message: error instanceof Error ? error.message : String(error),
|
|
237
|
+
runtime_available: !!runtime,
|
|
238
|
+
agent_id: runtime?.agentId
|
|
239
|
+
}
|
|
240
|
+
};
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
};
|
|
244
|
+
var EnhancedLLMJudgeEvaluator = class {
|
|
245
|
+
async evaluateEnhanced(params, runResult, runtime) {
|
|
246
|
+
if (params.type !== "llm_judge") throw new Error("Mismatched evaluator");
|
|
247
|
+
const prompt = params.prompt;
|
|
248
|
+
const expected = params.expected;
|
|
249
|
+
const candidateModels = [ModelType.OBJECT_SMALL, ModelType.TEXT_LARGE, ModelType.TEXT_SMALL];
|
|
250
|
+
const temperature = params.temperature || 0.1;
|
|
251
|
+
const timeoutMs = Number(process.env.LLM_JUDGE_TIMEOUT_MS || 15e3);
|
|
252
|
+
let modelType = candidateModels.find((m) => runtime.getModel?.(m)) ?? ModelType.TEXT_LARGE;
|
|
253
|
+
const capabilities = params.capabilities;
|
|
254
|
+
if (capabilities !== void 0) {
|
|
255
|
+
try {
|
|
256
|
+
const capabilitiesSchema = z.array(z.string()).min(1, "Capabilities array must not be empty");
|
|
257
|
+
capabilitiesSchema.parse(capabilities);
|
|
258
|
+
} catch (error) {
|
|
259
|
+
throw new Error(`Invalid capabilities: ${error.message}`);
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
const structuredPrompt = this.createStructuredPrompt(runResult, prompt, expected, capabilities);
|
|
263
|
+
const jsonSchema = this.getStructuredJudgmentSchema();
|
|
264
|
+
try {
|
|
265
|
+
const modelHandler = runtime.getModel(modelType);
|
|
266
|
+
if (!modelHandler) {
|
|
267
|
+
return {
|
|
268
|
+
evaluator_type: "llm_judge",
|
|
269
|
+
success: false,
|
|
270
|
+
summary: `LLM Judge FAILED: No available model handler found.`,
|
|
271
|
+
details: {
|
|
272
|
+
error: "no_model_available",
|
|
273
|
+
attempted_models: candidateModels,
|
|
274
|
+
models_available: Object.keys(runtime.models || {}),
|
|
275
|
+
prompt,
|
|
276
|
+
expected
|
|
277
|
+
}
|
|
278
|
+
};
|
|
279
|
+
}
|
|
280
|
+
const objectParams = {
|
|
281
|
+
prompt: structuredPrompt,
|
|
282
|
+
schema: jsonSchema,
|
|
283
|
+
temperature,
|
|
284
|
+
output: "object"
|
|
285
|
+
};
|
|
286
|
+
const response = await Promise.race([
|
|
287
|
+
runtime.useModel(modelType, objectParams),
|
|
288
|
+
new Promise(
|
|
289
|
+
(_, reject) => setTimeout(() => reject(new Error(`LLM judge timeout after ${timeoutMs}ms`)), timeoutMs)
|
|
290
|
+
)
|
|
291
|
+
]);
|
|
292
|
+
let parsedResponse;
|
|
293
|
+
try {
|
|
294
|
+
parsedResponse = this.validateStructuredResponse(response, jsonSchema);
|
|
295
|
+
} catch (parseError) {
|
|
296
|
+
return {
|
|
297
|
+
evaluator_type: "llm_judge",
|
|
298
|
+
success: false,
|
|
299
|
+
summary: `LLM Judge FAILED: Invalid LLM response - ${parseError.message}`,
|
|
300
|
+
details: {
|
|
301
|
+
error: "llm_parse_error",
|
|
302
|
+
error_type: "llm_parse_error",
|
|
303
|
+
error_message: parseError.message,
|
|
304
|
+
model_used: modelType,
|
|
305
|
+
prompt,
|
|
306
|
+
expected,
|
|
307
|
+
raw_llm_response: response,
|
|
308
|
+
custom_capabilities_provided: !!(capabilities && capabilities.length > 0),
|
|
309
|
+
capabilities_count: capabilities ? capabilities.length : 0
|
|
310
|
+
}
|
|
311
|
+
};
|
|
312
|
+
}
|
|
313
|
+
const success = this.compareWithExpected(parsedResponse, expected);
|
|
314
|
+
return {
|
|
315
|
+
evaluator_type: "llm_judge",
|
|
316
|
+
success,
|
|
317
|
+
summary: `LLM Judge ${success ? "PASSED" : "FAILED"}: ${parsedResponse.qualitative_summary.substring(0, 150)}${parsedResponse.qualitative_summary.length > 150 ? "..." : ""}`,
|
|
318
|
+
details: {
|
|
319
|
+
llm_judge_result: {
|
|
320
|
+
qualitative_summary: parsedResponse.qualitative_summary,
|
|
321
|
+
capability_checklist: parsedResponse.capability_checklist
|
|
322
|
+
},
|
|
323
|
+
custom_capabilities_provided: !!(capabilities && capabilities.length > 0),
|
|
324
|
+
capabilities_count: capabilities ? capabilities.length : 5,
|
|
325
|
+
// Default capabilities count
|
|
326
|
+
judgment_confidence: parsedResponse.confidence,
|
|
327
|
+
expected_outcome: expected,
|
|
328
|
+
model_used: modelType,
|
|
329
|
+
prompt_used: prompt,
|
|
330
|
+
raw_llm_response: response
|
|
331
|
+
}
|
|
332
|
+
};
|
|
333
|
+
} catch (error) {
|
|
334
|
+
const msg = error?.message || String(error);
|
|
335
|
+
const isTimeout = msg.toLowerCase().includes("timeout");
|
|
336
|
+
return {
|
|
337
|
+
evaluator_type: "llm_judge",
|
|
338
|
+
success: false,
|
|
339
|
+
summary: isTimeout ? `LLM Judge FAILED: Timed out after ${timeoutMs}ms.` : `LLM Judge FAILED: ${msg}`,
|
|
340
|
+
details: {
|
|
341
|
+
error: isTimeout ? "llm_timeout" : "llm_error",
|
|
342
|
+
error_type: isTimeout ? "llm_timeout" : "llm_error",
|
|
343
|
+
error_message: msg,
|
|
344
|
+
timeout_ms: timeoutMs,
|
|
345
|
+
model_attempted: modelType,
|
|
346
|
+
prompt,
|
|
347
|
+
expected,
|
|
348
|
+
custom_capabilities_provided: !!(capabilities && capabilities.length > 0),
|
|
349
|
+
capabilities_count: capabilities ? capabilities.length : 0
|
|
350
|
+
}
|
|
351
|
+
};
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
createStructuredPrompt(runResult, userPrompt, expected, capabilities) {
|
|
355
|
+
const defaultCapabilities = [
|
|
356
|
+
"Task Completion",
|
|
357
|
+
"Response Quality",
|
|
358
|
+
"User Intent Understanding",
|
|
359
|
+
"Error Handling",
|
|
360
|
+
"Appropriate Response Format"
|
|
361
|
+
];
|
|
362
|
+
const capabilitiesToUse = capabilities && capabilities.length > 0 ? capabilities : defaultCapabilities;
|
|
363
|
+
const capabilitiesSection = capabilitiesToUse.map((capability, index) => {
|
|
364
|
+
return `${index + 1}. ${capability}`;
|
|
365
|
+
}).join("\n");
|
|
366
|
+
return `You are an expert evaluator analyzing an AI agent's performance. Provide a comprehensive, structured assessment.
|
|
367
|
+
|
|
368
|
+
## Execution Context
|
|
369
|
+
- Exit Code: ${runResult.exitCode}
|
|
370
|
+
- Standard Output: ${runResult.stdout}
|
|
371
|
+
- Standard Error: ${runResult.stderr}
|
|
372
|
+
- Files Created: ${Object.keys(runResult.files).join(", ") || "None"}
|
|
373
|
+
|
|
374
|
+
## Evaluation Question
|
|
375
|
+
${userPrompt}
|
|
376
|
+
|
|
377
|
+
## Expected Outcome
|
|
378
|
+
${expected}
|
|
379
|
+
|
|
380
|
+
## Instructions
|
|
381
|
+
Analyze the agent's performance and provide a detailed assessment. You must evaluate the agent against the following specific capabilities:
|
|
382
|
+
|
|
383
|
+
${capabilitiesSection}
|
|
384
|
+
|
|
385
|
+
For each capability listed above, you must assess whether the agent achieved it and provide detailed reasoning. Your response should include:
|
|
386
|
+
|
|
387
|
+
1. **Qualitative Summary**: A comprehensive paragraph summarizing overall performance
|
|
388
|
+
2. **Capability Checklist**: For each capability above, provide:
|
|
389
|
+
- capability: The exact capability name from the list
|
|
390
|
+
- achieved: Boolean indicating if the capability was demonstrated
|
|
391
|
+
- reasoning: Detailed explanation of your assessment
|
|
392
|
+
|
|
393
|
+
Provide your assessment as a structured JSON response with detailed reasoning for each capability.`;
|
|
394
|
+
}
|
|
395
|
+
getStructuredJudgmentSchema() {
|
|
396
|
+
return {
|
|
397
|
+
type: "object",
|
|
398
|
+
properties: {
|
|
399
|
+
qualitative_summary: {
|
|
400
|
+
type: "string",
|
|
401
|
+
description: "A comprehensive paragraph summarizing the agent's performance, reasoning, and notable successes or failures"
|
|
402
|
+
},
|
|
403
|
+
capability_checklist: {
|
|
404
|
+
type: "array",
|
|
405
|
+
items: {
|
|
406
|
+
type: "object",
|
|
407
|
+
properties: {
|
|
408
|
+
capability: { type: "string" },
|
|
409
|
+
achieved: { type: "boolean" },
|
|
410
|
+
reasoning: { type: "string" }
|
|
411
|
+
},
|
|
412
|
+
required: ["capability", "achieved", "reasoning"]
|
|
413
|
+
}
|
|
414
|
+
},
|
|
415
|
+
confidence: {
|
|
416
|
+
type: "number",
|
|
417
|
+
minimum: 0,
|
|
418
|
+
maximum: 1,
|
|
419
|
+
description: "Confidence level in the assessment (0-1)"
|
|
420
|
+
},
|
|
421
|
+
overall_success: {
|
|
422
|
+
type: "boolean",
|
|
423
|
+
description: "Whether the agent successfully met the expected outcome"
|
|
424
|
+
}
|
|
425
|
+
},
|
|
426
|
+
required: ["qualitative_summary", "capability_checklist", "confidence", "overall_success"]
|
|
427
|
+
};
|
|
428
|
+
}
|
|
429
|
+
validateStructuredResponse(response, schema) {
|
|
430
|
+
if (typeof response === "string") {
|
|
431
|
+
response = JSON.parse(response);
|
|
432
|
+
}
|
|
433
|
+
if (!response.qualitative_summary || !response.capability_checklist) {
|
|
434
|
+
throw new Error("Invalid LLM response: missing required fields");
|
|
435
|
+
}
|
|
436
|
+
if (!Array.isArray(response.capability_checklist)) {
|
|
437
|
+
throw new Error("Invalid LLM response: capability_checklist must be an array");
|
|
438
|
+
}
|
|
439
|
+
if (response.capability_checklist.length === 0) {
|
|
440
|
+
response.capability_checklist = [
|
|
441
|
+
{
|
|
442
|
+
capability: "Task Completion",
|
|
443
|
+
achieved: response.overall_success || false,
|
|
444
|
+
reasoning: "Default capability assessment based on overall success"
|
|
445
|
+
}
|
|
446
|
+
];
|
|
447
|
+
}
|
|
448
|
+
if (response.confidence === void 0) {
|
|
449
|
+
response.confidence = 0.8;
|
|
450
|
+
}
|
|
451
|
+
if (response.overall_success === void 0) {
|
|
452
|
+
const allAchieved = response.capability_checklist.every((cap) => cap.achieved === true);
|
|
453
|
+
response.overall_success = allAchieved;
|
|
454
|
+
}
|
|
455
|
+
return response;
|
|
456
|
+
}
|
|
457
|
+
compareWithExpected(parsedResponse, expected) {
|
|
458
|
+
const overallSuccess = parsedResponse.overall_success;
|
|
459
|
+
const confidence = parsedResponse.confidence || 0;
|
|
460
|
+
const expectedLower = expected.toLowerCase();
|
|
461
|
+
if (expectedLower === "yes" || expectedLower === "no") {
|
|
462
|
+
return expectedLower === "yes" === overallSuccess;
|
|
463
|
+
}
|
|
464
|
+
if (expectedLower.includes("+")) {
|
|
465
|
+
const threshold = parseFloat(expectedLower.replace("+", ""));
|
|
466
|
+
return confidence >= threshold;
|
|
467
|
+
}
|
|
468
|
+
return overallSuccess;
|
|
469
|
+
}
|
|
470
|
+
};
|
|
471
|
+
export {
|
|
472
|
+
EnhancedEvaluationEngine
|
|
473
|
+
};
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import {
|
|
2
|
+
LocalEnvironmentProvider
|
|
3
|
+
} from "./chunk-LG7YDBMV.js";
|
|
4
|
+
import "./chunk-PUZHCSGF.js";
|
|
5
|
+
import "./chunk-WX37MM4G.js";
|
|
6
|
+
import "./chunk-I57T3WPO.js";
|
|
7
|
+
import "./chunk-5IWKEMEF.js";
|
|
8
|
+
import "./chunk-Q6M2K53X.js";
|
|
9
|
+
import "./chunk-FQYWRHLX.js";
|
|
10
|
+
import "./chunk-GXWWPFBO.js";
|
|
11
|
+
import "./chunk-I4L4T7QX.js";
|
|
12
|
+
import "./chunk-2ESYSVXG.js";
|
|
13
|
+
export {
|
|
14
|
+
LocalEnvironmentProvider
|
|
15
|
+
};
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import {
|
|
2
|
+
startAgent,
|
|
3
|
+
stopAgent
|
|
4
|
+
} from "./chunk-WX37MM4G.js";
|
|
5
|
+
import "./chunk-Q6M2K53X.js";
|
|
6
|
+
import "./chunk-FQYWRHLX.js";
|
|
7
|
+
import "./chunk-GXWWPFBO.js";
|
|
8
|
+
import "./chunk-I4L4T7QX.js";
|
|
9
|
+
import "./chunk-2ESYSVXG.js";
|
|
10
|
+
export {
|
|
11
|
+
startAgent,
|
|
12
|
+
stopAgent
|
|
13
|
+
};
|