@huydao/karrot 0.1.6 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +496 -243
- package/dist/executors/adapters/ag-ui-post.d.ts +10 -0
- package/dist/executors/adapters/ag-ui-post.js +174 -21
- package/dist/executors/adapters/ag-ui.js +5 -3
- package/dist/executors/execute.js +1 -0
- package/dist/executors/executor.js +2 -1
- package/dist/executors/run-result.d.ts +3 -0
- package/dist/reports/report.js +20 -0
- package/dist/scenarios/scenario.d.ts +1 -0
- package/dist/utils/config.d.ts +9 -0
- package/package.json +5 -2
- package/site/assets/app.js +201 -0
- package/site/assets/karrot-mark.svg +10 -0
- package/site/assets/styles.css +698 -0
- package/site/check.js +43 -0
- package/site/docs/index.html +505 -0
- package/site/index.html +162 -0
- package/site/serve.js +50 -0
|
@@ -6,6 +6,7 @@ export type RunAgUiPostMessageOptions = {
|
|
|
6
6
|
processTimeoutMs?: number;
|
|
7
7
|
injectMessage?: boolean;
|
|
8
8
|
injectRunMetadata?: boolean;
|
|
9
|
+
textEvents?: AgUiPostTextEventConfig[];
|
|
9
10
|
run?: {
|
|
10
11
|
url: string;
|
|
11
12
|
headers?: Record<string, string>;
|
|
@@ -52,4 +53,13 @@ export type RunAgUiPostMessageOptions = {
|
|
|
52
53
|
timeoutMs?: number;
|
|
53
54
|
};
|
|
54
55
|
};
|
|
56
|
+
export type AgUiPostTextEventConfig = {
|
|
57
|
+
type: string;
|
|
58
|
+
name?: string;
|
|
59
|
+
role?: string;
|
|
60
|
+
textPath?: string;
|
|
61
|
+
contentPath?: string;
|
|
62
|
+
deltaPath?: string;
|
|
63
|
+
mode?: 'content' | 'delta';
|
|
64
|
+
};
|
|
55
65
|
export declare function runAgUiPostMessage(options: RunAgUiPostMessageOptions): Promise<MessageRunResult>;
|
|
@@ -56,6 +56,13 @@ function getStringAtPath(payload, pathExpression) {
|
|
|
56
56
|
const rawValue = getValueByPath(payload, pathExpression);
|
|
57
57
|
return typeof rawValue === 'string' ? rawValue.trim() : '';
|
|
58
58
|
}
|
|
59
|
+
function getRawStringAtPath(payload, pathExpression) {
|
|
60
|
+
if (!pathExpression) {
|
|
61
|
+
return '';
|
|
62
|
+
}
|
|
63
|
+
const rawValue = getValueByPath(payload, pathExpression);
|
|
64
|
+
return typeof rawValue === 'string' ? rawValue : '';
|
|
65
|
+
}
|
|
59
66
|
function getArrayAtPath(payload, pathExpression) {
|
|
60
67
|
const rawValue = getValueByPath(payload, pathExpression);
|
|
61
68
|
return Array.isArray(rawValue) ? rawValue : [];
|
|
@@ -258,6 +265,100 @@ function parseSseBlock(block) {
|
|
|
258
265
|
data: dataLines.join('\n'),
|
|
259
266
|
};
|
|
260
267
|
}
|
|
268
|
+
function normalizeEventTimestamp(value) {
|
|
269
|
+
if (typeof value === 'number' && Number.isFinite(value)) {
|
|
270
|
+
return value > 10_000_000_000 ? value : value * 1000;
|
|
271
|
+
}
|
|
272
|
+
if (typeof value === 'string' && value.trim()) {
|
|
273
|
+
const numeric = Number(value);
|
|
274
|
+
if (Number.isFinite(numeric)) {
|
|
275
|
+
return numeric > 10_000_000_000 ? numeric : numeric * 1000;
|
|
276
|
+
}
|
|
277
|
+
const parsed = Date.parse(value);
|
|
278
|
+
return Number.isFinite(parsed) ? parsed : undefined;
|
|
279
|
+
}
|
|
280
|
+
return undefined;
|
|
281
|
+
}
|
|
282
|
+
function roundSeconds(startTimeMs, endTimeMs) {
|
|
283
|
+
if (typeof startTimeMs !== 'number' || typeof endTimeMs !== 'number' || endTimeMs < startTimeMs) {
|
|
284
|
+
return undefined;
|
|
285
|
+
}
|
|
286
|
+
return Number(((endTimeMs - startTimeMs) / 1000).toFixed(1));
|
|
287
|
+
}
|
|
288
|
+
function isStandardAssistantTextEvent(event) {
|
|
289
|
+
return (event.type === 'TEXT_MESSAGE_CONTENT' ||
|
|
290
|
+
event.type === 'TEXT_MESSAGE_CHUNK');
|
|
291
|
+
}
|
|
292
|
+
function isToolStartEvent(event) {
|
|
293
|
+
return (event.type === 'TOOL_CALL_START' ||
|
|
294
|
+
(event.type === 'CUSTOM' && event.name === 'super-testing-agent.tool_started'));
|
|
295
|
+
}
|
|
296
|
+
function matchesConfiguredTextEvent(event, config) {
|
|
297
|
+
if (event.type !== config.type) {
|
|
298
|
+
return false;
|
|
299
|
+
}
|
|
300
|
+
if (config.name != null && event.name !== config.name) {
|
|
301
|
+
return false;
|
|
302
|
+
}
|
|
303
|
+
if (config.role != null && event.role !== config.role) {
|
|
304
|
+
return false;
|
|
305
|
+
}
|
|
306
|
+
return true;
|
|
307
|
+
}
|
|
308
|
+
function getConfiguredTextEventValue(event, textEvents) {
|
|
309
|
+
for (const config of textEvents ?? []) {
|
|
310
|
+
if (!matchesConfiguredTextEvent(event, config)) {
|
|
311
|
+
continue;
|
|
312
|
+
}
|
|
313
|
+
const content = getRawStringAtPath(event, config.contentPath);
|
|
314
|
+
if (content.trim()) {
|
|
315
|
+
return { content: content.trim() };
|
|
316
|
+
}
|
|
317
|
+
const delta = getRawStringAtPath(event, config.deltaPath);
|
|
318
|
+
if (delta) {
|
|
319
|
+
return { delta };
|
|
320
|
+
}
|
|
321
|
+
const text = getRawStringAtPath(event, config.textPath);
|
|
322
|
+
if (!text) {
|
|
323
|
+
return {};
|
|
324
|
+
}
|
|
325
|
+
return config.mode === 'content' ? { content: text.trim() } : { delta: text };
|
|
326
|
+
}
|
|
327
|
+
return {};
|
|
328
|
+
}
|
|
329
|
+
function isAssistantTextEvent(event, textEvents) {
|
|
330
|
+
return (isStandardAssistantTextEvent(event) ||
|
|
331
|
+
Boolean((textEvents ?? []).find((config) => matchesConfiguredTextEvent(event, config))));
|
|
332
|
+
}
|
|
333
|
+
function getAssistantContent(event, textEvents) {
|
|
334
|
+
const configured = getConfiguredTextEventValue(event, textEvents);
|
|
335
|
+
if (configured.content) {
|
|
336
|
+
return configured.content;
|
|
337
|
+
}
|
|
338
|
+
if (typeof event.content === 'string' && event.content.trim()) {
|
|
339
|
+
return event.content.trim();
|
|
340
|
+
}
|
|
341
|
+
if (typeof event.value?.content === 'string' && event.value.content.trim()) {
|
|
342
|
+
return event.value.content.trim();
|
|
343
|
+
}
|
|
344
|
+
if (typeof event.value?.output === 'string' && event.value.output.trim()) {
|
|
345
|
+
return event.value.output.trim();
|
|
346
|
+
}
|
|
347
|
+
return undefined;
|
|
348
|
+
}
|
|
349
|
+
function getAssistantDelta(event, textEvents) {
|
|
350
|
+
const configured = getConfiguredTextEventValue(event, textEvents);
|
|
351
|
+
if (configured.delta) {
|
|
352
|
+
return configured.delta;
|
|
353
|
+
}
|
|
354
|
+
if (typeof event.delta === 'string' && event.delta) {
|
|
355
|
+
return event.delta;
|
|
356
|
+
}
|
|
357
|
+
if (typeof event.value?.delta === 'string' && event.value.delta) {
|
|
358
|
+
return event.value.delta;
|
|
359
|
+
}
|
|
360
|
+
return undefined;
|
|
361
|
+
}
|
|
261
362
|
function createConnectCollector(options) {
|
|
262
363
|
const assistantFragments = [];
|
|
263
364
|
const toolCalls = [];
|
|
@@ -266,6 +367,11 @@ function createConnectCollector(options) {
|
|
|
266
367
|
let started = false;
|
|
267
368
|
let finished = false;
|
|
268
369
|
let sawAnyEvent = false;
|
|
370
|
+
let runStartedAt;
|
|
371
|
+
let firstTextAt;
|
|
372
|
+
let firstToolAt;
|
|
373
|
+
let runFinishedAt;
|
|
374
|
+
const eventTime = (event) => normalizeEventTimestamp(event.timestamp) ?? Date.now();
|
|
269
375
|
const consume = (sseEvent) => {
|
|
270
376
|
if (!sseEvent.data) {
|
|
271
377
|
return false;
|
|
@@ -286,6 +392,7 @@ function createConnectCollector(options) {
|
|
|
286
392
|
}
|
|
287
393
|
if (parsed.type === 'RUN_STARTED' && parsed.runId === options.targetRunId) {
|
|
288
394
|
started = true;
|
|
395
|
+
runStartedAt = eventTime(parsed);
|
|
289
396
|
return false;
|
|
290
397
|
}
|
|
291
398
|
if (!started) {
|
|
@@ -300,19 +407,26 @@ function createConnectCollector(options) {
|
|
|
300
407
|
});
|
|
301
408
|
}
|
|
302
409
|
const isAssistantMessage = parsed.role === 'assistant' || typeof parsed.role !== 'string';
|
|
303
|
-
if ((parsed
|
|
304
|
-
|
|
305
|
-
|
|
410
|
+
if (isAssistantTextEvent(parsed, options.textEvents) && isAssistantMessage) {
|
|
411
|
+
firstTextAt ??= eventTime(parsed);
|
|
412
|
+
const content = getAssistantContent(parsed, options.textEvents);
|
|
413
|
+
const delta = getAssistantDelta(parsed, options.textEvents);
|
|
414
|
+
if (content) {
|
|
415
|
+
latestAssistantContent = content;
|
|
306
416
|
}
|
|
307
|
-
else if (
|
|
308
|
-
assistantFragments.push(
|
|
417
|
+
else if (delta) {
|
|
418
|
+
assistantFragments.push(delta);
|
|
309
419
|
}
|
|
310
420
|
}
|
|
311
|
-
if (parsed
|
|
312
|
-
|
|
421
|
+
if (isToolStartEvent(parsed)) {
|
|
422
|
+
firstToolAt ??= eventTime(parsed);
|
|
423
|
+
if (typeof parsed.toolCallName === 'string' && parsed.toolCallName.trim()) {
|
|
424
|
+
toolCalls.push(parsed.toolCallName.trim());
|
|
425
|
+
}
|
|
313
426
|
}
|
|
314
427
|
if (parsed.type === 'RUN_FINISHED' && (!parsed.runId || parsed.runId === options.targetRunId)) {
|
|
315
428
|
finished = true;
|
|
429
|
+
runFinishedAt = eventTime(parsed);
|
|
316
430
|
return true;
|
|
317
431
|
}
|
|
318
432
|
return false;
|
|
@@ -320,12 +434,19 @@ function createConnectCollector(options) {
|
|
|
320
434
|
return {
|
|
321
435
|
consume,
|
|
322
436
|
getResult() {
|
|
437
|
+
const turnCompleteSeconds = roundSeconds(runStartedAt, runFinishedAt);
|
|
323
438
|
return {
|
|
324
439
|
output: latestAssistantContent?.trim() || assistantFragments.join('').trim(),
|
|
325
440
|
toolCalls: [...toolCalls],
|
|
326
441
|
threadId: resolvedThreadId,
|
|
327
442
|
finished,
|
|
328
443
|
sawAnyEvent,
|
|
444
|
+
metrics: {
|
|
445
|
+
ttfTextSeconds: roundSeconds(runStartedAt, firstTextAt),
|
|
446
|
+
ttfToolSeconds: roundSeconds(runStartedAt, firstToolAt),
|
|
447
|
+
turnCompleteSeconds,
|
|
448
|
+
totalSeconds: turnCompleteSeconds,
|
|
449
|
+
},
|
|
329
450
|
};
|
|
330
451
|
},
|
|
331
452
|
};
|
|
@@ -337,6 +458,7 @@ async function postAndCaptureResponse(options) {
|
|
|
337
458
|
: undefined;
|
|
338
459
|
try {
|
|
339
460
|
await promises_1.default.writeFile(options.outputPath, '', 'utf8');
|
|
461
|
+
const startedAtMs = Date.now();
|
|
340
462
|
const response = await fetch(options.url, {
|
|
341
463
|
method: 'POST',
|
|
342
464
|
headers: {
|
|
@@ -369,6 +491,8 @@ async function postAndCaptureResponse(options) {
|
|
|
369
491
|
return {
|
|
370
492
|
status: response.status,
|
|
371
493
|
rawContent,
|
|
494
|
+
startedAtMs,
|
|
495
|
+
finishedAtMs: Date.now(),
|
|
372
496
|
};
|
|
373
497
|
}
|
|
374
498
|
catch (error) {
|
|
@@ -419,17 +543,19 @@ function startConnectStream(options) {
|
|
|
419
543
|
const collector = createConnectCollector({
|
|
420
544
|
targetRunId: options.targetRunId,
|
|
421
545
|
fallbackThreadId: options.fallbackThreadId,
|
|
546
|
+
textEvents: options.textEvents,
|
|
422
547
|
});
|
|
423
548
|
if (!response.body) {
|
|
424
549
|
const rawContent = await response.text();
|
|
425
550
|
await promises_1.default.writeFile(options.outputPath, rawContent, 'utf8');
|
|
426
|
-
const parsed = extractResultFromSse(rawContent, options.fallbackThreadId);
|
|
551
|
+
const parsed = extractResultFromSse(rawContent, options.fallbackThreadId, {}, options.textEvents);
|
|
427
552
|
return {
|
|
428
553
|
output: parsed.output,
|
|
429
554
|
toolCalls: parsed.toolCalls,
|
|
430
555
|
threadId: parsed.threadId,
|
|
431
556
|
finished: parsed.finished,
|
|
432
557
|
sawAnyEvent: parsed.sawAnyEvent,
|
|
558
|
+
metrics: parsed.metrics,
|
|
433
559
|
};
|
|
434
560
|
}
|
|
435
561
|
const reader = response.body.getReader();
|
|
@@ -495,13 +621,17 @@ function startConnectStream(options) {
|
|
|
495
621
|
result,
|
|
496
622
|
};
|
|
497
623
|
}
|
|
498
|
-
function extractResultFromSse(rawContent, fallbackThreadId) {
|
|
624
|
+
function extractResultFromSse(rawContent, fallbackThreadId, fallbackMetrics = {}, textEvents) {
|
|
499
625
|
const fragments = [];
|
|
500
626
|
let latestContent;
|
|
501
627
|
let resolvedThreadId = fallbackThreadId;
|
|
502
628
|
const toolCalls = [];
|
|
503
629
|
let finished = false;
|
|
504
630
|
let sawAnyEvent = false;
|
|
631
|
+
let runStartedAt;
|
|
632
|
+
let firstTextAt;
|
|
633
|
+
let firstToolAt;
|
|
634
|
+
let runFinishedAt;
|
|
505
635
|
for (const sseEvent of parseSseEvents(rawContent)) {
|
|
506
636
|
if (!sseEvent.data) {
|
|
507
637
|
continue;
|
|
@@ -515,19 +645,29 @@ function extractResultFromSse(rawContent, fallbackThreadId) {
|
|
|
515
645
|
else if (typeof parsed.conversationId === 'string' && parsed.conversationId.trim()) {
|
|
516
646
|
resolvedThreadId = parsed.conversationId.trim();
|
|
517
647
|
}
|
|
518
|
-
if (parsed.type === '
|
|
519
|
-
|
|
520
|
-
|
|
648
|
+
if (parsed.type === 'RUN_STARTED' && typeof runStartedAt !== 'number') {
|
|
649
|
+
runStartedAt = normalizeEventTimestamp(parsed.timestamp);
|
|
650
|
+
}
|
|
651
|
+
if (isAssistantTextEvent(parsed, textEvents)) {
|
|
652
|
+
firstTextAt ??= normalizeEventTimestamp(parsed.timestamp);
|
|
653
|
+
const content = getAssistantContent(parsed, textEvents);
|
|
654
|
+
const delta = getAssistantDelta(parsed, textEvents);
|
|
655
|
+
if (content) {
|
|
656
|
+
latestContent = content;
|
|
521
657
|
}
|
|
522
|
-
else if (
|
|
523
|
-
fragments.push(
|
|
658
|
+
else if (delta) {
|
|
659
|
+
fragments.push(delta);
|
|
524
660
|
}
|
|
525
661
|
}
|
|
526
|
-
if (parsed
|
|
527
|
-
|
|
662
|
+
if (isToolStartEvent(parsed)) {
|
|
663
|
+
firstToolAt ??= normalizeEventTimestamp(parsed.timestamp);
|
|
664
|
+
if (typeof parsed.toolCallName === 'string' && parsed.toolCallName.trim()) {
|
|
665
|
+
toolCalls.push(parsed.toolCallName.trim());
|
|
666
|
+
}
|
|
528
667
|
}
|
|
529
668
|
if (parsed.type === 'RUN_FINISHED') {
|
|
530
669
|
finished = true;
|
|
670
|
+
runFinishedAt = normalizeEventTimestamp(parsed.timestamp);
|
|
531
671
|
}
|
|
532
672
|
if (parsed.type === 'RUN_ERROR') {
|
|
533
673
|
throw new run_result_1.MessageRunError(parsed.error?.trim() || 'Agent run failed.', {
|
|
@@ -544,12 +684,21 @@ function extractResultFromSse(rawContent, fallbackThreadId) {
|
|
|
544
684
|
}
|
|
545
685
|
}
|
|
546
686
|
}
|
|
687
|
+
const effectiveStartedAt = runStartedAt ?? fallbackMetrics.startTimeMs;
|
|
688
|
+
const effectiveFinishedAt = runFinishedAt ?? fallbackMetrics.finishedTimeMs;
|
|
689
|
+
const turnCompleteSeconds = roundSeconds(effectiveStartedAt, effectiveFinishedAt);
|
|
547
690
|
return {
|
|
548
691
|
output: latestContent?.trim() || fragments.join('').trim(),
|
|
549
692
|
toolCalls,
|
|
550
693
|
threadId: resolvedThreadId,
|
|
551
694
|
finished,
|
|
552
695
|
sawAnyEvent,
|
|
696
|
+
metrics: {
|
|
697
|
+
ttfTextSeconds: roundSeconds(runStartedAt, firstTextAt),
|
|
698
|
+
ttfToolSeconds: roundSeconds(runStartedAt, firstToolAt),
|
|
699
|
+
turnCompleteSeconds,
|
|
700
|
+
totalSeconds: turnCompleteSeconds,
|
|
701
|
+
},
|
|
553
702
|
};
|
|
554
703
|
}
|
|
555
704
|
async function runAgUiPostMessage(options) {
|
|
@@ -591,6 +740,7 @@ async function runAgUiPostMessage(options) {
|
|
|
591
740
|
targetRunId: runId,
|
|
592
741
|
fallbackThreadId: resolvedThreadId,
|
|
593
742
|
processTimeoutMs: options.connect.processTimeoutMs ?? options.processTimeoutMs,
|
|
743
|
+
textEvents: options.textEvents,
|
|
594
744
|
});
|
|
595
745
|
await connectStream.ready;
|
|
596
746
|
const runResponse = await postAndCaptureResponse({
|
|
@@ -625,7 +775,7 @@ async function runAgUiPostMessage(options) {
|
|
|
625
775
|
note: `Run log: ${node_path_1.default.basename(runOutputPath)}`,
|
|
626
776
|
toolCallCount: connected.toolCalls.length,
|
|
627
777
|
toolCalls: connected.toolCalls,
|
|
628
|
-
metrics:
|
|
778
|
+
metrics: connected.metrics,
|
|
629
779
|
};
|
|
630
780
|
}
|
|
631
781
|
const runResponse = await postAndCaptureResponse({
|
|
@@ -642,7 +792,10 @@ async function runAgUiPostMessage(options) {
|
|
|
642
792
|
output: runResponse.rawContent,
|
|
643
793
|
});
|
|
644
794
|
}
|
|
645
|
-
const parsed = extractResultFromSse(runResponse.rawContent, resolvedThreadId
|
|
795
|
+
const parsed = extractResultFromSse(runResponse.rawContent, resolvedThreadId, {
|
|
796
|
+
startTimeMs: runResponse.startedAtMs,
|
|
797
|
+
finishedTimeMs: runResponse.finishedAtMs,
|
|
798
|
+
}, options.textEvents);
|
|
646
799
|
if (options.observe) {
|
|
647
800
|
await promises_1.default.writeFile(observePath, '', 'utf8');
|
|
648
801
|
const observed = await waitForObservedCompletion({
|
|
@@ -659,7 +812,7 @@ async function runAgUiPostMessage(options) {
|
|
|
659
812
|
note: `Observe status: ${observed.status}. Observe log: ${node_path_1.default.basename(observePath)}`,
|
|
660
813
|
toolCallCount: parsed.toolCalls.length,
|
|
661
814
|
toolCalls: parsed.toolCalls,
|
|
662
|
-
metrics:
|
|
815
|
+
metrics: parsed.metrics,
|
|
663
816
|
};
|
|
664
817
|
}
|
|
665
818
|
if (options.completionCheck) {
|
|
@@ -671,7 +824,7 @@ async function runAgUiPostMessage(options) {
|
|
|
671
824
|
note: `Completion status: ${completion.status}`,
|
|
672
825
|
toolCallCount: parsed.toolCalls.length,
|
|
673
826
|
toolCalls: parsed.toolCalls,
|
|
674
|
-
metrics:
|
|
827
|
+
metrics: parsed.metrics,
|
|
675
828
|
};
|
|
676
829
|
}
|
|
677
830
|
if (!parsed.finished && !parsed.output) {
|
|
@@ -691,7 +844,7 @@ async function runAgUiPostMessage(options) {
|
|
|
691
844
|
outputPath,
|
|
692
845
|
toolCallCount: parsed.toolCalls.length,
|
|
693
846
|
toolCalls: parsed.toolCalls,
|
|
694
|
-
metrics:
|
|
847
|
+
metrics: parsed.metrics,
|
|
695
848
|
};
|
|
696
849
|
}
|
|
697
850
|
catch (error) {
|
|
@@ -9,8 +9,8 @@ exports.extractAppendedLog = extractAppendedLog;
|
|
|
9
9
|
exports.runAgUiMessage = runAgUiMessage;
|
|
10
10
|
const promises_1 = __importDefault(require("node:fs/promises"));
|
|
11
11
|
const node_path_1 = __importDefault(require("node:path"));
|
|
12
|
+
const node_crypto_1 = require("node:crypto");
|
|
12
13
|
const stompjs_1 = require("@stomp/stompjs");
|
|
13
|
-
const uuid_1 = require("uuid");
|
|
14
14
|
const ws_1 = __importDefault(require("ws"));
|
|
15
15
|
const run_result_1 = require("../run-result");
|
|
16
16
|
Object.assign(globalThis, { WebSocket: ws_1.default });
|
|
@@ -169,6 +169,7 @@ function computeMetrics(state) {
|
|
|
169
169
|
return {
|
|
170
170
|
ttfToolSeconds,
|
|
171
171
|
ttfTextSeconds,
|
|
172
|
+
turnCompleteSeconds: totalSeconds,
|
|
172
173
|
totalSeconds,
|
|
173
174
|
protocolUsedKb,
|
|
174
175
|
protocolTotalKb,
|
|
@@ -179,6 +180,7 @@ function writeMetricsToStdout(metrics) {
|
|
|
179
180
|
const parts = [
|
|
180
181
|
metrics.ttfToolSeconds != null ? `TTF-Tool: ${metrics.ttfToolSeconds.toFixed(1)}s` : undefined,
|
|
181
182
|
metrics.ttfTextSeconds != null ? `TTF-Text: ${metrics.ttfTextSeconds.toFixed(1)}s` : undefined,
|
|
183
|
+
metrics.turnCompleteSeconds != null ? `Turn complete: ${metrics.turnCompleteSeconds.toFixed(1)}s` : undefined,
|
|
182
184
|
metrics.totalSeconds != null ? `Total: ${metrics.totalSeconds.toFixed(1)}s` : undefined,
|
|
183
185
|
metrics.protocolUsedKb != null && metrics.protocolTotalKb != null && metrics.efficiencyPercent != null
|
|
184
186
|
? `Protocol efficiency: ${metrics.protocolUsedKb.toFixed(1)}KB/${metrics.protocolTotalKb.toFixed(1)}KB (${metrics.efficiencyPercent}%)`
|
|
@@ -430,8 +432,8 @@ async function connectAndRun(options) {
|
|
|
430
432
|
async function runAgUiMessage(options) {
|
|
431
433
|
await promises_1.default.mkdir(options.outputDirectory, { recursive: true });
|
|
432
434
|
const config = parseAgUiEnv(options.env);
|
|
433
|
-
const threadId = options.threadId ?? options.threadIdFallback ?? (0,
|
|
434
|
-
const runId = (0,
|
|
435
|
+
const threadId = options.threadId ?? options.threadIdFallback ?? (0, node_crypto_1.randomUUID)();
|
|
436
|
+
const runId = (0, node_crypto_1.randomUUID)();
|
|
435
437
|
const logPath = node_path_1.default.join(options.outputDirectory, `${threadId}.jsonl`);
|
|
436
438
|
const previousLogContent = await readJsonl(logPath);
|
|
437
439
|
const state = await connectAndRun({
|
|
@@ -48,6 +48,7 @@ function createAgUiPostRunner(config) {
|
|
|
48
48
|
processTimeoutMs: processTimeoutMs ?? transport.processTimeoutMs,
|
|
49
49
|
injectMessage: transport.injectMessage,
|
|
50
50
|
injectRunMetadata: transport.injectRunMetadata,
|
|
51
|
+
textEvents: transport.textEvents,
|
|
51
52
|
run: transport.run ?? transport.request,
|
|
52
53
|
connect: transport.connect,
|
|
53
54
|
observe: transport.observe,
|
|
@@ -135,7 +135,8 @@ async function runSingleScenario(scenario, context, env, outputDirectory, deadli
|
|
|
135
135
|
if (assertionFailureNote) {
|
|
136
136
|
result.status = 'FAIL';
|
|
137
137
|
result.note = [result.note, assertionFailureNote].filter(Boolean).join(' ') || undefined;
|
|
138
|
-
|
|
138
|
+
const shouldContinueOnAssertionFailure = turn.continueOnAssertionFailure ?? scenario.continueOnAssertionFailure ?? false;
|
|
139
|
+
if (!shouldContinueOnAssertionFailure) {
|
|
139
140
|
throw new Error(assertionFailureNote);
|
|
140
141
|
}
|
|
141
142
|
}
|
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
export type TimingMetrics = {
|
|
2
2
|
ttfToolSeconds?: number;
|
|
3
3
|
ttfTextSeconds?: number;
|
|
4
|
+
turnCompleteSeconds?: number;
|
|
4
5
|
totalSeconds?: number;
|
|
6
|
+
averageTtfTextSeconds?: number;
|
|
7
|
+
averageTurnCompleteSeconds?: number;
|
|
5
8
|
protocolUsedKb?: number;
|
|
6
9
|
protocolTotalKb?: number;
|
|
7
10
|
efficiencyPercent?: number;
|
package/dist/reports/report.js
CHANGED
|
@@ -26,6 +26,16 @@ function sumNumbers(values) {
|
|
|
26
26
|
}
|
|
27
27
|
return Number(definedValues.reduce((total, value) => total + value, 0).toFixed(1));
|
|
28
28
|
}
|
|
29
|
+
function averageNumbers(values) {
|
|
30
|
+
const definedValues = values.filter((value) => typeof value === 'number');
|
|
31
|
+
if (definedValues.length === 0) {
|
|
32
|
+
return undefined;
|
|
33
|
+
}
|
|
34
|
+
return Number((definedValues.reduce((total, value) => total + value, 0) / definedValues.length).toFixed(1));
|
|
35
|
+
}
|
|
36
|
+
function turnCompleteSeconds(metrics) {
|
|
37
|
+
return metrics.turnCompleteSeconds ?? metrics.totalSeconds;
|
|
38
|
+
}
|
|
29
39
|
function summarizeScenarioMetrics(turns) {
|
|
30
40
|
const protocolUsedKb = sumNumbers(turns.map((turn) => turn.metrics.protocolUsedKb));
|
|
31
41
|
const protocolTotalKb = sumNumbers(turns.map((turn) => turn.metrics.protocolTotalKb));
|
|
@@ -35,7 +45,10 @@ function summarizeScenarioMetrics(turns) {
|
|
|
35
45
|
return {
|
|
36
46
|
ttfToolSeconds: sumNumbers(turns.map((turn) => turn.metrics.ttfToolSeconds)),
|
|
37
47
|
ttfTextSeconds: sumNumbers(turns.map((turn) => turn.metrics.ttfTextSeconds)),
|
|
48
|
+
turnCompleteSeconds: sumNumbers(turns.map((turn) => turnCompleteSeconds(turn.metrics))),
|
|
38
49
|
totalSeconds: sumNumbers(turns.map((turn) => turn.metrics.totalSeconds)),
|
|
50
|
+
averageTtfTextSeconds: averageNumbers(turns.map((turn) => turn.metrics.ttfTextSeconds)),
|
|
51
|
+
averageTurnCompleteSeconds: averageNumbers(turns.map((turn) => turnCompleteSeconds(turn.metrics))),
|
|
39
52
|
protocolUsedKb,
|
|
40
53
|
protocolTotalKb,
|
|
41
54
|
efficiencyPercent,
|
|
@@ -112,6 +125,8 @@ function buildScenarioRunSummary(results) {
|
|
|
112
125
|
failedAssertions: assertions.filter((assertion) => !assertion.passed).length,
|
|
113
126
|
totalToolCalls: turns.reduce((total, turn) => total + turn.toolCallCount, 0),
|
|
114
127
|
totalEvaluations: evaluations.length,
|
|
128
|
+
averageTtfTextSeconds: averageNumbers(turns.map((turn) => turn.metrics.ttfTextSeconds)),
|
|
129
|
+
averageTurnCompleteSeconds: averageNumbers(turns.map((turn) => turnCompleteSeconds(turn.metrics))),
|
|
115
130
|
averageScoresByDimension,
|
|
116
131
|
requestedEvalDimensions,
|
|
117
132
|
};
|
|
@@ -155,7 +170,10 @@ function renderMetrics(metrics) {
|
|
|
155
170
|
return [
|
|
156
171
|
`TTF Tool: ${formatSeconds(metrics.ttfToolSeconds)}`,
|
|
157
172
|
`TTF Text: ${formatSeconds(metrics.ttfTextSeconds)}`,
|
|
173
|
+
`Turn Complete: ${formatSeconds(turnCompleteSeconds(metrics))}`,
|
|
158
174
|
`Total: ${formatSeconds(metrics.totalSeconds)}`,
|
|
175
|
+
`Avg TTF Text: ${formatSeconds(metrics.averageTtfTextSeconds)}`,
|
|
176
|
+
`Avg Complete: ${formatSeconds(metrics.averageTurnCompleteSeconds)}`,
|
|
159
177
|
`Efficiency: ${formatPercent(metrics.efficiencyPercent)}`,
|
|
160
178
|
].join(' | ');
|
|
161
179
|
}
|
|
@@ -356,6 +374,8 @@ function buildScenarioRunHtml(payload) {
|
|
|
356
374
|
'<section class="summary-grid">',
|
|
357
375
|
`<article class="summary-card"><span class="label">Scenarios</span><div class="value">${payload.summary.totalScenarios}</div><div class="sub">${payload.summary.passedScenarios} pass / ${payload.summary.failedScenarios} fail / ${payload.summary.skippedScenarios} skip</div></article>`,
|
|
358
376
|
`<article class="summary-card"><span class="label">Turns</span><div class="value">${payload.summary.totalTurns}</div><div class="sub">${payload.summary.totalToolCalls} tool calls total</div></article>`,
|
|
377
|
+
`<article class="summary-card"><span class="label">Avg TTF Text</span><div class="value">${formatSeconds(payload.summary.averageTtfTextSeconds)}</div><div class="sub">First assistant text per turn</div></article>`,
|
|
378
|
+
`<article class="summary-card"><span class="label">Avg Turn Complete</span><div class="value">${formatSeconds(payload.summary.averageTurnCompleteSeconds)}</div><div class="sub">Run started to run finished</div></article>`,
|
|
359
379
|
`<article class="summary-card"><span class="label">Assertions</span><div class="value">${payload.summary.totalAssertions}</div><div class="sub">${payload.summary.passedAssertions} pass / ${payload.summary.failedAssertions} fail</div></article>`,
|
|
360
380
|
`<article class="summary-card"><span class="label">Evaluations</span><div class="value">${payload.summary.totalEvaluations}</div><div class="sub">LLM-scored dimensions</div></article>`,
|
|
361
381
|
'</section>',
|
|
@@ -44,6 +44,7 @@ export type AiTurn<TContext extends BaseAiScenarioContext = BaseAiScenarioContex
|
|
|
44
44
|
message: AiTurnMessage<TContext>;
|
|
45
45
|
idleTimeoutMs?: number;
|
|
46
46
|
processTimeoutMs?: number;
|
|
47
|
+
continueOnAssertionFailure?: boolean;
|
|
47
48
|
assertions?: AiTurnAssertion[];
|
|
48
49
|
eval?: AiTurnEvalDefinition[];
|
|
49
50
|
onComplete?: (args: AiTurnCompletionArgs<TContext>) => void | Promise<void>;
|
package/dist/utils/config.d.ts
CHANGED
|
@@ -22,6 +22,15 @@ export type KarrotConfig = {
|
|
|
22
22
|
type: 'ag-ui-post';
|
|
23
23
|
injectMessage?: boolean;
|
|
24
24
|
injectRunMetadata?: boolean;
|
|
25
|
+
textEvents?: Array<{
|
|
26
|
+
type: string;
|
|
27
|
+
name?: string;
|
|
28
|
+
role?: string;
|
|
29
|
+
textPath?: string;
|
|
30
|
+
contentPath?: string;
|
|
31
|
+
deltaPath?: string;
|
|
32
|
+
mode?: 'content' | 'delta';
|
|
33
|
+
}>;
|
|
25
34
|
run?: {
|
|
26
35
|
url: string;
|
|
27
36
|
headers?: Record<string, string>;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@huydao/karrot",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.8",
|
|
4
4
|
"description": "Reusable AI scenario execution, assertion, evaluation, and reporting toolkit",
|
|
5
5
|
"license": "ISC",
|
|
6
6
|
"type": "commonjs",
|
|
@@ -126,12 +126,15 @@
|
|
|
126
126
|
},
|
|
127
127
|
"files": [
|
|
128
128
|
"dist",
|
|
129
|
+
"site",
|
|
129
130
|
"README.md",
|
|
130
131
|
"GUIDE.md"
|
|
131
132
|
],
|
|
132
133
|
"scripts": {
|
|
133
134
|
"build": "rm -rf dist && tsc -p tsconfig.json && mkdir -p dist/prompts && cp prompts/*.md dist/prompts/",
|
|
134
|
-
"prepack": "npm run build"
|
|
135
|
+
"prepack": "npm run build",
|
|
136
|
+
"site:serve": "node site/serve.js",
|
|
137
|
+
"site:check": "node site/check.js"
|
|
135
138
|
},
|
|
136
139
|
"dependencies": {
|
|
137
140
|
"@stomp/stompjs": "^7.3.0",
|