@huydao/karrot 0.1.6 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +496 -243
- package/dist/executors/adapters/ag-ui-post.js +87 -12
- package/dist/executors/adapters/ag-ui.js +5 -3
- package/dist/executors/executor.js +2 -1
- package/dist/executors/run-result.d.ts +3 -0
- package/dist/reports/report.js +20 -0
- package/dist/scenarios/scenario.d.ts +1 -0
- package/package.json +5 -2
- package/site/assets/app.js +201 -0
- package/site/assets/karrot-mark.svg +10 -0
- package/site/assets/styles.css +698 -0
- package/site/check.js +43 -0
- package/site/docs/index.html +505 -0
- package/site/index.html +162 -0
- package/site/serve.js +50 -0
|
@@ -258,6 +258,35 @@ function parseSseBlock(block) {
|
|
|
258
258
|
data: dataLines.join('\n'),
|
|
259
259
|
};
|
|
260
260
|
}
|
|
261
|
+
function normalizeEventTimestamp(value) {
|
|
262
|
+
if (typeof value === 'number' && Number.isFinite(value)) {
|
|
263
|
+
return value > 10_000_000_000 ? value : value * 1000;
|
|
264
|
+
}
|
|
265
|
+
if (typeof value === 'string' && value.trim()) {
|
|
266
|
+
const numeric = Number(value);
|
|
267
|
+
if (Number.isFinite(numeric)) {
|
|
268
|
+
return numeric > 10_000_000_000 ? numeric : numeric * 1000;
|
|
269
|
+
}
|
|
270
|
+
const parsed = Date.parse(value);
|
|
271
|
+
return Number.isFinite(parsed) ? parsed : undefined;
|
|
272
|
+
}
|
|
273
|
+
return undefined;
|
|
274
|
+
}
|
|
275
|
+
function roundSeconds(startTimeMs, endTimeMs) {
|
|
276
|
+
if (typeof startTimeMs !== 'number' || typeof endTimeMs !== 'number' || endTimeMs < startTimeMs) {
|
|
277
|
+
return undefined;
|
|
278
|
+
}
|
|
279
|
+
return Number(((endTimeMs - startTimeMs) / 1000).toFixed(1));
|
|
280
|
+
}
|
|
281
|
+
function isAssistantTextEvent(event) {
|
|
282
|
+
return (event.type === 'TEXT_MESSAGE_CONTENT' ||
|
|
283
|
+
event.type === 'TEXT_MESSAGE_CHUNK' ||
|
|
284
|
+
(event.type === 'CUSTOM' && event.name === 'super-testing-agent.model_stream_chunk'));
|
|
285
|
+
}
|
|
286
|
+
function isToolStartEvent(event) {
|
|
287
|
+
return (event.type === 'TOOL_CALL_START' ||
|
|
288
|
+
(event.type === 'CUSTOM' && event.name === 'super-testing-agent.tool_started'));
|
|
289
|
+
}
|
|
261
290
|
function createConnectCollector(options) {
|
|
262
291
|
const assistantFragments = [];
|
|
263
292
|
const toolCalls = [];
|
|
@@ -266,6 +295,11 @@ function createConnectCollector(options) {
|
|
|
266
295
|
let started = false;
|
|
267
296
|
let finished = false;
|
|
268
297
|
let sawAnyEvent = false;
|
|
298
|
+
let runStartedAt;
|
|
299
|
+
let firstTextAt;
|
|
300
|
+
let firstToolAt;
|
|
301
|
+
let runFinishedAt;
|
|
302
|
+
const eventTime = (event) => normalizeEventTimestamp(event.timestamp) ?? Date.now();
|
|
269
303
|
const consume = (sseEvent) => {
|
|
270
304
|
if (!sseEvent.data) {
|
|
271
305
|
return false;
|
|
@@ -286,6 +320,7 @@ function createConnectCollector(options) {
|
|
|
286
320
|
}
|
|
287
321
|
if (parsed.type === 'RUN_STARTED' && parsed.runId === options.targetRunId) {
|
|
288
322
|
started = true;
|
|
323
|
+
runStartedAt = eventTime(parsed);
|
|
289
324
|
return false;
|
|
290
325
|
}
|
|
291
326
|
if (!started) {
|
|
@@ -300,7 +335,8 @@ function createConnectCollector(options) {
|
|
|
300
335
|
});
|
|
301
336
|
}
|
|
302
337
|
const isAssistantMessage = parsed.role === 'assistant' || typeof parsed.role !== 'string';
|
|
303
|
-
if ((parsed
|
|
338
|
+
if (isAssistantTextEvent(parsed) && isAssistantMessage) {
|
|
339
|
+
firstTextAt ??= eventTime(parsed);
|
|
304
340
|
if (typeof parsed.content === 'string' && parsed.content.trim()) {
|
|
305
341
|
latestAssistantContent = parsed.content.trim();
|
|
306
342
|
}
|
|
@@ -308,11 +344,15 @@ function createConnectCollector(options) {
|
|
|
308
344
|
assistantFragments.push(parsed.delta);
|
|
309
345
|
}
|
|
310
346
|
}
|
|
311
|
-
if (parsed
|
|
312
|
-
|
|
347
|
+
if (isToolStartEvent(parsed)) {
|
|
348
|
+
firstToolAt ??= eventTime(parsed);
|
|
349
|
+
if (typeof parsed.toolCallName === 'string' && parsed.toolCallName.trim()) {
|
|
350
|
+
toolCalls.push(parsed.toolCallName.trim());
|
|
351
|
+
}
|
|
313
352
|
}
|
|
314
353
|
if (parsed.type === 'RUN_FINISHED' && (!parsed.runId || parsed.runId === options.targetRunId)) {
|
|
315
354
|
finished = true;
|
|
355
|
+
runFinishedAt = eventTime(parsed);
|
|
316
356
|
return true;
|
|
317
357
|
}
|
|
318
358
|
return false;
|
|
@@ -320,12 +360,19 @@ function createConnectCollector(options) {
|
|
|
320
360
|
return {
|
|
321
361
|
consume,
|
|
322
362
|
getResult() {
|
|
363
|
+
const turnCompleteSeconds = roundSeconds(runStartedAt, runFinishedAt);
|
|
323
364
|
return {
|
|
324
365
|
output: latestAssistantContent?.trim() || assistantFragments.join('').trim(),
|
|
325
366
|
toolCalls: [...toolCalls],
|
|
326
367
|
threadId: resolvedThreadId,
|
|
327
368
|
finished,
|
|
328
369
|
sawAnyEvent,
|
|
370
|
+
metrics: {
|
|
371
|
+
ttfTextSeconds: roundSeconds(runStartedAt, firstTextAt),
|
|
372
|
+
ttfToolSeconds: roundSeconds(runStartedAt, firstToolAt),
|
|
373
|
+
turnCompleteSeconds,
|
|
374
|
+
totalSeconds: turnCompleteSeconds,
|
|
375
|
+
},
|
|
329
376
|
};
|
|
330
377
|
},
|
|
331
378
|
};
|
|
@@ -337,6 +384,7 @@ async function postAndCaptureResponse(options) {
|
|
|
337
384
|
: undefined;
|
|
338
385
|
try {
|
|
339
386
|
await promises_1.default.writeFile(options.outputPath, '', 'utf8');
|
|
387
|
+
const startedAtMs = Date.now();
|
|
340
388
|
const response = await fetch(options.url, {
|
|
341
389
|
method: 'POST',
|
|
342
390
|
headers: {
|
|
@@ -369,6 +417,8 @@ async function postAndCaptureResponse(options) {
|
|
|
369
417
|
return {
|
|
370
418
|
status: response.status,
|
|
371
419
|
rawContent,
|
|
420
|
+
startedAtMs,
|
|
421
|
+
finishedAtMs: Date.now(),
|
|
372
422
|
};
|
|
373
423
|
}
|
|
374
424
|
catch (error) {
|
|
@@ -430,6 +480,7 @@ function startConnectStream(options) {
|
|
|
430
480
|
threadId: parsed.threadId,
|
|
431
481
|
finished: parsed.finished,
|
|
432
482
|
sawAnyEvent: parsed.sawAnyEvent,
|
|
483
|
+
metrics: parsed.metrics,
|
|
433
484
|
};
|
|
434
485
|
}
|
|
435
486
|
const reader = response.body.getReader();
|
|
@@ -495,13 +546,17 @@ function startConnectStream(options) {
|
|
|
495
546
|
result,
|
|
496
547
|
};
|
|
497
548
|
}
|
|
498
|
-
function extractResultFromSse(rawContent, fallbackThreadId) {
|
|
549
|
+
function extractResultFromSse(rawContent, fallbackThreadId, fallbackMetrics = {}) {
|
|
499
550
|
const fragments = [];
|
|
500
551
|
let latestContent;
|
|
501
552
|
let resolvedThreadId = fallbackThreadId;
|
|
502
553
|
const toolCalls = [];
|
|
503
554
|
let finished = false;
|
|
504
555
|
let sawAnyEvent = false;
|
|
556
|
+
let runStartedAt;
|
|
557
|
+
let firstTextAt;
|
|
558
|
+
let firstToolAt;
|
|
559
|
+
let runFinishedAt;
|
|
505
560
|
for (const sseEvent of parseSseEvents(rawContent)) {
|
|
506
561
|
if (!sseEvent.data) {
|
|
507
562
|
continue;
|
|
@@ -515,7 +570,11 @@ function extractResultFromSse(rawContent, fallbackThreadId) {
|
|
|
515
570
|
else if (typeof parsed.conversationId === 'string' && parsed.conversationId.trim()) {
|
|
516
571
|
resolvedThreadId = parsed.conversationId.trim();
|
|
517
572
|
}
|
|
518
|
-
if (parsed.type === '
|
|
573
|
+
if (parsed.type === 'RUN_STARTED' && typeof runStartedAt !== 'number') {
|
|
574
|
+
runStartedAt = normalizeEventTimestamp(parsed.timestamp);
|
|
575
|
+
}
|
|
576
|
+
if (isAssistantTextEvent(parsed)) {
|
|
577
|
+
firstTextAt ??= normalizeEventTimestamp(parsed.timestamp);
|
|
519
578
|
if (typeof parsed.content === 'string' && parsed.content.trim()) {
|
|
520
579
|
latestContent = parsed.content.trim();
|
|
521
580
|
}
|
|
@@ -523,11 +582,15 @@ function extractResultFromSse(rawContent, fallbackThreadId) {
|
|
|
523
582
|
fragments.push(parsed.delta);
|
|
524
583
|
}
|
|
525
584
|
}
|
|
526
|
-
if (parsed
|
|
527
|
-
|
|
585
|
+
if (isToolStartEvent(parsed)) {
|
|
586
|
+
firstToolAt ??= normalizeEventTimestamp(parsed.timestamp);
|
|
587
|
+
if (typeof parsed.toolCallName === 'string' && parsed.toolCallName.trim()) {
|
|
588
|
+
toolCalls.push(parsed.toolCallName.trim());
|
|
589
|
+
}
|
|
528
590
|
}
|
|
529
591
|
if (parsed.type === 'RUN_FINISHED') {
|
|
530
592
|
finished = true;
|
|
593
|
+
runFinishedAt = normalizeEventTimestamp(parsed.timestamp);
|
|
531
594
|
}
|
|
532
595
|
if (parsed.type === 'RUN_ERROR') {
|
|
533
596
|
throw new run_result_1.MessageRunError(parsed.error?.trim() || 'Agent run failed.', {
|
|
@@ -544,12 +607,21 @@ function extractResultFromSse(rawContent, fallbackThreadId) {
|
|
|
544
607
|
}
|
|
545
608
|
}
|
|
546
609
|
}
|
|
610
|
+
const effectiveStartedAt = runStartedAt ?? fallbackMetrics.startTimeMs;
|
|
611
|
+
const effectiveFinishedAt = runFinishedAt ?? fallbackMetrics.finishedTimeMs;
|
|
612
|
+
const turnCompleteSeconds = roundSeconds(effectiveStartedAt, effectiveFinishedAt);
|
|
547
613
|
return {
|
|
548
614
|
output: latestContent?.trim() || fragments.join('').trim(),
|
|
549
615
|
toolCalls,
|
|
550
616
|
threadId: resolvedThreadId,
|
|
551
617
|
finished,
|
|
552
618
|
sawAnyEvent,
|
|
619
|
+
metrics: {
|
|
620
|
+
ttfTextSeconds: roundSeconds(runStartedAt, firstTextAt),
|
|
621
|
+
ttfToolSeconds: roundSeconds(runStartedAt, firstToolAt),
|
|
622
|
+
turnCompleteSeconds,
|
|
623
|
+
totalSeconds: turnCompleteSeconds,
|
|
624
|
+
},
|
|
553
625
|
};
|
|
554
626
|
}
|
|
555
627
|
async function runAgUiPostMessage(options) {
|
|
@@ -625,7 +697,7 @@ async function runAgUiPostMessage(options) {
|
|
|
625
697
|
note: `Run log: ${node_path_1.default.basename(runOutputPath)}`,
|
|
626
698
|
toolCallCount: connected.toolCalls.length,
|
|
627
699
|
toolCalls: connected.toolCalls,
|
|
628
|
-
metrics:
|
|
700
|
+
metrics: connected.metrics,
|
|
629
701
|
};
|
|
630
702
|
}
|
|
631
703
|
const runResponse = await postAndCaptureResponse({
|
|
@@ -642,7 +714,10 @@ async function runAgUiPostMessage(options) {
|
|
|
642
714
|
output: runResponse.rawContent,
|
|
643
715
|
});
|
|
644
716
|
}
|
|
645
|
-
const parsed = extractResultFromSse(runResponse.rawContent, resolvedThreadId
|
|
717
|
+
const parsed = extractResultFromSse(runResponse.rawContent, resolvedThreadId, {
|
|
718
|
+
startTimeMs: runResponse.startedAtMs,
|
|
719
|
+
finishedTimeMs: runResponse.finishedAtMs,
|
|
720
|
+
});
|
|
646
721
|
if (options.observe) {
|
|
647
722
|
await promises_1.default.writeFile(observePath, '', 'utf8');
|
|
648
723
|
const observed = await waitForObservedCompletion({
|
|
@@ -659,7 +734,7 @@ async function runAgUiPostMessage(options) {
|
|
|
659
734
|
note: `Observe status: ${observed.status}. Observe log: ${node_path_1.default.basename(observePath)}`,
|
|
660
735
|
toolCallCount: parsed.toolCalls.length,
|
|
661
736
|
toolCalls: parsed.toolCalls,
|
|
662
|
-
metrics:
|
|
737
|
+
metrics: parsed.metrics,
|
|
663
738
|
};
|
|
664
739
|
}
|
|
665
740
|
if (options.completionCheck) {
|
|
@@ -671,7 +746,7 @@ async function runAgUiPostMessage(options) {
|
|
|
671
746
|
note: `Completion status: ${completion.status}`,
|
|
672
747
|
toolCallCount: parsed.toolCalls.length,
|
|
673
748
|
toolCalls: parsed.toolCalls,
|
|
674
|
-
metrics:
|
|
749
|
+
metrics: parsed.metrics,
|
|
675
750
|
};
|
|
676
751
|
}
|
|
677
752
|
if (!parsed.finished && !parsed.output) {
|
|
@@ -691,7 +766,7 @@ async function runAgUiPostMessage(options) {
|
|
|
691
766
|
outputPath,
|
|
692
767
|
toolCallCount: parsed.toolCalls.length,
|
|
693
768
|
toolCalls: parsed.toolCalls,
|
|
694
|
-
metrics:
|
|
769
|
+
metrics: parsed.metrics,
|
|
695
770
|
};
|
|
696
771
|
}
|
|
697
772
|
catch (error) {
|
|
@@ -9,8 +9,8 @@ exports.extractAppendedLog = extractAppendedLog;
|
|
|
9
9
|
exports.runAgUiMessage = runAgUiMessage;
|
|
10
10
|
const promises_1 = __importDefault(require("node:fs/promises"));
|
|
11
11
|
const node_path_1 = __importDefault(require("node:path"));
|
|
12
|
+
const node_crypto_1 = require("node:crypto");
|
|
12
13
|
const stompjs_1 = require("@stomp/stompjs");
|
|
13
|
-
const uuid_1 = require("uuid");
|
|
14
14
|
const ws_1 = __importDefault(require("ws"));
|
|
15
15
|
const run_result_1 = require("../run-result");
|
|
16
16
|
Object.assign(globalThis, { WebSocket: ws_1.default });
|
|
@@ -169,6 +169,7 @@ function computeMetrics(state) {
|
|
|
169
169
|
return {
|
|
170
170
|
ttfToolSeconds,
|
|
171
171
|
ttfTextSeconds,
|
|
172
|
+
turnCompleteSeconds: totalSeconds,
|
|
172
173
|
totalSeconds,
|
|
173
174
|
protocolUsedKb,
|
|
174
175
|
protocolTotalKb,
|
|
@@ -179,6 +180,7 @@ function writeMetricsToStdout(metrics) {
|
|
|
179
180
|
const parts = [
|
|
180
181
|
metrics.ttfToolSeconds != null ? `TTF-Tool: ${metrics.ttfToolSeconds.toFixed(1)}s` : undefined,
|
|
181
182
|
metrics.ttfTextSeconds != null ? `TTF-Text: ${metrics.ttfTextSeconds.toFixed(1)}s` : undefined,
|
|
183
|
+
metrics.turnCompleteSeconds != null ? `Turn complete: ${metrics.turnCompleteSeconds.toFixed(1)}s` : undefined,
|
|
182
184
|
metrics.totalSeconds != null ? `Total: ${metrics.totalSeconds.toFixed(1)}s` : undefined,
|
|
183
185
|
metrics.protocolUsedKb != null && metrics.protocolTotalKb != null && metrics.efficiencyPercent != null
|
|
184
186
|
? `Protocol efficiency: ${metrics.protocolUsedKb.toFixed(1)}KB/${metrics.protocolTotalKb.toFixed(1)}KB (${metrics.efficiencyPercent}%)`
|
|
@@ -430,8 +432,8 @@ async function connectAndRun(options) {
|
|
|
430
432
|
async function runAgUiMessage(options) {
|
|
431
433
|
await promises_1.default.mkdir(options.outputDirectory, { recursive: true });
|
|
432
434
|
const config = parseAgUiEnv(options.env);
|
|
433
|
-
const threadId = options.threadId ?? options.threadIdFallback ?? (0,
|
|
434
|
-
const runId = (0,
|
|
435
|
+
const threadId = options.threadId ?? options.threadIdFallback ?? (0, node_crypto_1.randomUUID)();
|
|
436
|
+
const runId = (0, node_crypto_1.randomUUID)();
|
|
435
437
|
const logPath = node_path_1.default.join(options.outputDirectory, `${threadId}.jsonl`);
|
|
436
438
|
const previousLogContent = await readJsonl(logPath);
|
|
437
439
|
const state = await connectAndRun({
|
|
@@ -135,7 +135,8 @@ async function runSingleScenario(scenario, context, env, outputDirectory, deadli
|
|
|
135
135
|
if (assertionFailureNote) {
|
|
136
136
|
result.status = 'FAIL';
|
|
137
137
|
result.note = [result.note, assertionFailureNote].filter(Boolean).join(' ') || undefined;
|
|
138
|
-
|
|
138
|
+
const shouldContinueOnAssertionFailure = turn.continueOnAssertionFailure ?? scenario.continueOnAssertionFailure ?? false;
|
|
139
|
+
if (!shouldContinueOnAssertionFailure) {
|
|
139
140
|
throw new Error(assertionFailureNote);
|
|
140
141
|
}
|
|
141
142
|
}
|
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
export type TimingMetrics = {
|
|
2
2
|
ttfToolSeconds?: number;
|
|
3
3
|
ttfTextSeconds?: number;
|
|
4
|
+
turnCompleteSeconds?: number;
|
|
4
5
|
totalSeconds?: number;
|
|
6
|
+
averageTtfTextSeconds?: number;
|
|
7
|
+
averageTurnCompleteSeconds?: number;
|
|
5
8
|
protocolUsedKb?: number;
|
|
6
9
|
protocolTotalKb?: number;
|
|
7
10
|
efficiencyPercent?: number;
|
package/dist/reports/report.js
CHANGED
|
@@ -26,6 +26,16 @@ function sumNumbers(values) {
|
|
|
26
26
|
}
|
|
27
27
|
return Number(definedValues.reduce((total, value) => total + value, 0).toFixed(1));
|
|
28
28
|
}
|
|
29
|
+
function averageNumbers(values) {
|
|
30
|
+
const definedValues = values.filter((value) => typeof value === 'number');
|
|
31
|
+
if (definedValues.length === 0) {
|
|
32
|
+
return undefined;
|
|
33
|
+
}
|
|
34
|
+
return Number((definedValues.reduce((total, value) => total + value, 0) / definedValues.length).toFixed(1));
|
|
35
|
+
}
|
|
36
|
+
function turnCompleteSeconds(metrics) {
|
|
37
|
+
return metrics.turnCompleteSeconds ?? metrics.totalSeconds;
|
|
38
|
+
}
|
|
29
39
|
function summarizeScenarioMetrics(turns) {
|
|
30
40
|
const protocolUsedKb = sumNumbers(turns.map((turn) => turn.metrics.protocolUsedKb));
|
|
31
41
|
const protocolTotalKb = sumNumbers(turns.map((turn) => turn.metrics.protocolTotalKb));
|
|
@@ -35,7 +45,10 @@ function summarizeScenarioMetrics(turns) {
|
|
|
35
45
|
return {
|
|
36
46
|
ttfToolSeconds: sumNumbers(turns.map((turn) => turn.metrics.ttfToolSeconds)),
|
|
37
47
|
ttfTextSeconds: sumNumbers(turns.map((turn) => turn.metrics.ttfTextSeconds)),
|
|
48
|
+
turnCompleteSeconds: sumNumbers(turns.map((turn) => turnCompleteSeconds(turn.metrics))),
|
|
38
49
|
totalSeconds: sumNumbers(turns.map((turn) => turn.metrics.totalSeconds)),
|
|
50
|
+
averageTtfTextSeconds: averageNumbers(turns.map((turn) => turn.metrics.ttfTextSeconds)),
|
|
51
|
+
averageTurnCompleteSeconds: averageNumbers(turns.map((turn) => turnCompleteSeconds(turn.metrics))),
|
|
39
52
|
protocolUsedKb,
|
|
40
53
|
protocolTotalKb,
|
|
41
54
|
efficiencyPercent,
|
|
@@ -112,6 +125,8 @@ function buildScenarioRunSummary(results) {
|
|
|
112
125
|
failedAssertions: assertions.filter((assertion) => !assertion.passed).length,
|
|
113
126
|
totalToolCalls: turns.reduce((total, turn) => total + turn.toolCallCount, 0),
|
|
114
127
|
totalEvaluations: evaluations.length,
|
|
128
|
+
averageTtfTextSeconds: averageNumbers(turns.map((turn) => turn.metrics.ttfTextSeconds)),
|
|
129
|
+
averageTurnCompleteSeconds: averageNumbers(turns.map((turn) => turnCompleteSeconds(turn.metrics))),
|
|
115
130
|
averageScoresByDimension,
|
|
116
131
|
requestedEvalDimensions,
|
|
117
132
|
};
|
|
@@ -155,7 +170,10 @@ function renderMetrics(metrics) {
|
|
|
155
170
|
return [
|
|
156
171
|
`TTF Tool: ${formatSeconds(metrics.ttfToolSeconds)}`,
|
|
157
172
|
`TTF Text: ${formatSeconds(metrics.ttfTextSeconds)}`,
|
|
173
|
+
`Turn Complete: ${formatSeconds(turnCompleteSeconds(metrics))}`,
|
|
158
174
|
`Total: ${formatSeconds(metrics.totalSeconds)}`,
|
|
175
|
+
`Avg TTF Text: ${formatSeconds(metrics.averageTtfTextSeconds)}`,
|
|
176
|
+
`Avg Complete: ${formatSeconds(metrics.averageTurnCompleteSeconds)}`,
|
|
159
177
|
`Efficiency: ${formatPercent(metrics.efficiencyPercent)}`,
|
|
160
178
|
].join(' | ');
|
|
161
179
|
}
|
|
@@ -356,6 +374,8 @@ function buildScenarioRunHtml(payload) {
|
|
|
356
374
|
'<section class="summary-grid">',
|
|
357
375
|
`<article class="summary-card"><span class="label">Scenarios</span><div class="value">${payload.summary.totalScenarios}</div><div class="sub">${payload.summary.passedScenarios} pass / ${payload.summary.failedScenarios} fail / ${payload.summary.skippedScenarios} skip</div></article>`,
|
|
358
376
|
`<article class="summary-card"><span class="label">Turns</span><div class="value">${payload.summary.totalTurns}</div><div class="sub">${payload.summary.totalToolCalls} tool calls total</div></article>`,
|
|
377
|
+
`<article class="summary-card"><span class="label">Avg TTF Text</span><div class="value">${formatSeconds(payload.summary.averageTtfTextSeconds)}</div><div class="sub">First assistant text per turn</div></article>`,
|
|
378
|
+
`<article class="summary-card"><span class="label">Avg Turn Complete</span><div class="value">${formatSeconds(payload.summary.averageTurnCompleteSeconds)}</div><div class="sub">Run started to run finished</div></article>`,
|
|
359
379
|
`<article class="summary-card"><span class="label">Assertions</span><div class="value">${payload.summary.totalAssertions}</div><div class="sub">${payload.summary.passedAssertions} pass / ${payload.summary.failedAssertions} fail</div></article>`,
|
|
360
380
|
`<article class="summary-card"><span class="label">Evaluations</span><div class="value">${payload.summary.totalEvaluations}</div><div class="sub">LLM-scored dimensions</div></article>`,
|
|
361
381
|
'</section>',
|
|
@@ -44,6 +44,7 @@ export type AiTurn<TContext extends BaseAiScenarioContext = BaseAiScenarioContex
|
|
|
44
44
|
message: AiTurnMessage<TContext>;
|
|
45
45
|
idleTimeoutMs?: number;
|
|
46
46
|
processTimeoutMs?: number;
|
|
47
|
+
continueOnAssertionFailure?: boolean;
|
|
47
48
|
assertions?: AiTurnAssertion[];
|
|
48
49
|
eval?: AiTurnEvalDefinition[];
|
|
49
50
|
onComplete?: (args: AiTurnCompletionArgs<TContext>) => void | Promise<void>;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@huydao/karrot",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.7",
|
|
4
4
|
"description": "Reusable AI scenario execution, assertion, evaluation, and reporting toolkit",
|
|
5
5
|
"license": "ISC",
|
|
6
6
|
"type": "commonjs",
|
|
@@ -126,12 +126,15 @@
|
|
|
126
126
|
},
|
|
127
127
|
"files": [
|
|
128
128
|
"dist",
|
|
129
|
+
"site",
|
|
129
130
|
"README.md",
|
|
130
131
|
"GUIDE.md"
|
|
131
132
|
],
|
|
132
133
|
"scripts": {
|
|
133
134
|
"build": "rm -rf dist && tsc -p tsconfig.json && mkdir -p dist/prompts && cp prompts/*.md dist/prompts/",
|
|
134
|
-
"prepack": "npm run build"
|
|
135
|
+
"prepack": "npm run build",
|
|
136
|
+
"site:serve": "node site/serve.js",
|
|
137
|
+
"site:check": "node site/check.js"
|
|
135
138
|
},
|
|
136
139
|
"dependencies": {
|
|
137
140
|
"@stomp/stompjs": "^7.3.0",
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
(function () {
|
|
2
|
+
const doc = document;
|
|
3
|
+
const body = doc.body;
|
|
4
|
+
|
|
5
|
+
function copyCode(button) {
|
|
6
|
+
const block = button.closest('.code-block');
|
|
7
|
+
const code = block ? block.querySelector('code') : null;
|
|
8
|
+
|
|
9
|
+
if (!code) {
|
|
10
|
+
return;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
navigator.clipboard.writeText(code.innerText).then(() => {
|
|
14
|
+
const original = button.textContent;
|
|
15
|
+
button.textContent = 'Copied';
|
|
16
|
+
button.setAttribute('aria-live', 'polite');
|
|
17
|
+
window.setTimeout(() => {
|
|
18
|
+
button.textContent = original || 'Copy';
|
|
19
|
+
}, 1400);
|
|
20
|
+
}).catch(() => {
|
|
21
|
+
button.textContent = 'Select';
|
|
22
|
+
});
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
function enhanceCodeBlocks() {
|
|
26
|
+
doc.querySelectorAll('pre').forEach((pre) => {
|
|
27
|
+
if (pre.parentElement && pre.parentElement.classList.contains('code-block')) {
|
|
28
|
+
return;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
const wrapper = doc.createElement('div');
|
|
32
|
+
wrapper.className = 'code-block';
|
|
33
|
+
const button = doc.createElement('button');
|
|
34
|
+
button.type = 'button';
|
|
35
|
+
button.className = 'copy-button';
|
|
36
|
+
button.textContent = 'Copy';
|
|
37
|
+
button.addEventListener('click', () => copyCode(button));
|
|
38
|
+
|
|
39
|
+
pre.parentNode.insertBefore(wrapper, pre);
|
|
40
|
+
wrapper.appendChild(button);
|
|
41
|
+
wrapper.appendChild(pre);
|
|
42
|
+
});
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function setupMobileNav() {
|
|
46
|
+
const toggle = doc.querySelector('[data-nav-toggle]');
|
|
47
|
+
const sidebar = doc.querySelector('[data-sidebar]');
|
|
48
|
+
const backdrop = doc.querySelector('[data-sidebar-backdrop]');
|
|
49
|
+
|
|
50
|
+
if (!toggle || !sidebar) {
|
|
51
|
+
return;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
function close() {
|
|
55
|
+
body.classList.remove('nav-open');
|
|
56
|
+
toggle.setAttribute('aria-expanded', 'false');
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
toggle.addEventListener('click', () => {
|
|
60
|
+
const open = body.classList.toggle('nav-open');
|
|
61
|
+
toggle.setAttribute('aria-expanded', String(open));
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
sidebar.querySelectorAll('a').forEach((link) => {
|
|
65
|
+
link.addEventListener('click', close);
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
if (backdrop) {
|
|
69
|
+
backdrop.addEventListener('click', close);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
doc.addEventListener('keydown', (event) => {
|
|
73
|
+
if (event.key === 'Escape') {
|
|
74
|
+
close();
|
|
75
|
+
}
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function setupHeadingLinks() {
|
|
80
|
+
doc.querySelectorAll('main section[id] > h2, main h3[id]').forEach((heading) => {
|
|
81
|
+
if (heading.querySelector('.heading-anchor')) {
|
|
82
|
+
return;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
const targetId = heading.id || heading.closest('section[id]')?.id;
|
|
86
|
+
|
|
87
|
+
if (!targetId) {
|
|
88
|
+
return;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const anchor = doc.createElement('a');
|
|
92
|
+
anchor.className = 'heading-anchor';
|
|
93
|
+
anchor.href = `#${targetId}`;
|
|
94
|
+
anchor.setAttribute('aria-label', `Link to ${heading.textContent}`);
|
|
95
|
+
anchor.textContent = '#';
|
|
96
|
+
heading.appendChild(anchor);
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
function buildToc() {
|
|
101
|
+
const toc = doc.querySelector('[data-toc]');
|
|
102
|
+
|
|
103
|
+
if (!toc) {
|
|
104
|
+
return [];
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
const sections = Array.from(doc.querySelectorAll('main section[id]'));
|
|
108
|
+
|
|
109
|
+
if (!sections.length) {
|
|
110
|
+
toc.remove();
|
|
111
|
+
return [];
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
const list = doc.createElement('ul');
|
|
115
|
+
list.className = 'toc-list';
|
|
116
|
+
|
|
117
|
+
sections.forEach((section) => {
|
|
118
|
+
const heading = section.querySelector('h2');
|
|
119
|
+
|
|
120
|
+
if (!heading) {
|
|
121
|
+
return;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
const item = doc.createElement('li');
|
|
125
|
+
const link = doc.createElement('a');
|
|
126
|
+
link.href = `#${section.id}`;
|
|
127
|
+
link.textContent = heading.textContent.replace('#', '').trim();
|
|
128
|
+
link.dataset.tocLink = section.id;
|
|
129
|
+
item.appendChild(link);
|
|
130
|
+
list.appendChild(item);
|
|
131
|
+
});
|
|
132
|
+
|
|
133
|
+
toc.appendChild(list);
|
|
134
|
+
return sections;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
function setupActiveLinks(sections) {
|
|
138
|
+
const sectionLinks = Array.from(doc.querySelectorAll('[data-section-link]'));
|
|
139
|
+
const tocLinks = Array.from(doc.querySelectorAll('[data-toc-link]'));
|
|
140
|
+
|
|
141
|
+
if (!sections.length || (!sectionLinks.length && !tocLinks.length)) {
|
|
142
|
+
return;
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
const setActive = (id) => {
|
|
146
|
+
sectionLinks.forEach((link) => {
|
|
147
|
+
link.classList.toggle('is-active', link.getAttribute('href') === `#${id}`);
|
|
148
|
+
});
|
|
149
|
+
tocLinks.forEach((link) => {
|
|
150
|
+
link.classList.toggle('is-active', link.dataset.tocLink === id);
|
|
151
|
+
});
|
|
152
|
+
};
|
|
153
|
+
|
|
154
|
+
const updateFromScroll = () => {
|
|
155
|
+
const offset = 120;
|
|
156
|
+
let current = sections[0];
|
|
157
|
+
|
|
158
|
+
sections.forEach((section) => {
|
|
159
|
+
if (section.getBoundingClientRect().top <= offset) {
|
|
160
|
+
current = section;
|
|
161
|
+
}
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
if (current?.id) {
|
|
165
|
+
setActive(current.id);
|
|
166
|
+
}
|
|
167
|
+
};
|
|
168
|
+
|
|
169
|
+
[...sectionLinks, ...tocLinks].forEach((link) => {
|
|
170
|
+
link.addEventListener('click', () => {
|
|
171
|
+
const targetId = link.getAttribute('href')?.slice(1);
|
|
172
|
+
|
|
173
|
+
if (targetId) {
|
|
174
|
+
setActive(targetId);
|
|
175
|
+
}
|
|
176
|
+
});
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
window.addEventListener('scroll', updateFromScroll, { passive: true });
|
|
180
|
+
window.addEventListener('hashchange', () => {
|
|
181
|
+
if (window.location.hash) {
|
|
182
|
+
setActive(window.location.hash.slice(1));
|
|
183
|
+
}
|
|
184
|
+
});
|
|
185
|
+
|
|
186
|
+
if (window.location.hash) {
|
|
187
|
+
setActive(window.location.hash.slice(1));
|
|
188
|
+
} else if (sections[0]) {
|
|
189
|
+
setActive(sections[0].id);
|
|
190
|
+
}
|
|
191
|
+
|
|
192
|
+
window.setTimeout(updateFromScroll, 150);
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
doc.addEventListener('DOMContentLoaded', () => {
|
|
196
|
+
enhanceCodeBlocks();
|
|
197
|
+
setupMobileNav();
|
|
198
|
+
setupHeadingLinks();
|
|
199
|
+
setupActiveLinks(buildToc());
|
|
200
|
+
});
|
|
201
|
+
})();
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
<svg width="64" height="64" viewBox="0 0 64 64" fill="none" xmlns="http://www.w3.org/2000/svg" role="img" aria-labelledby="title desc">
|
|
2
|
+
<title id="title">Karrot mark</title>
|
|
3
|
+
<desc id="desc">A simple carrot-shaped mark for Karrot documentation.</desc>
|
|
4
|
+
<rect width="64" height="64" rx="16" fill="#12352B"/>
|
|
5
|
+
<path d="M33.7 16.2c2.8-5.5 8.2-6.9 13.4-5.9-1.1 5.4-4.2 9.8-10.3 10.5 3.7-2.6 5.6-5.8 6.4-8.2-3.2.4-6.3 1.9-8.2 5.8l-1.3-2.2Z" fill="#62C370"/>
|
|
6
|
+
<path d="M28.9 18.2c-1.5-4.5-5.7-7-10.7-7.5.4 5.1 3.5 9.4 8.5 11l2.2-3.5Z" fill="#8ED97E"/>
|
|
7
|
+
<path d="M20.2 23.5c-2.1 3-2 7.1.1 10.1l13 18.9c1.6 2.3 5 2.2 6.5-.2l11.7-19.7c1.9-3.2 1.4-7.3-1.2-10-7.7-7.9-23.3-8.1-30.1.9Z" fill="#F28C38"/>
|
|
8
|
+
<path d="M24.7 27.5c4.7-3.9 14.3-3.8 20.2.2" stroke="#FFE2C0" stroke-width="3" stroke-linecap="round"/>
|
|
9
|
+
<path d="M30.3 39.2c2.8 1.8 6.9 1.8 9.5-.1" stroke="#9D471C" stroke-width="3" stroke-linecap="round"/>
|
|
10
|
+
</svg>
|