@huydao/karrot 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -258,6 +258,35 @@ function parseSseBlock(block) {
258
258
  data: dataLines.join('\n'),
259
259
  };
260
260
  }
261
+ function normalizeEventTimestamp(value) {
262
+ if (typeof value === 'number' && Number.isFinite(value)) {
263
+ return value > 10_000_000_000 ? value : value * 1000;
264
+ }
265
+ if (typeof value === 'string' && value.trim()) {
266
+ const numeric = Number(value);
267
+ if (Number.isFinite(numeric)) {
268
+ return numeric > 10_000_000_000 ? numeric : numeric * 1000;
269
+ }
270
+ const parsed = Date.parse(value);
271
+ return Number.isFinite(parsed) ? parsed : undefined;
272
+ }
273
+ return undefined;
274
+ }
275
+ function roundSeconds(startTimeMs, endTimeMs) {
276
+ if (typeof startTimeMs !== 'number' || typeof endTimeMs !== 'number' || endTimeMs < startTimeMs) {
277
+ return undefined;
278
+ }
279
+ return Number(((endTimeMs - startTimeMs) / 1000).toFixed(1));
280
+ }
281
+ function isAssistantTextEvent(event) {
282
+ return (event.type === 'TEXT_MESSAGE_CONTENT' ||
283
+ event.type === 'TEXT_MESSAGE_CHUNK' ||
284
+ (event.type === 'CUSTOM' && event.name === 'super-testing-agent.model_stream_chunk'));
285
+ }
286
+ function isToolStartEvent(event) {
287
+ return (event.type === 'TOOL_CALL_START' ||
288
+ (event.type === 'CUSTOM' && event.name === 'super-testing-agent.tool_started'));
289
+ }
261
290
  function createConnectCollector(options) {
262
291
  const assistantFragments = [];
263
292
  const toolCalls = [];
@@ -266,6 +295,11 @@ function createConnectCollector(options) {
266
295
  let started = false;
267
296
  let finished = false;
268
297
  let sawAnyEvent = false;
298
+ let runStartedAt;
299
+ let firstTextAt;
300
+ let firstToolAt;
301
+ let runFinishedAt;
302
+ const eventTime = (event) => normalizeEventTimestamp(event.timestamp) ?? Date.now();
269
303
  const consume = (sseEvent) => {
270
304
  if (!sseEvent.data) {
271
305
  return false;
@@ -286,6 +320,7 @@ function createConnectCollector(options) {
286
320
  }
287
321
  if (parsed.type === 'RUN_STARTED' && parsed.runId === options.targetRunId) {
288
322
  started = true;
323
+ runStartedAt = eventTime(parsed);
289
324
  return false;
290
325
  }
291
326
  if (!started) {
@@ -300,7 +335,8 @@ function createConnectCollector(options) {
300
335
  });
301
336
  }
302
337
  const isAssistantMessage = parsed.role === 'assistant' || typeof parsed.role !== 'string';
303
- if ((parsed.type === 'TEXT_MESSAGE_CHUNK' || parsed.type === 'TEXT_MESSAGE_CONTENT') && isAssistantMessage) {
338
+ if (isAssistantTextEvent(parsed) && isAssistantMessage) {
339
+ firstTextAt ??= eventTime(parsed);
304
340
  if (typeof parsed.content === 'string' && parsed.content.trim()) {
305
341
  latestAssistantContent = parsed.content.trim();
306
342
  }
@@ -308,11 +344,15 @@ function createConnectCollector(options) {
308
344
  assistantFragments.push(parsed.delta);
309
345
  }
310
346
  }
311
- if (parsed.type === 'TOOL_CALL_START' && typeof parsed.toolCallName === 'string' && parsed.toolCallName.trim()) {
312
- toolCalls.push(parsed.toolCallName.trim());
347
+ if (isToolStartEvent(parsed)) {
348
+ firstToolAt ??= eventTime(parsed);
349
+ if (typeof parsed.toolCallName === 'string' && parsed.toolCallName.trim()) {
350
+ toolCalls.push(parsed.toolCallName.trim());
351
+ }
313
352
  }
314
353
  if (parsed.type === 'RUN_FINISHED' && (!parsed.runId || parsed.runId === options.targetRunId)) {
315
354
  finished = true;
355
+ runFinishedAt = eventTime(parsed);
316
356
  return true;
317
357
  }
318
358
  return false;
@@ -320,12 +360,19 @@ function createConnectCollector(options) {
320
360
  return {
321
361
  consume,
322
362
  getResult() {
363
+ const turnCompleteSeconds = roundSeconds(runStartedAt, runFinishedAt);
323
364
  return {
324
365
  output: latestAssistantContent?.trim() || assistantFragments.join('').trim(),
325
366
  toolCalls: [...toolCalls],
326
367
  threadId: resolvedThreadId,
327
368
  finished,
328
369
  sawAnyEvent,
370
+ metrics: {
371
+ ttfTextSeconds: roundSeconds(runStartedAt, firstTextAt),
372
+ ttfToolSeconds: roundSeconds(runStartedAt, firstToolAt),
373
+ turnCompleteSeconds,
374
+ totalSeconds: turnCompleteSeconds,
375
+ },
329
376
  };
330
377
  },
331
378
  };
@@ -337,6 +384,7 @@ async function postAndCaptureResponse(options) {
337
384
  : undefined;
338
385
  try {
339
386
  await promises_1.default.writeFile(options.outputPath, '', 'utf8');
387
+ const startedAtMs = Date.now();
340
388
  const response = await fetch(options.url, {
341
389
  method: 'POST',
342
390
  headers: {
@@ -369,6 +417,8 @@ async function postAndCaptureResponse(options) {
369
417
  return {
370
418
  status: response.status,
371
419
  rawContent,
420
+ startedAtMs,
421
+ finishedAtMs: Date.now(),
372
422
  };
373
423
  }
374
424
  catch (error) {
@@ -430,6 +480,7 @@ function startConnectStream(options) {
430
480
  threadId: parsed.threadId,
431
481
  finished: parsed.finished,
432
482
  sawAnyEvent: parsed.sawAnyEvent,
483
+ metrics: parsed.metrics,
433
484
  };
434
485
  }
435
486
  const reader = response.body.getReader();
@@ -495,13 +546,17 @@ function startConnectStream(options) {
495
546
  result,
496
547
  };
497
548
  }
498
- function extractResultFromSse(rawContent, fallbackThreadId) {
549
+ function extractResultFromSse(rawContent, fallbackThreadId, fallbackMetrics = {}) {
499
550
  const fragments = [];
500
551
  let latestContent;
501
552
  let resolvedThreadId = fallbackThreadId;
502
553
  const toolCalls = [];
503
554
  let finished = false;
504
555
  let sawAnyEvent = false;
556
+ let runStartedAt;
557
+ let firstTextAt;
558
+ let firstToolAt;
559
+ let runFinishedAt;
505
560
  for (const sseEvent of parseSseEvents(rawContent)) {
506
561
  if (!sseEvent.data) {
507
562
  continue;
@@ -515,7 +570,11 @@ function extractResultFromSse(rawContent, fallbackThreadId) {
515
570
  else if (typeof parsed.conversationId === 'string' && parsed.conversationId.trim()) {
516
571
  resolvedThreadId = parsed.conversationId.trim();
517
572
  }
518
- if (parsed.type === 'TEXT_MESSAGE_CONTENT') {
573
+ if (parsed.type === 'RUN_STARTED' && typeof runStartedAt !== 'number') {
574
+ runStartedAt = normalizeEventTimestamp(parsed.timestamp);
575
+ }
576
+ if (isAssistantTextEvent(parsed)) {
577
+ firstTextAt ??= normalizeEventTimestamp(parsed.timestamp);
519
578
  if (typeof parsed.content === 'string' && parsed.content.trim()) {
520
579
  latestContent = parsed.content.trim();
521
580
  }
@@ -523,11 +582,15 @@ function extractResultFromSse(rawContent, fallbackThreadId) {
523
582
  fragments.push(parsed.delta);
524
583
  }
525
584
  }
526
- if (parsed.type === 'TOOL_CALL_START' && typeof parsed.toolCallName === 'string' && parsed.toolCallName.trim()) {
527
- toolCalls.push(parsed.toolCallName.trim());
585
+ if (isToolStartEvent(parsed)) {
586
+ firstToolAt ??= normalizeEventTimestamp(parsed.timestamp);
587
+ if (typeof parsed.toolCallName === 'string' && parsed.toolCallName.trim()) {
588
+ toolCalls.push(parsed.toolCallName.trim());
589
+ }
528
590
  }
529
591
  if (parsed.type === 'RUN_FINISHED') {
530
592
  finished = true;
593
+ runFinishedAt = normalizeEventTimestamp(parsed.timestamp);
531
594
  }
532
595
  if (parsed.type === 'RUN_ERROR') {
533
596
  throw new run_result_1.MessageRunError(parsed.error?.trim() || 'Agent run failed.', {
@@ -544,12 +607,21 @@ function extractResultFromSse(rawContent, fallbackThreadId) {
544
607
  }
545
608
  }
546
609
  }
610
+ const effectiveStartedAt = runStartedAt ?? fallbackMetrics.startTimeMs;
611
+ const effectiveFinishedAt = runFinishedAt ?? fallbackMetrics.finishedTimeMs;
612
+ const turnCompleteSeconds = roundSeconds(effectiveStartedAt, effectiveFinishedAt);
547
613
  return {
548
614
  output: latestContent?.trim() || fragments.join('').trim(),
549
615
  toolCalls,
550
616
  threadId: resolvedThreadId,
551
617
  finished,
552
618
  sawAnyEvent,
619
+ metrics: {
620
+ ttfTextSeconds: roundSeconds(runStartedAt, firstTextAt),
621
+ ttfToolSeconds: roundSeconds(runStartedAt, firstToolAt),
622
+ turnCompleteSeconds,
623
+ totalSeconds: turnCompleteSeconds,
624
+ },
553
625
  };
554
626
  }
555
627
  async function runAgUiPostMessage(options) {
@@ -625,7 +697,7 @@ async function runAgUiPostMessage(options) {
625
697
  note: `Run log: ${node_path_1.default.basename(runOutputPath)}`,
626
698
  toolCallCount: connected.toolCalls.length,
627
699
  toolCalls: connected.toolCalls,
628
- metrics: {},
700
+ metrics: connected.metrics,
629
701
  };
630
702
  }
631
703
  const runResponse = await postAndCaptureResponse({
@@ -642,7 +714,10 @@ async function runAgUiPostMessage(options) {
642
714
  output: runResponse.rawContent,
643
715
  });
644
716
  }
645
- const parsed = extractResultFromSse(runResponse.rawContent, resolvedThreadId);
717
+ const parsed = extractResultFromSse(runResponse.rawContent, resolvedThreadId, {
718
+ startTimeMs: runResponse.startedAtMs,
719
+ finishedTimeMs: runResponse.finishedAtMs,
720
+ });
646
721
  if (options.observe) {
647
722
  await promises_1.default.writeFile(observePath, '', 'utf8');
648
723
  const observed = await waitForObservedCompletion({
@@ -659,7 +734,7 @@ async function runAgUiPostMessage(options) {
659
734
  note: `Observe status: ${observed.status}. Observe log: ${node_path_1.default.basename(observePath)}`,
660
735
  toolCallCount: parsed.toolCalls.length,
661
736
  toolCalls: parsed.toolCalls,
662
- metrics: {},
737
+ metrics: parsed.metrics,
663
738
  };
664
739
  }
665
740
  if (options.completionCheck) {
@@ -671,7 +746,7 @@ async function runAgUiPostMessage(options) {
671
746
  note: `Completion status: ${completion.status}`,
672
747
  toolCallCount: parsed.toolCalls.length,
673
748
  toolCalls: parsed.toolCalls,
674
- metrics: {},
749
+ metrics: parsed.metrics,
675
750
  };
676
751
  }
677
752
  if (!parsed.finished && !parsed.output) {
@@ -691,7 +766,7 @@ async function runAgUiPostMessage(options) {
691
766
  outputPath,
692
767
  toolCallCount: parsed.toolCalls.length,
693
768
  toolCalls: parsed.toolCalls,
694
- metrics: {},
769
+ metrics: parsed.metrics,
695
770
  };
696
771
  }
697
772
  catch (error) {
@@ -9,8 +9,8 @@ exports.extractAppendedLog = extractAppendedLog;
9
9
  exports.runAgUiMessage = runAgUiMessage;
10
10
  const promises_1 = __importDefault(require("node:fs/promises"));
11
11
  const node_path_1 = __importDefault(require("node:path"));
12
+ const node_crypto_1 = require("node:crypto");
12
13
  const stompjs_1 = require("@stomp/stompjs");
13
- const uuid_1 = require("uuid");
14
14
  const ws_1 = __importDefault(require("ws"));
15
15
  const run_result_1 = require("../run-result");
16
16
  Object.assign(globalThis, { WebSocket: ws_1.default });
@@ -169,6 +169,7 @@ function computeMetrics(state) {
169
169
  return {
170
170
  ttfToolSeconds,
171
171
  ttfTextSeconds,
172
+ turnCompleteSeconds: totalSeconds,
172
173
  totalSeconds,
173
174
  protocolUsedKb,
174
175
  protocolTotalKb,
@@ -179,6 +180,7 @@ function writeMetricsToStdout(metrics) {
179
180
  const parts = [
180
181
  metrics.ttfToolSeconds != null ? `TTF-Tool: ${metrics.ttfToolSeconds.toFixed(1)}s` : undefined,
181
182
  metrics.ttfTextSeconds != null ? `TTF-Text: ${metrics.ttfTextSeconds.toFixed(1)}s` : undefined,
183
+ metrics.turnCompleteSeconds != null ? `Turn complete: ${metrics.turnCompleteSeconds.toFixed(1)}s` : undefined,
182
184
  metrics.totalSeconds != null ? `Total: ${metrics.totalSeconds.toFixed(1)}s` : undefined,
183
185
  metrics.protocolUsedKb != null && metrics.protocolTotalKb != null && metrics.efficiencyPercent != null
184
186
  ? `Protocol efficiency: ${metrics.protocolUsedKb.toFixed(1)}KB/${metrics.protocolTotalKb.toFixed(1)}KB (${metrics.efficiencyPercent}%)`
@@ -430,8 +432,8 @@ async function connectAndRun(options) {
430
432
  async function runAgUiMessage(options) {
431
433
  await promises_1.default.mkdir(options.outputDirectory, { recursive: true });
432
434
  const config = parseAgUiEnv(options.env);
433
- const threadId = options.threadId ?? options.threadIdFallback ?? (0, uuid_1.v7)();
434
- const runId = (0, uuid_1.v7)();
435
+ const threadId = options.threadId ?? options.threadIdFallback ?? (0, node_crypto_1.randomUUID)();
436
+ const runId = (0, node_crypto_1.randomUUID)();
435
437
  const logPath = node_path_1.default.join(options.outputDirectory, `${threadId}.jsonl`);
436
438
  const previousLogContent = await readJsonl(logPath);
437
439
  const state = await connectAndRun({
@@ -135,7 +135,8 @@ async function runSingleScenario(scenario, context, env, outputDirectory, deadli
135
135
  if (assertionFailureNote) {
136
136
  result.status = 'FAIL';
137
137
  result.note = [result.note, assertionFailureNote].filter(Boolean).join(' ') || undefined;
138
- if (!scenario.continueOnAssertionFailure) {
138
+ const shouldContinueOnAssertionFailure = turn.continueOnAssertionFailure ?? scenario.continueOnAssertionFailure ?? false;
139
+ if (!shouldContinueOnAssertionFailure) {
139
140
  throw new Error(assertionFailureNote);
140
141
  }
141
142
  }
@@ -1,7 +1,10 @@
1
1
  export type TimingMetrics = {
2
2
  ttfToolSeconds?: number;
3
3
  ttfTextSeconds?: number;
4
+ turnCompleteSeconds?: number;
4
5
  totalSeconds?: number;
6
+ averageTtfTextSeconds?: number;
7
+ averageTurnCompleteSeconds?: number;
5
8
  protocolUsedKb?: number;
6
9
  protocolTotalKb?: number;
7
10
  efficiencyPercent?: number;
@@ -26,6 +26,16 @@ function sumNumbers(values) {
26
26
  }
27
27
  return Number(definedValues.reduce((total, value) => total + value, 0).toFixed(1));
28
28
  }
29
+ function averageNumbers(values) {
30
+ const definedValues = values.filter((value) => typeof value === 'number');
31
+ if (definedValues.length === 0) {
32
+ return undefined;
33
+ }
34
+ return Number((definedValues.reduce((total, value) => total + value, 0) / definedValues.length).toFixed(1));
35
+ }
36
+ function turnCompleteSeconds(metrics) {
37
+ return metrics.turnCompleteSeconds ?? metrics.totalSeconds;
38
+ }
29
39
  function summarizeScenarioMetrics(turns) {
30
40
  const protocolUsedKb = sumNumbers(turns.map((turn) => turn.metrics.protocolUsedKb));
31
41
  const protocolTotalKb = sumNumbers(turns.map((turn) => turn.metrics.protocolTotalKb));
@@ -35,7 +45,10 @@ function summarizeScenarioMetrics(turns) {
35
45
  return {
36
46
  ttfToolSeconds: sumNumbers(turns.map((turn) => turn.metrics.ttfToolSeconds)),
37
47
  ttfTextSeconds: sumNumbers(turns.map((turn) => turn.metrics.ttfTextSeconds)),
48
+ turnCompleteSeconds: sumNumbers(turns.map((turn) => turnCompleteSeconds(turn.metrics))),
38
49
  totalSeconds: sumNumbers(turns.map((turn) => turn.metrics.totalSeconds)),
50
+ averageTtfTextSeconds: averageNumbers(turns.map((turn) => turn.metrics.ttfTextSeconds)),
51
+ averageTurnCompleteSeconds: averageNumbers(turns.map((turn) => turnCompleteSeconds(turn.metrics))),
39
52
  protocolUsedKb,
40
53
  protocolTotalKb,
41
54
  efficiencyPercent,
@@ -112,6 +125,8 @@ function buildScenarioRunSummary(results) {
112
125
  failedAssertions: assertions.filter((assertion) => !assertion.passed).length,
113
126
  totalToolCalls: turns.reduce((total, turn) => total + turn.toolCallCount, 0),
114
127
  totalEvaluations: evaluations.length,
128
+ averageTtfTextSeconds: averageNumbers(turns.map((turn) => turn.metrics.ttfTextSeconds)),
129
+ averageTurnCompleteSeconds: averageNumbers(turns.map((turn) => turnCompleteSeconds(turn.metrics))),
115
130
  averageScoresByDimension,
116
131
  requestedEvalDimensions,
117
132
  };
@@ -155,7 +170,10 @@ function renderMetrics(metrics) {
155
170
  return [
156
171
  `TTF Tool: ${formatSeconds(metrics.ttfToolSeconds)}`,
157
172
  `TTF Text: ${formatSeconds(metrics.ttfTextSeconds)}`,
173
+ `Turn Complete: ${formatSeconds(turnCompleteSeconds(metrics))}`,
158
174
  `Total: ${formatSeconds(metrics.totalSeconds)}`,
175
+ `Avg TTF Text: ${formatSeconds(metrics.averageTtfTextSeconds)}`,
176
+ `Avg Complete: ${formatSeconds(metrics.averageTurnCompleteSeconds)}`,
159
177
  `Efficiency: ${formatPercent(metrics.efficiencyPercent)}`,
160
178
  ].join(' | ');
161
179
  }
@@ -279,60 +297,70 @@ function buildScenarioRunHtml(payload) {
279
297
  '<meta name="viewport" content="width=device-width, initial-scale=1" />',
280
298
  `<title>${escapeHtml(`${payload.projectName} AI Scenario Report`)}</title>`,
281
299
  '<style>',
282
- 'body{margin:0;font-family:ui-sans-serif,system-ui,-apple-system,BlinkMacSystemFont,"Segoe UI",sans-serif;background:#f4f7fb;color:#142033;}',
283
- '.page{max-width:1280px;margin:0 auto;padding:32px 24px 64px;}',
284
- '.hero{background:linear-gradient(135deg,#0f172a,#1d4ed8);color:#fff;padding:28px;border-radius:20px;box-shadow:0 20px 50px rgba(15,23,42,.18);}',
285
- '.hero h1{margin:0 0 8px;font-size:32px;}',
286
- '.hero p{margin:4px 0;color:rgba(255,255,255,.88);}',
287
- '.summary-grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(170px,1fr));gap:14px;margin:22px 0 28px;}',
288
- '.summary-card,.panel,.turn-card,.scenario-card{background:#fff;border:1px solid #dbe4f0;border-radius:18px;box-shadow:0 10px 30px rgba(15,23,42,.06);}',
289
- '.summary-card{padding:18px;}',
290
- '.summary-card .label{display:block;font-size:12px;text-transform:uppercase;letter-spacing:.08em;color:#5b6b84;margin-bottom:8px;}',
291
- '.summary-card .value{font-size:28px;font-weight:700;}',
292
- '.summary-card .sub{font-size:13px;color:#61728d;}',
293
- '.panels{display:grid;grid-template-columns:repeat(auto-fit,minmax(280px,1fr));gap:16px;margin-bottom:28px;}',
294
- '.panel{padding:18px;}',
295
- '.panel h2{margin:0 0 12px;font-size:18px;}',
296
- '.panel pre{margin:0;white-space:pre-wrap;word-break:break-word;background:#f8fbff;border-radius:12px;padding:14px;font-size:13px;}',
300
+ ':root{--ink:#0b1220;--text:#273247;--muted:#6f7b91;--line:#e3ebf5;--panel:#fff;--panel-soft:#f8fafd;--page:#eaf1f8;--blue:#2f63e5;--blue-dark:#153f9f;--green:#3f9a8f;--red:#dc3d4d;--amber:#b7791f;--shadow:0 14px 34px rgba(21,45,85,.08);}',
301
+ '*{box-sizing:border-box;}',
302
+ 'body{margin:0;font-family:"Avenir Next","Nunito Sans",ui-sans-serif,system-ui,-apple-system,BlinkMacSystemFont,"Segoe UI",sans-serif;background:radial-gradient(circle at 8% -10%,rgba(47,99,229,.13),transparent 26rem),linear-gradient(180deg,#edf4fb 0%,var(--page) 100%);color:var(--text);font-size:15px;line-height:1.55;}',
303
+ '.page{max-width:1360px;margin:0 auto;padding:24px 18px 56px;}',
304
+ '.hero,.summary-card,.panel,.turn-card,.scenario-card{position:relative;background:var(--panel);border:1px solid #d8e3f0;border-radius:10px;box-shadow:var(--shadow);overflow:hidden;}',
305
+ '.hero::before,.summary-card::before,.panel::before,.turn-card::before,.scenario-card::before{content:"";position:absolute;inset:0 0 auto;height:4px;background:linear-gradient(90deg,#071326 0%,var(--blue) 78%,#4b7cff 100%);}',
306
+ '.hero{display:grid;grid-template-columns:minmax(0,1fr) auto;gap:18px;padding:26px 30px 24px;margin-bottom:22px;}',
307
+ '.hero h1{grid-column:1/-1;margin:0 0 2px;color:var(--ink);font-size:29px;line-height:1.16;font-weight:700;letter-spacing:-.02em;}',
308
+ '.hero p{margin:0;color:var(--muted);font-weight:700;}',
309
+ '.hero strong{color:var(--ink);font-weight:700;}',
310
+ '.summary-grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(190px,1fr));gap:16px;margin:0 0 18px;}',
311
+ '.summary-card{padding:22px 22px 18px;min-height:124px;}',
312
+ '.summary-card .label,.score-card .label,.meta-label{display:block;font-size:11px;text-transform:uppercase;letter-spacing:.06em;color:var(--muted);font-weight:700;margin-bottom:8px;}',
313
+ '.summary-card .value{font-size:31px;line-height:1.06;font-weight:700;color:var(--ink);letter-spacing:-.025em;}',
314
+ '.summary-card .sub{margin-top:8px;font-size:14px;color:var(--muted);font-weight:700;}',
315
+ '.panels{display:grid;grid-template-columns:repeat(auto-fit,minmax(300px,1fr));gap:16px;margin-bottom:18px;}',
316
+ '.panel{padding:22px;}',
317
+ '.panel h2{margin:0 0 14px;color:var(--ink);font-size:17px;line-height:1.22;font-weight:700;letter-spacing:-.01em;}',
318
+ '.panel pre{margin:0;white-space:pre-wrap;word-break:break-word;background:var(--panel-soft);border:1px solid #edf2f8;border-radius:6px;padding:14px;font-size:13px;color:#334155;}',
297
319
  '.scenario-list{display:grid;gap:18px;}',
298
- '.scenario-card summary{list-style:none;display:flex;gap:12px;align-items:center;justify-content:space-between;padding:18px 20px;cursor:pointer;}',
320
+ '.scenario-card summary{list-style:none;display:grid;grid-template-columns:minmax(0,1fr) auto minmax(160px,36%);gap:14px;align-items:center;padding:20px 24px 18px;cursor:pointer;}',
299
321
  '.scenario-card summary::-webkit-details-marker{display:none;}',
300
- '.scenario-title{font-weight:700;font-size:18px;flex:1;}',
301
- '.summary-note{color:#61728d;font-size:14px;max-width:40%;text-align:right;}',
302
- '.scenario-body{padding:0 20px 20px;}',
303
- '.meta-grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(220px,1fr));gap:12px;margin:8px 0 18px;}',
304
- '.meta-grid>div{background:#f8fbff;border-radius:12px;padding:12px;}',
305
- '.meta-label{display:block;font-size:12px;color:#61728d;text-transform:uppercase;letter-spacing:.08em;margin-bottom:6px;}',
306
- '.turn-card{padding:18px;margin-top:16px;}',
307
- '.turn-card h4,.content-block h5{margin:0 0 10px;}',
322
+ '.scenario-title{font-weight:700;font-size:17px;color:var(--ink);letter-spacing:-.01em;overflow-wrap:anywhere;}',
323
+ '.summary-note{color:var(--muted);font-size:14px;font-weight:400;text-align:right;overflow-wrap:anywhere;}',
324
+ '.scenario-body{padding:0 24px 24px;}',
325
+ '.meta-grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(230px,1fr));gap:12px;margin:8px 0 18px;}',
326
+ '.meta-grid>div{background:var(--panel-soft);border:1px solid #edf2f8;border-radius:6px;padding:13px 14px;}',
327
+ '.turn-card{padding:22px;margin-top:16px;border-radius:8px;box-shadow:0 10px 24px rgba(21,45,85,.06);}',
328
+ '.turn-card h4{margin:0 0 14px;color:var(--ink);font-size:16px;font-weight:700;letter-spacing:0;}',
308
329
  '.content-block{margin-top:14px;}',
309
- '.content-block pre{margin:0;white-space:pre-wrap;word-break:break-word;background:#f8fbff;border-radius:12px;padding:14px;max-height:420px;overflow:auto;}',
310
- '.content-block p{margin:0;background:#f8fbff;border-radius:12px;padding:14px;}',
311
- '.assertions,.evaluations{width:100%;border-collapse:collapse;font-size:14px;}',
330
+ '.content-block h5{margin:0 0 8px;color:var(--ink);font-size:12px;text-transform:uppercase;letter-spacing:.06em;font-weight:700;}',
331
+ '.content-block pre,.content-block p{margin:0;background:var(--panel-soft);border:1px solid #edf2f8;border-radius:6px;padding:14px;}',
332
+ '.content-block pre{white-space:pre-wrap;word-break:break-word;max-height:420px;overflow:auto;color:#29364b;}',
333
+ '.assertions,.evaluations{width:100%;border-collapse:separate;border-spacing:0 8px;font-size:14px;}',
312
334
  '.assertions{table-layout:fixed;}',
313
- '.assertions th,.assertions td,.evaluations th,.evaluations td{padding:10px 12px;border-bottom:1px solid #e5edf7;vertical-align:top;text-align:left;}',
314
- '.assertions th,.evaluations th{font-size:12px;text-transform:uppercase;letter-spacing:.08em;color:#61728d;}',
335
+ '.assertions th,.assertions td,.evaluations th,.evaluations td{padding:10px 12px;vertical-align:top;text-align:left;}',
336
+ '.assertions th,.evaluations th{font-size:11px;text-transform:uppercase;letter-spacing:.06em;color:var(--muted);font-weight:700;}',
337
+ '.assertions tbody tr,.evaluations tbody tr{background:var(--panel-soft);}',
338
+ '.assertions tbody td,.evaluations tbody td{border-top:1px solid #edf2f8;border-bottom:1px solid #edf2f8;}',
339
+ '.assertions tbody td:first-child,.evaluations tbody td:first-child{border-left:1px solid #edf2f8;border-radius:6px 0 0 6px;}',
340
+ '.assertions tbody td:last-child,.evaluations tbody td:last-child{border-right:1px solid #edf2f8;border-radius:0 6px 6px 0;}',
315
341
  '.assertions td{word-break:break-word;overflow-wrap:anywhere;}',
316
342
  '.assertions th:nth-child(1),.assertions td:nth-child(1){width:8%;}',
317
343
  '.assertions th:nth-child(2),.assertions td:nth-child(2){width:12%;}',
318
344
  '.assertions th:nth-child(3),.assertions td:nth-child(3){width:34%;}',
319
345
  '.assertions th:nth-child(4),.assertions td:nth-child(4){width:16%;}',
320
346
  '.assertions th:nth-child(5),.assertions td:nth-child(5){width:30%;}',
321
- '.assertions .assertion-expected-object{margin:0;white-space:pre-wrap;word-break:break-word;overflow-wrap:anywhere;background:#f8fbff;border-radius:12px;padding:12px;font-size:12px;line-height:1.45;max-height:none;overflow:visible;}',
322
- '.badge{display:inline-flex;align-items:center;justify-content:center;border-radius:999px;padding:5px 10px;font-size:12px;font-weight:700;min-width:56px;}',
323
- '.badge.pass{background:#dcfce7;color:#166534;}',
324
- '.badge.fail{background:#fee2e2;color:#991b1b;}',
325
- '.badge.skip{background:#e2e8f0;color:#334155;}',
326
- '.dimension-chip{display:inline-flex;align-items:center;justify-content:center;border-radius:999px;padding:4px 10px;margin:0 6px 6px 0;background:#eef2ff;color:#3730a3;font-size:12px;font-weight:600;}',
347
+ '.assertions .assertion-expected-object{margin:0;white-space:pre-wrap;word-break:break-word;overflow-wrap:anywhere;background:#fff;border:1px solid #e5edf7;border-radius:6px;padding:12px;font-size:12px;line-height:1.45;max-height:none;overflow:visible;}',
348
+ '.badge{display:inline-flex;align-items:center;justify-content:center;border-radius:999px;padding:5px 11px;font-size:12px;font-weight:900;min-width:58px;letter-spacing:.02em;}',
349
+ '.badge.pass{background:#e4f8ef;color:#157347;}',
350
+ '.badge.fail{background:#ffe8eb;color:#b42332;}',
351
+ '.badge.skip{background:#edf2f7;color:#4a5568;}',
352
+ '.scenario-card.pass::before{background:linear-gradient(90deg,#071326 0%,#22a06b 100%);}',
353
+ '.scenario-card.fail::before{background:linear-gradient(90deg,#071326 0%,var(--red) 100%);}',
354
+ '.scenario-card.skip::before{background:linear-gradient(90deg,#071326 0%,#8a94a6 100%);}',
355
+ '.dimension-chip{display:inline-flex;align-items:center;justify-content:center;border-radius:999px;padding:5px 11px;margin:0 6px 6px 0;background:#eef4ff;color:var(--blue);font-size:12px;font-weight:900;}',
327
356
  '.score-grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(160px,1fr));gap:12px;}',
328
- '.score-card{background:#f8fbff;border-radius:12px;padding:14px;}',
329
- '.score-card .label{display:block;font-size:12px;color:#61728d;text-transform:uppercase;letter-spacing:.08em;margin-bottom:8px;}',
330
- '.score-card .value{font-size:24px;font-weight:700;}',
331
- '.score-pill{display:inline-flex;align-items:center;justify-content:center;border-radius:999px;padding:4px 10px;background:#dbeafe;color:#1d4ed8;font-weight:700;min-width:58px;}',
332
- '.muted{color:#7c8ba1;}',
357
+ '.score-card{background:var(--panel-soft);border:1px solid #edf2f8;border-radius:6px;padding:14px;}',
358
+ '.score-card .value{font-size:23px;font-weight:700;color:var(--ink);letter-spacing:-.015em;}',
359
+ '.score-pill{display:inline-flex;align-items:center;justify-content:center;border-radius:999px;padding:5px 11px;background:#e8f0ff;color:var(--blue);font-weight:900;min-width:58px;}',
360
+ '.muted{color:#8a95a8;}',
333
361
  'code{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono",monospace;font-size:12px;word-break:break-all;}',
334
- '.footer{margin-top:28px;color:#61728d;font-size:13px;}',
335
- '@media (max-width:900px){.scenario-card summary{flex-direction:column;align-items:flex-start;}.summary-note{max-width:none;text-align:left;}}',
362
+ '.footer{margin:24px 4px 0;color:var(--muted);font-size:13px;font-weight:700;}',
363
+ '@media (max-width:900px){.hero{display:block;padding:24px 20px;}.hero p{margin-top:6px;}.scenario-card summary{grid-template-columns:1fr;align-items:start;}.summary-note{text-align:left;}.page{padding:14px 10px 40px;}.assertions,.evaluations{display:block;overflow-x:auto;white-space:normal;}.summary-card .value{font-size:30px;}}',
336
364
  '</style>',
337
365
  '</head>',
338
366
  '<body>',
@@ -346,6 +374,8 @@ function buildScenarioRunHtml(payload) {
346
374
  '<section class="summary-grid">',
347
375
  `<article class="summary-card"><span class="label">Scenarios</span><div class="value">${payload.summary.totalScenarios}</div><div class="sub">${payload.summary.passedScenarios} pass / ${payload.summary.failedScenarios} fail / ${payload.summary.skippedScenarios} skip</div></article>`,
348
376
  `<article class="summary-card"><span class="label">Turns</span><div class="value">${payload.summary.totalTurns}</div><div class="sub">${payload.summary.totalToolCalls} tool calls total</div></article>`,
377
+ `<article class="summary-card"><span class="label">Avg TTF Text</span><div class="value">${formatSeconds(payload.summary.averageTtfTextSeconds)}</div><div class="sub">First assistant text per turn</div></article>`,
378
+ `<article class="summary-card"><span class="label">Avg Turn Complete</span><div class="value">${formatSeconds(payload.summary.averageTurnCompleteSeconds)}</div><div class="sub">Run started to run finished</div></article>`,
349
379
  `<article class="summary-card"><span class="label">Assertions</span><div class="value">${payload.summary.totalAssertions}</div><div class="sub">${payload.summary.passedAssertions} pass / ${payload.summary.failedAssertions} fail</div></article>`,
350
380
  `<article class="summary-card"><span class="label">Evaluations</span><div class="value">${payload.summary.totalEvaluations}</div><div class="sub">LLM-scored dimensions</div></article>`,
351
381
  '</section>',
@@ -44,6 +44,7 @@ export type AiTurn<TContext extends BaseAiScenarioContext = BaseAiScenarioContex
44
44
  message: AiTurnMessage<TContext>;
45
45
  idleTimeoutMs?: number;
46
46
  processTimeoutMs?: number;
47
+ continueOnAssertionFailure?: boolean;
47
48
  assertions?: AiTurnAssertion[];
48
49
  eval?: AiTurnEvalDefinition[];
49
50
  onComplete?: (args: AiTurnCompletionArgs<TContext>) => void | Promise<void>;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@huydao/karrot",
3
- "version": "0.1.5",
3
+ "version": "0.1.7",
4
4
  "description": "Reusable AI scenario execution, assertion, evaluation, and reporting toolkit",
5
5
  "license": "ISC",
6
6
  "type": "commonjs",
@@ -126,12 +126,15 @@
126
126
  },
127
127
  "files": [
128
128
  "dist",
129
+ "site",
129
130
  "README.md",
130
131
  "GUIDE.md"
131
132
  ],
132
133
  "scripts": {
133
134
  "build": "rm -rf dist && tsc -p tsconfig.json && mkdir -p dist/prompts && cp prompts/*.md dist/prompts/",
134
- "prepack": "npm run build"
135
+ "prepack": "npm run build",
136
+ "site:serve": "node site/serve.js",
137
+ "site:check": "node site/check.js"
135
138
  },
136
139
  "dependencies": {
137
140
  "@stomp/stompjs": "^7.3.0",