@huydao/karrot 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -225,7 +225,7 @@ function evaluateToolCallWithContentAssertion(assertion, logContent) {
225
225
  .filter((value) => typeof value === 'string')
226
226
  .map((value) => value.trim())
227
227
  .filter(Boolean);
228
- const matchingToolCallIds = new Set();
228
+ const toolCallNamesById = new Map();
229
229
  const eventPayloadsByToolCallId = new Map();
230
230
  const parsedArgumentsByToolCallId = new Map();
231
231
  for (const rawLine of logContent.split('\n')) {
@@ -238,25 +238,23 @@ function evaluateToolCallWithContentAssertion(assertion, logContent) {
238
238
  if (!event.toolCallId) {
239
239
  continue;
240
240
  }
241
- if (event.toolCallName === expected.name) {
242
- matchingToolCallIds.add(event.toolCallId);
241
+ eventPayloadsByToolCallId.set(event.toolCallId, [
242
+ ...(eventPayloadsByToolCallId.get(event.toolCallId) ?? []),
243
+ line,
244
+ ]);
245
+ if (typeof event.toolCallName === 'string' && event.toolCallName.trim()) {
246
+ toolCallNamesById.set(event.toolCallId, event.toolCallName.trim());
243
247
  }
244
- if (matchingToolCallIds.has(event.toolCallId)) {
245
- eventPayloadsByToolCallId.set(event.toolCallId, [
246
- ...(eventPayloadsByToolCallId.get(event.toolCallId) ?? []),
247
- line,
248
- ]);
249
- if (typeof event.arguments === 'string' && event.arguments.trim()) {
250
- try {
251
- const parsedArguments = JSON.parse(event.arguments);
252
- parsedArgumentsByToolCallId.set(event.toolCallId, [
253
- ...(parsedArgumentsByToolCallId.get(event.toolCallId) ?? []),
254
- parsedArguments,
255
- ]);
256
- }
257
- catch {
258
- // Ignore unparsable arguments and fall back to raw text matching.
259
- }
248
+ if (typeof event.arguments === 'string' && event.arguments.trim()) {
249
+ try {
250
+ const parsedArguments = JSON.parse(event.arguments);
251
+ parsedArgumentsByToolCallId.set(event.toolCallId, [
252
+ ...(parsedArgumentsByToolCallId.get(event.toolCallId) ?? []),
253
+ parsedArguments,
254
+ ]);
255
+ }
256
+ catch {
257
+ // Ignore unparsable arguments and fall back to raw text matching.
260
258
  }
261
259
  }
262
260
  }
@@ -264,7 +262,10 @@ function evaluateToolCallWithContentAssertion(assertion, logContent) {
264
262
  continue;
265
263
  }
266
264
  }
267
- if (matchingToolCallIds.size === 0) {
265
+ const matchingToolCallIds = [...toolCallNamesById.entries()]
266
+ .filter(([, toolCallName]) => toolCallName === expected.name)
267
+ .map(([toolCallId]) => toolCallId);
268
+ if (matchingToolCallIds.length === 0) {
268
269
  return {
269
270
  kind: assertion.kind,
270
271
  matcher: assertion.matcher,
@@ -274,12 +275,12 @@ function evaluateToolCallWithContentAssertion(assertion, logContent) {
274
275
  reason: `${buildNormalizedAssertionReason(assertion)} failed. Tool call "${expected.name}" was not found in the run log.`,
275
276
  };
276
277
  }
277
- const combinedPayload = [...matchingToolCallIds]
278
+ const combinedPayload = matchingToolCallIds
278
279
  .flatMap((toolCallId) => eventPayloadsByToolCallId.get(toolCallId) ?? [])
279
280
  .join('\n');
280
281
  const missingTexts = expectedTexts.filter((text) => !combinedPayload.includes(text));
281
282
  const hasPropertiesMatch = expected.hasProperties === undefined ||
282
- [...matchingToolCallIds].some((toolCallId) => (parsedArgumentsByToolCallId.get(toolCallId) ?? []).some((parsedArguments) => matchesExpectedProperties(parsedArguments, expected.hasProperties)));
283
+ matchingToolCallIds.some((toolCallId) => (parsedArgumentsByToolCallId.get(toolCallId) ?? []).some((parsedArguments) => matchesExpectedProperties(parsedArguments, expected.hasProperties)));
283
284
  const passed = missingTexts.length === 0 && hasPropertiesMatch;
284
285
  return {
285
286
  kind: assertion.kind,
@@ -3,6 +3,9 @@ import { type ScenarioRunResult } from '../reports/report';
3
3
  type ExecuteOptions = {
4
4
  variables?: Record<string, unknown>;
5
5
  scenario: KarrotScenarioSelection;
6
+ execution?: {
7
+ concurrency?: number;
8
+ };
6
9
  };
7
10
  type ExecuteResult = {
8
11
  outputDirectory: string;
@@ -122,6 +122,7 @@ async function execute(configOrPath, options) {
122
122
  ? createAgUiRunner(resolvedConfig)
123
123
  : createAgUiPostRunner(resolvedConfig),
124
124
  stopOnFailure: resolvedConfig.execution?.stopOnFailure ?? false,
125
+ concurrency: options.execution?.concurrency ?? resolvedConfig.execution?.concurrency,
125
126
  });
126
127
  let reportPaths;
127
128
  if (resolvedConfig.report?.enabled !== false && resolvedConfig.report) {
@@ -5,6 +5,7 @@ type BaseScenarioExecutionOptions = {
5
5
  env: NodeJS.ProcessEnv;
6
6
  outputDirectory: string;
7
7
  stopOnFailure?: boolean;
8
+ concurrency?: number;
8
9
  maxDurationMs?: number;
9
10
  initialThreadId?: string;
10
11
  };
@@ -10,6 +10,12 @@ function readPositiveTimeoutMs(value) {
10
10
  const parsed = Number(value);
11
11
  return Number.isFinite(parsed) && parsed > 0 ? parsed : undefined;
12
12
  }
13
+ function normalizeConcurrency(value, totalScenarios) {
14
+ if (!Number.isFinite(value) || (value ?? 0) <= 1) {
15
+ return 1;
16
+ }
17
+ return Math.min(Math.floor(value), Math.max(1, totalScenarios));
18
+ }
13
19
  function resolveTurnProcessTimeoutMs(options) {
14
20
  const envOverrideMs = readPositiveTimeoutMs(options.env.AI_TURN_TIMEOUT_MS);
15
21
  const requestedMs = typeof envOverrideMs === 'number'
@@ -95,6 +101,7 @@ async function runSingleScenario(scenario, context, env, outputDirectory, deadli
95
101
  toolCalls: run.toolCalls,
96
102
  env: turnEnv,
97
103
  outputDirectory,
104
+ outputPath: run.outputPath,
98
105
  });
99
106
  const failedAssertions = assertionResults.filter((assertion) => !assertion.passed);
100
107
  const assertionFailureNote = failedAssertions.length > 0
@@ -126,7 +133,11 @@ async function runSingleScenario(scenario, context, env, outputDirectory, deadli
126
133
  result.turns.push(turnResult);
127
134
  turnRecorded = true;
128
135
  if (assertionFailureNote) {
129
- throw new Error(assertionFailureNote);
136
+ result.status = 'FAIL';
137
+ result.note = [result.note, assertionFailureNote].filter(Boolean).join(' ') || undefined;
138
+ if (!scenario.continueOnAssertionFailure) {
139
+ throw new Error(assertionFailureNote);
140
+ }
130
141
  }
131
142
  }
132
143
  catch (error) {
@@ -173,31 +184,61 @@ async function runSingleScenario(scenario, context, env, outputDirectory, deadli
173
184
  }
174
185
  async function runScenario(scenario, options) {
175
186
  const scenarios = Array.isArray(scenario) ? scenario : [scenario];
176
- const results = [];
177
187
  const shouldStopOnFailure = options.stopOnFailure ?? true;
178
188
  const deadlineAt = typeof options.maxDurationMs === 'number' ? Date.now() + options.maxDurationMs : undefined;
179
- for (const currentScenario of scenarios) {
189
+ const concurrency = normalizeConcurrency(options.concurrency, scenarios.length);
190
+ const runScenarioAtIndex = async (currentScenario) => {
180
191
  try {
181
- results.push(await runSingleScenario(currentScenario, options.context, options.env, options.outputDirectory, deadlineAt, options.messageRunner, options.initialThreadId));
192
+ return await runSingleScenario(currentScenario, options.context, options.env, options.outputDirectory, deadlineAt, options.messageRunner, concurrency === 1 ? options.initialThreadId : undefined);
182
193
  }
183
194
  catch (error) {
184
195
  if (error instanceof report_1.ScenarioExecutionError) {
185
- results.push(error.result);
186
- }
187
- else {
188
- results.push({
189
- id: currentScenario.id,
190
- name: currentScenario.name,
191
- status: 'FAIL',
192
- note: error instanceof Error ? error.message : String(error),
193
- turns: [],
194
- metrics: {},
195
- });
196
+ return error.result;
196
197
  }
197
- if (shouldStopOnFailure) {
198
+ return {
199
+ id: currentScenario.id,
200
+ name: currentScenario.name,
201
+ status: 'FAIL',
202
+ note: error instanceof Error ? error.message : String(error),
203
+ turns: [],
204
+ metrics: {},
205
+ };
206
+ }
207
+ };
208
+ if (concurrency === 1) {
209
+ const results = [];
210
+ for (const currentScenario of scenarios) {
211
+ const result = await runScenarioAtIndex(currentScenario);
212
+ results.push(result);
213
+ if (shouldStopOnFailure && result.status === 'FAIL') {
198
214
  break;
199
215
  }
200
216
  }
217
+ return Array.isArray(scenario) ? results : results[0];
201
218
  }
202
- return Array.isArray(scenario) ? results : results[0];
219
+ const results = new Array(scenarios.length);
220
+ let nextIndex = 0;
221
+ let stopScheduling = false;
222
+ const worker = async () => {
223
+ while (true) {
224
+ if (shouldStopOnFailure && stopScheduling) {
225
+ return;
226
+ }
227
+ const currentIndex = nextIndex;
228
+ nextIndex += 1;
229
+ if (currentIndex >= scenarios.length) {
230
+ return;
231
+ }
232
+ const result = await runScenarioAtIndex(scenarios[currentIndex]);
233
+ results[currentIndex] = result;
234
+ if (shouldStopOnFailure && result.status === 'FAIL') {
235
+ stopScheduling = true;
236
+ }
237
+ }
238
+ };
239
+ await Promise.all(Array.from({ length: concurrency }, async () => {
240
+ await worker();
241
+ }));
242
+ const completedResults = results.filter((result) => result != null);
243
+ return Array.isArray(scenario) ? completedResults : completedResults[0];
203
244
  }
@@ -144,7 +144,7 @@ function renderAssertionExpected(expected) {
144
144
  return expected.length > 0 ? escapeHtml(expected.join(', ')) : '<span class="muted">[]</span>';
145
145
  }
146
146
  if (typeof expected === 'object' && expected !== null) {
147
- return `<pre>${escapeHtml(stringifyAssertionExpected(expected))}</pre>`;
147
+ return `<pre class="assertion-expected-object">${escapeHtml(stringifyAssertionExpected(expected))}</pre>`;
148
148
  }
149
149
  return escapeHtml(expected);
150
150
  }
@@ -312,11 +312,13 @@ function buildScenarioRunHtml(payload) {
312
312
  '.assertions{table-layout:fixed;}',
313
313
  '.assertions th,.assertions td,.evaluations th,.evaluations td{padding:10px 12px;border-bottom:1px solid #e5edf7;vertical-align:top;text-align:left;}',
314
314
  '.assertions th,.evaluations th{font-size:12px;text-transform:uppercase;letter-spacing:.08em;color:#61728d;}',
315
+ '.assertions td{word-break:break-word;overflow-wrap:anywhere;}',
315
316
  '.assertions th:nth-child(1),.assertions td:nth-child(1){width:8%;}',
316
317
  '.assertions th:nth-child(2),.assertions td:nth-child(2){width:12%;}',
317
318
  '.assertions th:nth-child(3),.assertions td:nth-child(3){width:34%;}',
318
319
  '.assertions th:nth-child(4),.assertions td:nth-child(4){width:16%;}',
319
320
  '.assertions th:nth-child(5),.assertions td:nth-child(5){width:30%;}',
321
+ '.assertions .assertion-expected-object{margin:0;white-space:pre-wrap;word-break:break-word;overflow-wrap:anywhere;background:#f8fbff;border-radius:12px;padding:12px;font-size:12px;line-height:1.45;max-height:none;overflow:visible;}',
320
322
  '.badge{display:inline-flex;align-items:center;justify-content:center;border-radius:999px;padding:5px 10px;font-size:12px;font-weight:700;min-width:56px;}',
321
323
  '.badge.pass{background:#dcfce7;color:#166534;}',
322
324
  '.badge.fail{background:#fee2e2;color:#991b1b;}',
@@ -51,6 +51,7 @@ export type AiTurn<TContext extends BaseAiScenarioContext = BaseAiScenarioContex
51
51
  export type AiScenario<TContext extends BaseAiScenarioContext = BaseAiScenarioContext> = {
52
52
  id: string;
53
53
  name: string;
54
+ continueOnAssertionFailure?: boolean;
54
55
  turns: AiTurn<TContext>[];
55
56
  };
56
57
  export declare class AiScenarioSet<TContext extends BaseAiScenarioContext = BaseAiScenarioContext> {
@@ -79,6 +79,7 @@ export type KarrotConfig = {
79
79
  };
80
80
  execution?: {
81
81
  stopOnFailure?: boolean;
82
+ concurrency?: number;
82
83
  };
83
84
  evaluation?: {
84
85
  systemPromptPath?: string;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@huydao/karrot",
3
- "version": "0.1.4",
3
+ "version": "0.1.5",
4
4
  "description": "Reusable AI scenario execution, assertion, evaluation, and reporting toolkit",
5
5
  "license": "ISC",
6
6
  "type": "commonjs",