npm - @huydao/karrot - Versions diffs - 0.1.1 → 0.1.5 - Mend

@huydao/karrot 0.1.1 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/GUIDE.md +3 -3
package/README.md +192 -38
package/dist/assertions/assertion.d.ts +7 -2
package/dist/assertions/assertion.js +143 -1
package/dist/executors/adapters/ag-ui.d.ts +2 -2
package/dist/executors/adapters/ag-ui.js +379 -186
package/dist/executors/execute.d.ts +3 -0
package/dist/executors/execute.js +6 -4
package/dist/executors/executor.d.ts +1 -0
package/dist/executors/executor.js +58 -17
package/dist/prompts/turn-eval-system-prompt.md +31 -1
package/dist/prompts/turn-message-gen-system-prompt.md +4 -2
package/dist/reports/report.d.ts +1 -1
package/dist/reports/report.js +20 -0
package/dist/scenarios/generated-message.js +2 -0
package/dist/scenarios/scenario-loader.d.ts +1 -1
package/dist/scenarios/scenario-loader.js +41 -3
package/dist/scenarios/scenario.d.ts +12 -7
package/dist/utils/config.d.ts +3 -4
package/package.json +12 -5

package/GUIDE.md CHANGED Viewed

@@ -70,7 +70,7 @@ await execute('./karrot.config.yml', {
 A scenario module must export:
 - `scenarioSet`
-- `buildScenarioContext(projectId)`
+- `buildScenarioContext(baseContext)`
 Example:
@@ -96,9 +96,9 @@ const scenarios: AiScenario<DemoContext>[] = [
 export const scenarioSet = new AiScenarioSet(scenarios);
-export function buildScenarioContext(projectId: string): DemoContext {
+export function buildScenarioContext(baseContext: BaseAiScenarioContext): DemoContext {
   return {
-    projectId,
+    ...baseContext,
     projectLabel: 'RA Sample Project',
   };
 }

package/README.md CHANGED Viewed

@@ -38,7 +38,7 @@ import { execute } from '@huydao/karrot';
 await execute('./karrot.config.yml', {
   variables: {
-    PROJECT_ID: '3422056',
+    PROJECT_ID: process.env.PROJECT_ID,
     JWT: process.env.JWT,
     ACCOUNT_ID: process.env.ACCOUNT_ID,
     WS_URL: process.env.WS_URL,
@@ -58,11 +58,102 @@ await execute('./karrot.config.yml', {
 5. run selected scenarios
 6. write JSON and HTML reports
-## Scenario Authoring
+## Recommended Setup Flow
+The normal setup path is:
+1. create a YAML config file for the WSS transport
+2. create a scenario module that exports `scenarioSet` and `buildScenarioContext`
+3. create a small run script that calls `execute()`
+### 1. WSS config in YAML
+Use one config file to describe transport, evaluation prompt settings, artifacts, and reporting.
+```yml
+version: 1
+transport:
+  type: ag-ui-wss
+  env:
+    JWT: ${JWT}
+    ACCOUNT_ID: ${ACCOUNT_ID}
+    PROJECT_ID: ${PROJECT_ID}
+    AGENT_URL: ${AGENT_URL}
+    AGENT_ID: ${AGENT_ID}
+    WS_URL: ${WS_URL}
+    WS_TOPIC: ${WS_TOPIC}
+    WS_STOMP_HEADERS: Authorization:${JWT}
+    WS_HEADERS: Origin:${WS_ORIGIN},User-Agent:Mozilla/5.0
+  processTimeoutMs: 120000
+artifacts:
+  directory: ./artifacts
+execution:
+  stopOnFailure: false
+evaluation:
+  systemPromptPath: ./prompts/turn-eval-system-prompt.md
+  promptDirectory: ./prompts/eval
+context:
+  projectId: ${PROJECT_ID}
+report:
+  enabled: true
+  environment: prod
+  projectName: Demo Project
+  runtime:
+    agentUrl: ${AGENT_URL}
+    agentId: ${AGENT_ID}
+    wsUrl: ${WS_URL}
+    wsTopic: ${WS_TOPIC}
+    accountId: ${ACCOUNT_ID}
+    projectId: ${PROJECT_ID}
+    appBaseUrl: ${APP_BASE_URL}
+```
+What this does:
+- `transport`: tells Karrot how to talk to the assistant
+- `evaluation`: points to the turn-eval rubric and any extra project-specific dimension prompts
+- `context`: makes resolved values available to scenarios
+- `report`: controls run metadata written into reports
+### 2. Scenario module
+A scenario module defines the multi-turn tests that Karrot will run.
+### 3. Run script
+Use a small script to resolve variables and point Karrot at the scenario file.
+```ts
+import { execute } from '@huydao/karrot';
+await execute('./karrot.config.yml', {
+  variables: {
+    PROJECT_ID: process.env.PROJECT_ID,
+    JWT: process.env.JWT,
+    ACCOUNT_ID: process.env.ACCOUNT_ID,
+    AGENT_URL: process.env.AGENT_URL,
+    AGENT_ID: process.env.AGENT_ID,
+    WS_URL: process.env.WS_URL,
+    WS_TOPIC: process.env.WS_TOPIC,
+    WS_ORIGIN: process.env.WS_ORIGIN,
+    APP_BASE_URL: process.env.APP_BASE_URL,
+  },
+  scenario: {
+    file: './src/scenarios/basic-two-turn-demo.ts',
+    ids: ['BASIC-2T'],
+  },
+});
+```
+## Scenario Structure
 A scenario module exports:
 - `scenarioSet`
-- `buildScenarioContext(projectId)`
+- `buildScenarioContext(baseContext)`
 Minimal example:
@@ -88,8 +179,68 @@ const scenarios: AiScenario<BaseAiScenarioContext>[] = [
 export const scenarioSet = new AiScenarioSet(scenarios);
-export function buildScenarioContext(projectId: string): BaseAiScenarioContext {
-  return { projectId };
+export function buildScenarioContext(baseContext: BaseAiScenarioContext): BaseAiScenarioContext {
+  return { ...baseContext };
+}
+```
+### Scenario shape
+Each scenario typically contains:
+- `id`: stable scenario identifier
+- `name`: human-readable scenario name
+- `turns`: ordered list of user turns to execute
+Each turn supports:
+- `label`: display label in reports
+- `message`: the user message to send
+- `idleTimeoutMs`: optional wait limit for message inactivity
+- `processTimeoutMs`: optional hard timeout for the turn
+- `assertions`: pass/fail checks for the turn output
+- `eval`: quality scoring dimensions for the turn output
+- `onComplete`: optional callback for turn-level post-processing
+### Message options
+`message` can be:
+- a function `(context) => string`
+- `aiGen.fromPreviousContext()`
+- `aiGen.fromGuidance(guidance)`
+- `aiGen.fromContent(content)`
+This gives you a few common scenario authoring patterns:
+- fixed prompts for deterministic tests
+- context-aware prompts that use scenario data
+- generated user prompts for more adaptive multi-turn flows
+Example with assertions and eval on a turn:
+```ts
+import { AiScenarioSet, aiGen, type AiScenario, type BaseAiScenarioContext } from '@huydao/karrot';
+const scenarios: AiScenario<BaseAiScenarioContext>[] = [
+  {
+    id: 'FOLLOW-UP-1',
+    name: 'Follow-up prompt generation',
+    turns: [
+      {
+        label: 'Ask for next prompts',
+        message: aiGen.fromGuidance(
+          'Ask for 3 concise follow-up prompts the user can send next based on the previous answer.',
+        ),
+        assertions: [
+          { assert: { hasText: 'prompt' } },
+        ],
+        eval: ['correctness', 'helpfulness', 'relevance'],
+      },
+    ],
+  },
+];
+export const scenarioSet = new AiScenarioSet(scenarios);
+export function buildScenarioContext(baseContext: BaseAiScenarioContext): BaseAiScenarioContext {
+  return { ...baseContext };
 }
 ```
@@ -97,6 +248,8 @@ export function buildScenarioContext(projectId: string): BaseAiScenarioContext {
 Karrot supports two assertion styles.
+Use assertions for pass/fail requirements. If a turn must contain or avoid something specific, assertions are the right tool.
 Direct assertions:
 ```ts
@@ -115,9 +268,15 @@ assertions: [
 ]
 ```
+Assertion guidance:
+- Use direct assertions when the expected output is deterministic enough to check literally.
+- Use AI assertions when the requirement is semantic and cannot be captured safely with exact string matching.
+- Use assertions to decide whether the turn satisfied a contract, not to measure answer quality.
 ## Evaluations
 Turn evals score the assistant response for named dimensions.
+Karrot applies a CheckEval-inspired evaluation rubric: broad dimensions are decomposed into concrete checklist-style checks before assigning a final score, which improves consistency and makes explanations more traceable.
 ```ts
 eval: ['correctness', 'coverage', 'helpfulness']
@@ -135,12 +294,37 @@ eval: [
 ]
 ```
+Use eval when you want a quality score rather than a hard pass/fail rule.
+Built-in dimensions commonly used by Karrot:
+- `correctness`
+- `coverage`
+- `helpfulness`
+- `clarity`
+- `completeness`
+- `conciseness`
+- `relevance`
+- `actionability`
+- `structure`
+- `consistency`
+- `safety`
 Project-level eval prompts can be configured through:
 - `evaluation.systemPromptPath`
 - `evaluation.promptDirectory`
 That lets the project define rubric files without repeating inline guidance in every scenario.
+Use:
+- `systemPromptPath` when you want to replace the whole turn-eval rubric
+- `promptDirectory` when you want to add custom project-specific dimensions
+Eval guidance:
+- Use assertions for required behavior.
+- Use eval for quality measurement across dimensions.
+- Prefer a small number of dimensions that reflect the goal of the turn.
+- Because Karrot applies CheckEval-style scoring, dimensions like `relevance` and `consistency` are judged through concrete sub-checks instead of a vague overall impression.
 ## AI-Generated User Messages
 Karrot can generate a user turn message before sending it to the target assistant.
@@ -173,39 +357,9 @@ Karrot config currently supports:
 - `context`
 - `report`
-Example `ag-ui-wss` config:
-```yml
-version: 1
-transport:
-  type: ag-ui-wss
-  env:
-    JWT: ${JWT}
-    ACCOUNT_ID: ${ACCOUNT_ID}
-    PROJECT_ID: ${PROJECT_ID}
-    AGENT_URL: ${AGENT_URL}
-    AGENT_ID: ${AGENT_ID}
-    WS_URL: ${WS_URL}
-    WS_TOPIC: ${WS_TOPIC}
-    WS_STOMP_HEADERS: Authorization:${JWT}
-    WS_HEADERS: Origin:${WS_ORIGIN},User-Agent:Mozilla/5.0
-context:
-  projectId: ${PROJECT_ID}
-report:
-  environment: prod
-  projectName: Demo Project
-  runtime:
-    agentUrl: ${AGENT_URL}
-    agentId: ${AGENT_ID}
-    wsUrl: ${WS_URL}
-    wsTopic: ${WS_TOPIC}
-    accountId: ${ACCOUNT_ID}
-    projectId: ${PROJECT_ID}
-    appBaseUrl: ${APP_BASE_URL}
-```
+Important design choice:
+- config and scenario are separate
+- one transport config can be reused across many scenario files
 ## Reports and Artifacts

package/dist/assertions/assertion.d.ts CHANGED Viewed

@@ -1,8 +1,12 @@
 import { type AiTurnAssertion } from '../scenarios/scenario';
 export type AssertionEvaluationResult = {
     kind: 'assert' | 'aiAssert';
-    matcher: 'hasText' | 'toolcall' | 'hasContent' | 'notHasContent';
-    expected: string | string[];
+    matcher: 'hasText' | 'toolcall' | 'toolcallWithContent' | 'hasContent' | 'notHasContent';
+    expected: string | string[] | {
+        name: string;
+        hasText?: string | string[];
+        hasProperties?: Record<string, unknown>;
+    };
     description?: string;
     passed: boolean;
     reason: string;
@@ -13,6 +17,7 @@ type EvaluateTurnAssertionsOptions = {
     toolCalls?: string[];
     env?: NodeJS.ProcessEnv;
     outputDirectory?: string;
+    outputPath?: string;
 };
 export declare function evaluateTurnAssertions(options: EvaluateTurnAssertionsOptions): Promise<AssertionEvaluationResult[]>;
 export {};

package/dist/assertions/assertion.js CHANGED Viewed

@@ -12,6 +12,14 @@ function normalizeAssertion(assertion) {
                 description: assertion.description,
             };
         }
+        if ('toolcallWithContent' in assertion.assert) {
+            return {
+                kind: 'assert',
+                matcher: 'toolcallWithContent',
+                expected: assertion.assert.toolcallWithContent,
+                description: assertion.description,
+            };
+        }
         return {
             kind: 'assert',
             matcher: 'toolcall',
@@ -35,11 +43,32 @@ function normalizeAssertion(assertion) {
     };
 }
 function formatExpectedValue(expected) {
+    if (typeof expected === 'object' && expected !== null && !Array.isArray(expected)) {
+        return JSON.stringify(expected);
+    }
     if (Array.isArray(expected)) {
         return `[${expected.join(', ')}]`;
     }
     return `"${expected}"`;
 }
+function isPlainObject(value) {
+    return typeof value === 'object' && value !== null && !Array.isArray(value);
+}
+function matchesExpectedProperties(actual, expected) {
+    if (Array.isArray(expected)) {
+        if (!Array.isArray(actual) || actual.length < expected.length) {
+            return false;
+        }
+        return expected.every((expectedItem, index) => matchesExpectedProperties(actual[index], expectedItem));
+    }
+    if (isPlainObject(expected)) {
+        if (!isPlainObject(actual)) {
+            return false;
+        }
+        return Object.entries(expected).every(([key, value]) => matchesExpectedProperties(actual[key], value));
+    }
+    return Object.is(actual, expected);
+}
 function buildAssertionReason(result) {
     const prefix = result.description ? `${result.description}: ` : '';
     return `${prefix}${result.matcher}(${formatExpectedValue(result.expected)})`;
@@ -130,6 +159,18 @@ async function evaluateAiAssertion(assertion, output, env) {
         description: assertion.description,
     });
 }
+async function readOutputLog(outputPath) {
+    if (!outputPath) {
+        return '';
+    }
+    try {
+        const { readFile } = await import('node:fs/promises');
+        return await readFile(outputPath, 'utf8');
+    }
+    catch {
+        return '';
+    }
+}
 function evaluateToolCallAssertion(assertion, toolCalls) {
     const expectedToolCalls = (Array.isArray(assertion.expected) ? assertion.expected : [])
         .map((toolCall) => toolCall.trim())
@@ -166,11 +207,100 @@ function evaluateToolCallAssertion(assertion, toolCalls) {
             : `${buildNormalizedAssertionReason(normalizedToolCallAssertion)} failed. Missing: ${formatExpectedValue(missingToolCalls)}. Observed: ${formatExpectedValue(actualToolCalls)}.`,
     };
 }
+function evaluateToolCallWithContentAssertion(assertion, logContent) {
+    const expected = typeof assertion.expected === 'object' && assertion.expected !== null && !Array.isArray(assertion.expected)
+        ? assertion.expected
+        : undefined;
+    if (!expected) {
+        return {
+            kind: assertion.kind,
+            matcher: assertion.matcher,
+            expected: assertion.expected,
+            description: assertion.description,
+            passed: false,
+            reason: `${buildNormalizedAssertionReason(assertion)} failed. Invalid toolcallWithContent expectation.`,
+        };
+    }
+    const expectedTexts = (Array.isArray(expected.hasText) ? expected.hasText : [expected.hasText])
+        .filter((value) => typeof value === 'string')
+        .map((value) => value.trim())
+        .filter(Boolean);
+    const toolCallNamesById = new Map();
+    const eventPayloadsByToolCallId = new Map();
+    const parsedArgumentsByToolCallId = new Map();
+    for (const rawLine of logContent.split('\n')) {
+        const line = rawLine.trim();
+        if (!line) {
+            continue;
+        }
+        try {
+            const event = JSON.parse(line);
+            if (!event.toolCallId) {
+                continue;
+            }
+            eventPayloadsByToolCallId.set(event.toolCallId, [
+                ...(eventPayloadsByToolCallId.get(event.toolCallId) ?? []),
+                line,
+            ]);
+            if (typeof event.toolCallName === 'string' && event.toolCallName.trim()) {
+                toolCallNamesById.set(event.toolCallId, event.toolCallName.trim());
+            }
+            if (typeof event.arguments === 'string' && event.arguments.trim()) {
+                try {
+                    const parsedArguments = JSON.parse(event.arguments);
+                    parsedArgumentsByToolCallId.set(event.toolCallId, [
+                        ...(parsedArgumentsByToolCallId.get(event.toolCallId) ?? []),
+                        parsedArguments,
+                    ]);
+                }
+                catch {
+                    // Ignore unparsable arguments and fall back to raw text matching.
+                }
+            }
+        }
+        catch {
+            continue;
+        }
+    }
+    const matchingToolCallIds = [...toolCallNamesById.entries()]
+        .filter(([, toolCallName]) => toolCallName === expected.name)
+        .map(([toolCallId]) => toolCallId);
+    if (matchingToolCallIds.length === 0) {
+        return {
+            kind: assertion.kind,
+            matcher: assertion.matcher,
+            expected,
+            description: assertion.description,
+            passed: false,
+            reason: `${buildNormalizedAssertionReason(assertion)} failed. Tool call "${expected.name}" was not found in the run log.`,
+        };
+    }
+    const combinedPayload = matchingToolCallIds
+        .flatMap((toolCallId) => eventPayloadsByToolCallId.get(toolCallId) ?? [])
+        .join('\n');
+    const missingTexts = expectedTexts.filter((text) => !combinedPayload.includes(text));
+    const hasPropertiesMatch = expected.hasProperties === undefined ||
+        matchingToolCallIds.some((toolCallId) => (parsedArgumentsByToolCallId.get(toolCallId) ?? []).some((parsedArguments) => matchesExpectedProperties(parsedArguments, expected.hasProperties)));
+    const passed = missingTexts.length === 0 && hasPropertiesMatch;
+    return {
+        kind: assertion.kind,
+        matcher: assertion.matcher,
+        expected,
+        description: assertion.description,
+        passed,
+        reason: passed
+            ? `${buildNormalizedAssertionReason(assertion)} passed.`
+            : !hasPropertiesMatch
+                ? `${buildNormalizedAssertionReason(assertion)} failed. Missing properties: ${JSON.stringify(expected.hasProperties ?? {})}.`
+                : `${buildNormalizedAssertionReason(assertion)} failed. Missing text: ${formatExpectedValue(missingTexts)}.`,
+    };
+}
 async function evaluateTurnAssertions(options) {
     if (!options.assertions?.length) {
         return [];
     }
     const results = [];
+    let outputLogContent;
     for (const rawAssertion of options.assertions) {
         const assertion = normalizeAssertion(rawAssertion);
         if (assertion.kind === 'assert') {
@@ -178,7 +308,19 @@ async function evaluateTurnAssertions(options) {
                 results.push(evaluateToolCallAssertion(assertion, options.toolCalls));
                 continue;
             }
-            const expected = typeof assertion.expected === 'string' ? assertion.expected : assertion.expected.join(', ');
+            if (assertion.matcher === 'toolcallWithContent') {
+                outputLogContent ??= await readOutputLog(options.outputPath);
+                results.push(evaluateToolCallWithContentAssertion(assertion, outputLogContent));
+                continue;
+            }
+            if (assertion.matcher !== 'hasText') {
+                continue;
+            }
+            const expected = typeof assertion.expected === 'string'
+                ? assertion.expected
+                : Array.isArray(assertion.expected)
+                    ? assertion.expected.join(', ')
+                    : JSON.stringify(assertion.expected);
             const passed = options.output.includes(expected);
             results.push({
                 kind: assertion.kind,

package/dist/executors/adapters/ag-ui.d.ts CHANGED Viewed

@@ -8,8 +8,8 @@ type RunAgUiMessageOptions = {
     allowIdleTimeoutWithAssistantText?: boolean;
     processTimeoutMs?: number;
 };
-export declare function parseExecutionTestResultId(output: string): string | undefined;
+declare function parseExecutionTestResultId(output: string): string | undefined;
+export { parseExecutionTestResultId };
 export declare function extractToolCallNames(logContent: string): string[];
 export declare function extractAppendedLog(previousLogContent: string, latestLogContent: string): string;
 export declare function runAgUiMessage(options: RunAgUiMessageOptions): Promise<MessageRunResult>;
-export {};