npm - @huydao/karrot - Versions diffs - 0.1.1 → 0.1.4 - Mend

@huydao/karrot 0.1.1 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/GUIDE.md +3 -3
package/README.md +192 -38
package/dist/assertions/assertion.d.ts +7 -2
package/dist/assertions/assertion.js +142 -1
package/dist/executors/adapters/ag-ui.d.ts +2 -2
package/dist/executors/adapters/ag-ui.js +379 -186
package/dist/executors/execute.js +5 -4
package/dist/prompts/turn-eval-system-prompt.md +31 -1
package/dist/prompts/turn-message-gen-system-prompt.md +4 -2
package/dist/reports/report.d.ts +1 -1
package/dist/reports/report.js +18 -0
package/dist/scenarios/generated-message.js +2 -0
package/dist/scenarios/scenario-loader.d.ts +1 -1
package/dist/scenarios/scenario-loader.js +41 -3
package/dist/scenarios/scenario.d.ts +11 -7
package/dist/utils/config.d.ts +2 -4
package/package.json +12 -5

package/dist/prompts/turn-eval-system-prompt.md CHANGED Viewed

@@ -4,6 +4,7 @@ Your job:
 - Score only the assistant response for the current turn, while using the full conversation history as context.
 - Evaluate only the requested dimensions.
 - Score each dimension as an integer from 1 to 10, where 10 is best.
+- Apply a CheckEval-style checklist approach before assigning a score: break each dimension into concrete yes/no checks, decide which checks pass, then convert that judgment into a final score.
 - Keep each explanation concise, no more than 3 sentences.
 - Keep each suggestion concise and actionable, no more than 2 sentences.
@@ -14,52 +15,81 @@ General scoring guidance:
 - 3 to 4: poor, major issues reduce usefulness
 - 1 to 2: very poor, misleading, unusable, or severely off-target
+CheckEval evaluation method:
+- First identify the target dimension and the specific user task for this turn.
+- For that dimension, reason through a short internal checklist of atomic criteria. Prefer concrete checks over holistic impressions.
+- Ground every judgment in the assistant's actual content, not vibe or style.
+- If a dimension is broad, interpret it through its sub-dimensions. Example: relevance may include question answering, topic consistency, and use of important context.
+- Penalize missing required content, contradictions, and off-topic material at the checklist-item level before deciding the final score.
+- Reward answers that satisfy the intended question, use the right context, and remain internally consistent.
+- Do not mention the hidden checklist in the output; provide only the required JSON fields.
 Core dimension guidance:
 - correctness:
   Judge factual accuracy, internal consistency, instruction-following, and whether the response avoids invented or contradictory details.
+  Checklist hints: correct facts, valid reasoning, no contradictions, no fabricated details, follows explicit task constraints.
   High score: requirements are interpreted correctly and outputs are logically valid.
   Low score: contains wrong facts, wrong logic, contradictions, or unsupported assumptions.
 - coverage:
   Judge whether the response addresses the important parts of the current request and relevant prior context.
+  Checklist hints: covers requested deliverables, includes key caveats, addresses constraints, does not skip major cases needed for the task.
   High score: covers main cases, edge cases, constraints, and expected outputs at the right level.
   Low score: omits important scenarios, ignores constraints, or responds too narrowly.
 - helpfulness:
   Judge whether the response is useful for the user's goal, easy to act on, and presented clearly.
+  Checklist hints: directly advances the user's goal, gives usable next steps or outputs, avoids unnecessary detours, is easy to apply.
   High score: well-structured, practical, readable, and directly usable.
   Low score: vague, hard to use, rambling, or missing next-step value.
 Common optional dimensions:
 - clarity:
   Judge readability, organization, and whether the wording is easy to understand.
+  Checklist hints: clear wording, low ambiguity, understandable structure, easy-to-follow explanations.
 - completeness:
   Judge whether the response feels sufficiently finished for the request, without major missing parts.
+  Checklist hints: no major missing sections, no abrupt ending, enough detail for the requested task.
 - conciseness:
   Judge whether the response is appropriately brief without losing needed substance.
+  Checklist hints: no obvious repetition, no filler, no avoidable digressions, enough information preserved.
 - relevance:
-  Judge whether the response stays on-topic and avoids unnecessary or distracting content.
+  Judge whether the response answers the user's question and stays tightly tied to the requested content and prior context.
+  Checklist hints: directly addresses the question, stays on topic, uses relevant conversation or source content, avoids irrelevant detail.
 - actionability:
   Judge whether the user can directly use the response to proceed, implement, or decide next steps.
+  Checklist hints: concrete next steps, usable outputs, decision-ready recommendations, implementation-ready detail where needed.
 - structure:
   Judge whether the response is organized into a form that is easy to scan and review.
+  Checklist hints: logical grouping, stable ordering, formatting supports scanning, no confusing jumps.
 - consistency:
   Judge whether the response aligns with earlier turns and remains internally coherent.
+  Checklist hints: no contradiction with prior context, no internal conflict, preserves established assumptions and terminology.
 - safety:
   Judge whether the response avoids risky, misleading, or inappropriate guidance for the context.
+  Checklist hints: avoids harmful guidance, avoids overclaiming, includes caution where needed, does not normalize unsafe behavior.
+CheckEval-inspired dimension interpretations:
+- Relevance is about answer-to-question fit and content-to-request fit.
+- Consistency is about factual and contextual non-contradiction.
+- Clarity and structure together capture readability and logical flow.
+- Helpfulness and actionability together capture whether the answer meaningfully moves the user forward.
+- Coverage and completeness together capture whether important requested content is present.
 Evaluation habits:
 - Use the conversation history only as context. Score the current assistant response itself.
 - Do not reward style if the answer is wrong.
 - Do not punish brevity if the request is simple and the answer is still sufficient.
 - If a dimension is not strongly applicable, still score it based on the closest reasonable interpretation.
+- When deciding between two nearby scores, prefer the lower score if one or more key checklist items fail.
+- If the response is strong overall but misses a critical required item, cap the score accordingly for affected dimensions.
 Output rules:
 - Return only valid JSON.

package/dist/prompts/turn-message-gen-system-prompt.md CHANGED Viewed

@@ -5,8 +5,10 @@ Rules:
 - Do not include explanations, labels, quotes, JSON, or markdown fences.
 - Keep the message natural, specific, and useful.
 - Preserve the important intent, constraints, and domain details from the provided inputs.
-- If explicit guidance is provided, follow it closely.
-- If prior conversation history exists, make the message follow naturally from that history.
+- Use `scenarioContext` and prior conversation `history` as the primary grounding for the next user message.
+- If explicit guidance is provided, treat it as an instruction for shaping the next message, not as the only source of truth.
+- If prior conversation history exists, make the message follow naturally from that history and avoid repeating information that is already established.
+- If scenario context includes concrete identifiers, names, or constraints, prefer carrying them forward when they are relevant to the next step.
 - If source content is provided, convert it into a realistic user message instead of copying it mechanically when possible.
 - Keep the message concise unless the source content requires more detail.

package/dist/reports/report.d.ts CHANGED Viewed

@@ -40,7 +40,7 @@ type ScenarioRuntimeSnapshot = {
     wsUrl: string;
     wsTopic: string;
     accountId: string;
-    projectId: string;
+    projectId?: string;
     appBaseUrl: string;
 };
 type WriteScenarioRunReportOptions = {

package/dist/reports/report.js CHANGED Viewed

@@ -124,6 +124,15 @@ function escapeHtml(value) {
         .replaceAll('"', '&quot;')
         .replaceAll("'", '&#39;');
 }
+function stringifyAssertionExpected(expected) {
+    if (typeof expected === 'object' && expected !== null && !Array.isArray(expected)) {
+        return JSON.stringify(expected, null, 2);
+    }
+    if (Array.isArray(expected)) {
+        return expected.length > 0 ? expected.join(', ') : '[]';
+    }
+    return expected;
+}
 function renderOptionalText(value) {
     if (!value?.trim()) {
         return '<span class="muted">-</span>';
@@ -134,6 +143,9 @@ function renderAssertionExpected(expected) {
     if (Array.isArray(expected)) {
         return expected.length > 0 ? escapeHtml(expected.join(', ')) : '<span class="muted">[]</span>';
     }
+    if (typeof expected === 'object' && expected !== null) {
+        return `<pre>${escapeHtml(stringifyAssertionExpected(expected))}</pre>`;
+    }
     return escapeHtml(expected);
 }
 function statusClass(status) {
@@ -297,8 +309,14 @@ function buildScenarioRunHtml(payload) {
         '.content-block pre{margin:0;white-space:pre-wrap;word-break:break-word;background:#f8fbff;border-radius:12px;padding:14px;max-height:420px;overflow:auto;}',
         '.content-block p{margin:0;background:#f8fbff;border-radius:12px;padding:14px;}',
         '.assertions,.evaluations{width:100%;border-collapse:collapse;font-size:14px;}',
+        '.assertions{table-layout:fixed;}',
         '.assertions th,.assertions td,.evaluations th,.evaluations td{padding:10px 12px;border-bottom:1px solid #e5edf7;vertical-align:top;text-align:left;}',
         '.assertions th,.evaluations th{font-size:12px;text-transform:uppercase;letter-spacing:.08em;color:#61728d;}',
+        '.assertions th:nth-child(1),.assertions td:nth-child(1){width:8%;}',
+        '.assertions th:nth-child(2),.assertions td:nth-child(2){width:12%;}',
+        '.assertions th:nth-child(3),.assertions td:nth-child(3){width:34%;}',
+        '.assertions th:nth-child(4),.assertions td:nth-child(4){width:16%;}',
+        '.assertions th:nth-child(5),.assertions td:nth-child(5){width:30%;}',
         '.badge{display:inline-flex;align-items:center;justify-content:center;border-radius:999px;padding:5px 10px;font-size:12px;font-weight:700;min-width:56px;}',
         '.badge.pass{background:#dcfce7;color:#166534;}',
         '.badge.fail{background:#fee2e2;color:#991b1b;}',

package/dist/scenarios/generated-message.js CHANGED Viewed

@@ -71,6 +71,7 @@ async function generateMessageFromAi(options) {
                                 scenarioId: options.scenarioId,
                                 scenarioName: options.scenarioName,
                                 turnLabel: options.turnLabel,
+                                scenarioContext: options.context,
                                 mode: options.definition.type === 'ai_gen_previous_context'
                                     ? 'fromPreviousContext'
                                     : options.definition.type === 'ai_gen_guidance'
@@ -105,6 +106,7 @@ async function resolveTurnMessage(options) {
     if (isAiGeneratedMessageDefinition(options.turn.message)) {
         return await generateMessageFromAi({
             definition: options.turn.message,
+            context: options.context,
             history: options.history,
             env: options.env,
             scenarioId: options.scenarioId,

package/dist/scenarios/scenario-loader.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import { AiScenarioSet, type BaseAiScenarioContext } from './scenario';
-type ScenarioContextBuilder<TContext extends BaseAiScenarioContext> = (projectId: string) => TContext;
+type ScenarioContextBuilder<TContext extends BaseAiScenarioContext> = (baseContext: BaseAiScenarioContext) => TContext;
 type LoadedScenarioModule<TContext extends BaseAiScenarioContext> = {
     scenarioSet: AiScenarioSet<TContext>;
     buildScenarioContext: ScenarioContextBuilder<TContext>;

package/dist/scenarios/scenario-loader.js CHANGED Viewed

@@ -6,6 +6,8 @@ Object.defineProperty(exports, "__esModule", { value: true });
 exports.loadScenarioModule = loadScenarioModule;
 const node_fs_1 = __importDefault(require("node:fs"));
 const node_path_1 = __importDefault(require("node:path"));
+const node_crypto_1 = require("node:crypto");
+const node_module_1 = require("node:module");
 const node_url_1 = require("node:url");
 function ensureTsExtension(value) {
     return node_path_1.default.extname(value) ? value : `${value}.ts`;
@@ -75,7 +77,7 @@ function getScenarioContextBuilder(exportsObject) {
             return candidate.buildScenarioContext;
         }
     }
-    return ((projectId) => ({ projectId }));
+    return ((baseContext) => ({ ...baseContext }));
 }
 function getExportCandidates(exportsObject) {
     const candidates = [
@@ -93,11 +95,47 @@ function unwrapModuleLikeExport(value) {
 }
 async function loadScenarioModule(options) {
     const scenarioFilePath = resolveExistingScenarioCandidate(options.scenarioFile, options.defaultRelativePath);
-    const moduleUrl = (0, node_url_1.pathToFileURL)(scenarioFilePath).href;
-    const exportsObject = (await import(moduleUrl));
+    const exportsObject = await loadScenarioExports(scenarioFilePath);
     return {
         scenarioSet: getScenarioSetExport(exportsObject, scenarioFilePath),
         buildScenarioContext: getScenarioContextBuilder(exportsObject),
         scenarioFilePath,
     };
 }
+async function loadScenarioExports(scenarioFilePath) {
+    if (node_path_1.default.extname(scenarioFilePath).toLowerCase() === '.ts') {
+        return await loadTranspiledTsScenario(scenarioFilePath);
+    }
+    const moduleUrl = (0, node_url_1.pathToFileURL)(scenarioFilePath).href;
+    return (await import(moduleUrl));
+}
+async function loadTranspiledTsScenario(scenarioFilePath) {
+    const source = await node_fs_1.default.promises.readFile(scenarioFilePath, 'utf8');
+    const transformedSource = transpileScenarioToCommonJs(source, scenarioFilePath);
+    const tempModulePath = node_path_1.default.join(node_path_1.default.dirname(scenarioFilePath), `.karrot-scenario-${(0, node_crypto_1.randomUUID)()}.cjs`);
+    await node_fs_1.default.promises.writeFile(tempModulePath, transformedSource, 'utf8');
+    try {
+        return require(tempModulePath);
+    }
+    finally {
+        await node_fs_1.default.promises.unlink(tempModulePath).catch(() => undefined);
+    }
+}
+function transpileScenarioToCommonJs(source, scenarioFilePath) {
+    let typescript;
+    try {
+        const consumerRequire = (0, node_module_1.createRequire)(node_path_1.default.join(process.cwd(), 'package.json'));
+        typescript = consumerRequire('typescript');
+    }
+    catch {
+        throw new Error(`Unable to load TypeScript to transpile scenario file ${scenarioFilePath}. Install 'typescript' in the consumer project or provide a .js scenario file.`);
+    }
+    return typescript.transpileModule(source, {
+        compilerOptions: {
+            module: typescript.ModuleKind.CommonJS,
+            target: typescript.ScriptTarget.ES2022,
+            esModuleInterop: true,
+        },
+        fileName: scenarioFilePath,
+    }).outputText;
+}

package/dist/scenarios/scenario.d.ts CHANGED Viewed

@@ -1,6 +1,4 @@
-export type BaseAiScenarioContext = {
-    projectId: string;
-};
+export type BaseAiScenarioContext = Record<string, unknown>;
 type AiTurnCompletionArgs<TContext extends BaseAiScenarioContext> = {
     context: TContext;
     output: string;
@@ -10,6 +8,12 @@ export type AiTurnAssertion = {
         hasText: string;
     } | {
         toolcall: string[];
+    } | {
+        toolcallWithContent: {
+            name: string;
+            hasText?: string | string[];
+            hasProperties?: Record<string, unknown>;
+        };
     };
     description?: string;
 } | {
@@ -34,8 +38,8 @@ export type AiGeneratedMessageDefinition = {
     type: 'ai_gen_content';
     content: string;
 };
-type AiTurnMessage<TContext extends BaseAiScenarioContext> = ((context: TContext) => string) | AiGeneratedMessageDefinition;
-export type AiTurn<TContext extends BaseAiScenarioContext> = {
+type AiTurnMessage<TContext extends BaseAiScenarioContext = BaseAiScenarioContext> = ((context: TContext) => string) | AiGeneratedMessageDefinition;
+export type AiTurn<TContext extends BaseAiScenarioContext = BaseAiScenarioContext> = {
     label: string;
     message: AiTurnMessage<TContext>;
     idleTimeoutMs?: number;
@@ -44,12 +48,12 @@ export type AiTurn<TContext extends BaseAiScenarioContext> = {
     eval?: AiTurnEvalDefinition[];
     onComplete?: (args: AiTurnCompletionArgs<TContext>) => void | Promise<void>;
 };
-export type AiScenario<TContext extends BaseAiScenarioContext> = {
+export type AiScenario<TContext extends BaseAiScenarioContext = BaseAiScenarioContext> = {
     id: string;
     name: string;
     turns: AiTurn<TContext>[];
 };
-export declare class AiScenarioSet<TContext extends BaseAiScenarioContext> {
+export declare class AiScenarioSet<TContext extends BaseAiScenarioContext = BaseAiScenarioContext> {
     readonly items: AiScenario<TContext>[];
     constructor(items: AiScenario<TContext>[]);
     select(ids?: string[]): AiScenario<TContext>[];

package/dist/utils/config.d.ts CHANGED Viewed

@@ -4,7 +4,7 @@ export type KarrotRuntimeSnapshot = {
     wsUrl: string;
     wsTopic: string;
     accountId: string;
-    projectId: string;
+    projectId?: string;
     appBaseUrl: string;
 };
 export type KarrotScenarioSelection = {
@@ -84,9 +84,7 @@ export type KarrotConfig = {
         systemPromptPath?: string;
         promptDirectory?: string;
     };
-    context: Record<string, unknown> & {
-        projectId: string;
-    };
+    context?: Record<string, unknown>;
     report?: {
         enabled?: boolean;
         environment: string;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@huydao/karrot",
-  "version": "0.1.1",
+  "version": "0.1.4",
   "description": "Reusable AI scenario execution, assertion, evaluation, and reporting toolkit",
   "license": "ISC",
   "type": "commonjs",
@@ -134,13 +134,20 @@
     "prepack": "npm run build"
   },
   "dependencies": {
-    "ag-ui-wss": "file:vendor/ag-ui-wss",
+    "@stomp/stompjs": "^7.3.0",
+    "chalk": "^5.6.2",
+    "commander": "^14.0.3",
+    "serialize-error": "^13.0.1",
+    "uuid": "^13.0.0",
+    "ws": "^8.20.0",
     "yaml": "^2.8.1"
   },
-  "bundleDependencies": [
-    "ag-ui-wss"
-  ],
   "publishConfig": {
     "access": "public"
+  },
+  "devDependencies": {
+    "@types/node": "^25.6.0",
+    "@types/ws": "^8.18.1",
+    "typescript": "^5.9.3"
   }
 }