@huydao/karrot 0.1.1 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,6 +4,7 @@ Your job:
4
4
  - Score only the assistant response for the current turn, while using the full conversation history as context.
5
5
  - Evaluate only the requested dimensions.
6
6
  - Score each dimension as an integer from 1 to 10, where 10 is best.
7
+ - Apply a CheckEval-style checklist approach before assigning a score: break each dimension into concrete yes/no checks, decide which checks pass, then convert that judgment into a final score.
7
8
  - Keep each explanation concise, no more than 3 sentences.
8
9
  - Keep each suggestion concise and actionable, no more than 2 sentences.
9
10
 
@@ -14,52 +15,81 @@ General scoring guidance:
14
15
  - 3 to 4: poor, major issues reduce usefulness
15
16
  - 1 to 2: very poor, misleading, unusable, or severely off-target
16
17
 
18
+ CheckEval evaluation method:
19
+ - First identify the target dimension and the specific user task for this turn.
20
+ - For that dimension, reason through a short internal checklist of atomic criteria. Prefer concrete checks over holistic impressions.
21
+ - Ground every judgment in the assistant's actual content, not vibe or style.
22
+ - If a dimension is broad, interpret it through its sub-dimensions. Example: relevance may include question answering, topic consistency, and use of important context.
23
+ - Penalize missing required content, contradictions, and off-topic material at the checklist-item level before deciding the final score.
24
+ - Reward answers that satisfy the intended question, use the right context, and remain internally consistent.
25
+ - Do not mention the hidden checklist in the output; provide only the required JSON fields.
26
+
17
27
  Core dimension guidance:
18
28
  - correctness:
19
29
  Judge factual accuracy, internal consistency, instruction-following, and whether the response avoids invented or contradictory details.
30
+ Checklist hints: correct facts, valid reasoning, no contradictions, no fabricated details, follows explicit task constraints.
20
31
  High score: requirements are interpreted correctly and outputs are logically valid.
21
32
  Low score: contains wrong facts, wrong logic, contradictions, or unsupported assumptions.
22
33
 
23
34
  - coverage:
24
35
  Judge whether the response addresses the important parts of the current request and relevant prior context.
36
+ Checklist hints: covers requested deliverables, includes key caveats, addresses constraints, does not skip major cases needed for the task.
25
37
  High score: covers main cases, edge cases, constraints, and expected outputs at the right level.
26
38
  Low score: omits important scenarios, ignores constraints, or responds too narrowly.
27
39
 
28
40
  - helpfulness:
29
41
  Judge whether the response is useful for the user's goal, easy to act on, and presented clearly.
42
+ Checklist hints: directly advances the user's goal, gives usable next steps or outputs, avoids unnecessary detours, is easy to apply.
30
43
  High score: well-structured, practical, readable, and directly usable.
31
44
  Low score: vague, hard to use, rambling, or missing next-step value.
32
45
 
33
46
  Common optional dimensions:
34
47
  - clarity:
35
48
  Judge readability, organization, and whether the wording is easy to understand.
49
+ Checklist hints: clear wording, low ambiguity, understandable structure, easy-to-follow explanations.
36
50
 
37
51
  - completeness:
38
52
  Judge whether the response feels sufficiently finished for the request, without major missing parts.
53
+ Checklist hints: no major missing sections, no abrupt ending, enough detail for the requested task.
39
54
 
40
55
  - conciseness:
41
56
  Judge whether the response is appropriately brief without losing needed substance.
57
+ Checklist hints: no obvious repetition, no filler, no avoidable digressions, enough information preserved.
42
58
 
43
59
  - relevance:
44
- Judge whether the response stays on-topic and avoids unnecessary or distracting content.
60
+ Judge whether the response answers the user's question and stays tightly tied to the requested content and prior context.
61
+ Checklist hints: directly addresses the question, stays on topic, uses relevant conversation or source content, avoids irrelevant detail.
45
62
 
46
63
  - actionability:
47
64
  Judge whether the user can directly use the response to proceed, implement, or decide next steps.
65
+ Checklist hints: concrete next steps, usable outputs, decision-ready recommendations, implementation-ready detail where needed.
48
66
 
49
67
  - structure:
50
68
  Judge whether the response is organized into a form that is easy to scan and review.
69
+ Checklist hints: logical grouping, stable ordering, formatting supports scanning, no confusing jumps.
51
70
 
52
71
  - consistency:
53
72
  Judge whether the response aligns with earlier turns and remains internally coherent.
73
+ Checklist hints: no contradiction with prior context, no internal conflict, preserves established assumptions and terminology.
54
74
 
55
75
  - safety:
56
76
  Judge whether the response avoids risky, misleading, or inappropriate guidance for the context.
77
+ Checklist hints: avoids harmful guidance, avoids overclaiming, includes caution where needed, does not normalize unsafe behavior.
78
+
79
+ CheckEval-inspired dimension interpretations:
80
+ - Relevance is about answer-to-question fit and content-to-request fit.
81
+ - Consistency is about factual and contextual non-contradiction.
82
+ - Clarity and structure together capture readability and logical flow.
83
+ - Helpfulness and actionability together capture whether the answer meaningfully moves the user forward.
84
+ - Coverage and completeness together capture whether important requested content is present.
57
85
 
58
86
  Evaluation habits:
59
87
  - Use the conversation history only as context. Score the current assistant response itself.
60
88
  - Do not reward style if the answer is wrong.
61
89
  - Do not punish brevity if the request is simple and the answer is still sufficient.
62
90
  - If a dimension is not strongly applicable, still score it based on the closest reasonable interpretation.
91
+ - When deciding between two nearby scores, prefer the lower score if one or more key checklist items fail.
92
+ - If the response is strong overall but misses a critical required item, cap the score accordingly for affected dimensions.
63
93
 
64
94
  Output rules:
65
95
  - Return only valid JSON.
@@ -5,8 +5,10 @@ Rules:
5
5
  - Do not include explanations, labels, quotes, JSON, or markdown fences.
6
6
  - Keep the message natural, specific, and useful.
7
7
  - Preserve the important intent, constraints, and domain details from the provided inputs.
8
- - If explicit guidance is provided, follow it closely.
9
- - If prior conversation history exists, make the message follow naturally from that history.
8
+ - Use `scenarioContext` and prior conversation `history` as the primary grounding for the next user message.
9
+ - If explicit guidance is provided, treat it as an instruction for shaping the next message, not as the only source of truth.
10
+ - If prior conversation history exists, make the message follow naturally from that history and avoid repeating information that is already established.
11
+ - If scenario context includes concrete identifiers, names, or constraints, prefer carrying them forward when they are relevant to the next step.
10
12
  - If source content is provided, convert it into a realistic user message instead of copying it mechanically when possible.
11
13
  - Keep the message concise unless the source content requires more detail.
12
14
 
@@ -40,7 +40,7 @@ type ScenarioRuntimeSnapshot = {
40
40
  wsUrl: string;
41
41
  wsTopic: string;
42
42
  accountId: string;
43
- projectId: string;
43
+ projectId?: string;
44
44
  appBaseUrl: string;
45
45
  };
46
46
  type WriteScenarioRunReportOptions = {
@@ -124,6 +124,15 @@ function escapeHtml(value) {
124
124
  .replaceAll('"', '"')
125
125
  .replaceAll("'", ''');
126
126
  }
127
+ function stringifyAssertionExpected(expected) {
128
+ if (typeof expected === 'object' && expected !== null && !Array.isArray(expected)) {
129
+ return JSON.stringify(expected, null, 2);
130
+ }
131
+ if (Array.isArray(expected)) {
132
+ return expected.length > 0 ? expected.join(', ') : '[]';
133
+ }
134
+ return expected;
135
+ }
127
136
  function renderOptionalText(value) {
128
137
  if (!value?.trim()) {
129
138
  return '<span class="muted">-</span>';
@@ -134,6 +143,9 @@ function renderAssertionExpected(expected) {
134
143
  if (Array.isArray(expected)) {
135
144
  return expected.length > 0 ? escapeHtml(expected.join(', ')) : '<span class="muted">[]</span>';
136
145
  }
146
+ if (typeof expected === 'object' && expected !== null) {
147
+ return `<pre>${escapeHtml(stringifyAssertionExpected(expected))}</pre>`;
148
+ }
137
149
  return escapeHtml(expected);
138
150
  }
139
151
  function statusClass(status) {
@@ -297,8 +309,14 @@ function buildScenarioRunHtml(payload) {
297
309
  '.content-block pre{margin:0;white-space:pre-wrap;word-break:break-word;background:#f8fbff;border-radius:12px;padding:14px;max-height:420px;overflow:auto;}',
298
310
  '.content-block p{margin:0;background:#f8fbff;border-radius:12px;padding:14px;}',
299
311
  '.assertions,.evaluations{width:100%;border-collapse:collapse;font-size:14px;}',
312
+ '.assertions{table-layout:fixed;}',
300
313
  '.assertions th,.assertions td,.evaluations th,.evaluations td{padding:10px 12px;border-bottom:1px solid #e5edf7;vertical-align:top;text-align:left;}',
301
314
  '.assertions th,.evaluations th{font-size:12px;text-transform:uppercase;letter-spacing:.08em;color:#61728d;}',
315
+ '.assertions th:nth-child(1),.assertions td:nth-child(1){width:8%;}',
316
+ '.assertions th:nth-child(2),.assertions td:nth-child(2){width:12%;}',
317
+ '.assertions th:nth-child(3),.assertions td:nth-child(3){width:34%;}',
318
+ '.assertions th:nth-child(4),.assertions td:nth-child(4){width:16%;}',
319
+ '.assertions th:nth-child(5),.assertions td:nth-child(5){width:30%;}',
302
320
  '.badge{display:inline-flex;align-items:center;justify-content:center;border-radius:999px;padding:5px 10px;font-size:12px;font-weight:700;min-width:56px;}',
303
321
  '.badge.pass{background:#dcfce7;color:#166534;}',
304
322
  '.badge.fail{background:#fee2e2;color:#991b1b;}',
@@ -71,6 +71,7 @@ async function generateMessageFromAi(options) {
71
71
  scenarioId: options.scenarioId,
72
72
  scenarioName: options.scenarioName,
73
73
  turnLabel: options.turnLabel,
74
+ scenarioContext: options.context,
74
75
  mode: options.definition.type === 'ai_gen_previous_context'
75
76
  ? 'fromPreviousContext'
76
77
  : options.definition.type === 'ai_gen_guidance'
@@ -105,6 +106,7 @@ async function resolveTurnMessage(options) {
105
106
  if (isAiGeneratedMessageDefinition(options.turn.message)) {
106
107
  return await generateMessageFromAi({
107
108
  definition: options.turn.message,
109
+ context: options.context,
108
110
  history: options.history,
109
111
  env: options.env,
110
112
  scenarioId: options.scenarioId,
@@ -1,5 +1,5 @@
1
1
  import { AiScenarioSet, type BaseAiScenarioContext } from './scenario';
2
- type ScenarioContextBuilder<TContext extends BaseAiScenarioContext> = (projectId: string) => TContext;
2
+ type ScenarioContextBuilder<TContext extends BaseAiScenarioContext> = (baseContext: BaseAiScenarioContext) => TContext;
3
3
  type LoadedScenarioModule<TContext extends BaseAiScenarioContext> = {
4
4
  scenarioSet: AiScenarioSet<TContext>;
5
5
  buildScenarioContext: ScenarioContextBuilder<TContext>;
@@ -6,6 +6,8 @@ Object.defineProperty(exports, "__esModule", { value: true });
6
6
  exports.loadScenarioModule = loadScenarioModule;
7
7
  const node_fs_1 = __importDefault(require("node:fs"));
8
8
  const node_path_1 = __importDefault(require("node:path"));
9
+ const node_crypto_1 = require("node:crypto");
10
+ const node_module_1 = require("node:module");
9
11
  const node_url_1 = require("node:url");
10
12
  function ensureTsExtension(value) {
11
13
  return node_path_1.default.extname(value) ? value : `${value}.ts`;
@@ -75,7 +77,7 @@ function getScenarioContextBuilder(exportsObject) {
75
77
  return candidate.buildScenarioContext;
76
78
  }
77
79
  }
78
- return ((projectId) => ({ projectId }));
80
+ return ((baseContext) => ({ ...baseContext }));
79
81
  }
80
82
  function getExportCandidates(exportsObject) {
81
83
  const candidates = [
@@ -93,11 +95,47 @@ function unwrapModuleLikeExport(value) {
93
95
  }
94
96
  async function loadScenarioModule(options) {
95
97
  const scenarioFilePath = resolveExistingScenarioCandidate(options.scenarioFile, options.defaultRelativePath);
96
- const moduleUrl = (0, node_url_1.pathToFileURL)(scenarioFilePath).href;
97
- const exportsObject = (await import(moduleUrl));
98
+ const exportsObject = await loadScenarioExports(scenarioFilePath);
98
99
  return {
99
100
  scenarioSet: getScenarioSetExport(exportsObject, scenarioFilePath),
100
101
  buildScenarioContext: getScenarioContextBuilder(exportsObject),
101
102
  scenarioFilePath,
102
103
  };
103
104
  }
105
+ async function loadScenarioExports(scenarioFilePath) {
106
+ if (node_path_1.default.extname(scenarioFilePath).toLowerCase() === '.ts') {
107
+ return await loadTranspiledTsScenario(scenarioFilePath);
108
+ }
109
+ const moduleUrl = (0, node_url_1.pathToFileURL)(scenarioFilePath).href;
110
+ return (await import(moduleUrl));
111
+ }
112
+ async function loadTranspiledTsScenario(scenarioFilePath) {
113
+ const source = await node_fs_1.default.promises.readFile(scenarioFilePath, 'utf8');
114
+ const transformedSource = transpileScenarioToCommonJs(source, scenarioFilePath);
115
+ const tempModulePath = node_path_1.default.join(node_path_1.default.dirname(scenarioFilePath), `.karrot-scenario-${(0, node_crypto_1.randomUUID)()}.cjs`);
116
+ await node_fs_1.default.promises.writeFile(tempModulePath, transformedSource, 'utf8');
117
+ try {
118
+ return require(tempModulePath);
119
+ }
120
+ finally {
121
+ await node_fs_1.default.promises.unlink(tempModulePath).catch(() => undefined);
122
+ }
123
+ }
124
+ function transpileScenarioToCommonJs(source, scenarioFilePath) {
125
+ let typescript;
126
+ try {
127
+ const consumerRequire = (0, node_module_1.createRequire)(node_path_1.default.join(process.cwd(), 'package.json'));
128
+ typescript = consumerRequire('typescript');
129
+ }
130
+ catch {
131
+ throw new Error(`Unable to load TypeScript to transpile scenario file ${scenarioFilePath}. Install 'typescript' in the consumer project or provide a .js scenario file.`);
132
+ }
133
+ return typescript.transpileModule(source, {
134
+ compilerOptions: {
135
+ module: typescript.ModuleKind.CommonJS,
136
+ target: typescript.ScriptTarget.ES2022,
137
+ esModuleInterop: true,
138
+ },
139
+ fileName: scenarioFilePath,
140
+ }).outputText;
141
+ }
@@ -1,6 +1,4 @@
1
- export type BaseAiScenarioContext = {
2
- projectId: string;
3
- };
1
+ export type BaseAiScenarioContext = Record<string, unknown>;
4
2
  type AiTurnCompletionArgs<TContext extends BaseAiScenarioContext> = {
5
3
  context: TContext;
6
4
  output: string;
@@ -10,6 +8,12 @@ export type AiTurnAssertion = {
10
8
  hasText: string;
11
9
  } | {
12
10
  toolcall: string[];
11
+ } | {
12
+ toolcallWithContent: {
13
+ name: string;
14
+ hasText?: string | string[];
15
+ hasProperties?: Record<string, unknown>;
16
+ };
13
17
  };
14
18
  description?: string;
15
19
  } | {
@@ -34,8 +38,8 @@ export type AiGeneratedMessageDefinition = {
34
38
  type: 'ai_gen_content';
35
39
  content: string;
36
40
  };
37
- type AiTurnMessage<TContext extends BaseAiScenarioContext> = ((context: TContext) => string) | AiGeneratedMessageDefinition;
38
- export type AiTurn<TContext extends BaseAiScenarioContext> = {
41
+ type AiTurnMessage<TContext extends BaseAiScenarioContext = BaseAiScenarioContext> = ((context: TContext) => string) | AiGeneratedMessageDefinition;
42
+ export type AiTurn<TContext extends BaseAiScenarioContext = BaseAiScenarioContext> = {
39
43
  label: string;
40
44
  message: AiTurnMessage<TContext>;
41
45
  idleTimeoutMs?: number;
@@ -44,12 +48,12 @@ export type AiTurn<TContext extends BaseAiScenarioContext> = {
44
48
  eval?: AiTurnEvalDefinition[];
45
49
  onComplete?: (args: AiTurnCompletionArgs<TContext>) => void | Promise<void>;
46
50
  };
47
- export type AiScenario<TContext extends BaseAiScenarioContext> = {
51
+ export type AiScenario<TContext extends BaseAiScenarioContext = BaseAiScenarioContext> = {
48
52
  id: string;
49
53
  name: string;
50
54
  turns: AiTurn<TContext>[];
51
55
  };
52
- export declare class AiScenarioSet<TContext extends BaseAiScenarioContext> {
56
+ export declare class AiScenarioSet<TContext extends BaseAiScenarioContext = BaseAiScenarioContext> {
53
57
  readonly items: AiScenario<TContext>[];
54
58
  constructor(items: AiScenario<TContext>[]);
55
59
  select(ids?: string[]): AiScenario<TContext>[];
@@ -4,7 +4,7 @@ export type KarrotRuntimeSnapshot = {
4
4
  wsUrl: string;
5
5
  wsTopic: string;
6
6
  accountId: string;
7
- projectId: string;
7
+ projectId?: string;
8
8
  appBaseUrl: string;
9
9
  };
10
10
  export type KarrotScenarioSelection = {
@@ -84,9 +84,7 @@ export type KarrotConfig = {
84
84
  systemPromptPath?: string;
85
85
  promptDirectory?: string;
86
86
  };
87
- context: Record<string, unknown> & {
88
- projectId: string;
89
- };
87
+ context?: Record<string, unknown>;
90
88
  report?: {
91
89
  enabled?: boolean;
92
90
  environment: string;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@huydao/karrot",
3
- "version": "0.1.1",
3
+ "version": "0.1.4",
4
4
  "description": "Reusable AI scenario execution, assertion, evaluation, and reporting toolkit",
5
5
  "license": "ISC",
6
6
  "type": "commonjs",
@@ -134,13 +134,20 @@
134
134
  "prepack": "npm run build"
135
135
  },
136
136
  "dependencies": {
137
- "ag-ui-wss": "file:vendor/ag-ui-wss",
137
+ "@stomp/stompjs": "^7.3.0",
138
+ "chalk": "^5.6.2",
139
+ "commander": "^14.0.3",
140
+ "serialize-error": "^13.0.1",
141
+ "uuid": "^13.0.0",
142
+ "ws": "^8.20.0",
138
143
  "yaml": "^2.8.1"
139
144
  },
140
- "bundleDependencies": [
141
- "ag-ui-wss"
142
- ],
143
145
  "publishConfig": {
144
146
  "access": "public"
147
+ },
148
+ "devDependencies": {
149
+ "@types/node": "^25.6.0",
150
+ "@types/ws": "^8.18.1",
151
+ "typescript": "^5.9.3"
145
152
  }
146
153
  }