npm - vieval - Versions diffs - 0.0.1 - Mend

vieval 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

package/README.md +290 -0
package/dist/assertions-DcAjfVDA.mjs +183 -0
package/dist/assertions-DcAjfVDA.mjs.map +1 -0
package/dist/cli/index.d.mts +11 -0
package/dist/cli/index.mjs +1434 -0
package/dist/cli/index.mjs.map +1 -0
package/dist/config-D2fe1SnT.mjs +17 -0
package/dist/config-D2fe1SnT.mjs.map +1 -0
package/dist/config.d.mts +3 -0
package/dist/config.mjs +3 -0
package/dist/core/assertions/index.d.mts +2 -0
package/dist/core/assertions/index.mjs +2 -0
package/dist/core/inference-executors/index.d.mts +273 -0
package/dist/core/inference-executors/index.mjs +225 -0
package/dist/core/inference-executors/index.mjs.map +1 -0
package/dist/core/processors/results/index.d.mts +96 -0
package/dist/core/processors/results/index.mjs +64 -0
package/dist/core/processors/results/index.mjs.map +1 -0
package/dist/core/runner/index.d.mts +2 -0
package/dist/core/runner/index.mjs +2 -0
package/dist/expect-0jPJ7Zio.d.mts +2318 -0
package/dist/expect-extensions-CwPtgTz8.mjs +13471 -0
package/dist/expect-extensions-CwPtgTz8.mjs.map +1 -0
package/dist/expect-i9WZWGrA.mjs +17 -0
package/dist/expect-i9WZWGrA.mjs.map +1 -0
package/dist/expect.d.mts +2 -0
package/dist/expect.mjs +2 -0
package/dist/index-DP7jsORl.d.mts +947 -0
package/dist/index-oSXhM1zx.d.mts +314 -0
package/dist/index.d.mts +92 -0
package/dist/index.mjs +150 -0
package/dist/index.mjs.map +1 -0
package/dist/magic-string.es-CH1jwzMg.mjs +1013 -0
package/dist/magic-string.es-CH1jwzMg.mjs.map +1 -0
package/dist/models-D_MsBtYw.mjs +14 -0
package/dist/models-D_MsBtYw.mjs.map +1 -0
package/dist/plugin-DVaRZY2x.d.mts +84 -0
package/dist/plugins/chat-models/index.d.mts +90 -0
package/dist/plugins/chat-models/index.mjs +48 -0
package/dist/plugins/chat-models/index.mjs.map +1 -0
package/dist/registry-ChOjjdEC.mjs +245 -0
package/dist/registry-ChOjjdEC.mjs.map +1 -0
package/dist/runner-4ZsOveoY.mjs +480 -0
package/dist/runner-4ZsOveoY.mjs.map +1 -0
package/dist/testing/expect-extensions.d.mts +86 -0
package/dist/testing/expect-extensions.mjs +2 -0
package/package.json +88 -0

package/README.md ADDED Viewed

@@ -0,0 +1,290 @@
+# Vieval
+[![npm version][npm-version-src]][npm-version-href]
+[![npm downloads][npm-downloads-src]][npm-downloads-href]
+[![bundle][bundle-src]][bundle-href]
+[![JSDocs][jsdocs-src]][jsdocs-href]
+[![License][license-src]][license-href]
+[![Ask DeepWiki][deepwiki-src]][deepwiki-href]
+Vitest-style evaluation framework for agents, models, and task pipelines.
+`vieval` keeps eval authoring close to product code while giving you repeatable project/eval/task matrix runs and a CLI summary experience.
+## Why Vieval
+- Familiar authoring model (`describeEval`, `caseOf`, `expect`) instead of a separate eval DSL language.
+- Matrix control at three levels (project, eval, task) with deterministic merge rules.
+- Works for chat and non-chat workloads through custom `projects[].executor`.
+- Human-readable TTY output and machine-readable JSON output from the same command.
+## Quick Start
+### 1) Create a config
+```ts
+// vieval.config.ts
+import { defineConfig } from 'vieval'
+export default defineConfig({
+  projects: [
+    {
+      name: 'default',
+      root: '.',
+      include: ['evals/*.eval.ts'],
+    },
+  ],
+})
+```
+### 2) Create an eval
+```ts
+// evals/smoke.eval.ts
+import { caseOf, describeEval, expect } from 'vieval'
+export default describeEval('smoke', () => {
+  caseOf('2 + 2 = 4', () => {
+    expect(2 + 2).toBe(4)
+  }, {})
+})
+```
+### 3) Run
+```bash
+pnpm -F vieval eval:run -- --config ./vieval.config.ts
+```
+## Core Concepts
+### Matrix layering
+`vieval` expands matrices in scope order:
+1. `project` from `vieval.config.*`
+2. `eval` from `*.eval.ts`
+3. `task` from `defineTask(...)`
+Within each scope, matrix layers apply in this order:
+1. `disable`
+2. `extend`
+3. `override`
+Both `runMatrix` and `evalMatrix` are supported at each scope.
+### Matrix compatibility alias
+`matrix` remains supported as a compatibility alias for `runMatrix.extend`.
+### Stable matrix artifact
+Each scheduled run includes:
+- `matrix.run`
+- `matrix.eval`
+- `matrix.meta.runRowId`
+- `matrix.meta.evalRowId`
+Use these fields to group and compare runs across models, rubrics, and scenarios.
+## Architecture
+```mermaid
+flowchart LR
+  CLI["src/cli/index.ts\n(runTopLevelCli)"] --> RUN["src/cli/run.ts\n(runVievalCli + formatter)"]
+  RUN --> CFG["src/cli/config.ts\n(loadVievalCliConfig)"]
+  RUN --> DISC["src/cli/discovery.ts\n(discoverEvalFiles)"]
+  RUN --> REG["src/dsl/registry.ts\n(module registrations)"]
+  RUN --> DSL["src/dsl/task.ts\n(describeTask/caseOf hooks)"]
+  RUN --> REP["src/cli/reporters/*\n(summary + windowed + noop)"]
+  RUN --> COLLECT["src/core/runner/collect.ts\n(collectEvalEntries)"]
+  RUN --> SCHEDULE["src/core/runner/schedule.ts\n(createRunnerSchedule)"]
+  RUN --> EXEC["src/core/runner/run.ts\n(runScheduledTasks)"]
+  EXEC --> CTX["src/core/runner/task-context.ts\n(createTaskExecutionContext)"]
+  EXEC --> AGG["src/core/runner/aggregate.ts\n(aggregateRunResults)"]
+  AGG --> POLICY["src/core/processors/results/*\n(hybrid-threshold, max-failed-runs)"]
+  RUN --> POLICY
+  PROVIDERS["src/core/inference-executors/*\n(env, adapters, retry, openai)"] --> CTX
+  PLUGINS["src/plugins/chat-models/*\n(model aliases/plugins)"] --> CFG
+  TESTS["src/**/*.test.ts + tests/projects/*"] --> CLI
+  TESTS --> RUN
+  TESTS --> EXEC
+  TESTS --> DSL
+  TESTS --> REP
+```
+### Connection Notes
+- `src/cli/run.ts` is the integration hub: it loads config, discovers eval files, prepares schedules, runs tasks, emits live reporter events, and formats static summaries.
+- `src/dsl/task.ts` emits case lifecycle hooks (`onCaseStart` / `onCaseEnd`) that feed the live reporter when `reporterHooks` is present in task context.
+- `src/core/runner/run.ts` owns task lifecycle (`onTaskStart` / `onTaskEnd`) and result aggregation boundaries.
+- `src/cli/reporters/summary-reporter.ts` and `src/cli/reporters/renderers/windowed-renderer.ts` provide the Vitest-style live TTY experience; non-TTY falls back to noop reporter + final static formatter.
+### Runtime Sequence (`eval:run`)
+```mermaid
+sequenceDiagram
+  participant U as User
+  participant C as src/cli/index.ts
+  participant R as src/cli/run.ts
+  participant L as src/cli/config.ts
+  participant D as src/cli/discovery.ts
+  participant S as src/core/runner/*
+  participant T as src/dsl/task.ts
+  participant P as src/cli/reporters/*
+  U->>C: pnpm run eval:run -- --config ...
+  C->>R: runVievalCli(options)
+  R->>L: loadVievalCliConfig()
+  R->>D: discoverEvalFiles()
+  R->>S: collectEvalEntries() + createRunnerSchedule()
+  R->>P: createCliReporter(isTTY)
+  R->>P: onRunStart + onTaskQueued
+  R->>S: runScheduledTasks(...)
+  S->>P: onTaskStart / onTaskEnd
+  S->>T: task.run(context)
+  T->>P: reporterHooks.onCaseStart / onCaseEnd
+  S-->>R: aggregated run results
+  R->>P: onRunEnd + dispose
+  R-->>C: CliRunOutput
+  C->>U: static summary (or JSON)
+```
+## Config Example (Control Group Style)
+```ts
+import { defineConfig } from 'vieval'
+export default defineConfig({
+  projects: [
+    {
+      name: 'chat-evals',
+      runMatrix: {
+        extend: {
+          model: ['gpt-4.1-mini', 'gpt-4.1'],
+          promptLanguage: ['en', 'zh'],
+          scenario: ['baseline', 'stress'],
+        },
+      },
+      evalMatrix: {
+        extend: {
+          rubric: ['strict', 'lenient'],
+          rubricModel: ['judge-mini', 'judge-large'],
+        },
+      },
+    },
+  ],
+})
+```
+## Custom Executor Example
+Use `projects[].executor` for non-chat workloads such as ASR, TTS, image, motion, or other domain-specific evaluators.
+```ts
+import { defineConfig } from 'vieval'
+export default defineConfig({
+  projects: [
+    {
+      name: 'motion-evals',
+      inferenceExecutors: [{ id: 'motion-engine' }],
+      models: [
+        {
+          id: 'motion-engine:v2',
+          inferenceExecutor: 'motion-engine',
+          inferenceExecutorId: 'motion-engine',
+          model: 'v2',
+          aliases: ['motion-default'],
+        },
+      ],
+      async executor(task, context) {
+        const model = context.model()
+        const success = model.model === 'v2' && task.matrix.run.scenario === 'baseline'
+        return {
+          id: task.id,
+          entryId: task.entry.id,
+          inferenceExecutorId: task.inferenceExecutor.id,
+          matrix: task.matrix,
+          scores: [{ kind: 'exact', score: success ? 1 : 0 }],
+        }
+      },
+    },
+  ],
+})
+```
+## CLI
+```bash
+vieval run [--config <path>] [--project <name>] [--json]
+```
+Common usage:
+```bash
+pnpm -F vieval eval:run
+pnpm -F vieval eval:run -- --config ./vieval.config.ts
+pnpm -F vieval eval:run -- --config ./vieval.config.ts --project chess --project moderation
+pnpm -F vieval eval:run -- --json
+pnpm -F vieval eval:run -- --help
+```
+## Examples In This Repository
+- `packages/vieval/tests/projects/example-api-defining-new-task`
+- `packages/vieval/tests/projects/example-api-config-matrix`
+- `packages/vieval/tests/projects/example-api-load-datasource-as-cases`
+- `packages/vieval/tests/projects/example-pattern-byoa-bring-your-own-agent`
+## Development
+```bash
+pnpm install
+pnpm -F vieval test:run
+pnpm -F vieval typecheck
+pnpm lint:fix
+```
+## When To Use / Not Use
+Use `vieval` when:
+- you want evals close to app code with Vitest-like ergonomics;
+- you need matrix experiments and repeatable run metadata;
+- you want one CLI for local diagnostics and CI export (`--json`).
+Do not use `vieval` when:
+- you need hosted dataset management, annotation UI, or SaaS observability out of the box;
+- you only need one-off scripts without reusable eval definitions or matrix scheduling.
+## Acknowledgements
+- [Vitest](https://github.com/vitest-dev/vitest)
+- [LobeHub](https://github.com/lobehub/lobehub)
+- [EvalSys](https://github.com/evalsys)
+## License
+MIT
+[npm-version-src]: https://img.shields.io/npm/v/vieval?style=flat&colorA=080f12&colorB=1fa669
+[npm-version-href]: https://npmjs.com/package/vieval
+[npm-downloads-src]: https://img.shields.io/npm/dm/vieval?style=flat&colorA=080f12&colorB=1fa669
+[npm-downloads-href]: https://npmjs.com/package/vieval
+[bundle-src]: https://img.shields.io/bundlephobia/minzip/vieval?style=flat&colorA=080f12&colorB=1fa669&label=minzip
+[bundle-href]: https://bundlephobia.com/result?p=vieval
+[license-src]: https://img.shields.io/github/license/vieval-dev/vieval.svg?style=flat&colorA=080f12&colorB=1fa669
+[license-href]: https://github.com/vieval-dev/vieval/blob/main/LICENSE
+[jsdocs-src]: https://img.shields.io/badge/jsdocs-reference-080f12?style=flat&colorA=080f12&colorB=1fa669
+[jsdocs-href]: https://www.jsdocs.io/package/vieval
+[deepwiki-src]: https://deepwiki.com/badge.svg
+[deepwiki-href]: https://deepwiki.com/vieval-dev/vieval

package/dist/assertions-DcAjfVDA.mjs ADDED Viewed

@@ -0,0 +1,183 @@
+//#region src/core/assertions/index.ts
+/**
+* Normalizes text for matching.
+*
+* Before: `"  Hello\nWorld  "`
+* After: `"hello world"`
+*/
+function normalizeMatchText(value, caseSensitive) {
+	const compactedWhitespace = value.trim().replaceAll(/\s+/g, " ");
+	if (caseSensitive) return compactedWhitespace;
+	return compactedWhitespace.toLowerCase();
+}
+function clampScore(score) {
+	if (Number.isNaN(score)) return 0;
+	if (score < 0) return 0;
+	if (score > 1) return 1;
+	return score;
+}
+function createOutcome(id, scoreKind, pass, score, reason) {
+	return {
+		id,
+		pass,
+		reason,
+		score: clampScore(score),
+		scoreKind
+	};
+}
+/**
+* Creates an assertion that requires specific keywords in model text.
+*
+* Example:
+* `expectMustInclude({ id: 'tone', keywords: ['calm', 'move'] })`
+*/
+function expectMustInclude(options) {
+	return async (context) => {
+		if (options.keywords.length === 0) return createOutcome(options.id, "exact", true, 1, "No required keywords configured.");
+		const caseSensitive = options.caseSensitive ?? false;
+		const normalizedText = normalizeMatchText(context.text, caseSensitive);
+		const matches = options.keywords.filter((keyword) => {
+			const normalizedKeyword = normalizeMatchText(keyword, caseSensitive);
+			return normalizedText.includes(normalizedKeyword);
+		});
+		const pass = (options.mode ?? "all") === "all" ? matches.length === options.keywords.length : matches.length > 0;
+		const score = options.keywords.length === 0 ? 1 : matches.length / options.keywords.length;
+		return createOutcome(options.id, "exact", pass, score, pass ? `Matched ${matches.length}/${options.keywords.length} required keywords.` : `Matched ${matches.length}/${options.keywords.length} required keywords.`);
+	};
+}
+/**
+* Creates an assertion that forbids specific keywords.
+*
+* Example:
+* `expectMustExclude({ id: 'no-engine-dump', keywords: ['bestmove', 'ponder'] })`
+*/
+function expectMustExclude(options) {
+	return async (context) => {
+		if (options.keywords.length === 0) return createOutcome(options.id, "exact", true, 1, "No excluded keywords configured.");
+		const caseSensitive = options.caseSensitive ?? false;
+		const normalizedText = normalizeMatchText(context.text, caseSensitive);
+		const forbiddenMatches = options.keywords.filter((keyword) => {
+			const normalizedKeyword = normalizeMatchText(keyword, caseSensitive);
+			return normalizedText.includes(normalizedKeyword);
+		});
+		const pass = forbiddenMatches.length === 0;
+		const score = pass ? 1 : 0;
+		return createOutcome(options.id, "exact", pass, score, pass ? "No forbidden keywords found." : `Forbidden keywords found: ${forbiddenMatches.join(", ")}`);
+	};
+}
+/**
+* Creates an assertion based on a regular expression.
+*
+* Example:
+* `expectRegex({ id: 'starts-with-act', pattern: /^<\|ACT:/ })`
+*/
+function expectRegex(options) {
+	return async (context) => {
+		const pass = options.pattern.test(context.text);
+		return createOutcome(options.id, "exact", pass, pass ? 1 : 0, pass ? "Regex matched response text." : `Regex did not match: ${options.pattern}`);
+	};
+}
+/**
+* Creates an assertion for structured model output.
+*
+* Example:
+* `expectStructuredOutput({ id: 'json-shape', validate: isMySchema })`
+*/
+function expectStructuredOutput(options) {
+	return async (context) => {
+		const pass = options.validate(context.structuredOutput);
+		return createOutcome(options.id, "exact", pass, pass ? 1 : 0, pass ? "Structured output matched validator." : options.failureReason ?? "Structured output validation failed.");
+	};
+}
+/**
+* Creates an assertion for validating tool-call arguments.
+*
+* Example:
+* `expectToolCallArgs({ id: 'spark-command-shape', toolName: 'builtIn_sparkCommand', validate: isSparkArgs })`
+*/
+function expectToolCallArgs(options) {
+	return async (context) => {
+		const targetCall = (context.toolCalls ?? []).find((call) => call.name === options.toolName);
+		if (targetCall == null) return createOutcome(options.id, "exact", false, 0, `Missing tool call: ${options.toolName}`);
+		const pass = options.validate(targetCall.args);
+		return createOutcome(options.id, "exact", pass, pass ? 1 : 0, pass ? `Tool call args validated for ${options.toolName}.` : `Tool call args validation failed for ${options.toolName}.`);
+	};
+}
+/**
+* Creates a rubric assertion driven by teacher-model style scoring.
+*
+* Example:
+* `expectRubric({ id: 'human-like-tone', judge: judgeFn, minScore: 0.8 })`
+*/
+function expectRubric(options) {
+	return async (context) => {
+		const result = await options.judge(context);
+		const minScore = options.minScore ?? .7;
+		const normalizedScore = clampScore(result.score);
+		const pass = normalizedScore >= minScore;
+		return createOutcome(options.id, "judge", pass, normalizedScore, `${result.reason}${result.judgeModel ? ` (judge: ${result.judgeModel})` : ""}`);
+	};
+}
+/**
+* Creates a custom assertion with fully user-defined logic.
+*
+* Example:
+* `expectCustom({ id: 'stateful-window', scoreKind: 'exact', evaluate: (ctx) => ... })`
+*/
+function expectCustom(options) {
+	return async (context) => {
+		const result = await options.evaluate(context);
+		return createOutcome(options.id, options.scoreKind, result.pass, result.score, result.reason);
+	};
+}
+/**
+* Creates an inverse assertion.
+*
+* Example:
+* `expectNot(expectMustInclude({ id: 'contains-engine-word', keywords: ['bestmove'] }), { id: 'no-engine-word' })`
+*/
+function expectNot(assertion, options) {
+	return async (context) => {
+		const baseOutcome = await assertion(context);
+		return createOutcome(options.id, baseOutcome.scoreKind, !baseOutcome.pass, 1 - baseOutcome.score, `NOT(${baseOutcome.id}): ${baseOutcome.reason}`);
+	};
+}
+/**
+* Executes assertion list and returns all outcomes.
+*
+* Call stack:
+*
+* {@link evaluateAssertions}
+*   -> `assertion(context)`
+*     -> {@link AssertionOutcome}[]
+*/
+async function evaluateAssertions(assertions, context) {
+	const normalizedContext = {
+		state: context.state ?? /* @__PURE__ */ new Map(),
+		structuredOutput: context.structuredOutput,
+		text: context.text,
+		toolCalls: context.toolCalls
+	};
+	const outcomes = [];
+	for (const assertion of assertions) outcomes.push(await assertion(normalizedContext));
+	return outcomes;
+}
+/**
+* Converts assertion outcomes to run-score tuples consumed by aggregation.
+*/
+function toRunScores(outcomes) {
+	return outcomes.map((outcome) => ({
+		kind: outcome.scoreKind,
+		score: outcome.score
+	}));
+}
+/**
+* Returns failing assertion outcomes in original order.
+*/
+function collectFailedAssertions(outcomes) {
+	return outcomes.filter((outcome) => !outcome.pass);
+}
+//#endregion
+export { expectMustInclude as a, expectRubric as c, normalizeMatchText as d, toRunScores as f, expectMustExclude as i, expectStructuredOutput as l, evaluateAssertions as n, expectNot as o, expectCustom as r, expectRegex as s, collectFailedAssertions as t, expectToolCallArgs as u };
+//# sourceMappingURL=assertions-DcAjfVDA.mjs.map

package/dist/assertions-DcAjfVDA.mjs.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"assertions-DcAjfVDA.mjs","names":[],"sources":["../src/core/assertions/index.ts"],"sourcesContent":["import type { RunScore, RunScoreKind } from '../runner/aggregate'\n\n/**\n * Stores mutable evaluation state for stateful assertion flows.\n *\n * Use when:\n * - assertions need to share counters, rolling metrics, or memoized values\n * - a scenario evaluates multiple steps and expects state-aware checks\n */\nexport type AssertionState = Map<string, unknown>\n\n/**\n * Represents one tool call emitted by a model response.\n */\nexport interface ToolCall {\n /**\n * Tool name used by the call.\n */\n name: string\n /**\n * Tool arguments payload.\n */\n args: unknown\n}\n\n/**\n * Normalized assertion context for one model output.\n */\nexport interface AssertionContext {\n /**\n * Plain text model output used by text assertions.\n */\n text: string\n /**\n * Optional structured output parsed from the model response.\n */\n structuredOutput?: unknown\n /**\n * Optional tool calls extracted from the model response.\n */\n toolCalls?: readonly ToolCall[]\n /**\n * Shared mutable state for stateful assertion measurement.\n */\n state: AssertionState\n}\n\n/**\n * Result for one assertion evaluation.\n */\nexport interface AssertionOutcome {\n /**\n * Stable assertion id.\n */\n id: string\n /**\n * Assertion family emitted as run score kind.\n */\n scoreKind: RunScoreKind\n /**\n * Whether the assertion passed.\n */\n pass: boolean\n /**\n * Normalized score in the `0..1` range.\n */\n score: number\n /**\n * Human-readable reason for logs and reports.\n */\n reason: string\n}\n\n/**\n * Async assertion function used by eval scenarios.\n */\nexport type Assertion = (context: AssertionContext) => Promise<AssertionOutcome>\n\n/**\n * Normalizes text for matching.\n *\n * Before: `\" Hello\\nWorld \"`\n * After: `\"hello world\"`\n */\nexport function normalizeMatchText(value: string, caseSensitive: boolean): string {\n const compactedWhitespace = value.trim().replaceAll(/\\s+/g, ' ')\n\n if (caseSensitive) {\n return compactedWhitespace\n }\n\n return compactedWhitespace.toLowerCase()\n}\n\nfunction clampScore(score: number): number {\n if (Number.isNaN(score)) {\n return 0\n }\n\n if (score < 0) {\n return 0\n }\n\n if (score > 1) {\n return 1\n }\n\n return score\n}\n\nfunction createOutcome(\n id: string,\n scoreKind: RunScoreKind,\n pass: boolean,\n score: number,\n reason: string,\n): AssertionOutcome {\n return {\n id,\n pass,\n reason,\n score: clampScore(score),\n scoreKind,\n }\n}\n\n/**\n * Options for include-keyword assertions.\n */\nexport interface MustIncludeAssertionOptions {\n /**\n * Stable assertion id.\n */\n id: string\n /**\n * Keywords that must be present.\n */\n keywords: readonly string[]\n /**\n * Match mode for keywords.\n *\n * @default 'all'\n */\n mode?: 'all' | 'any'\n /**\n * Case-sensitive matching toggle.\n *\n * @default false\n */\n caseSensitive?: boolean\n}\n\n/**\n * Creates an assertion that requires specific keywords in model text.\n *\n * Example:\n * `expectMustInclude({ id: 'tone', keywords: ['calm', 'move'] })`\n */\nexport function expectMustInclude(options: MustIncludeAssertionOptions): Assertion {\n return async (context) => {\n if (options.keywords.length === 0) {\n return createOutcome(options.id, 'exact', true, 1, 'No required keywords configured.')\n }\n\n const caseSensitive = options.caseSensitive ?? false\n const normalizedText = normalizeMatchText(context.text, caseSensitive)\n const matches = options.keywords.filter((keyword) => {\n const normalizedKeyword = normalizeMatchText(keyword, caseSensitive)\n return normalizedText.includes(normalizedKeyword)\n })\n\n const mode = options.mode ?? 'all'\n const pass = mode === 'all'\n ? matches.length === options.keywords.length\n : matches.length > 0\n\n const score = options.keywords.length === 0 ? 1 : matches.length / options.keywords.length\n\n return createOutcome(\n options.id,\n 'exact',\n pass,\n score,\n pass\n ? `Matched ${matches.length}/${options.keywords.length} required keywords.`\n : `Matched ${matches.length}/${options.keywords.length} required keywords.`,\n )\n }\n}\n\n/**\n * Options for exclude-keyword assertions.\n */\nexport interface MustExcludeAssertionOptions {\n /**\n * Stable assertion id.\n */\n id: string\n /**\n * Keywords that must not appear.\n */\n keywords: readonly string[]\n /**\n * Case-sensitive matching toggle.\n *\n * @default false\n */\n caseSensitive?: boolean\n}\n\n/**\n * Creates an assertion that forbids specific keywords.\n *\n * Example:\n * `expectMustExclude({ id: 'no-engine-dump', keywords: ['bestmove', 'ponder'] })`\n */\nexport function expectMustExclude(options: MustExcludeAssertionOptions): Assertion {\n return async (context) => {\n if (options.keywords.length === 0) {\n return createOutcome(options.id, 'exact', true, 1, 'No excluded keywords configured.')\n }\n\n const caseSensitive = options.caseSensitive ?? false\n const normalizedText = normalizeMatchText(context.text, caseSensitive)\n const forbiddenMatches = options.keywords.filter((keyword) => {\n const normalizedKeyword = normalizeMatchText(keyword, caseSensitive)\n return normalizedText.includes(normalizedKeyword)\n })\n\n const pass = forbiddenMatches.length === 0\n const score = pass ? 1 : 0\n\n return createOutcome(\n options.id,\n 'exact',\n pass,\n score,\n pass\n ? 'No forbidden keywords found.'\n : `Forbidden keywords found: ${forbiddenMatches.join(', ')}`,\n )\n }\n}\n\n/**\n * Options for regular-expression assertions.\n */\nexport interface RegexAssertionOptions {\n /**\n * Stable assertion id.\n */\n id: string\n /**\n * Pattern to apply to model text.\n */\n pattern: RegExp\n}\n\n/**\n * Creates an assertion based on a regular expression.\n *\n * Example:\n * `expectRegex({ id: 'starts-with-act', pattern: /^<\\|ACT:/ })`\n */\nexport function expectRegex(options: RegexAssertionOptions): Assertion {\n return async (context) => {\n const pass = options.pattern.test(context.text)\n\n return createOutcome(\n options.id,\n 'exact',\n pass,\n pass ? 1 : 0,\n pass ? 'Regex matched response text.' : `Regex did not match: ${options.pattern}`,\n )\n }\n}\n\n/**\n * Options for structured-output assertions.\n */\nexport interface StructuredOutputAssertionOptions<TValue> {\n /**\n * Stable assertion id.\n */\n id: string\n /**\n * Runtime validator for structured output.\n */\n validate: (value: unknown) => value is TValue\n /**\n * Optional failure reason.\n */\n failureReason?: string\n}\n\n/**\n * Creates an assertion for structured model output.\n *\n * Example:\n * `expectStructuredOutput({ id: 'json-shape', validate: isMySchema })`\n */\nexport function expectStructuredOutput<TValue>(options: StructuredOutputAssertionOptions<TValue>): Assertion {\n return async (context) => {\n const pass = options.validate(context.structuredOutput)\n\n return createOutcome(\n options.id,\n 'exact',\n pass,\n pass ? 1 : 0,\n pass ? 'Structured output matched validator.' : (options.failureReason ?? 'Structured output validation failed.'),\n )\n }\n}\n\n/**\n * Options for tool-call argument assertions.\n */\nexport interface ToolCallArgsAssertionOptions {\n /**\n * Stable assertion id.\n */\n id: string\n /**\n * Tool name to inspect.\n */\n toolName: string\n /**\n * Runtime validator for tool arguments.\n */\n validate: (args: unknown) => boolean\n}\n\n/**\n * Creates an assertion for validating tool-call arguments.\n *\n * Example:\n * `expectToolCallArgs({ id: 'spark-command-shape', toolName: 'builtIn_sparkCommand', validate: isSparkArgs })`\n */\nexport function expectToolCallArgs(options: ToolCallArgsAssertionOptions): Assertion {\n return async (context) => {\n const targetCall = (context.toolCalls ?? []).find(call => call.name === options.toolName)\n\n if (targetCall == null) {\n return createOutcome(options.id, 'exact', false, 0, `Missing tool call: ${options.toolName}`)\n }\n\n const pass = options.validate(targetCall.args)\n\n return createOutcome(\n options.id,\n 'exact',\n pass,\n pass ? 1 : 0,\n pass ? `Tool call args validated for ${options.toolName}.` : `Tool call args validation failed for ${options.toolName}.`,\n )\n }\n}\n\n/**\n * Rubric judge result returned by teacher-model or rubric logic.\n */\nexport interface RubricJudgeResult {\n /**\n * Normalized score in the `0..1` range.\n */\n score: number\n /**\n * Judge explanation text.\n */\n reason: string\n /**\n * Optional judge model id.\n */\n judgeModel?: string\n}\n\n/**\n * Options for rubric assertions.\n */\nexport interface RubricAssertionOptions {\n /**\n * Stable assertion id.\n */\n id: string\n /**\n * Async rubric judge callback.\n */\n judge: (context: AssertionContext) => Promise<RubricJudgeResult>\n /**\n * Minimum passing score.\n *\n * @default 0.7\n */\n minScore?: number\n}\n\n/**\n * Creates a rubric assertion driven by teacher-model style scoring.\n *\n * Example:\n * `expectRubric({ id: 'human-like-tone', judge: judgeFn, minScore: 0.8 })`\n */\nexport function expectRubric(options: RubricAssertionOptions): Assertion {\n return async (context) => {\n const result = await options.judge(context)\n const minScore = options.minScore ?? 0.7\n const normalizedScore = clampScore(result.score)\n const pass = normalizedScore >= minScore\n\n return createOutcome(\n options.id,\n 'judge',\n pass,\n normalizedScore,\n `${result.reason}${result.judgeModel ? ` (judge: ${result.judgeModel})` : ''}`,\n )\n }\n}\n\n/**\n * Options for custom assertions.\n */\nexport interface CustomAssertionOptions {\n /**\n * Stable assertion id.\n */\n id: string\n /**\n * Score family emitted by this custom assertion.\n */\n scoreKind: RunScoreKind\n /**\n * Custom evaluator callback.\n */\n evaluate: (context: AssertionContext) => Promise<{ pass: boolean, reason: string, score: number }> | { pass: boolean, reason: string, score: number }\n}\n\n/**\n * Creates a custom assertion with fully user-defined logic.\n *\n * Example:\n * `expectCustom({ id: 'stateful-window', scoreKind: 'exact', evaluate: (ctx) => ... })`\n */\nexport function expectCustom(options: CustomAssertionOptions): Assertion {\n return async (context) => {\n const result = await options.evaluate(context)\n\n return createOutcome(options.id, options.scoreKind, result.pass, result.score, result.reason)\n }\n}\n\n/**\n * Creates an inverse assertion.\n *\n * Example:\n * `expectNot(expectMustInclude({ id: 'contains-engine-word', keywords: ['bestmove'] }), { id: 'no-engine-word' })`\n */\nexport function expectNot(assertion: Assertion, options: { id: string }): Assertion {\n return async (context) => {\n const baseOutcome = await assertion(context)\n\n return createOutcome(\n options.id,\n baseOutcome.scoreKind,\n !baseOutcome.pass,\n 1 - baseOutcome.score,\n `NOT(${baseOutcome.id}): ${baseOutcome.reason}`,\n )\n }\n}\n\n/**\n * Executes assertion list and returns all outcomes.\n *\n * Call stack:\n *\n * {@link evaluateAssertions}\n * -> `assertion(context)`\n * -> {@link AssertionOutcome}[]\n */\nexport async function evaluateAssertions(\n assertions: readonly Assertion[],\n context: Omit<AssertionContext, 'state'> & { state?: AssertionState },\n): Promise<AssertionOutcome[]> {\n const state = context.state ?? new Map<string, unknown>()\n const normalizedContext: AssertionContext = {\n state,\n structuredOutput: context.structuredOutput,\n text: context.text,\n toolCalls: context.toolCalls,\n }\n\n const outcomes: AssertionOutcome[] = []\n\n for (const assertion of assertions) {\n outcomes.push(await assertion(normalizedContext))\n }\n\n return outcomes\n}\n\n/**\n * Converts assertion outcomes to run-score tuples consumed by aggregation.\n */\nexport function toRunScores(outcomes: readonly AssertionOutcome[]): RunScore[] {\n return outcomes.map(outcome => ({\n kind: outcome.scoreKind,\n score: outcome.score,\n }))\n}\n\n/**\n * Returns failing assertion outcomes in original order.\n */\nexport function collectFailedAssertions(outcomes: readonly AssertionOutcome[]): AssertionOutcome[] {\n return outcomes.filter(outcome => !outcome.pass)\n}\n"],"mappings":";;;;;;;AAoFA,SAAgB,mBAAmB,OAAe,eAAgC;CAChF,MAAM,sBAAsB,MAAM,MAAM,CAAC,WAAW,QAAQ,IAAI;AAEhE,KAAI,cACF,QAAO;AAGT,QAAO,oBAAoB,aAAa;;AAG1C,SAAS,WAAW,OAAuB;AACzC,KAAI,OAAO,MAAM,MAAM,CACrB,QAAO;AAGT,KAAI,QAAQ,EACV,QAAO;AAGT,KAAI,QAAQ,EACV,QAAO;AAGT,QAAO;;AAGT,SAAS,cACP,IACA,WACA,MACA,OACA,QACkB;AAClB,QAAO;EACL;EACA;EACA;EACA,OAAO,WAAW,MAAM;EACxB;EACD;;;;;;;;AAmCH,SAAgB,kBAAkB,SAAiD;AACjF,QAAO,OAAO,YAAY;AACxB,MAAI,QAAQ,SAAS,WAAW,EAC9B,QAAO,cAAc,QAAQ,IAAI,SAAS,MAAM,GAAG,mCAAmC;EAGxF,MAAM,gBAAgB,QAAQ,iBAAiB;EAC/C,MAAM,iBAAiB,mBAAmB,QAAQ,MAAM,cAAc;EACtE,MAAM,UAAU,QAAQ,SAAS,QAAQ,YAAY;GACnD,MAAM,oBAAoB,mBAAmB,SAAS,cAAc;AACpE,UAAO,eAAe,SAAS,kBAAkB;IACjD;EAGF,MAAM,QADO,QAAQ,QAAQ,WACP,QAClB,QAAQ,WAAW,QAAQ,SAAS,SACpC,QAAQ,SAAS;EAErB,MAAM,QAAQ,QAAQ,SAAS,WAAW,IAAI,IAAI,QAAQ,SAAS,QAAQ,SAAS;AAEpF,SAAO,cACL,QAAQ,IACR,SACA,MACA,OACA,OACI,WAAW,QAAQ,OAAO,GAAG,QAAQ,SAAS,OAAO,uBACrD,WAAW,QAAQ,OAAO,GAAG,QAAQ,SAAS,OAAO,qBAC1D;;;;;;;;;AA8BL,SAAgB,kBAAkB,SAAiD;AACjF,QAAO,OAAO,YAAY;AACxB,MAAI,QAAQ,SAAS,WAAW,EAC9B,QAAO,cAAc,QAAQ,IAAI,SAAS,MAAM,GAAG,mCAAmC;EAGxF,MAAM,gBAAgB,QAAQ,iBAAiB;EAC/C,MAAM,iBAAiB,mBAAmB,QAAQ,MAAM,cAAc;EACtE,MAAM,mBAAmB,QAAQ,SAAS,QAAQ,YAAY;GAC5D,MAAM,oBAAoB,mBAAmB,SAAS,cAAc;AACpE,UAAO,eAAe,SAAS,kBAAkB;IACjD;EAEF,MAAM,OAAO,iBAAiB,WAAW;EACzC,MAAM,QAAQ,OAAO,IAAI;AAEzB,SAAO,cACL,QAAQ,IACR,SACA,MACA,OACA,OACI,iCACA,6BAA6B,iBAAiB,KAAK,KAAK,GAC7D;;;;;;;;;AAwBL,SAAgB,YAAY,SAA2C;AACrE,QAAO,OAAO,YAAY;EACxB,MAAM,OAAO,QAAQ,QAAQ,KAAK,QAAQ,KAAK;AAE/C,SAAO,cACL,QAAQ,IACR,SACA,MACA,OAAO,IAAI,GACX,OAAO,iCAAiC,wBAAwB,QAAQ,UACzE;;;;;;;;;AA4BL,SAAgB,uBAA+B,SAA8D;AAC3G,QAAO,OAAO,YAAY;EACxB,MAAM,OAAO,QAAQ,SAAS,QAAQ,iBAAiB;AAEvD,SAAO,cACL,QAAQ,IACR,SACA,MACA,OAAO,IAAI,GACX,OAAO,yCAA0C,QAAQ,iBAAiB,uCAC3E;;;;;;;;;AA4BL,SAAgB,mBAAmB,SAAkD;AACnF,QAAO,OAAO,YAAY;EACxB,MAAM,cAAc,QAAQ,aAAa,EAAE,EAAE,MAAK,SAAQ,KAAK,SAAS,QAAQ,SAAS;AAEzF,MAAI,cAAc,KAChB,QAAO,cAAc,QAAQ,IAAI,SAAS,OAAO,GAAG,sBAAsB,QAAQ,WAAW;EAG/F,MAAM,OAAO,QAAQ,SAAS,WAAW,KAAK;AAE9C,SAAO,cACL,QAAQ,IACR,SACA,MACA,OAAO,IAAI,GACX,OAAO,gCAAgC,QAAQ,SAAS,KAAK,wCAAwC,QAAQ,SAAS,GACvH;;;;;;;;;AAgDL,SAAgB,aAAa,SAA4C;AACvE,QAAO,OAAO,YAAY;EACxB,MAAM,SAAS,MAAM,QAAQ,MAAM,QAAQ;EAC3C,MAAM,WAAW,QAAQ,YAAY;EACrC,MAAM,kBAAkB,WAAW,OAAO,MAAM;EAChD,MAAM,OAAO,mBAAmB;AAEhC,SAAO,cACL,QAAQ,IACR,SACA,MACA,iBACA,GAAG,OAAO,SAAS,OAAO,aAAa,YAAY,OAAO,WAAW,KAAK,KAC3E;;;;;;;;;AA4BL,SAAgB,aAAa,SAA4C;AACvE,QAAO,OAAO,YAAY;EACxB,MAAM,SAAS,MAAM,QAAQ,SAAS,QAAQ;AAE9C,SAAO,cAAc,QAAQ,IAAI,QAAQ,WAAW,OAAO,MAAM,OAAO,OAAO,OAAO,OAAO;;;;;;;;;AAUjG,SAAgB,UAAU,WAAsB,SAAoC;AAClF,QAAO,OAAO,YAAY;EACxB,MAAM,cAAc,MAAM,UAAU,QAAQ;AAE5C,SAAO,cACL,QAAQ,IACR,YAAY,WACZ,CAAC,YAAY,MACb,IAAI,YAAY,OAChB,OAAO,YAAY,GAAG,KAAK,YAAY,SACxC;;;;;;;;;;;;AAaL,eAAsB,mBACpB,YACA,SAC6B;CAE7B,MAAM,oBAAsC;EAC1C,OAFY,QAAQ,yBAAS,IAAI,KAAsB;EAGvD,kBAAkB,QAAQ;EAC1B,MAAM,QAAQ;EACd,WAAW,QAAQ;EACpB;CAED,MAAM,WAA+B,EAAE;AAEvC,MAAK,MAAM,aAAa,WACtB,UAAS,KAAK,MAAM,UAAU,kBAAkB,CAAC;AAGnD,QAAO;;;;;AAMT,SAAgB,YAAY,UAAmD;AAC7E,QAAO,SAAS,KAAI,aAAY;EAC9B,MAAM,QAAQ;EACd,OAAO,QAAQ;EAChB,EAAE;;;;;AAML,SAAgB,wBAAwB,UAA2D;AACjG,QAAO,SAAS,QAAO,YAAW,CAAC,QAAQ,KAAK"}

package/dist/cli/index.d.mts ADDED Viewed

@@ -0,0 +1,11 @@
+//#region src/cli/index.d.ts
+type Command = 'run';
+interface ParsedTopLevelCliArguments {
+  command: Command | 'help';
+  commandArgv: string[];
+}
+declare function parseTopLevelCliArguments(argv: readonly string[]): ParsedTopLevelCliArguments;
+declare function runTopLevelCli(argv: readonly string[]): Promise<void>;
+//#endregion
+export { parseTopLevelCliArguments, runTopLevelCli };
+//# sourceMappingURL=index.d.mts.map