npm - @checklabs/core - Versions diffs - 0.2.1 - Mend

@checklabs/core 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/package.json +31 -0
package/src/adapters/index.ts +136 -0
package/src/assertions/expect.ts +218 -0
package/src/config.ts +89 -0
package/src/discovery.ts +57 -0
package/src/env.ts +35 -0
package/src/generate/index.ts +103 -0
package/src/generate/templates.ts +225 -0
package/src/index.ts +93 -0
package/src/judge/index.ts +158 -0
package/src/pricing.ts +56 -0
package/src/registry.ts +23 -0
package/src/reporters/colors.ts +36 -0
package/src/reporters/console.ts +154 -0
package/src/reporters/html.ts +189 -0
package/src/reporters/index.ts +4 -0
package/src/reporters/json.ts +11 -0
package/src/runner/compare.ts +84 -0
package/src/runner/runner.ts +144 -0
package/src/types.ts +197 -0

package/src/types.ts ADDED Viewed

@@ -0,0 +1,197 @@
+/**
+ * CheckAI public types.
+ *
+ * The contract that matters most is {@link AgentResponse} (what an agent returns)
+ * and {@link AgentAdapter} (how CheckAI talks to any agent). Everything else is
+ * built on top of these two.
+ */
+/** Token accounting for a single agent turn. */
+export interface TokenUsage {
+  promptTokens: number;
+  completionTokens: number;
+  totalTokens: number;
+}
+/** The value every agent turn resolves to. */
+export interface AgentResponse {
+  /** Natural-language reply shown to the user. */
+  output: string;
+  /** Tools/functions the agent invoked this turn. */
+  toolsUsed: string[];
+  /** Wall-clock latency of the turn, in milliseconds. */
+  latencyMs: number;
+  /** Model identifier that produced the response. */
+  model: string;
+  /** Token usage, if the adapter can report it. Estimated by CheckAI otherwise. */
+  usage?: TokenUsage;
+  /** Pre-computed cost in USD, if known. Estimated from usage + pricing otherwise. */
+  costUsd?: number;
+  /** Adapter-specific raw payload (never inspected by CheckAI). */
+  raw?: unknown;
+}
+/**
+ * A connected agent. Adapters wrap OpenAI, a local function, or an HTTP endpoint
+ * behind this single interface so tests are agent-agnostic.
+ */
+export interface AgentAdapter {
+  /** Human-friendly name, e.g. "support-agent". */
+  name: string;
+  /** Model id the adapter drives, e.g. "gpt-4.1-mini". */
+  model: string;
+  /** Run one turn. */
+  run(input: string): Promise<AgentResponse>;
+}
+/** Context injected into every test body. */
+export interface TestContext {
+  agent: AgentAdapter;
+}
+/** A test body. */
+export type TestFn = (ctx: TestContext) => void | Promise<void>;
+/** A registered test case. */
+export interface TestCase {
+  name: string;
+  fn: TestFn;
+  file: string;
+}
+/** Outcome of a single LLM-judge evaluation. */
+export interface JudgeResult {
+  /** Behavior under evaluation. */
+  behavior: string;
+  /** Graded score in [0, 1]. */
+  score: number;
+  /** Minimum score required to pass. */
+  threshold: number;
+  /** score >= threshold. */
+  pass: boolean;
+  /** Short human-readable justification. */
+  reasoning: string;
+  /** Which backend produced the score. */
+  backend: "openai" | "heuristic";
+}
+/** Outcome of a single assertion. */
+export interface AssertionResult {
+  /** Matcher name, e.g. "toAskFor" (prefixed with "not." when negated). */
+  matcher: string;
+  negated: boolean;
+  pass: boolean;
+  expected: string;
+  actual: string;
+  /** Present for judge-backed assertions. */
+  score?: number;
+  threshold?: number;
+}
+/** Status of one test against one agent. */
+export type TestStatus = "pass" | "fail" | "error";
+/** Result of running one test against one agent. */
+export interface TestResult {
+  name: string;
+  file: string;
+  status: TestStatus;
+  /** Every assertion attempted, up to (and including) the first failure. */
+  assertions: AssertionResult[];
+  /** The first failing assertion, if status === "fail". */
+  failure?: AssertionResult;
+  /** Stack/message if status === "error" (a thrown non-assertion error). */
+  errorMessage?: string;
+  /** Latency of every agent.run() call in the test. */
+  latencies: number[];
+  /** Union of tools used across the test. */
+  toolsUsed: string[];
+  /** Aggregated token usage across the test. */
+  usage: TokenUsage;
+  /** Aggregated estimated cost across the test (USD). */
+  costUsd: number;
+  /** Total test duration (ms). */
+  durationMs: number;
+}
+/** Aggregate stats for a suite run against one agent. */
+export interface SuiteSummary {
+  total: number;
+  passed: number;
+  failed: number;
+  errored: number;
+  /** passed / total in [0, 1]. */
+  passRate: number;
+  avgLatencyMs: number;
+  totalTokens: number;
+  totalCostUsd: number;
+}
+/** Full report for one agent run (the unit the JSON/HTML reporters serialize). */
+export interface RunReport {
+  kind: "run";
+  agent: { name: string; model: string; backend: string };
+  results: TestResult[];
+  summary: SuiteSummary;
+  startedAt: string;
+  finishedAt: string;
+}
+/** A single test compared across agents. */
+export interface ComparisonRow {
+  name: string;
+  file: string;
+  statuses: TestStatus[];
+  /** Per-agent first-failure detail (aligned with `statuses`). */
+  failures: (AssertionResult | undefined)[];
+  /** "regression" | "improvement" | "unchanged" | "error" relative to baseline. */
+  delta: "regression" | "improvement" | "unchanged" | "error";
+}
+/** Result of comparing two or more agents over the same suite. */
+export interface ComparisonResult {
+  kind: "comparison";
+  agents: { name: string; model: string; backend: string }[];
+  baseline: string;
+  rows: ComparisonRow[];
+  summaries: SuiteSummary[];
+  regressions: ComparisonRow[];
+  improvements: ComparisonRow[];
+  unchanged: number;
+  /** Candidate cost vs baseline cost, percent (+/-). */
+  costDeltaPct: number;
+  /** Candidate latency vs baseline latency, percent (+/-). */
+  latencyDeltaPct: number;
+  startedAt: string;
+  finishedAt: string;
+}
+/** Named agent: a module path, or an inline adapter. */
+export type AgentSource = string | AgentAdapter;
+/** User-facing CheckAI configuration (checkai.config.ts default export). */
+export interface CheckAIConfig {
+  /** Directory scanned for *.test.ts files (relative to the config file). */
+  testDir?: string;
+  /** Model used by the LLM judge. */
+  judgeModel?: string;
+  /** Default pass threshold for toSatisfyBehavior (0..1). */
+  judgeThreshold?: number;
+  /** Named agents. compare runs the suite against every entry. */
+  agents?: Record<string, AgentSource>;
+  /** Single-agent shorthand. */
+  agent?: AgentSource;
+  /** Which agent `checkai run` uses. */
+  defaultAgent?: string;
+}
+/** Config with defaults resolved and paths made absolute. */
+export interface ResolvedConfig {
+  configPath: string | null;
+  rootDir: string;
+  testDir: string;
+  judgeModel: string;
+  judgeThreshold: number;
+  agents: Record<string, AgentSource>;
+  defaultAgent: string;
+}