npm - badgr-eval-check - Versions diffs - 0.1.0 - Mend

badgr-eval-check 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/README.md ADDED Viewed

@@ -0,0 +1,139 @@
+# badgr-eval-check
+Assert that AI model output contains what it should, excludes what it shouldn't, and matches the shape you expect — repeatable checks you can run in CI.
+```bash
+npx badgr-eval-check \
+  --output "The answer is 42. Confidence: high." \
+  --must-include "42" \
+  --must-not-include "I cannot"
+```
+**Free. No signup required.** Runs entirely on your machine.
+---
+## The problem it solves
+LLM outputs are non-deterministic, so regressions are easy to miss. A model update silently starts refusing questions it used to answer, hallucinating success, or dropping required fields from JSON responses. `badgr-eval-check` lets you write simple assertions against model output and run them on every deploy — like unit tests for your prompts.
+---
+## Quick start
+```bash
+# Check that output includes required text
+npx badgr-eval-check --output "The answer is 42." --must-include "42"
+# Check that output excludes banned phrases
+npx badgr-eval-check --output "The answer is 42." --must-not-include "I cannot,sorry,I don't know"
+# Check a JSON response has required fields
+npx badgr-eval-check --output '{"answer": "42", "confidence": "high"}' --must-include answer,confidence
+# Machine-readable JSON
+npx badgr-eval-check --output "..." --must-include "42" --json
+```
+---
+## CLI flags
+| Flag | Description |
+|------|-------------|
+| `--output <str>` | The model output string to check (required) |
+| `--must-include <items>` | Comma-separated strings that **must** appear in the output |
+| `--must-not-include <items>` | Comma-separated strings that **must not** appear in the output |
+| `--json` | Output machine-readable JSON |
+**Exit codes:** `0` = all checks passed, `1` = one or more checks failed
+---
+## Example output
+```
+badgr-eval-check
+  ✓  must-include     "42" found in output
+  ✓  must-include     "confidence" found in output
+  ✗  must-not-include "I cannot" found in output — model refused to answer
+  1 check failed.
+```
+```
+badgr-eval-check
+  ✓  must-include     "42" found in output
+  ✓  must-not-include "I cannot" not found ✓
+  ✓  must-not-include "sorry" not found ✓
+  All checks passed.
+```
+---
+## TypeScript API
+Use the programmatic API to run multiple fixtures as a batch:
+```ts
+import { runEvalCheck } from "badgr-eval-check";
+const result = runEvalCheck([
+  {
+    name: "answer present",
+    output: modelOutput,
+    mustInclude: ["42", "confidence"],
+    mustNotInclude: ["I cannot", "sorry", "I don't know"],
+  },
+  {
+    name: "JSON fields present",
+    output: modelOutput,
+    requiredJsonFields: ["answer", "confidence"],  // dot-path: "data.items.0.name"
+  },
+]);
+console.log(result.passed);  // true / false
+console.log(result.checks);  // per-fixture results
+```
+**Types:**
+```ts
+interface EvalFixture {
+  name: string;
+  output: string;
+  mustInclude?: string[];
+  mustNotInclude?: string[];
+  requiredJsonFields?: string[];  // supports dot-path, e.g. "data.items.0.name"
+}
+interface EvalCheckResult {
+  checks: DiagnosticCheck[];
+  passed: boolean;
+  report: JsonReport;
+}
+```
+---
+## Use in CI
+```yaml
+# GitHub Actions example
+- name: Run model eval checks
+  run: |
+    OUTPUT=$(curl -s https://api.openai.com/v1/chat/completions \
+      -H "Authorization: Bearer $OPENAI_API_KEY" \
+      -d '{"model":"gpt-4o","messages":[{"role":"user","content":"What is 6*7?"}]}' \
+      | jq -r '.choices[0].message.content')
+    npx badgr-eval-check --output "$OUTPUT" --must-include "42" --must-not-include "I cannot"
+```
+---
+## Requirements
+- Node.js 18+

package/dist/cli.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+#!/usr/bin/env node
+export {};
+//# sourceMappingURL=cli.d.ts.map

package/dist/cli.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"cli.d.ts","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":""}

package/dist/cli.js ADDED Viewed

@@ -0,0 +1,58 @@
+#!/usr/bin/env node
+import { createLogger, fireTelemetry } from "badgr-shared";
+import { runEvalCheck } from "./index.js";
+fireTelemetry({ package: "badgr-eval-check" });
+function readArg(name) {
+    const index = process.argv.indexOf(name);
+    return index >= 0 ? process.argv[index + 1] : undefined;
+}
+if (process.argv.includes("--help") || process.argv.includes("-h")) {
+    console.log(`badgr-eval-check — local AI output assertions
+Usage:
+  npx badgr-eval-check --output "<model output>" --must-include "<text>"
+  npx badgr-eval-check --output '{"answer":"ok"}' --must-include answer --json
+  npx badgr-eval-check --output "<text>" --must-not-include "error" --json
+Flags:
+  --output <str>           Model output string to check
+  --must-include <items>   Comma-separated strings that must appear in output
+  --must-not-include <items> Comma-separated strings that must NOT appear
+  --json                   Output machine-readable JSON
+Exit codes:
+  0  All checks passed
+  1  One or more checks failed
+No signup required. Attaching eval status to AI Badgr receipts is optional.
+Environment:
+  BADGR_TELEMETRY=0      Disable anonymous usage telemetry`);
+    process.exit(0);
+}
+const json = process.argv.includes("--json");
+const output = readArg("--output") ?? "";
+const mustInclude = (readArg("--must-include") ?? "").split(",").map((v) => v.trim()).filter(Boolean);
+const mustNotInclude = (readArg("--must-not-include") ?? "").split(",").map((v) => v.trim()).filter(Boolean);
+if (!output) {
+    console.error("Error: --output is required");
+    console.error("Usage: npx badgr-eval-check --output '<model output>' --must-include '<text>'");
+    process.exit(2);
+}
+const result = runEvalCheck([{ name: "cli-output", output, mustInclude, mustNotInclude }]);
+const logger = createLogger(json);
+if (json) {
+    logger.report(result.report);
+}
+else {
+    logger.line(`Eval result: ${result.passed ? "passed" : "failed"}`);
+    logger.line("");
+    for (const check of result.checks) {
+        const sym = check.status === "pass" ? "✓" : check.status === "fail" ? "✗" : "!";
+        logger.line(`${sym} ${check.message}`);
+    }
+    logger.line("");
+    logger.line("Optional: include eval status in AI Badgr run receipts.");
+}
+process.exitCode = result.report.status === "failed" ? 1 : 0;
+//# sourceMappingURL=cli.js.map

package/dist/cli.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"cli.js","sourceRoot":"","sources":["../src/cli.ts"],"names":[],"mappings":";AACA,OAAO,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,cAAc,CAAC;AAC3D,OAAO,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAE1C,aAAa,CAAC,EAAE,OAAO,EAAE,kBAAkB,EAAE,CAAC,CAAC;AAE/C,SAAS,OAAO,CAAC,IAAY;IAC3B,MAAM,KAAK,GAAG,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC;IACzC,OAAO,KAAK,IAAI,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS,CAAC;AAC1D,CAAC;AAED,IAAI,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,IAAI,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;IACnE,OAAO,CAAC,GAAG,CAAC;;;;;;;;;;;;;;;;;;;;2DAoB6C,CAAC,CAAC;IAC3D,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC;AAED,MAAM,IAAI,GAAG,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;AAC7C,MAAM,MAAM,GAAG,OAAO,CAAC,UAAU,CAAC,IAAI,EAAE,CAAC;AACzC,MAAM,WAAW,GAAG,CAAC,OAAO,CAAC,gBAAgB,CAAC,IAAI,EAAE,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;AACtG,MAAM,cAAc,GAAG,CAAC,OAAO,CAAC,oBAAoB,CAAC,IAAI,EAAE,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;AAE7G,IAAI,CAAC,MAAM,EAAE,CAAC;IACZ,OAAO,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAC;IAC7C,OAAO,CAAC,KAAK,CAAC,+EAA+E,CAAC,CAAC;IAC/F,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAClB,CAAC;AAED,MAAM,MAAM,GAAG,YAAY,CAAC,CAAC,EAAE,IAAI,EAAE,YAAY,EAAE,MAAM,EAAE,WAAW,EAAE,cAAc,EAAE,CAAC,CAAC,CAAC;AAC3F,MAAM,MAAM,GAAG,YAAY,CAAC,IAAI,CAAC,CAAC;AAClC,IAAI,IAAI,EAAE,CAAC;IACT,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;AAC/B,CAAC;KAAM,CAAC;IACN,MAAM,CAAC,IAAI,CAAC,gBAAgB,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ,EAAE,CAAC,CAAC;IACnE,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAChB,KAAK,MAAM,KAAK,IAAI,MAAM,CAAC,MAAM,EAAE,CAAC;QAClC,MAAM,GAAG,GAAG,KAAK,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,MAAM,KAAK,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC;QAChF,MAAM,CAAC,IAAI,CAAC,GAAG,GAAG,IAAI,KAAK,CAAC,OAAO,EAAE,CAAC,CAAC;IACzC,CAAC;IACD,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAChB,MAAM,CAAC,IAAI,CAAC,yDAAyD,CAAC,CAAC;AACzE,CAAC;AACD,OAAO,CAAC,QAAQ,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC"}

package/dist/index.d.ts ADDED Viewed

@@ -0,0 +1,15 @@
+import { type DiagnosticCheck, type JsonReport } from "badgr-shared";
+export interface EvalFixture {
+    name: string;
+    output: string;
+    mustInclude?: string[];
+    mustNotInclude?: string[];
+    requiredJsonFields?: string[];
+}
+export interface EvalCheckResult {
+    checks: DiagnosticCheck[];
+    passed: boolean;
+    report: JsonReport;
+}
+export declare function runEvalCheck(fixtures: EvalFixture[]): EvalCheckResult;
+//# sourceMappingURL=index.d.ts.map

package/dist/index.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAA0D,KAAK,eAAe,EAAE,KAAK,UAAU,EAAE,MAAM,cAAc,CAAC;AAE7H,MAAM,WAAW,WAAW;IAAG,IAAI,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,MAAM,CAAC;IAAC,WAAW,CAAC,EAAE,MAAM,EAAE,CAAC;IAAC,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;IAAC,kBAAkB,CAAC,EAAE,MAAM,EAAE,CAAC;CAAE;AAChJ,MAAM,WAAW,eAAe;IAAG,MAAM,EAAE,eAAe,EAAE,CAAC;IAAC,MAAM,EAAE,OAAO,CAAC;IAAC,MAAM,EAAE,UAAU,CAAC;CAAE;AAEpG,wBAAgB,YAAY,CAAC,QAAQ,EAAE,WAAW,EAAE,GAAG,eAAe,CAiCrE"}

package/dist/index.js ADDED Viewed

@@ -0,0 +1,51 @@
+import { checkToFinding, createReport, reportStatusFromFindings } from "badgr-shared";
+export function runEvalCheck(fixtures) {
+    const checks = [];
+    for (const fixture of fixtures) {
+        for (const text of fixture.mustInclude ?? []) {
+            checks.push({ name: `${fixture.name}-include-${text}`, status: fixture.output.includes(text) ? "pass" : "fail", message: fixture.output.includes(text) ? `${fixture.name} includes required content: ${text}` : `${fixture.name} is missing required content: ${text}` });
+        }
+        for (const text of fixture.mustNotInclude ?? []) {
+            checks.push({ name: `${fixture.name}-ban-${text}`, status: fixture.output.includes(text) ? "fail" : "pass", message: fixture.output.includes(text) ? `${fixture.name} contains banned output: ${text}` : `${fixture.name} avoided banned output: ${text}` });
+        }
+        if (fixture.requiredJsonFields?.length) {
+            let parsed;
+            try {
+                parsed = JSON.parse(fixture.output);
+            }
+            catch {
+                parsed = undefined;
+            }
+            for (const field of fixture.requiredJsonFields) {
+                checks.push({ name: `${fixture.name}-json-${field}`, status: hasPath(parsed, field) ? "pass" : "fail", message: hasPath(parsed, field) ? `${fixture.name} JSON includes ${field}` : `${fixture.name} JSON is missing ${field}` });
+            }
+        }
+    }
+    if (checks.length === 0)
+        checks.push({ name: "eval-input", status: "info", message: "No eval fixtures were supplied" });
+    const findings = checks.map((check) => checkToFinding(check, "EVAL"));
+    const status = reportStatusFromFindings(findings);
+    return {
+        checks,
+        passed: status === "passed",
+        report: createReport({
+            tool: "eval-check",
+            status,
+            summary: status === "failed" ? "AI output eval failed" : status === "warning" ? "AI output eval has warnings" : "AI output eval passed",
+            findings,
+            recommendedActions: ["Run eval-check before deployment", "Attach eval status to future AI Badgr execution receipts"],
+            nextCommand: "npx @aibadgr/eval-check --json",
+            actionUrl: "https://aibadgr.com/agents/connect",
+        }),
+    };
+}
+function hasPath(value, path) {
+    let current = value;
+    for (const part of path.split(".")) {
+        if (typeof current !== "object" || current === null || !(part in current))
+            return false;
+        current = current[part];
+    }
+    return true;
+}
+//# sourceMappingURL=index.js.map

package/dist/index.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,YAAY,EAAE,wBAAwB,EAAyC,MAAM,cAAc,CAAC;AAK7H,MAAM,UAAU,YAAY,CAAC,QAAuB;IAClD,MAAM,MAAM,GAAsB,EAAE,CAAC;IACrC,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,KAAK,MAAM,IAAI,IAAI,OAAO,CAAC,WAAW,IAAI,EAAE,EAAE,CAAC;YAC7C,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,GAAG,OAAO,CAAC,IAAI,YAAY,IAAI,EAAE,EAAE,MAAM,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,EAAE,OAAO,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,IAAI,+BAA+B,IAAI,EAAE,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,IAAI,iCAAiC,IAAI,EAAE,EAAE,CAAC,CAAC;QAC5Q,CAAC;QACD,KAAK,MAAM,IAAI,IAAI,OAAO,CAAC,cAAc,IAAI,EAAE,EAAE,CAAC;YAChD,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,GAAG,OAAO,CAAC,IAAI,QAAQ,IAAI,EAAE,EAAE,MAAM,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,EAAE,OAAO,EAAE,OAAO,CAAC,MAAM,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,IAAI,4BAA4B,IAAI,EAAE,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,IAAI,2BAA2B,IAAI,EAAE,EAAE,CAAC,CAAC;QAC/P,CAAC;QACD,IAAI,OAAO,CAAC,kBAAkB,EAAE,MAAM,EAAE,CAAC;YACvC,IAAI,MAAe,CAAC;YACpB,IAAI,CAAC;gBAAC,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC;YAAC,CAAC;YAAC,MAAM,CAAC;gBAAC,MAAM,GAAG,SAAS,CAAC;YAAC,CAAC;YAC1E,KAAK,MAAM,KAAK,IAAI,OAAO,CAAC,kBAAkB,EAAE,CAAC;gBAC/C,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,GAAG,OAAO,CAAC,IAAI,SAAS,KAAK,EAAE,EAAE,MAAM,EAAE,OAAO,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,EAAE,OAAO,EAAE,OAAO,CAAC,MAAM,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,IAAI,kBAAkB,KAAK,EAAE,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,IAAI,oBAAoB,KAAK,EAAE,EAAE,CAAC,CAAC;YACpO,CAAC;QACH,CAAC;IACH,CAAC;IACD,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,YAAY,EAAE,MAAM,EAAE,MAAM,EAAE,OAAO,EAAE,gCAAgC,EAAE,CAAC,CAAC;IACxH,MAAM,QAAQ,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,cAAc,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC,CAAC;IACtE,MAAM,MAAM,GAAG,wBAAwB,CAAC,QAAQ,CAAC,CAAC;IAClD,OAAO;QACL,MAAM;QACN,MAAM,EAAE,MAAM,KAAK,QAAQ;QAC3B,MAAM,EAAE,YAAY,CAAC;YACnB,IAAI,EAAE,YAAY;YAClB,MAAM;YACN,OAAO,EAAE,MAAM,KAAK,QAAQ,CAAC,CAAC,CAAC,uBAAuB,CAAC,CAAC,CAAC,MAAM,KAAK,SAAS,CAAC,CAAC,CAAC,6BAA6B,CAAC,CAAC,CAAC,uBAAuB;YACvI,QAAQ;YACR,kBAAkB,EAAE,CAAC,kCAAkC,EAAE,0DAA0D,CAAC;YACpH,WAAW,EAAE,gCAAgC;YAC7C,SAAS,EAAE,oCAAoC;SAChD,CAAC;KACH,CAAC;AACJ,CAAC;AAED,SAAS,OAAO,CAAC,KAAc,EAAE,IAAY;IAC3C,IAAI,OAAO,GAAY,KAAK,CAAC;IAC7B,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,EAAE,CAAC;QACnC,IAAI,OAAO,OAAO,KAAK,QAAQ,IAAI,OAAO,KAAK,IAAI,IAAI,CAAC,CAAC,IAAI,IAAI,OAAO,CAAC;YAAE,OAAO,KAAK,CAAC;QACxF,OAAO,GAAI,OAAmC,CAAC,IAAI,CAAC,CAAC;IACvD,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC"}

package/package.json ADDED Viewed

@@ -0,0 +1,16 @@
+{
+  "name": "badgr-eval-check",
+  "version": "0.1.0",
+  "description": "Local-first repeatable AI output checks for required content, banned output, schemas, and regressions.",
+  "type": "module",
+  "main": "dist/index.js",
+  "types": "dist/index.d.ts",
+  "bin": { "badgr-eval-check": "dist/cli.js" },
+  "exports": { ".": { "types": "./dist/index.d.ts", "import": "./dist/index.js" } },
+  "files": ["dist", "README.md"],
+  "scripts": { "build": "tsc -b", "typecheck": "tsc -b --pretty false", "test": "vitest run" },
+  "dependencies": { "badgr-shared": "0.1.1" },
+  "engines": { "node": ">=18.0.0" },
+  "keywords": ["eval", "llm", "output-validation", "regression", "rag"],
+  "license": "MIT"
+}