npm - @tangle-network/agent-eval - Versions diffs - 0.72.0 → 0.72.3 - Mend

@tangle-network/agent-eval 0.72.0 → 0.72.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (69) hide show

package/CHANGELOG.md +39 -0
package/dist/adapters/http.d.ts +1 -1
package/dist/adapters/langchain.d.ts +1 -1
package/dist/adapters/otel.d.ts +3 -2
package/dist/agent-profile-DYRboYWu.d.ts +364 -0
package/dist/analyst/index.d.ts +221 -0
package/dist/analyst/index.js +371 -0
package/dist/analyst/index.js.map +1 -0
package/dist/analyst-t7zZS3TV.d.ts +88 -0
package/dist/campaign/index.d.ts +485 -9
package/dist/campaign/index.js +597 -22
package/dist/campaign/index.js.map +1 -1
package/dist/chunk-7W4SM7FD.js +1075 -0
package/dist/chunk-7W4SM7FD.js.map +1 -0
package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
package/dist/chunk-JHA3ZGSO.js +1496 -0
package/dist/chunk-JHA3ZGSO.js.map +1 -0
package/dist/{chunk-4QJN7RDX.js → chunk-JYE3WOTE.js} +55 -7
package/dist/{chunk-4QJN7RDX.js.map → chunk-JYE3WOTE.js.map} +1 -1
package/dist/chunk-LB2UOI5F.js +412 -0
package/dist/chunk-LB2UOI5F.js.map +1 -0
package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
package/dist/chunk-VUINJM5M.js.map +1 -0
package/dist/chunk-WYIHD6EB.js +1044 -0
package/dist/chunk-WYIHD6EB.js.map +1 -0
package/dist/{chunk-UD6EF73X.js → chunk-XPILG2CA.js} +119 -2
package/dist/chunk-XPILG2CA.js.map +1 -0
package/dist/contract/index.d.ts +17 -13
package/dist/contract/index.js +13 -7
package/dist/contract/index.js.map +1 -1
package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
package/dist/control.d.ts +2 -2
package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
package/dist/hosted/index.d.ts +223 -2
package/dist/index.d.ts +49 -1323
package/dist/index.js +353 -2496
package/dist/index.js.map +1 -1
package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
package/dist/openapi.json +1 -1
package/dist/pareto-E-pembql.d.ts +81 -0
package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
package/dist/redact-B40YG2M_.d.ts +45 -0
package/dist/registry-DuVYiTvw.d.ts +128 -0
package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
package/dist/rl.d.ts +4 -3
package/dist/rl.js +4 -4
package/dist/run-critic-BAIjX99r.d.ts +56 -0
package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
package/dist/traces.d.ts +371 -308
package/dist/traces.js +43 -18
package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
package/dist/wire/index.d.ts +1 -1
package/dist/workflow/index.d.ts +494 -0
package/dist/workflow/index.js +2177 -0
package/dist/workflow/index.js.map +1 -0
package/docs/design/self-improvement-roadmap.md +106 -0
package/package.json +36 -12
package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
package/dist/chunk-ODGETRTM.js.map +0 -1
package/dist/chunk-SL55X4VN.js +0 -186
package/dist/chunk-SL55X4VN.js.map +0 -1
package/dist/chunk-UD6EF73X.js.map +0 -1
/package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-eval",
-  "version": "0.72.0",
+  "version": "0.72.3",
   "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
   "homepage": "https://github.com/tangle-network/agent-eval#readme",
   "repository": {
@@ -19,6 +19,11 @@
       "import": "./dist/index.js",
       "default": "./dist/index.js"
     },
+    "./analyst": {
+      "types": "./dist/analyst/index.d.ts",
+      "import": "./dist/analyst/index.js",
+      "default": "./dist/analyst/index.js"
+    },
     "./control": {
       "types": "./dist/control.d.ts",
       "import": "./dist/control.js",
@@ -104,6 +109,11 @@
       "import": "./dist/campaign/index.js",
       "default": "./dist/campaign/index.js"
     },
+    "./workflow": {
+      "types": "./dist/workflow/index.d.ts",
+      "import": "./dist/workflow/index.js",
+      "default": "./dist/workflow/index.js"
+    },
     "./contract": {
       "types": "./dist/contract/index.d.ts",
       "import": "./dist/contract/index.js",
@@ -144,6 +154,19 @@
   "publishConfig": {
     "access": "public"
   },
+  "scripts": {
+    "build": "tsup && pnpm openapi",
+    "dev": "tsup --watch",
+    "prepare": "husky",
+    "prepublishOnly": "pnpm build",
+    "test": "vitest run",
+    "test:watch": "vitest",
+    "typecheck": "tsc --noEmit",
+    "lint": "biome check src",
+    "format": "biome format --write src",
+    "openapi": "node dist/cli.js openapi --out dist/openapi.json",
+    "verify:package": "node scripts/verify-package-exports.mjs"
+  },
   "dependencies": {
     "@asteasolutions/zod-to-openapi": "^8.5.0",
     "@ax-llm/ax": "^19.0.25",
@@ -171,6 +194,16 @@
     "typescript": "^5.7.0",
     "vitest": "^3.0.0"
   },
+  "pnpm": {
+    "minimumReleaseAge": 4320,
+    "minimumReleaseAgeExclude": [
+      "@tangle-network/sandbox"
+    ],
+    "overrides": {
+      "postcss@<8.5.10": "^8.5.10",
+      "ws@>=8.0.0 <8.20.1": "^8.20.1"
+    }
+  },
   "engines": {
     "node": ">=20"
   },
@@ -183,14 +216,5 @@
     ]
   },
   "license": "MIT",
-  "scripts": {
-    "build": "tsup && pnpm openapi",
-    "dev": "tsup --watch",
-    "test": "vitest run",
-    "test:watch": "vitest",
-    "typecheck": "tsc --noEmit",
-    "lint": "biome check src",
-    "format": "biome format --write src",
-    "openapi": "node dist/cli.js openapi --out dist/openapi.json"
-  }
-}
+  "packageManager": "pnpm@10.22.0"
+}

package/dist/agent-profile-DzcPHR1Z.d.ts DELETED Viewed

@@ -1,114 +0,0 @@
-import { A as AgentEvalError } from './errors-Dwqw-T_m.js';
-import { R as RunRecord } from './run-record-BgTFzO2r.js';
-/**
- * Backend-integrity guard: distinguish "agent failed" from "eval ran against
- * a stub / unconfigured backend." Without this guard a canonical eval can
- * silently report `0/N passed` and look like an agent-quality problem when
- * the LLM was never actually called — the failure mode we just hit running
- * the 4-vertical parallel eval (legal-sandbox-stub returned hard-coded 33-104
- * char strings; gtm/creative defaulted to a cli-bridge that wasn't running).
- *
- * The shape:
- *
- *   const report = summarizeBackendIntegrity(records)
- *   assertRealBackend(records)   // throws BackendIntegrityError if 100% stub
- *
- * A record is "stub-mode" if its `tokenUsage.input === 0 && tokenUsage.output === 0`.
- * (`costUsd` alone is unreliable — some backends successfully call LLMs but
- *  don't propagate pricing, producing real tokens with $0 cost.)
- *
- * Verdicts:
- *   - `real`   — at least one record has nonzero token usage
- *   - `stub`   — every record is stub-mode (eval ran blind)
- *   - `mixed`  — some records real, some stub (partial backend failure;
- *                often the 429-cascade or auth-half-failed case)
- */
-interface BackendIntegrityReport {
-    /** Total records inspected. */
-    totalRecords: number;
-    /** Records with input=0 AND output=0 (a stub fingerprint). */
-    stubRecords: number;
-    /** Records with nonzero token usage (real LLM activity). */
-    realRecords: number;
-    /** Records where output>0 but costUsd=0 (real LLM, broken cost ledger). */
-    uncostedRecords: number;
-    /** Sum of input tokens across all records. */
-    totalInputTokens: number;
-    /** Sum of output tokens across all records. */
-    totalOutputTokens: number;
-    /** Sum of costUsd across all records. */
-    totalCostUsd: number;
-    /** Worst-case integrity verdict. */
-    verdict: 'real' | 'mixed' | 'stub';
-    /** Human-readable diagnosis suitable for terminal output. */
-    diagnosis: string;
-}
-/**
- * Error thrown when an integrity assertion fails. Caller can pattern-match
- * by `code === 'AGENT_EVAL_BACKEND_STUB'` to differentiate from other
- * errors.
- */
-declare class BackendIntegrityError extends AgentEvalError {
-    readonly report: BackendIntegrityReport;
-    constructor(message: string, report: BackendIntegrityReport);
-}
-/**
- * Inspect a batch of RunRecords and return an integrity report. Pure
- * function — no I/O, no logging. The caller decides what to do with the
- * verdict (print warning, throw, gate CI, etc.).
- */
-declare function summarizeBackendIntegrity(records: ReadonlyArray<RunRecord>): BackendIntegrityReport;
-/**
- * Throw BackendIntegrityError if the verdict is 'stub' — i.e. every record
- * shows zero LLM activity. Non-strict callers can pass `{ allowMixed: false }`
- * to also reject mixed verdicts (recommended for CI gates).
- *
- * Real backends pass through silently.
- */
-declare function assertRealBackend(records: ReadonlyArray<RunRecord>, opts?: {
-    allowMixed?: boolean;
-}): BackendIntegrityReport;
-/**
- * @stable
- *
- * AgentProfile — the eval harness's unit of variation.
- *
- * A profile pins everything that changes agent behaviour for a benchmark
- * cell: the model, the active skills, the prompt version, the available
- * tools. Vary the profile — swap a model, add a skill — and re-run the suite
- * to benchmark the change. The scorecard keys a cell on
- * `(scenarioId, profileHash)`, so the model is not a separate axis: it lives
- * inside the profile, and two profiles with the same model but different
- * skills are different cells.
- *
- * `agentProfileHash` is the profile's behaviour identity. Two profiles that
- * produce the same agent behaviour share a hash (and a scorecard cell);
- * reordering `skills` or `tools` does not change it; the human-facing `id`
- * label does not affect it.
- */
-interface AgentProfile {
-    /** Human-facing label, e.g. `sonnet-legal-skills-v3`. Not part of the hash. */
-    id: string;
-    /** Model snapshot id this profile pins, e.g. `claude-sonnet-4-6@2025-04-15`. */
-    model: string;
-    /** Skill ids/versions active in this profile — the primary behaviour lever. */
-    skills?: string[];
-    /** Prompt version identifier. */
-    promptVersion?: string;
-    /** Tool ids available to the agent. */
-    tools?: string[];
-    /** Any other behaviour-bearing knobs that should fingerprint into the hash. */
-    metadata?: Record<string, string | number | boolean>;
-}
-/**
- * Deterministic behaviour identity of a profile — a sha256 over the
- * behaviour-bearing fields. `skills` and `tools` are order-insensitive; the
- * `id` label is excluded. Throws on a profile with no `model` — an unkeyable
- * profile must fail loud rather than collapse into a blank-model cell.
- */
-declare function agentProfileHash(profile: AgentProfile): string;
-export { type AgentProfile as A, type BackendIntegrityReport as B, BackendIntegrityError as a, agentProfileHash as b, assertRealBackend as c, summarizeBackendIntegrity as s };