npm - @tangle-network/agent-app - Versions diffs - 0.1.14 → 0.2.0 - Mend

@tangle-network/agent-app 0.1.14 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/eval-campaign/index.d.ts +75 -0
package/dist/eval-campaign/index.js +47 -0
package/dist/eval-campaign/index.js.map +1 -0
package/dist/index.js +12 -12
package/package.json +8 -3

package/dist/eval-campaign/index.d.ts ADDED Viewed

@@ -0,0 +1,75 @@
+import { JudgeVerdict } from '@tangle-network/agent-eval';
+export { EnsembleAggregate, JudgeVerdict, RunRecord, aggregateJudgeVerdicts } from '@tangle-network/agent-eval';
+import { Scenario, JudgeConfig } from '@tangle-network/agent-eval/campaign';
+export { CampaignResult, DispatchContext, Gate, ImprovementDriver, JudgeConfig, JudgeDimension, JudgeScore, LabeledScenarioStore, MutableSurface, Mutator, Scenario, defaultProductionGate, evolutionaryDriver, gepaDriver, paretoSignificanceGate, runCampaign } from '@tangle-network/agent-eval/campaign';
+export { SelfImproveBudget, SelfImproveOptions, SelfImproveResult, selfImprove } from '@tangle-network/agent-eval/contract';
+/**
+ * Eval-campaign — the app-shell's curated surface for a product's
+ * self-improvement loop, NOT a reimplementation.
+ *
+ * The loop ENGINE lives in `@tangle-network/agent-eval` (a peer dependency):
+ * `selfImprove` already owns the whole cycle — train/holdout split, the GEPA
+ * driver, the held-out production gate, durable provenance + hosted ingest, and
+ * every default. A product should NOT hand-roll `runImprovementLoop` +
+ * `emitLoopProvenance` around it (that is the boilerplate this surface exists to
+ * delete). It should call `selfImprove` with three things it actually owns:
+ * scenarios, an `agent` dispatch, and a `judge`.
+ *
+ * This module adds the one piece `selfImprove` does not own and which every
+ * multi-model product re-hand-rolls — the ensemble judge:
+ *
+ *   {@link buildEnsembleJudge} — turn a per-rubric `scoreOne` into a
+ *   `JudgeConfig` that fans out N uncorrelated judge calls and reduces them via
+ *   the substrate's `aggregateJudgeVerdicts` (survivor-mean, inter-rater spread,
+ *   fail-loud on all-failed). A product writes its rubric + one judge call; the
+ *   fan-out, partial-failure handling, and composite are the scaffold's.
+ *
+ * Everything else is a curated re-export so a product has ONE eval import:
+ * `selfImprove` + the gates + the drivers + the types. See
+ * `.claude/skills/eval-campaign/SKILL.md` for the wiring contract.
+ */
+/** Config for {@link buildEnsembleJudge}. `D` = the rubric's dimension union. */
+interface EnsembleJudgeConfig<TArtifact, TScenario extends Scenario, D extends string> {
+    /** Judge name — appears in traces and scorecards. */
+    name: string;
+    /** Stable-ordered rubric dimensions. Drives the `JudgeDimension` list AND the
+     *  reducer keys, so a judge that omits a dimension scores it 0 (never silently
+     *  dropped). */
+    rubric: readonly D[];
+    /**
+     * Score ONE artifact on the rubric → a raw per-dimension verdict. Called
+     * `judgeReps` times per artifact; vary the model by `rep` for an uncorrelated
+     * ensemble (judges that share a base model share its bias). Return
+     * `{ model, perDimension: null }` to record a judge failure WITHOUT killing
+     * the ensemble; throw only on an unrecoverable error (the whole rep is then
+     * treated as a failed judge).
+     */
+    scoreOne: (input: {
+        artifact: TArtifact;
+        scenario: TScenario;
+        signal: AbortSignal;
+        rep: number;
+    }) => Promise<JudgeVerdict<D>>;
+    /** Independent judge calls per artifact, reduced by `aggregateJudgeVerdicts`.
+     *  Default 1. Raise (with model variety in `scoreOne`) for inter-rater bands. */
+    judgeReps?: number;
+    /** Per-dimension composite weights. Default: uniform over `rubric`. A partial
+     *  map selects-and-weights exactly the named dimensions. */
+    weights?: Partial<Record<D, number>>;
+    /** Optional human-readable dimension descriptions. Default: the key itself. */
+    describe?: (dim: D) => string;
+}
+/**
+ * Build a `JudgeConfig` whose `score()` fans out `judgeReps` independent
+ * `scoreOne` calls and reduces them with the substrate's
+ * `aggregateJudgeVerdicts`. A single judge call failing does NOT fail the cell
+ * (it is recorded and dropped); only ALL judges failing throws — which the
+ * campaign records as a failed cell, never a silent zero.
+ *
+ * Pass the result straight to `selfImprove({ judge })` (or `runCampaign`).
+ */
+declare function buildEnsembleJudge<TArtifact, TScenario extends Scenario, D extends string>(cfg: EnsembleJudgeConfig<TArtifact, TScenario, D>): JudgeConfig<TArtifact, TScenario>;
+export { type EnsembleJudgeConfig, buildEnsembleJudge };

package/dist/eval-campaign/index.js ADDED Viewed

@@ -0,0 +1,47 @@
+// src/eval-campaign/index.ts
+import {
+  aggregateJudgeVerdicts
+} from "@tangle-network/agent-eval";
+import { aggregateJudgeVerdicts as aggregateJudgeVerdicts2 } from "@tangle-network/agent-eval";
+import {
+  defaultProductionGate,
+  evolutionaryDriver,
+  gepaDriver,
+  paretoSignificanceGate,
+  runCampaign
+} from "@tangle-network/agent-eval/campaign";
+import { selfImprove } from "@tangle-network/agent-eval/contract";
+function buildEnsembleJudge(cfg) {
+  const reps = cfg.judgeReps ?? 1;
+  if (reps < 1) {
+    throw new Error(`buildEnsembleJudge: judgeReps must be >= 1 (got ${reps})`);
+  }
+  if (cfg.rubric.length === 0) {
+    throw new Error("buildEnsembleJudge: rubric is empty");
+  }
+  return {
+    name: cfg.name,
+    dimensions: cfg.rubric.map((key) => ({ key, description: cfg.describe?.(key) ?? key })),
+    async score({ artifact, scenario, signal }) {
+      const settled = await Promise.allSettled(
+        Array.from({ length: reps }, (_, rep) => cfg.scoreOne({ artifact, scenario, signal, rep }))
+      );
+      const verdicts = settled.map(
+        (r, rep) => r.status === "fulfilled" ? r.value : { model: `${cfg.name}-rep${rep}`, perDimension: null, rationale: String(r.reason) }
+      );
+      const agg = aggregateJudgeVerdicts(verdicts, cfg.rubric, cfg.weights);
+      return { composite: agg.composite, dimensions: agg.perDimension, notes: agg.rationale };
+    }
+  };
+}
+export {
+  aggregateJudgeVerdicts2 as aggregateJudgeVerdicts,
+  buildEnsembleJudge,
+  defaultProductionGate,
+  evolutionaryDriver,
+  gepaDriver,
+  paretoSignificanceGate,
+  runCampaign,
+  selfImprove
+};
+//# sourceMappingURL=index.js.map

package/dist/eval-campaign/index.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"sources":["../../src/eval-campaign/index.ts"],"sourcesContent":["/**\n * Eval-campaign — the app-shell's curated surface for a product's\n * self-improvement loop, NOT a reimplementation.\n *\n * The loop ENGINE lives in `@tangle-network/agent-eval` (a peer dependency):\n * `selfImprove` already owns the whole cycle — train/holdout split, the GEPA\n * driver, the held-out production gate, durable provenance + hosted ingest, and\n * every default. A product should NOT hand-roll `runImprovementLoop` +\n * `emitLoopProvenance` around it (that is the boilerplate this surface exists to\n * delete). It should call `selfImprove` with three things it actually owns:\n * scenarios, an `agent` dispatch, and a `judge`.\n *\n * This module adds the one piece `selfImprove` does not own and which every\n * multi-model product re-hand-rolls — the ensemble judge:\n *\n * {@link buildEnsembleJudge} — turn a per-rubric `scoreOne` into a\n * `JudgeConfig` that fans out N uncorrelated judge calls and reduces them via\n * the substrate's `aggregateJudgeVerdicts` (survivor-mean, inter-rater spread,\n * fail-loud on all-failed). A product writes its rubric + one judge call; the\n * fan-out, partial-failure handling, and composite are the scaffold's.\n *\n * Everything else is a curated re-export so a product has ONE eval import:\n * `selfImprove` + the gates + the drivers + the types. See\n * `.claude/skills/eval-campaign/SKILL.md` for the wiring contract.\n */\n\nimport {\n aggregateJudgeVerdicts,\n type JudgeVerdict,\n} from '@tangle-network/agent-eval'\nimport type {\n JudgeConfig,\n JudgeScore,\n Scenario,\n} from '@tangle-network/agent-eval/campaign'\n\n/** Config for {@link buildEnsembleJudge}. `D` = the rubric's dimension union. */\nexport interface EnsembleJudgeConfig<TArtifact, TScenario extends Scenario, D extends string> {\n /** Judge name — appears in traces and scorecards. */\n name: string\n /** Stable-ordered rubric dimensions. Drives the `JudgeDimension` list AND the\n * reducer keys, so a judge that omits a dimension scores it 0 (never silently\n * dropped). */\n rubric: readonly D[]\n /**\n * Score ONE artifact on the rubric → a raw per-dimension verdict. Called\n * `judgeReps` times per artifact; vary the model by `rep` for an uncorrelated\n * ensemble (judges that share a base model share its bias). Return\n * `{ model, perDimension: null }` to record a judge failure WITHOUT killing\n * the ensemble; throw only on an unrecoverable error (the whole rep is then\n * treated as a failed judge).\n */\n scoreOne: (input: {\n artifact: TArtifact\n scenario: TScenario\n signal: AbortSignal\n rep: number\n }) => Promise<JudgeVerdict<D>>\n /** Independent judge calls per artifact, reduced by `aggregateJudgeVerdicts`.\n * Default 1. Raise (with model variety in `scoreOne`) for inter-rater bands. */\n judgeReps?: number\n /** Per-dimension composite weights. Default: uniform over `rubric`. A partial\n * map selects-and-weights exactly the named dimensions. */\n weights?: Partial<Record<D, number>>\n /** Optional human-readable dimension descriptions. Default: the key itself. */\n describe?: (dim: D) => string\n}\n\n/**\n * Build a `JudgeConfig` whose `score()` fans out `judgeReps` independent\n * `scoreOne` calls and reduces them with the substrate's\n * `aggregateJudgeVerdicts`. A single judge call failing does NOT fail the cell\n * (it is recorded and dropped); only ALL judges failing throws — which the\n * campaign records as a failed cell, never a silent zero.\n *\n * Pass the result straight to `selfImprove({ judge })` (or `runCampaign`).\n */\nexport function buildEnsembleJudge<TArtifact, TScenario extends Scenario, D extends string>(\n cfg: EnsembleJudgeConfig<TArtifact, TScenario, D>,\n): JudgeConfig<TArtifact, TScenario> {\n const reps = cfg.judgeReps ?? 1\n if (reps < 1) {\n throw new Error(`buildEnsembleJudge: judgeReps must be >= 1 (got ${reps})`)\n }\n if (cfg.rubric.length === 0) {\n throw new Error('buildEnsembleJudge: rubric is empty')\n }\n return {\n name: cfg.name,\n dimensions: cfg.rubric.map((key) => ({ key, description: cfg.describe?.(key) ?? key })),\n async score({ artifact, scenario, signal }): Promise<JudgeScore> {\n const settled = await Promise.allSettled(\n Array.from({ length: reps }, (_, rep) => cfg.scoreOne({ artifact, scenario, signal, rep })),\n )\n const verdicts: JudgeVerdict<D>[] = settled.map((r, rep) =>\n r.status === 'fulfilled'\n ? r.value\n : { model: `${cfg.name}-rep${rep}`, perDimension: null, rationale: String(r.reason) },\n )\n // Throws iff EVERY rep failed → the campaign records a failed cell.\n const agg = aggregateJudgeVerdicts(verdicts, cfg.rubric, cfg.weights)\n return { composite: agg.composite, dimensions: agg.perDimension, notes: agg.rationale }\n },\n }\n}\n\n// ── Curated re-exports — the one eval import for a product loop ──────────────\n// The loop engine + gates + drivers + the ensemble reducer, so a product wires\n// its self-improvement loop from a single module instead of reaching across\n// three agent-eval subpaths. All DOWNWARD imports (agent-app consumes the\n// substrate); the layering rule is preserved.\n\nexport { aggregateJudgeVerdicts } from '@tangle-network/agent-eval'\nexport type {\n EnsembleAggregate,\n JudgeVerdict,\n RunRecord,\n} from '@tangle-network/agent-eval'\nexport {\n defaultProductionGate,\n evolutionaryDriver,\n gepaDriver,\n paretoSignificanceGate,\n runCampaign,\n} from '@tangle-network/agent-eval/campaign'\nexport type {\n CampaignResult,\n DispatchContext,\n Gate,\n ImprovementDriver,\n JudgeConfig,\n JudgeDimension,\n JudgeScore,\n LabeledScenarioStore,\n MutableSurface,\n Mutator,\n Scenario,\n} from '@tangle-network/agent-eval/campaign'\nexport { selfImprove } from '@tangle-network/agent-eval/contract'\nexport type {\n SelfImproveBudget,\n SelfImproveOptions,\n SelfImproveResult,\n} from '@tangle-network/agent-eval/contract'\n"],"mappings":";AA0BA;AAAA,EACE;AAAA,OAEK;AAmFP,SAAS,0BAAAA,+BAA8B;AAMvC;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OACK;AAcP,SAAS,mBAAmB;AA7DrB,SAAS,mBACd,KACmC;AACnC,QAAM,OAAO,IAAI,aAAa;AAC9B,MAAI,OAAO,GAAG;AACZ,UAAM,IAAI,MAAM,mDAAmD,IAAI,GAAG;AAAA,EAC5E;AACA,MAAI,IAAI,OAAO,WAAW,GAAG;AAC3B,UAAM,IAAI,MAAM,qCAAqC;AAAA,EACvD;AACA,SAAO;AAAA,IACL,MAAM,IAAI;AAAA,IACV,YAAY,IAAI,OAAO,IAAI,CAAC,SAAS,EAAE,KAAK,aAAa,IAAI,WAAW,GAAG,KAAK,IAAI,EAAE;AAAA,IACtF,MAAM,MAAM,EAAE,UAAU,UAAU,OAAO,GAAwB;AAC/D,YAAM,UAAU,MAAM,QAAQ;AAAA,QAC5B,MAAM,KAAK,EAAE,QAAQ,KAAK,GAAG,CAAC,GAAG,QAAQ,IAAI,SAAS,EAAE,UAAU,UAAU,QAAQ,IAAI,CAAC,CAAC;AAAA,MAC5F;AACA,YAAM,WAA8B,QAAQ;AAAA,QAAI,CAAC,GAAG,QAClD,EAAE,WAAW,cACT,EAAE,QACF,EAAE,OAAO,GAAG,IAAI,IAAI,OAAO,GAAG,IAAI,cAAc,MAAM,WAAW,OAAO,EAAE,MAAM,EAAE;AAAA,MACxF;AAEA,YAAM,MAAM,uBAAuB,UAAU,IAAI,QAAQ,IAAI,OAAO;AACpE,aAAO,EAAE,WAAW,IAAI,WAAW,YAAY,IAAI,cAAc,OAAO,IAAI,UAAU;AAAA,IACxF;AAAA,EACF;AACF;","names":["aggregateJudgeVerdicts"]}

package/dist/index.js CHANGED Viewed

@@ -1,3 +1,10 @@
+import {
+  addSecurityHeaders,
+  checkRateLimit,
+  extractRequestContext,
+  parseJsonObjectBody,
+  requireString
+} from "./chunk-CN75FIPT.js";
 import {
   DEFAULT_REDACTION_PATTERNS,
   buildRedactedDocument,
@@ -6,6 +13,11 @@ import {
   redactForIngestion,
   revealSpan
 } from "./chunk-5RMIUJDI.js";
+import {
+  createKnowledgeLoop,
+  createReviewerDecider,
+  reviewCandidate
+} from "./chunk-EEPJGZJW.js";
 import {
   DEFAULT_HARNESS,
   KNOWN_HARNESSES,
@@ -65,13 +77,6 @@ import {
   invokeIntegrationHub,
   resolveIntegrationAction
 } from "./chunk-L2TG5DBW.js";
-import {
-  addSecurityHeaders,
-  checkRateLimit,
-  extractRequestContext,
-  parseJsonObjectBody,
-  requireString
-} from "./chunk-CN75FIPT.js";
 import {
   DEFAULT_APP_TOOL_PATHS,
   DEFAULT_HEADER_NAMES,
@@ -123,11 +128,6 @@ import {
   buildKnowledgeRequirements,
   deriveSignals
 } from "./chunk-ZXNXAQAH.js";
-import {
-  createKnowledgeLoop,
-  createReviewerDecider,
-  reviewCandidate
-} from "./chunk-EEPJGZJW.js";
 export {
   APP_TOOL_NAMES,
   DEFAULT_APP_TOOL_PATHS,

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-app",
-  "version": "0.1.14",
+  "version": "0.2.0",
   "packageManager": "pnpm@10.33.4",
   "description": "Application-shell framework for Tangle agent products: a bounded tool loop, the structured agent→app tool side channel, integration-hub client, per-workspace billing, and crypto — composed over the Tangle agent substrate through typed seams.",
   "keywords": [
@@ -61,6 +61,11 @@
       "import": "./dist/eval/index.js",
       "default": "./dist/eval/index.js"
     },
+    "./eval-campaign": {
+      "types": "./dist/eval-campaign/index.d.ts",
+      "import": "./dist/eval-campaign/index.js",
+      "default": "./dist/eval-campaign/index.js"
+    },
     "./knowledge": {
       "types": "./dist/knowledge/index.d.ts",
       "import": "./dist/knowledge/index.js",
@@ -131,7 +136,7 @@
     "typecheck": "tsc --noEmit"
   },
   "devDependencies": {
-    "@tangle-network/agent-eval": "^0.70.0",
+    "@tangle-network/agent-eval": "^0.82.0",
     "@tangle-network/agent-integrations": "^0.32.0",
     "@tangle-network/agent-knowledge": "^1.5.2",
     "@types/node": "^25.6.0",
@@ -140,7 +145,7 @@
     "vitest": "^3.0.0"
   },
   "peerDependencies": {
-    "@tangle-network/agent-eval": ">=0.50.0",
+    "@tangle-network/agent-eval": ">=0.82.0",
     "@tangle-network/agent-integrations": ">=0.32.0",
     "@tangle-network/agent-knowledge": ">=1.5.0",
     "@tangle-network/agent-runtime": ">=0.21.0"