@tangle-network/agent-app 0.1.14 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,75 @@
1
+ import { JudgeVerdict } from '@tangle-network/agent-eval';
2
+ export { EnsembleAggregate, JudgeVerdict, RunRecord, aggregateJudgeVerdicts } from '@tangle-network/agent-eval';
3
+ import { Scenario, JudgeConfig } from '@tangle-network/agent-eval/campaign';
4
+ export { CampaignResult, DispatchContext, Gate, ImprovementDriver, JudgeConfig, JudgeDimension, JudgeScore, LabeledScenarioStore, MutableSurface, Mutator, Scenario, defaultProductionGate, evolutionaryDriver, gepaDriver, paretoSignificanceGate, runCampaign } from '@tangle-network/agent-eval/campaign';
5
+ export { SelfImproveBudget, SelfImproveOptions, SelfImproveResult, selfImprove } from '@tangle-network/agent-eval/contract';
6
+
7
+ /**
8
+ * Eval-campaign — the app-shell's curated surface for a product's
9
+ * self-improvement loop, NOT a reimplementation.
10
+ *
11
+ * The loop ENGINE lives in `@tangle-network/agent-eval` (a peer dependency):
12
+ * `selfImprove` already owns the whole cycle — train/holdout split, the GEPA
13
+ * driver, the held-out production gate, durable provenance + hosted ingest, and
14
+ * every default. A product should NOT hand-roll `runImprovementLoop` +
15
+ * `emitLoopProvenance` around it (that is the boilerplate this surface exists to
16
+ * delete). It should call `selfImprove` with three things it actually owns:
17
+ * scenarios, an `agent` dispatch, and a `judge`.
18
+ *
19
+ * This module adds the one piece `selfImprove` does not own and which every
20
+ * multi-model product re-hand-rolls — the ensemble judge:
21
+ *
22
+ * {@link buildEnsembleJudge} — turn a per-rubric `scoreOne` into a
23
+ * `JudgeConfig` that fans out N uncorrelated judge calls and reduces them via
24
+ * the substrate's `aggregateJudgeVerdicts` (survivor-mean, inter-rater spread,
25
+ * fail-loud on all-failed). A product writes its rubric + one judge call; the
26
+ * fan-out, partial-failure handling, and composite are the scaffold's.
27
+ *
28
+ * Everything else is a curated re-export so a product has ONE eval import:
29
+ * `selfImprove` + the gates + the drivers + the types. See
30
+ * `.claude/skills/eval-campaign/SKILL.md` for the wiring contract.
31
+ */
32
+
33
+ /** Config for {@link buildEnsembleJudge}. `D` = the rubric's dimension union. */
34
+ interface EnsembleJudgeConfig<TArtifact, TScenario extends Scenario, D extends string> {
35
+ /** Judge name — appears in traces and scorecards. */
36
+ name: string;
37
+ /** Stable-ordered rubric dimensions. Drives the `JudgeDimension` list AND the
38
+ * reducer keys, so a judge that omits a dimension scores it 0 (never silently
39
+ * dropped). */
40
+ rubric: readonly D[];
41
+ /**
42
+ * Score ONE artifact on the rubric → a raw per-dimension verdict. Called
43
+ * `judgeReps` times per artifact; vary the model by `rep` for an uncorrelated
44
+ * ensemble (judges that share a base model share its bias). Return
45
+ * `{ model, perDimension: null }` to record a judge failure WITHOUT killing
46
+ * the ensemble; throw only on an unrecoverable error (the whole rep is then
47
+ * treated as a failed judge).
48
+ */
49
+ scoreOne: (input: {
50
+ artifact: TArtifact;
51
+ scenario: TScenario;
52
+ signal: AbortSignal;
53
+ rep: number;
54
+ }) => Promise<JudgeVerdict<D>>;
55
+ /** Independent judge calls per artifact, reduced by `aggregateJudgeVerdicts`.
56
+ * Default 1. Raise (with model variety in `scoreOne`) for inter-rater bands. */
57
+ judgeReps?: number;
58
+ /** Per-dimension composite weights. Default: uniform over `rubric`. A partial
59
+ * map selects-and-weights exactly the named dimensions. */
60
+ weights?: Partial<Record<D, number>>;
61
+ /** Optional human-readable dimension descriptions. Default: the key itself. */
62
+ describe?: (dim: D) => string;
63
+ }
64
+ /**
65
+ * Build a `JudgeConfig` whose `score()` fans out `judgeReps` independent
66
+ * `scoreOne` calls and reduces them with the substrate's
67
+ * `aggregateJudgeVerdicts`. A single judge call failing does NOT fail the cell
68
+ * (it is recorded and dropped); only ALL judges failing throws — which the
69
+ * campaign records as a failed cell, never a silent zero.
70
+ *
71
+ * Pass the result straight to `selfImprove({ judge })` (or `runCampaign`).
72
+ */
73
+ declare function buildEnsembleJudge<TArtifact, TScenario extends Scenario, D extends string>(cfg: EnsembleJudgeConfig<TArtifact, TScenario, D>): JudgeConfig<TArtifact, TScenario>;
74
+
75
+ export { type EnsembleJudgeConfig, buildEnsembleJudge };
@@ -0,0 +1,47 @@
1
+ // src/eval-campaign/index.ts
2
+ import {
3
+ aggregateJudgeVerdicts
4
+ } from "@tangle-network/agent-eval";
5
+ import { aggregateJudgeVerdicts as aggregateJudgeVerdicts2 } from "@tangle-network/agent-eval";
6
+ import {
7
+ defaultProductionGate,
8
+ evolutionaryDriver,
9
+ gepaDriver,
10
+ paretoSignificanceGate,
11
+ runCampaign
12
+ } from "@tangle-network/agent-eval/campaign";
13
+ import { selfImprove } from "@tangle-network/agent-eval/contract";
14
+ function buildEnsembleJudge(cfg) {
15
+ const reps = cfg.judgeReps ?? 1;
16
+ if (reps < 1) {
17
+ throw new Error(`buildEnsembleJudge: judgeReps must be >= 1 (got ${reps})`);
18
+ }
19
+ if (cfg.rubric.length === 0) {
20
+ throw new Error("buildEnsembleJudge: rubric is empty");
21
+ }
22
+ return {
23
+ name: cfg.name,
24
+ dimensions: cfg.rubric.map((key) => ({ key, description: cfg.describe?.(key) ?? key })),
25
+ async score({ artifact, scenario, signal }) {
26
+ const settled = await Promise.allSettled(
27
+ Array.from({ length: reps }, (_, rep) => cfg.scoreOne({ artifact, scenario, signal, rep }))
28
+ );
29
+ const verdicts = settled.map(
30
+ (r, rep) => r.status === "fulfilled" ? r.value : { model: `${cfg.name}-rep${rep}`, perDimension: null, rationale: String(r.reason) }
31
+ );
32
+ const agg = aggregateJudgeVerdicts(verdicts, cfg.rubric, cfg.weights);
33
+ return { composite: agg.composite, dimensions: agg.perDimension, notes: agg.rationale };
34
+ }
35
+ };
36
+ }
37
+ export {
38
+ aggregateJudgeVerdicts2 as aggregateJudgeVerdicts,
39
+ buildEnsembleJudge,
40
+ defaultProductionGate,
41
+ evolutionaryDriver,
42
+ gepaDriver,
43
+ paretoSignificanceGate,
44
+ runCampaign,
45
+ selfImprove
46
+ };
47
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../src/eval-campaign/index.ts"],"sourcesContent":["/**\n * Eval-campaign — the app-shell's curated surface for a product's\n * self-improvement loop, NOT a reimplementation.\n *\n * The loop ENGINE lives in `@tangle-network/agent-eval` (a peer dependency):\n * `selfImprove` already owns the whole cycle — train/holdout split, the GEPA\n * driver, the held-out production gate, durable provenance + hosted ingest, and\n * every default. A product should NOT hand-roll `runImprovementLoop` +\n * `emitLoopProvenance` around it (that is the boilerplate this surface exists to\n * delete). It should call `selfImprove` with three things it actually owns:\n * scenarios, an `agent` dispatch, and a `judge`.\n *\n * This module adds the one piece `selfImprove` does not own and which every\n * multi-model product re-hand-rolls — the ensemble judge:\n *\n * {@link buildEnsembleJudge} — turn a per-rubric `scoreOne` into a\n * `JudgeConfig` that fans out N uncorrelated judge calls and reduces them via\n * the substrate's `aggregateJudgeVerdicts` (survivor-mean, inter-rater spread,\n * fail-loud on all-failed). A product writes its rubric + one judge call; the\n * fan-out, partial-failure handling, and composite are the scaffold's.\n *\n * Everything else is a curated re-export so a product has ONE eval import:\n * `selfImprove` + the gates + the drivers + the types. See\n * `.claude/skills/eval-campaign/SKILL.md` for the wiring contract.\n */\n\nimport {\n aggregateJudgeVerdicts,\n type JudgeVerdict,\n} from '@tangle-network/agent-eval'\nimport type {\n JudgeConfig,\n JudgeScore,\n Scenario,\n} from '@tangle-network/agent-eval/campaign'\n\n/** Config for {@link buildEnsembleJudge}. `D` = the rubric's dimension union. */\nexport interface EnsembleJudgeConfig<TArtifact, TScenario extends Scenario, D extends string> {\n /** Judge name — appears in traces and scorecards. */\n name: string\n /** Stable-ordered rubric dimensions. Drives the `JudgeDimension` list AND the\n * reducer keys, so a judge that omits a dimension scores it 0 (never silently\n * dropped). */\n rubric: readonly D[]\n /**\n * Score ONE artifact on the rubric → a raw per-dimension verdict. Called\n * `judgeReps` times per artifact; vary the model by `rep` for an uncorrelated\n * ensemble (judges that share a base model share its bias). Return\n * `{ model, perDimension: null }` to record a judge failure WITHOUT killing\n * the ensemble; throw only on an unrecoverable error (the whole rep is then\n * treated as a failed judge).\n */\n scoreOne: (input: {\n artifact: TArtifact\n scenario: TScenario\n signal: AbortSignal\n rep: number\n }) => Promise<JudgeVerdict<D>>\n /** Independent judge calls per artifact, reduced by `aggregateJudgeVerdicts`.\n * Default 1. Raise (with model variety in `scoreOne`) for inter-rater bands. */\n judgeReps?: number\n /** Per-dimension composite weights. Default: uniform over `rubric`. A partial\n * map selects-and-weights exactly the named dimensions. */\n weights?: Partial<Record<D, number>>\n /** Optional human-readable dimension descriptions. Default: the key itself. */\n describe?: (dim: D) => string\n}\n\n/**\n * Build a `JudgeConfig` whose `score()` fans out `judgeReps` independent\n * `scoreOne` calls and reduces them with the substrate's\n * `aggregateJudgeVerdicts`. A single judge call failing does NOT fail the cell\n * (it is recorded and dropped); only ALL judges failing throws — which the\n * campaign records as a failed cell, never a silent zero.\n *\n * Pass the result straight to `selfImprove({ judge })` (or `runCampaign`).\n */\nexport function buildEnsembleJudge<TArtifact, TScenario extends Scenario, D extends string>(\n cfg: EnsembleJudgeConfig<TArtifact, TScenario, D>,\n): JudgeConfig<TArtifact, TScenario> {\n const reps = cfg.judgeReps ?? 1\n if (reps < 1) {\n throw new Error(`buildEnsembleJudge: judgeReps must be >= 1 (got ${reps})`)\n }\n if (cfg.rubric.length === 0) {\n throw new Error('buildEnsembleJudge: rubric is empty')\n }\n return {\n name: cfg.name,\n dimensions: cfg.rubric.map((key) => ({ key, description: cfg.describe?.(key) ?? key })),\n async score({ artifact, scenario, signal }): Promise<JudgeScore> {\n const settled = await Promise.allSettled(\n Array.from({ length: reps }, (_, rep) => cfg.scoreOne({ artifact, scenario, signal, rep })),\n )\n const verdicts: JudgeVerdict<D>[] = settled.map((r, rep) =>\n r.status === 'fulfilled'\n ? r.value\n : { model: `${cfg.name}-rep${rep}`, perDimension: null, rationale: String(r.reason) },\n )\n // Throws iff EVERY rep failed → the campaign records a failed cell.\n const agg = aggregateJudgeVerdicts(verdicts, cfg.rubric, cfg.weights)\n return { composite: agg.composite, dimensions: agg.perDimension, notes: agg.rationale }\n },\n }\n}\n\n// ── Curated re-exports — the one eval import for a product loop ──────────────\n// The loop engine + gates + drivers + the ensemble reducer, so a product wires\n// its self-improvement loop from a single module instead of reaching across\n// three agent-eval subpaths. All DOWNWARD imports (agent-app consumes the\n// substrate); the layering rule is preserved.\n\nexport { aggregateJudgeVerdicts } from '@tangle-network/agent-eval'\nexport type {\n EnsembleAggregate,\n JudgeVerdict,\n RunRecord,\n} from '@tangle-network/agent-eval'\nexport {\n defaultProductionGate,\n evolutionaryDriver,\n gepaDriver,\n paretoSignificanceGate,\n runCampaign,\n} from '@tangle-network/agent-eval/campaign'\nexport type {\n CampaignResult,\n DispatchContext,\n Gate,\n ImprovementDriver,\n JudgeConfig,\n JudgeDimension,\n JudgeScore,\n LabeledScenarioStore,\n MutableSurface,\n Mutator,\n Scenario,\n} from '@tangle-network/agent-eval/campaign'\nexport { selfImprove } from '@tangle-network/agent-eval/contract'\nexport type {\n SelfImproveBudget,\n SelfImproveOptions,\n SelfImproveResult,\n} from '@tangle-network/agent-eval/contract'\n"],"mappings":";AA0BA;AAAA,EACE;AAAA,OAEK;AAmFP,SAAS,0BAAAA,+BAA8B;AAMvC;AAAA,EACE;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,EACA;AAAA,OACK;AAcP,SAAS,mBAAmB;AA7DrB,SAAS,mBACd,KACmC;AACnC,QAAM,OAAO,IAAI,aAAa;AAC9B,MAAI,OAAO,GAAG;AACZ,UAAM,IAAI,MAAM,mDAAmD,IAAI,GAAG;AAAA,EAC5E;AACA,MAAI,IAAI,OAAO,WAAW,GAAG;AAC3B,UAAM,IAAI,MAAM,qCAAqC;AAAA,EACvD;AACA,SAAO;AAAA,IACL,MAAM,IAAI;AAAA,IACV,YAAY,IAAI,OAAO,IAAI,CAAC,SAAS,EAAE,KAAK,aAAa,IAAI,WAAW,GAAG,KAAK,IAAI,EAAE;AAAA,IACtF,MAAM,MAAM,EAAE,UAAU,UAAU,OAAO,GAAwB;AAC/D,YAAM,UAAU,MAAM,QAAQ;AAAA,QAC5B,MAAM,KAAK,EAAE,QAAQ,KAAK,GAAG,CAAC,GAAG,QAAQ,IAAI,SAAS,EAAE,UAAU,UAAU,QAAQ,IAAI,CAAC,CAAC;AAAA,MAC5F;AACA,YAAM,WAA8B,QAAQ;AAAA,QAAI,CAAC,GAAG,QAClD,EAAE,WAAW,cACT,EAAE,QACF,EAAE,OAAO,GAAG,IAAI,IAAI,OAAO,GAAG,IAAI,cAAc,MAAM,WAAW,OAAO,EAAE,MAAM,EAAE;AAAA,MACxF;AAEA,YAAM,MAAM,uBAAuB,UAAU,IAAI,QAAQ,IAAI,OAAO;AACpE,aAAO,EAAE,WAAW,IAAI,WAAW,YAAY,IAAI,cAAc,OAAO,IAAI,UAAU;AAAA,IACxF;AAAA,EACF;AACF;","names":["aggregateJudgeVerdicts"]}
package/dist/index.js CHANGED
@@ -1,3 +1,10 @@
1
+ import {
2
+ addSecurityHeaders,
3
+ checkRateLimit,
4
+ extractRequestContext,
5
+ parseJsonObjectBody,
6
+ requireString
7
+ } from "./chunk-CN75FIPT.js";
1
8
  import {
2
9
  DEFAULT_REDACTION_PATTERNS,
3
10
  buildRedactedDocument,
@@ -6,6 +13,11 @@ import {
6
13
  redactForIngestion,
7
14
  revealSpan
8
15
  } from "./chunk-5RMIUJDI.js";
16
+ import {
17
+ createKnowledgeLoop,
18
+ createReviewerDecider,
19
+ reviewCandidate
20
+ } from "./chunk-EEPJGZJW.js";
9
21
  import {
10
22
  DEFAULT_HARNESS,
11
23
  KNOWN_HARNESSES,
@@ -65,13 +77,6 @@ import {
65
77
  invokeIntegrationHub,
66
78
  resolveIntegrationAction
67
79
  } from "./chunk-L2TG5DBW.js";
68
- import {
69
- addSecurityHeaders,
70
- checkRateLimit,
71
- extractRequestContext,
72
- parseJsonObjectBody,
73
- requireString
74
- } from "./chunk-CN75FIPT.js";
75
80
  import {
76
81
  DEFAULT_APP_TOOL_PATHS,
77
82
  DEFAULT_HEADER_NAMES,
@@ -123,11 +128,6 @@ import {
123
128
  buildKnowledgeRequirements,
124
129
  deriveSignals
125
130
  } from "./chunk-ZXNXAQAH.js";
126
- import {
127
- createKnowledgeLoop,
128
- createReviewerDecider,
129
- reviewCandidate
130
- } from "./chunk-EEPJGZJW.js";
131
131
  export {
132
132
  APP_TOOL_NAMES,
133
133
  DEFAULT_APP_TOOL_PATHS,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-app",
3
- "version": "0.1.14",
3
+ "version": "0.2.0",
4
4
  "packageManager": "pnpm@10.33.4",
5
5
  "description": "Application-shell framework for Tangle agent products: a bounded tool loop, the structured agent→app tool side channel, integration-hub client, per-workspace billing, and crypto — composed over the Tangle agent substrate through typed seams.",
6
6
  "keywords": [
@@ -61,6 +61,11 @@
61
61
  "import": "./dist/eval/index.js",
62
62
  "default": "./dist/eval/index.js"
63
63
  },
64
+ "./eval-campaign": {
65
+ "types": "./dist/eval-campaign/index.d.ts",
66
+ "import": "./dist/eval-campaign/index.js",
67
+ "default": "./dist/eval-campaign/index.js"
68
+ },
64
69
  "./knowledge": {
65
70
  "types": "./dist/knowledge/index.d.ts",
66
71
  "import": "./dist/knowledge/index.js",
@@ -131,7 +136,7 @@
131
136
  "typecheck": "tsc --noEmit"
132
137
  },
133
138
  "devDependencies": {
134
- "@tangle-network/agent-eval": "^0.70.0",
139
+ "@tangle-network/agent-eval": "^0.82.0",
135
140
  "@tangle-network/agent-integrations": "^0.32.0",
136
141
  "@tangle-network/agent-knowledge": "^1.5.2",
137
142
  "@types/node": "^25.6.0",
@@ -140,7 +145,7 @@
140
145
  "vitest": "^3.0.0"
141
146
  },
142
147
  "peerDependencies": {
143
- "@tangle-network/agent-eval": ">=0.50.0",
148
+ "@tangle-network/agent-eval": ">=0.82.0",
144
149
  "@tangle-network/agent-integrations": ">=0.32.0",
145
150
  "@tangle-network/agent-knowledge": ">=1.5.0",
146
151
  "@tangle-network/agent-runtime": ">=0.21.0"