@tangle-network/agent-eval 0.44.0 → 0.44.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,91 @@
1
+ import { S as Scenario, n as JudgeScore, D as DispatchFn, J as JudgeConfig } from '../types-DToGONFA.js';
2
+
3
+ /**
4
+ * # `@tangle-network/agent-eval/adapters/langchain` — wrap any LangChain
5
+ * Runnable as a `Dispatch` (or `JudgeConfig`).
6
+ *
7
+ * **Why structural, not pinned**: we don't depend on `@langchain/core` at
8
+ * install time. The adapter accepts anything with the canonical LangChain
9
+ * Runnable shape (`invoke(input, config?)`), so it works with their
10
+ * `Runnable`, `RunnableSequence`, `RunnableMap`, `RunnablePassthrough`,
11
+ * and any custom Runnable-shaped object. No version pin, no peer dep,
12
+ * no bundle-bloat risk.
13
+ *
14
+ * **Why this exists**: the most-asked question from foreign agent
15
+ * builders is "I'm already on LangChain — how do I plug in?". The answer
16
+ * is one function. Wrap your existing Runnable, pass the Dispatch into
17
+ * `runEval` / `runImprovementLoop`, ship.
18
+ */
19
+
20
+ interface RunnableLike<TInput, TOutput> {
21
+ invoke(input: TInput, config?: {
22
+ signal?: AbortSignal;
23
+ [key: string]: unknown;
24
+ }): Promise<TOutput>;
25
+ }
26
+ interface LangchainDispatchOptions<TScenario extends Scenario, TArtifact> {
27
+ /** The Runnable (or RunnableSequence, or anything `.invoke`able). */
28
+ runnable: RunnableLike<TScenario, TArtifact>;
29
+ /**
30
+ * Optional config merged into every `invoke` call — tags, metadata,
31
+ * callbacks, runName. The substrate's per-cell `AbortSignal` is
32
+ * always merged in last (and so wins).
33
+ */
34
+ config?: Record<string, unknown>;
35
+ }
36
+ /**
37
+ * Wrap a LangChain Runnable as a `Dispatch`. The Runnable's input must
38
+ * accept the scenario (typically you'll shape it via
39
+ * `RunnableMap`/`RunnableLambda` upstream); its output is the artifact
40
+ * the engine + judges see.
41
+ *
42
+ * @example
43
+ * const chain = prompt.pipe(model).pipe(parser)
44
+ * const dispatch = langchainDispatch({ runnable: chain })
45
+ * await runEval({ scenarios, dispatch, judges: [...], storage, runDir })
46
+ */
47
+ declare function langchainDispatch<TScenario extends Scenario, TArtifact>(opts: LangchainDispatchOptions<TScenario, TArtifact>): DispatchFn<TScenario, TArtifact>;
48
+ interface LangchainJudgeOptions<TArtifact, TScenario extends Scenario> {
49
+ /** Judge name; appears in `CampaignResult.aggregates.byJudge`. */
50
+ name: string;
51
+ /**
52
+ * Dimensions the judge scores. Used both for the judge's own prompt
53
+ * (if it reads them) and for the aggregator's `byJudge` rollup.
54
+ */
55
+ dimensions: {
56
+ key: string;
57
+ description: string;
58
+ }[];
59
+ /**
60
+ * A Runnable that takes `{ artifact, scenario }` and returns a
61
+ * partial `JudgeScore` — the dimensions map at minimum. `composite`
62
+ * is computed by averaging `dimensions` when the Runnable doesn't
63
+ * provide it; `notes` defaults to an empty string.
64
+ */
65
+ runnable: RunnableLike<{
66
+ artifact: TArtifact;
67
+ scenario: TScenario;
68
+ }, Partial<JudgeScore>>;
69
+ appliesTo?: (scenario: TScenario) => boolean;
70
+ }
71
+ /**
72
+ * Wrap a LangChain Runnable as a `JudgeConfig`. The Runnable can be any
73
+ * structured-output chain (e.g. `prompt.pipe(model).pipe(StructuredOutputParser)`)
74
+ * that returns a `Partial<JudgeScore>`.
75
+ *
76
+ * The substrate's invariant — throw on judge failure, never silently
77
+ * fold errors into a zero — is preserved: any error from the Runnable
78
+ * propagates and the substrate records a failed cell.
79
+ *
80
+ * @example
81
+ * const scorePrompt = ChatPromptTemplate.fromTemplate(`...`)
82
+ * const judgeChain = scorePrompt.pipe(judgeModel).pipe(jsonParser)
83
+ * const judge = langchainJudge({
84
+ * name: 'marketing-quality',
85
+ * dimensions: [{ key: 'hook_strength', description: '...' }, ...],
86
+ * runnable: judgeChain,
87
+ * })
88
+ */
89
+ declare function langchainJudge<TArtifact, TScenario extends Scenario>(opts: LangchainJudgeOptions<TArtifact, TScenario>): JudgeConfig<TArtifact, TScenario>;
90
+
91
+ export { type LangchainDispatchOptions, type LangchainJudgeOptions, type RunnableLike, langchainDispatch, langchainJudge };
@@ -0,0 +1,34 @@
1
+ import "../chunk-NSBPE2FW.js";
2
+
3
+ // src/adapters/langchain.ts
4
+ function langchainDispatch(opts) {
5
+ return async (scenario, ctx) => {
6
+ return opts.runnable.invoke(scenario, {
7
+ ...opts.config,
8
+ signal: ctx.signal
9
+ });
10
+ };
11
+ }
12
+ function langchainJudge(opts) {
13
+ return {
14
+ name: opts.name,
15
+ dimensions: opts.dimensions,
16
+ appliesTo: opts.appliesTo,
17
+ async score({ artifact, scenario, signal }) {
18
+ const result = await opts.runnable.invoke({ artifact, scenario }, { signal });
19
+ const dims = result.dimensions ?? {};
20
+ const dimValues = Object.values(dims);
21
+ const composite = result.composite ?? (dimValues.length > 0 ? dimValues.reduce((a, b) => a + b, 0) / dimValues.length : 0);
22
+ return {
23
+ dimensions: dims,
24
+ composite,
25
+ notes: result.notes ?? ""
26
+ };
27
+ }
28
+ };
29
+ }
30
+ export {
31
+ langchainDispatch,
32
+ langchainJudge
33
+ };
34
+ //# sourceMappingURL=langchain.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../../src/adapters/langchain.ts"],"sourcesContent":["/**\n * # `@tangle-network/agent-eval/adapters/langchain` — wrap any LangChain\n * Runnable as a `Dispatch` (or `JudgeConfig`).\n *\n * **Why structural, not pinned**: we don't depend on `@langchain/core` at\n * install time. The adapter accepts anything with the canonical LangChain\n * Runnable shape (`invoke(input, config?)`), so it works with their\n * `Runnable`, `RunnableSequence`, `RunnableMap`, `RunnablePassthrough`,\n * and any custom Runnable-shaped object. No version pin, no peer dep,\n * no bundle-bloat risk.\n *\n * **Why this exists**: the most-asked question from foreign agent\n * builders is \"I'm already on LangChain — how do I plug in?\". The answer\n * is one function. Wrap your existing Runnable, pass the Dispatch into\n * `runEval` / `runImprovementLoop`, ship.\n */\n\nimport type { Dispatch, JudgeConfig, JudgeScore, Scenario } from '../contract'\n\n// ── Minimal structural type ──────────────────────────────────────────\n//\n// Whatever has `invoke(input, config?)` qualifies. We accept any\n// config shape (LangChain's RunnableConfig has many optional fields)\n// — the only thing we need is the AbortSignal seam, which LangChain's\n// RunnableConfig already supports as `signal?: AbortSignal`.\n\nexport interface RunnableLike<TInput, TOutput> {\n invoke(input: TInput, config?: { signal?: AbortSignal; [key: string]: unknown }): Promise<TOutput>\n}\n\n// ── Dispatch wrapper ────────────────────────────────────────────────\n\nexport interface LangchainDispatchOptions<TScenario extends Scenario, TArtifact> {\n /** The Runnable (or RunnableSequence, or anything `.invoke`able). */\n runnable: RunnableLike<TScenario, TArtifact>\n /**\n * Optional config merged into every `invoke` call — tags, metadata,\n * callbacks, runName. The substrate's per-cell `AbortSignal` is\n * always merged in last (and so wins).\n */\n config?: Record<string, unknown>\n}\n\n/**\n * Wrap a LangChain Runnable as a `Dispatch`. The Runnable's input must\n * accept the scenario (typically you'll shape it via\n * `RunnableMap`/`RunnableLambda` upstream); its output is the artifact\n * the engine + judges see.\n *\n * @example\n * const chain = prompt.pipe(model).pipe(parser)\n * const dispatch = langchainDispatch({ runnable: chain })\n * await runEval({ scenarios, dispatch, judges: [...], storage, runDir })\n */\nexport function langchainDispatch<TScenario extends Scenario, TArtifact>(\n opts: LangchainDispatchOptions<TScenario, TArtifact>,\n): Dispatch<TScenario, TArtifact> {\n return async (scenario, ctx) => {\n return opts.runnable.invoke(scenario, {\n ...opts.config,\n signal: ctx.signal,\n })\n }\n}\n\n// ── Judge wrapper ───────────────────────────────────────────────────\n\nexport interface LangchainJudgeOptions<TArtifact, TScenario extends Scenario> {\n /** Judge name; appears in `CampaignResult.aggregates.byJudge`. */\n name: string\n /**\n * Dimensions the judge scores. Used both for the judge's own prompt\n * (if it reads them) and for the aggregator's `byJudge` rollup.\n */\n dimensions: { key: string; description: string }[]\n /**\n * A Runnable that takes `{ artifact, scenario }` and returns a\n * partial `JudgeScore` — the dimensions map at minimum. `composite`\n * is computed by averaging `dimensions` when the Runnable doesn't\n * provide it; `notes` defaults to an empty string.\n */\n runnable: RunnableLike<{ artifact: TArtifact; scenario: TScenario }, Partial<JudgeScore>>\n appliesTo?: (scenario: TScenario) => boolean\n}\n\n/**\n * Wrap a LangChain Runnable as a `JudgeConfig`. The Runnable can be any\n * structured-output chain (e.g. `prompt.pipe(model).pipe(StructuredOutputParser)`)\n * that returns a `Partial<JudgeScore>`.\n *\n * The substrate's invariant — throw on judge failure, never silently\n * fold errors into a zero — is preserved: any error from the Runnable\n * propagates and the substrate records a failed cell.\n *\n * @example\n * const scorePrompt = ChatPromptTemplate.fromTemplate(`...`)\n * const judgeChain = scorePrompt.pipe(judgeModel).pipe(jsonParser)\n * const judge = langchainJudge({\n * name: 'marketing-quality',\n * dimensions: [{ key: 'hook_strength', description: '...' }, ...],\n * runnable: judgeChain,\n * })\n */\nexport function langchainJudge<TArtifact, TScenario extends Scenario>(\n opts: LangchainJudgeOptions<TArtifact, TScenario>,\n): JudgeConfig<TArtifact, TScenario> {\n return {\n name: opts.name,\n dimensions: opts.dimensions,\n appliesTo: opts.appliesTo,\n async score({ artifact, scenario, signal }) {\n const result = await opts.runnable.invoke({ artifact, scenario }, { signal })\n const dims = (result.dimensions ?? {}) as Record<string, number>\n const dimValues = Object.values(dims)\n const composite = result.composite ?? (dimValues.length > 0 ? dimValues.reduce((a, b) => a + b, 0) / dimValues.length : 0)\n return {\n dimensions: dims,\n composite,\n notes: result.notes ?? '',\n }\n },\n }\n}\n"],"mappings":";;;AAsDO,SAAS,kBACd,MACgC;AAChC,SAAO,OAAO,UAAU,QAAQ;AAC9B,WAAO,KAAK,SAAS,OAAO,UAAU;AAAA,MACpC,GAAG,KAAK;AAAA,MACR,QAAQ,IAAI;AAAA,IACd,CAAC;AAAA,EACH;AACF;AAwCO,SAAS,eACd,MACmC;AACnC,SAAO;AAAA,IACL,MAAM,KAAK;AAAA,IACX,YAAY,KAAK;AAAA,IACjB,WAAW,KAAK;AAAA,IAChB,MAAM,MAAM,EAAE,UAAU,UAAU,OAAO,GAAG;AAC1C,YAAM,SAAS,MAAM,KAAK,SAAS,OAAO,EAAE,UAAU,SAAS,GAAG,EAAE,OAAO,CAAC;AAC5E,YAAM,OAAQ,OAAO,cAAc,CAAC;AACpC,YAAM,YAAY,OAAO,OAAO,IAAI;AACpC,YAAM,YAAY,OAAO,cAAc,UAAU,SAAS,IAAI,UAAU,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,UAAU,SAAS;AACxH,aAAO;AAAA,QACL,YAAY;AAAA,QACZ;AAAA,QACA,OAAO,OAAO,SAAS;AAAA,MACzB;AAAA,IACF;AAAA,EACF;AACF;","names":[]}
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.43.2",
5
+ "version": "0.44.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -0,0 +1,190 @@
1
+ # Quickstart — self-improvement loop for any agent (15 minutes)
2
+
3
+ The standalone walkthrough mirroring
4
+ `examples/foreign-agent-quickstart/`. Read this first; copy the runnable
5
+ example second.
6
+
7
+ ## What you get
8
+
9
+ After 15 minutes you have a closed self-improvement loop running
10
+ against your agent — measured, gated, and reproducible — with no
11
+ Tangle sandbox, no Tangle account, and no hosted infrastructure.
12
+
13
+ ## Install
14
+
15
+ ```sh
16
+ npm i @tangle-network/agent-eval@^0.44.0
17
+ ```
18
+
19
+ The package's `@tangle-network/sandbox` peer is `optional` (as of
20
+ 0.44.0). Foreign consumers can install agent-eval and run the full LAND
21
+ tier without our sandbox or its dependencies.
22
+
23
+ ## Five types, four functions
24
+
25
+ ```ts
26
+ import {
27
+ // Types
28
+ type Scenario, // what you evaluate against (id + kind + your fields)
29
+ type Dispatch, // your agent, wrapped as one function
30
+ type JudgeConfig, // pluggable dimensional scorer
31
+ type Mutator, // proposes a next surface
32
+ type Gate, // promotion guard
33
+
34
+ // Functions
35
+ runEval,
36
+ runCampaign,
37
+ runImprovementLoop,
38
+ defaultProductionGate,
39
+
40
+ // Storage
41
+ fsCampaignStorage,
42
+ inMemoryCampaignStorage,
43
+ } from '@tangle-network/agent-eval/contract'
44
+ ```
45
+
46
+ Every export above is committed under semver. New minors only ADD;
47
+ nothing here changes shape in a 0.x minor.
48
+
49
+ ## Three steps to wire your agent
50
+
51
+ ### 1. Scenarios
52
+
53
+ ```ts
54
+ interface MarketingScenario extends Scenario {
55
+ blurb: string
56
+ surface: 'landing-hero' | 'tweet' | 'email-subject'
57
+ audience: string
58
+ }
59
+
60
+ const scenarios: MarketingScenario[] = [
61
+ { id: 's1', kind: 'marketing-rewrite', blurb: '...', surface: 'tweet', audience: '...' },
62
+ // ...
63
+ ]
64
+ ```
65
+
66
+ ### 2. Wrap your agent as `Dispatch`
67
+
68
+ ```ts
69
+ const dispatch: Dispatch<MarketingScenario, MarketingArtifact> = async (scenario, ctx) => {
70
+ const rewrite = await callYourAgent(scenario, { signal: ctx.signal })
71
+ return { rewrite, modelUsed: '...' }
72
+ }
73
+ ```
74
+
75
+ `ctx` carries `signal` (cancellation), `trace` (write spans), `artifacts`
76
+ (write blobs), `cost` (token + $ meter). Use them or ignore them.
77
+
78
+ ### 3. Bring a judge
79
+
80
+ ```ts
81
+ const judge: JudgeConfig<MarketingArtifact, MarketingScenario> = {
82
+ name: 'marketing-quality',
83
+ dimensions: [
84
+ { key: 'hook_strength', description: '...' },
85
+ { key: 'voice_match', description: '...' },
86
+ { key: 'cta_clarity', description: '...' },
87
+ { key: 'factual_grounding', description: '...' },
88
+ ],
89
+ async score({ artifact, scenario, signal }) {
90
+ // LLM call, heuristic, ensemble — anything. Return JudgeScore.
91
+ return { dimensions: { ... }, composite: 0.72, notes: '...' }
92
+ },
93
+ }
94
+ ```
95
+
96
+ Throw on failure; the substrate records it as a failed cell. No silent
97
+ zeros.
98
+
99
+ ## Baseline
100
+
101
+ ```ts
102
+ const baseline = await runEval({
103
+ scenarios,
104
+ dispatch,
105
+ judges: [judge],
106
+ storage: inMemoryCampaignStorage(),
107
+ runDir: 'mem://my-baseline',
108
+ })
109
+
110
+ const score = Object.values(baseline.aggregates.byScenario)
111
+ .reduce((sum, s) => sum + s.meanComposite, 0) / scenarios.length
112
+
113
+ console.log(`Baseline composite: ${score.toFixed(3)}`)
114
+ ```
115
+
116
+ ## Self-improvement loop
117
+
118
+ ```ts
119
+ import { gepaDriver, defaultProductionGate } from '@tangle-network/agent-eval/contract'
120
+
121
+ const result = await runImprovementLoop({
122
+ scenarios: trainScenarios,
123
+ baselineSurface,
124
+ dispatchWithSurface: (surface, scenario, ctx) =>
125
+ runYourAgent({ systemPrompt: surface as string }, scenario, ctx),
126
+ driver: gepaDriver({
127
+ llm: { apiKey: process.env.OPENAI_API_KEY, baseUrl: '...' },
128
+ model: 'gpt-4o-mini',
129
+ target: 'marketing copywriting system prompt',
130
+ mutationPrimitives: [
131
+ 'Tighten the hook: lead with the concrete user outcome.',
132
+ 'Replace generic adjectives with specific verbs.',
133
+ // ...
134
+ ],
135
+ }),
136
+ judges: [judge],
137
+ populationSize: 2,
138
+ maxGenerations: 3,
139
+ holdoutScenarios,
140
+ gate: defaultProductionGate({
141
+ holdoutScenarios,
142
+ deltaThreshold: 0.05,
143
+ }),
144
+ autoOnPromote: 'none',
145
+ storage: inMemoryCampaignStorage(),
146
+ runDir: 'mem://my-improve',
147
+ })
148
+
149
+ if (result.gateResult.decision === 'ship') {
150
+ // Deploy result.winnerSurface — we don't push it for you.
151
+ }
152
+ ```
153
+
154
+ The gate decision is `'ship'` | `'hold'` | `'need_more_work'` |
155
+ `'model_ceiling'` | `'arch_ceiling'`. You define what each means in
156
+ your deploy pipeline.
157
+
158
+ ## What you control
159
+
160
+ - The agent (any framework, any model, any backend).
161
+ - The judge (LLM, heuristic, ensemble; we don't pick).
162
+ - The mutation strategy (`gepaDriver` for reflective LLM mutation,
163
+ `evolutionaryDriver({ mutator })` for population search, or
164
+ implement `ImprovementDriver` directly).
165
+ - The gate (compose `defaultProductionGate` with custom checks via
166
+ `composeGate`).
167
+ - The deploy step (`autoOnPromote: 'pr'` opens a GitHub PR with the
168
+ winner; `'none'` returns the surface and you ship however you ship).
169
+
170
+ ## What this does NOT install
171
+
172
+ - No `@tangle-network/sandbox` — nothing runs in a Tangle sandbox.
173
+ - No hosted orchestrator — traces, artifacts, judge scores stay on
174
+ your machine (or in `inMemoryCampaignStorage` for Workers/edge).
175
+ - No daemons — `runEval` and `runImprovementLoop` complete in-process
176
+ and return.
177
+
178
+ ## When you want more
179
+
180
+ The wedge doc (`docs/design/external-agent-wedge.md`) lays out three
181
+ graduated tiers:
182
+
183
+ | Tier | What you do | What you get |
184
+ |---|---|---|
185
+ | **LAND** (this quickstart) | `npm i @tangle-network/agent-eval`, wrap dispatch + judge, run loops | Local artifacts; full self-improvement; no Tangle infra |
186
+ | **EXPAND** | Point trace/eval data at our hosted orchestrator | Hosted dashboards, cross-run intelligence, billing on data routed to us |
187
+ | **PLATFORM** | Move execution into our sandbox | Substrate + orchestrator data pre-wired; sandbox usage billing |
188
+
189
+ Each tier is opt-in. EXPAND and PLATFORM build on the same primitives;
190
+ upgrading is adding configuration, not rewriting your wiring.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tangle-network/agent-eval",
3
- "version": "0.44.0",
3
+ "version": "0.44.1",
4
4
  "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
5
5
  "homepage": "https://github.com/tangle-network/agent-eval#readme",
6
6
  "repository": {
@@ -109,6 +109,11 @@
109
109
  "import": "./dist/contract/index.js",
110
110
  "default": "./dist/contract/index.js"
111
111
  },
112
+ "./adapters/langchain": {
113
+ "types": "./dist/adapters/langchain.d.ts",
114
+ "import": "./dist/adapters/langchain.js",
115
+ "default": "./dist/adapters/langchain.js"
116
+ },
112
117
  "./openapi.json": {
113
118
  "default": "./dist/openapi.json"
114
119
  }