npm - @tangle-network/agent-eval - Versions diffs - 0.44.0 → 0.44.1 - Mend

@tangle-network/agent-eval 0.44.0 → 0.44.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/adapters/langchain.d.ts +91 -0
package/dist/adapters/langchain.js +34 -0
package/dist/adapters/langchain.js.map +1 -0
package/dist/openapi.json +1 -1
package/docs/quickstart-external.md +190 -0
package/package.json +6 -1

package/dist/adapters/langchain.d.ts ADDED Viewed

@@ -0,0 +1,91 @@
+import { S as Scenario, n as JudgeScore, D as DispatchFn, J as JudgeConfig } from '../types-DToGONFA.js';
+/**
+ * # `@tangle-network/agent-eval/adapters/langchain` — wrap any LangChain
+ * Runnable as a `Dispatch` (or `JudgeConfig`).
+ *
+ * **Why structural, not pinned**: we don't depend on `@langchain/core` at
+ * install time. The adapter accepts anything with the canonical LangChain
+ * Runnable shape (`invoke(input, config?)`), so it works with their
+ * `Runnable`, `RunnableSequence`, `RunnableMap`, `RunnablePassthrough`,
+ * and any custom Runnable-shaped object. No version pin, no peer dep,
+ * no bundle-bloat risk.
+ *
+ * **Why this exists**: the most-asked question from foreign agent
+ * builders is "I'm already on LangChain — how do I plug in?". The answer
+ * is one function. Wrap your existing Runnable, pass the Dispatch into
+ * `runEval` / `runImprovementLoop`, ship.
+ */
+interface RunnableLike<TInput, TOutput> {
+    invoke(input: TInput, config?: {
+        signal?: AbortSignal;
+        [key: string]: unknown;
+    }): Promise<TOutput>;
+}
+interface LangchainDispatchOptions<TScenario extends Scenario, TArtifact> {
+    /** The Runnable (or RunnableSequence, or anything `.invoke`able). */
+    runnable: RunnableLike<TScenario, TArtifact>;
+    /**
+     * Optional config merged into every `invoke` call — tags, metadata,
+     * callbacks, runName. The substrate's per-cell `AbortSignal` is
+     * always merged in last (and so wins).
+     */
+    config?: Record<string, unknown>;
+}
+/**
+ * Wrap a LangChain Runnable as a `Dispatch`. The Runnable's input must
+ * accept the scenario (typically you'll shape it via
+ * `RunnableMap`/`RunnableLambda` upstream); its output is the artifact
+ * the engine + judges see.
+ *
+ * @example
+ *   const chain = prompt.pipe(model).pipe(parser)
+ *   const dispatch = langchainDispatch({ runnable: chain })
+ *   await runEval({ scenarios, dispatch, judges: [...], storage, runDir })
+ */
+declare function langchainDispatch<TScenario extends Scenario, TArtifact>(opts: LangchainDispatchOptions<TScenario, TArtifact>): DispatchFn<TScenario, TArtifact>;
+interface LangchainJudgeOptions<TArtifact, TScenario extends Scenario> {
+    /** Judge name; appears in `CampaignResult.aggregates.byJudge`. */
+    name: string;
+    /**
+     * Dimensions the judge scores. Used both for the judge's own prompt
+     * (if it reads them) and for the aggregator's `byJudge` rollup.
+     */
+    dimensions: {
+        key: string;
+        description: string;
+    }[];
+    /**
+     * A Runnable that takes `{ artifact, scenario }` and returns a
+     * partial `JudgeScore` — the dimensions map at minimum. `composite`
+     * is computed by averaging `dimensions` when the Runnable doesn't
+     * provide it; `notes` defaults to an empty string.
+     */
+    runnable: RunnableLike<{
+        artifact: TArtifact;
+        scenario: TScenario;
+    }, Partial<JudgeScore>>;
+    appliesTo?: (scenario: TScenario) => boolean;
+}
+/**
+ * Wrap a LangChain Runnable as a `JudgeConfig`. The Runnable can be any
+ * structured-output chain (e.g. `prompt.pipe(model).pipe(StructuredOutputParser)`)
+ * that returns a `Partial<JudgeScore>`.
+ *
+ * The substrate's invariant — throw on judge failure, never silently
+ * fold errors into a zero — is preserved: any error from the Runnable
+ * propagates and the substrate records a failed cell.
+ *
+ * @example
+ *   const scorePrompt = ChatPromptTemplate.fromTemplate(`...`)
+ *   const judgeChain = scorePrompt.pipe(judgeModel).pipe(jsonParser)
+ *   const judge = langchainJudge({
+ *     name: 'marketing-quality',
+ *     dimensions: [{ key: 'hook_strength', description: '...' }, ...],
+ *     runnable: judgeChain,
+ *   })
+ */
+declare function langchainJudge<TArtifact, TScenario extends Scenario>(opts: LangchainJudgeOptions<TArtifact, TScenario>): JudgeConfig<TArtifact, TScenario>;
+export { type LangchainDispatchOptions, type LangchainJudgeOptions, type RunnableLike, langchainDispatch, langchainJudge };

package/dist/adapters/langchain.js ADDED Viewed

@@ -0,0 +1,34 @@
+import "../chunk-NSBPE2FW.js";
+// src/adapters/langchain.ts
+function langchainDispatch(opts) {
+  return async (scenario, ctx) => {
+    return opts.runnable.invoke(scenario, {
+      ...opts.config,
+      signal: ctx.signal
+    });
+  };
+}
+function langchainJudge(opts) {
+  return {
+    name: opts.name,
+    dimensions: opts.dimensions,
+    appliesTo: opts.appliesTo,
+    async score({ artifact, scenario, signal }) {
+      const result = await opts.runnable.invoke({ artifact, scenario }, { signal });
+      const dims = result.dimensions ?? {};
+      const dimValues = Object.values(dims);
+      const composite = result.composite ?? (dimValues.length > 0 ? dimValues.reduce((a, b) => a + b, 0) / dimValues.length : 0);
+      return {
+        dimensions: dims,
+        composite,
+        notes: result.notes ?? ""
+      };
+    }
+  };
+}
+export {
+  langchainDispatch,
+  langchainJudge
+};
+//# sourceMappingURL=langchain.js.map

package/dist/adapters/langchain.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"sources":["../../src/adapters/langchain.ts"],"sourcesContent":["/**\n * # `@tangle-network/agent-eval/adapters/langchain` — wrap any LangChain\n * Runnable as a `Dispatch` (or `JudgeConfig`).\n *\n * **Why structural, not pinned**: we don't depend on `@langchain/core` at\n * install time. The adapter accepts anything with the canonical LangChain\n * Runnable shape (`invoke(input, config?)`), so it works with their\n * `Runnable`, `RunnableSequence`, `RunnableMap`, `RunnablePassthrough`,\n * and any custom Runnable-shaped object. No version pin, no peer dep,\n * no bundle-bloat risk.\n *\n * **Why this exists**: the most-asked question from foreign agent\n * builders is \"I'm already on LangChain — how do I plug in?\". The answer\n * is one function. Wrap your existing Runnable, pass the Dispatch into\n * `runEval` / `runImprovementLoop`, ship.\n */\n\nimport type { Dispatch, JudgeConfig, JudgeScore, Scenario } from '../contract'\n\n// ── Minimal structural type ──────────────────────────────────────────\n//\n// Whatever has `invoke(input, config?)` qualifies. We accept any\n// config shape (LangChain's RunnableConfig has many optional fields)\n// — the only thing we need is the AbortSignal seam, which LangChain's\n// RunnableConfig already supports as `signal?: AbortSignal`.\n\nexport interface RunnableLike<TInput, TOutput> {\n invoke(input: TInput, config?: { signal?: AbortSignal; [key: string]: unknown }): Promise<TOutput>\n}\n\n// ── Dispatch wrapper ────────────────────────────────────────────────\n\nexport interface LangchainDispatchOptions<TScenario extends Scenario, TArtifact> {\n /** The Runnable (or RunnableSequence, or anything `.invoke`able). */\n runnable: RunnableLike<TScenario, TArtifact>\n /**\n * Optional config merged into every `invoke` call — tags, metadata,\n * callbacks, runName. The substrate's per-cell `AbortSignal` is\n * always merged in last (and so wins).\n */\n config?: Record<string, unknown>\n}\n\n/**\n * Wrap a LangChain Runnable as a `Dispatch`. The Runnable's input must\n * accept the scenario (typically you'll shape it via\n * `RunnableMap`/`RunnableLambda` upstream); its output is the artifact\n * the engine + judges see.\n *\n * @example\n * const chain = prompt.pipe(model).pipe(parser)\n * const dispatch = langchainDispatch({ runnable: chain })\n * await runEval({ scenarios, dispatch, judges: [...], storage, runDir })\n */\nexport function langchainDispatch<TScenario extends Scenario, TArtifact>(\n opts: LangchainDispatchOptions<TScenario, TArtifact>,\n): Dispatch<TScenario, TArtifact> {\n return async (scenario, ctx) => {\n return opts.runnable.invoke(scenario, {\n ...opts.config,\n signal: ctx.signal,\n })\n }\n}\n\n// ── Judge wrapper ───────────────────────────────────────────────────\n\nexport interface LangchainJudgeOptions<TArtifact, TScenario extends Scenario> {\n /** Judge name; appears in `CampaignResult.aggregates.byJudge`. */\n name: string\n /**\n * Dimensions the judge scores. Used both for the judge's own prompt\n * (if it reads them) and for the aggregator's `byJudge` rollup.\n */\n dimensions: { key: string; description: string }[]\n /**\n * A Runnable that takes `{ artifact, scenario }` and returns a\n * partial `JudgeScore` — the dimensions map at minimum. `composite`\n * is computed by averaging `dimensions` when the Runnable doesn't\n * provide it; `notes` defaults to an empty string.\n */\n runnable: RunnableLike<{ artifact: TArtifact; scenario: TScenario }, Partial<JudgeScore>>\n appliesTo?: (scenario: TScenario) => boolean\n}\n\n/**\n * Wrap a LangChain Runnable as a `JudgeConfig`. The Runnable can be any\n * structured-output chain (e.g. `prompt.pipe(model).pipe(StructuredOutputParser)`)\n * that returns a `Partial<JudgeScore>`.\n *\n * The substrate's invariant — throw on judge failure, never silently\n * fold errors into a zero — is preserved: any error from the Runnable\n * propagates and the substrate records a failed cell.\n *\n * @example\n * const scorePrompt = ChatPromptTemplate.fromTemplate(`...`)\n * const judgeChain = scorePrompt.pipe(judgeModel).pipe(jsonParser)\n * const judge = langchainJudge({\n * name: 'marketing-quality',\n * dimensions: [{ key: 'hook_strength', description: '...' }, ...],\n * runnable: judgeChain,\n * })\n */\nexport function langchainJudge<TArtifact, TScenario extends Scenario>(\n opts: LangchainJudgeOptions<TArtifact, TScenario>,\n): JudgeConfig<TArtifact, TScenario> {\n return {\n name: opts.name,\n dimensions: opts.dimensions,\n appliesTo: opts.appliesTo,\n async score({ artifact, scenario, signal }) {\n const result = await opts.runnable.invoke({ artifact, scenario }, { signal })\n const dims = (result.dimensions ?? {}) as Record<string, number>\n const dimValues = Object.values(dims)\n const composite = result.composite ?? (dimValues.length > 0 ? dimValues.reduce((a, b) => a + b, 0) / dimValues.length : 0)\n return {\n dimensions: dims,\n composite,\n notes: result.notes ?? '',\n }\n },\n }\n}\n"],"mappings":";;;AAsDO,SAAS,kBACd,MACgC;AAChC,SAAO,OAAO,UAAU,QAAQ;AAC9B,WAAO,KAAK,SAAS,OAAO,UAAU;AAAA,MACpC,GAAG,KAAK;AAAA,MACR,QAAQ,IAAI;AAAA,IACd,CAAC;AAAA,EACH;AACF;AAwCO,SAAS,eACd,MACmC;AACnC,SAAO;AAAA,IACL,MAAM,KAAK;AAAA,IACX,YAAY,KAAK;AAAA,IACjB,WAAW,KAAK;AAAA,IAChB,MAAM,MAAM,EAAE,UAAU,UAAU,OAAO,GAAG;AAC1C,YAAM,SAAS,MAAM,KAAK,SAAS,OAAO,EAAE,UAAU,SAAS,GAAG,EAAE,OAAO,CAAC;AAC5E,YAAM,OAAQ,OAAO,cAAc,CAAC;AACpC,YAAM,YAAY,OAAO,OAAO,IAAI;AACpC,YAAM,YAAY,OAAO,cAAc,UAAU,SAAS,IAAI,UAAU,OAAO,CAAC,GAAG,MAAM,IAAI,GAAG,CAAC,IAAI,UAAU,SAAS;AACxH,aAAO;AAAA,QACL,YAAY;AAAA,QACZ;AAAA,QACA,OAAO,OAAO,SAAS;AAAA,MACzB;AAAA,IACF;AAAA,EACF;AACF;","names":[]}

package/dist/openapi.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "openapi": "3.1.0",
   "info": {
     "title": "@tangle-network/agent-eval — wire protocol",
-    "version": "0.43.2",
+    "version": "0.44.0",
     "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
     "contact": {
       "name": "Tangle Network",

package/docs/quickstart-external.md ADDED Viewed

@@ -0,0 +1,190 @@
+# Quickstart — self-improvement loop for any agent (15 minutes)
+The standalone walkthrough mirroring
+`examples/foreign-agent-quickstart/`. Read this first; copy the runnable
+example second.
+## What you get
+After 15 minutes you have a closed self-improvement loop running
+against your agent — measured, gated, and reproducible — with no
+Tangle sandbox, no Tangle account, and no hosted infrastructure.
+## Install
+```sh
+npm i @tangle-network/agent-eval@^0.44.0
+```
+The package's `@tangle-network/sandbox` peer is `optional` (as of
+0.44.0). Foreign consumers can install agent-eval and run the full LAND
+tier without our sandbox or its dependencies.
+## Five types, four functions
+```ts
+import {
+  // Types
+  type Scenario,        // what you evaluate against (id + kind + your fields)
+  type Dispatch,        // your agent, wrapped as one function
+  type JudgeConfig,     // pluggable dimensional scorer
+  type Mutator,         // proposes a next surface
+  type Gate,            // promotion guard
+  // Functions
+  runEval,
+  runCampaign,
+  runImprovementLoop,
+  defaultProductionGate,
+  // Storage
+  fsCampaignStorage,
+  inMemoryCampaignStorage,
+} from '@tangle-network/agent-eval/contract'
+```
+Every export above is committed under semver. New minors only ADD;
+nothing here changes shape in a 0.x minor.
+## Three steps to wire your agent
+### 1. Scenarios
+```ts
+interface MarketingScenario extends Scenario {
+  blurb: string
+  surface: 'landing-hero' | 'tweet' | 'email-subject'
+  audience: string
+}
+const scenarios: MarketingScenario[] = [
+  { id: 's1', kind: 'marketing-rewrite', blurb: '...', surface: 'tweet', audience: '...' },
+  // ...
+]
+```
+### 2. Wrap your agent as `Dispatch`
+```ts
+const dispatch: Dispatch<MarketingScenario, MarketingArtifact> = async (scenario, ctx) => {
+  const rewrite = await callYourAgent(scenario, { signal: ctx.signal })
+  return { rewrite, modelUsed: '...' }
+}
+```
+`ctx` carries `signal` (cancellation), `trace` (write spans), `artifacts`
+(write blobs), `cost` (token + $ meter). Use them or ignore them.
+### 3. Bring a judge
+```ts
+const judge: JudgeConfig<MarketingArtifact, MarketingScenario> = {
+  name: 'marketing-quality',
+  dimensions: [
+    { key: 'hook_strength', description: '...' },
+    { key: 'voice_match', description: '...' },
+    { key: 'cta_clarity', description: '...' },
+    { key: 'factual_grounding', description: '...' },
+  ],
+  async score({ artifact, scenario, signal }) {
+    // LLM call, heuristic, ensemble — anything. Return JudgeScore.
+    return { dimensions: { ... }, composite: 0.72, notes: '...' }
+  },
+}
+```
+Throw on failure; the substrate records it as a failed cell. No silent
+zeros.
+## Baseline
+```ts
+const baseline = await runEval({
+  scenarios,
+  dispatch,
+  judges: [judge],
+  storage: inMemoryCampaignStorage(),
+  runDir: 'mem://my-baseline',
+})
+const score = Object.values(baseline.aggregates.byScenario)
+  .reduce((sum, s) => sum + s.meanComposite, 0) / scenarios.length
+console.log(`Baseline composite: ${score.toFixed(3)}`)
+```
+## Self-improvement loop
+```ts
+import { gepaDriver, defaultProductionGate } from '@tangle-network/agent-eval/contract'
+const result = await runImprovementLoop({
+  scenarios: trainScenarios,
+  baselineSurface,
+  dispatchWithSurface: (surface, scenario, ctx) =>
+    runYourAgent({ systemPrompt: surface as string }, scenario, ctx),
+  driver: gepaDriver({
+    llm: { apiKey: process.env.OPENAI_API_KEY, baseUrl: '...' },
+    model: 'gpt-4o-mini',
+    target: 'marketing copywriting system prompt',
+    mutationPrimitives: [
+      'Tighten the hook: lead with the concrete user outcome.',
+      'Replace generic adjectives with specific verbs.',
+      // ...
+    ],
+  }),
+  judges: [judge],
+  populationSize: 2,
+  maxGenerations: 3,
+  holdoutScenarios,
+  gate: defaultProductionGate({
+    holdoutScenarios,
+    deltaThreshold: 0.05,
+  }),
+  autoOnPromote: 'none',
+  storage: inMemoryCampaignStorage(),
+  runDir: 'mem://my-improve',
+})
+if (result.gateResult.decision === 'ship') {
+  // Deploy result.winnerSurface — we don't push it for you.
+}
+```
+The gate decision is `'ship'` | `'hold'` | `'need_more_work'` |
+`'model_ceiling'` | `'arch_ceiling'`. You define what each means in
+your deploy pipeline.
+## What you control
+- The agent (any framework, any model, any backend).
+- The judge (LLM, heuristic, ensemble; we don't pick).
+- The mutation strategy (`gepaDriver` for reflective LLM mutation,
+  `evolutionaryDriver({ mutator })` for population search, or
+  implement `ImprovementDriver` directly).
+- The gate (compose `defaultProductionGate` with custom checks via
+  `composeGate`).
+- The deploy step (`autoOnPromote: 'pr'` opens a GitHub PR with the
+  winner; `'none'` returns the surface and you ship however you ship).
+## What this does NOT install
+- No `@tangle-network/sandbox` — nothing runs in a Tangle sandbox.
+- No hosted orchestrator — traces, artifacts, judge scores stay on
+  your machine (or in `inMemoryCampaignStorage` for Workers/edge).
+- No daemons — `runEval` and `runImprovementLoop` complete in-process
+  and return.
+## When you want more
+The wedge doc (`docs/design/external-agent-wedge.md`) lays out three
+graduated tiers:
+| Tier | What you do | What you get |
+|---|---|---|
+| **LAND** (this quickstart) | `npm i @tangle-network/agent-eval`, wrap dispatch + judge, run loops | Local artifacts; full self-improvement; no Tangle infra |
+| **EXPAND** | Point trace/eval data at our hosted orchestrator | Hosted dashboards, cross-run intelligence, billing on data routed to us |
+| **PLATFORM** | Move execution into our sandbox | Substrate + orchestrator data pre-wired; sandbox usage billing |
+Each tier is opt-in. EXPAND and PLATFORM build on the same primitives;
+upgrading is adding configuration, not rewriting your wiring.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-eval",
-  "version": "0.44.0",
+  "version": "0.44.1",
   "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
   "homepage": "https://github.com/tangle-network/agent-eval#readme",
   "repository": {
@@ -109,6 +109,11 @@
       "import": "./dist/contract/index.js",
       "default": "./dist/contract/index.js"
     },
+    "./adapters/langchain": {
+      "types": "./dist/adapters/langchain.d.ts",
+      "import": "./dist/adapters/langchain.js",
+      "default": "./dist/adapters/langchain.js"
+    },
     "./openapi.json": {
       "default": "./dist/openapi.json"
     }