npm - @tangle-network/agent-eval - Versions diffs - 0.20.8 → 0.20.10 - Mend

@tangle-network/agent-eval 0.20.8 → 0.20.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/CHANGELOG.md +302 -0
package/LICENSE +21 -0
package/README.md +16 -9
package/dist/benchmarks/index.d.ts +1 -0
package/dist/benchmarks/index.js +12 -0
package/dist/benchmarks/index.js.map +1 -0
package/dist/chunk-42I2QC2L.js +219 -0
package/dist/chunk-42I2QC2L.js.map +1 -0
package/dist/{chunk-CJJSB6ZQ.js → chunk-LSR4IAYN.js} +90 -11
package/dist/chunk-LSR4IAYN.js.map +1 -0
package/dist/cli.js +1 -1
package/dist/index-1PZOtZFr.d.ts +290 -0
package/dist/index.d.ts +37 -298
package/dist/index.js +130 -252
package/dist/index.js.map +1 -1
package/dist/openapi.json +502 -0
package/dist/{sink-fetch-C0B8ximv.d.ts → sink-fetch-B1Yg4Til.d.ts} +1 -1
package/dist/telemetry/file.d.ts +1 -1
package/dist/telemetry/index.d.ts +2 -2
package/dist/telemetry/index.js.map +1 -1
package/dist/wire/index.js +1 -1
package/docs/concepts.md +4 -4
package/docs/knowledge-readiness.md +2 -2
package/docs/wire-protocol.md +3 -3
package/package.json +13 -5
package/dist/chunk-CJJSB6ZQ.js.map +0 -1
package/examples/benchmarks/README.md +0 -44
package/examples/benchmarks/gsm8k/index.ts +0 -126
package/examples/benchmarks/swebench-lite/index.ts +0 -178
package/examples/multi-shot-optimization/index.ts +0 -114
package/examples/same-sandbox-harness/index.ts +0 -63

package/dist/openapi.json ADDED Viewed

@@ -0,0 +1,502 @@
+{
+  "openapi": "3.1.0",
+  "info": {
+    "title": "@tangle-network/agent-eval — wire protocol",
+    "version": "0.20.10",
+    "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
+    "contact": {
+      "name": "Tangle Network",
+      "url": "https://github.com/tangle-network/agent-eval"
+    },
+    "license": {
+      "name": "MIT"
+    }
+  },
+  "servers": [
+    {
+      "url": "http://localhost:5005",
+      "description": "Local agent-eval serve"
+    }
+  ],
+  "components": {
+    "schemas": {
+      "JudgeRequest": {
+        "oneOf": [
+          {
+            "type": "object",
+            "additionalProperties": false,
+            "required": [
+              "rubricName",
+              "content"
+            ],
+            "properties": {
+              "rubricName": {
+                "type": "string",
+                "minLength": 1
+              },
+              "content": {
+                "type": "string",
+                "minLength": 1
+              },
+              "context": {
+                "type": "object",
+                "additionalProperties": true
+              },
+              "model": {
+                "type": "string"
+              }
+            }
+          },
+          {
+            "type": "object",
+            "additionalProperties": false,
+            "required": [
+              "rubric",
+              "content"
+            ],
+            "properties": {
+              "rubric": {
+                "$ref": "#/components/schemas/Rubric"
+              },
+              "content": {
+                "type": "string",
+                "minLength": 1
+              },
+              "context": {
+                "type": "object",
+                "additionalProperties": true
+              },
+              "model": {
+                "type": "string"
+              }
+            }
+          }
+        ],
+        "description": "Judge request. Provide exactly one of rubricName or rubric."
+      },
+      "Rubric": {
+        "type": "object",
+        "properties": {
+          "name": {
+            "type": "string",
+            "minLength": 1,
+            "description": "Stable name like \"anti-slop\" — used by clients to invoke this rubric."
+          },
+          "description": {
+            "type": "string",
+            "minLength": 1,
+            "description": "What this rubric measures. Shown in /v1/rubrics listing."
+          },
+          "systemPrompt": {
+            "type": "string",
+            "minLength": 1,
+            "description": "Instructs the judging LLM. Should explain the persona (e.g. \"senior engineer reviewing voice\"), what to score on, and what to return."
+          },
+          "dimensions": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/RubricDimension"
+            },
+            "minItems": 1,
+            "description": "Scoring axes. The composite score is a weighted sum of these."
+          },
+          "failureModes": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/FailureMode"
+            },
+            "default": [],
+            "description": "Patterns to detect; each detected mode appears in the result.failureModes list."
+          },
+          "wins": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/FailureMode"
+            },
+            "default": [],
+            "description": "Positive patterns; each detected one appears in the result.wins list."
+          }
+        },
+        "required": [
+          "name",
+          "description",
+          "systemPrompt",
+          "dimensions"
+        ],
+        "description": "Inline rubric definition. Mutually exclusive with `rubricName`."
+      },
+      "RubricDimension": {
+        "type": "object",
+        "properties": {
+          "id": {
+            "type": "string",
+            "minLength": 1,
+            "description": "Short stable id like \"buyer_quality\" — used as the key in scoring output."
+          },
+          "description": {
+            "type": "string",
+            "minLength": 1,
+            "description": "One-line plain-English meaning. Read by humans reviewing low scores."
+          },
+          "weight": {
+            "type": "number",
+            "minimum": 0,
+            "default": 1,
+            "description": "Relative weight in the composite score. Default 1; 0 disables."
+          },
+          "min": {
+            "type": "number",
+            "default": 0,
+            "description": "Lower bound of valid score for this dimension."
+          },
+          "max": {
+            "type": "number",
+            "default": 1,
+            "description": "Upper bound of valid score for this dimension."
+          }
+        },
+        "required": [
+          "id",
+          "description"
+        ]
+      },
+      "FailureMode": {
+        "type": "object",
+        "properties": {
+          "id": {
+            "type": "string",
+            "minLength": 1,
+            "description": "Short stable id like \"ai-cadence\" — used in detection lists."
+          },
+          "description": {
+            "type": "string",
+            "minLength": 1,
+            "description": "Plain-English description of the failure pattern."
+          }
+        },
+        "required": [
+          "id",
+          "description"
+        ]
+      },
+      "JudgeResult": {
+        "type": "object",
+        "properties": {
+          "composite": {
+            "type": "number",
+            "minimum": 0,
+            "maximum": 1,
+            "description": "Weighted combination of dimension scores in 0..1. The single number to gate on."
+          },
+          "dimensions": {
+            "type": "object",
+            "additionalProperties": {
+              "type": "number"
+            },
+            "description": "Per-dimension score, keyed by RubricDimension.id."
+          },
+          "failureModes": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            },
+            "default": [],
+            "description": "Failure-mode ids detected in the content (subset of rubric.failureModes ids)."
+          },
+          "wins": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            },
+            "default": [],
+            "description": "Win ids detected in the content (subset of rubric.wins ids)."
+          },
+          "rationale": {
+            "type": "string",
+            "description": "Plain-English explanation of the score. Surfaced to the human reviewer."
+          },
+          "rubricVersion": {
+            "type": "string",
+            "description": "Stable hash of the rubric used. Scores are only comparable across runs when this matches."
+          },
+          "model": {
+            "type": "string",
+            "description": "Model that produced the judgement, for reproducibility."
+          },
+          "durationMs": {
+            "type": "integer",
+            "minimum": 0,
+            "description": "End-to-end wall time for this call."
+          }
+        },
+        "required": [
+          "composite",
+          "dimensions",
+          "rationale",
+          "rubricVersion",
+          "model",
+          "durationMs"
+        ]
+      },
+      "ListRubricsResponse": {
+        "type": "object",
+        "properties": {
+          "rubrics": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/RubricInfo"
+            }
+          }
+        },
+        "required": [
+          "rubrics"
+        ]
+      },
+      "RubricInfo": {
+        "type": "object",
+        "properties": {
+          "name": {
+            "type": "string",
+            "description": "Pass this to /v1/judge as `rubricName`."
+          },
+          "description": {
+            "type": "string",
+            "description": "What this rubric measures."
+          },
+          "dimensions": {
+            "type": "array",
+            "items": {
+              "type": "object",
+              "properties": {
+                "id": {
+                  "type": "string"
+                },
+                "description": {
+                  "type": "string"
+                },
+                "weight": {
+                  "type": "number"
+                }
+              },
+              "required": [
+                "id",
+                "description",
+                "weight"
+              ]
+            },
+            "description": "The scoring axes this rubric uses, with weights."
+          },
+          "failureModes": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            },
+            "default": [],
+            "description": "Failure-mode ids this rubric detects."
+          },
+          "rubricVersion": {
+            "type": "string",
+            "description": "Stable hash — match this to compare scores across runs."
+          }
+        },
+        "required": [
+          "name",
+          "description",
+          "dimensions",
+          "rubricVersion"
+        ]
+      },
+      "VersionResponse": {
+        "type": "object",
+        "properties": {
+          "package": {
+            "type": "string",
+            "description": "Package name (always \"@tangle-network/agent-eval\")."
+          },
+          "version": {
+            "type": "string",
+            "description": "Semver of the running server. Match your client to this."
+          },
+          "wireVersion": {
+            "type": "string",
+            "description": "Wire-protocol semver. Bumps separately from package version when the schema changes."
+          },
+          "apiSurface": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            },
+            "description": "List of supported method names."
+          }
+        },
+        "required": [
+          "package",
+          "version",
+          "wireVersion",
+          "apiSurface"
+        ]
+      },
+      "HealthResponse": {
+        "type": "object",
+        "properties": {
+          "status": {
+            "type": "string",
+            "enum": [
+              "ok"
+            ]
+          },
+          "uptimeSec": {
+            "type": "number"
+          }
+        },
+        "required": [
+          "status",
+          "uptimeSec"
+        ]
+      },
+      "ErrorResponse": {
+        "type": "object",
+        "properties": {
+          "error": {
+            "type": "object",
+            "properties": {
+              "code": {
+                "type": "string",
+                "description": "Machine-readable code: \"validation_error\", \"rubric_not_found\", \"judge_error\"."
+              },
+              "message": {
+                "type": "string",
+                "description": "Human-readable message."
+              },
+              "details": {
+                "description": "Optional structured detail."
+              }
+            },
+            "required": [
+              "code",
+              "message"
+            ],
+            "description": "Errors are always wrapped in this shape across all endpoints."
+          }
+        },
+        "required": [
+          "error"
+        ]
+      }
+    },
+    "parameters": {}
+  },
+  "paths": {
+    "/v1/judge": {
+      "post": {
+        "summary": "Score a piece of content against a rubric",
+        "description": "Runs the judging LLM with the named (or inline) rubric and returns dimension scores, detected failure modes, wins, and a composite score in 0..1.",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/JudgeRequest"
+              }
+            }
+          }
+        },
+        "responses": {
+          "200": {
+            "description": "Successful judgement",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/JudgeResult"
+                }
+              }
+            }
+          },
+          "400": {
+            "description": "Validation error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                }
+              }
+            }
+          },
+          "404": {
+            "description": "Rubric not found",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                }
+              }
+            }
+          },
+          "500": {
+            "description": "Judge error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/v1/rubrics": {
+      "get": {
+        "summary": "List built-in rubrics",
+        "description": "Returns every rubric registered server-side, with their dimensions and stable rubricVersion hash.",
+        "responses": {
+          "200": {
+            "description": "Listing",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ListRubricsResponse"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/v1/version": {
+      "get": {
+        "summary": "Server and wire-protocol version",
+        "description": "Match your client version to `version`; check `wireVersion` for compatibility.",
+        "responses": {
+          "200": {
+            "description": "Version info",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/VersionResponse"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/healthz": {
+      "get": {
+        "summary": "Liveness check",
+        "responses": {
+          "200": {
+            "description": "OK",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/HealthResponse"
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  },
+  "webhooks": {}
+}

package/dist/{sink-fetch-C0B8ximv.d.ts → sink-fetch-B1Yg4Til.d.ts} RENAMED Viewed

@@ -64,7 +64,7 @@ interface TelemetryModel {
  * `child_process`. Safe to import from a Cloudflare Worker, Lambda, edge
  * function, or browser extension.
  *
- * For Node-only file persistence, import from './sink-file' instead.
+ * For Node-only file persistence, import from '@tangle-network/agent-eval/telemetry/file'.
  */
 interface TelemetrySink {

package/dist/telemetry/file.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { c as TelemetrySink, e as TelemetryEnvelope } from '../sink-fetch-C0B8ximv.js';
+import { c as TelemetrySink, e as TelemetryEnvelope } from '../sink-fetch-B1Yg4Til.js';
 /**
  * Node-only file sink. Imports `node:fs` — DO NOT import this from a Worker

package/dist/telemetry/index.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
-import { T as TelemetryKind, a as TelemetryModel, b as TelemetrySource, c as TelemetrySink } from '../sink-fetch-C0B8ximv.js';
-export { F as FanoutTelemetrySink, H as HttpTelemetrySink, I as InMemoryTelemetrySink, N as NullTelemetrySink, d as TELEMETRY_SCHEMA_VERSION, e as TelemetryEnvelope } from '../sink-fetch-C0B8ximv.js';
+import { T as TelemetryKind, a as TelemetryModel, b as TelemetrySource, c as TelemetrySink } from '../sink-fetch-B1Yg4Til.js';
+export { F as FanoutTelemetrySink, H as HttpTelemetrySink, I as InMemoryTelemetrySink, N as NullTelemetrySink, d as TELEMETRY_SCHEMA_VERSION, e as TelemetryEnvelope } from '../sink-fetch-B1Yg4Til.js';
 /**
  * Telemetry client — thin wrapper that builds envelopes from `EmitArgs` and

package/dist/telemetry/index.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"sources":["../../src/telemetry/schema.ts","../../src/telemetry/sink-fetch.ts","../../src/telemetry/client.ts"],"sourcesContent":["/*\n Fleet telemetry envelope — agent-eval's portable observability shape.\n \n Designed so any consumer (Node CLI, Cloudflare Worker, Lambda, browser\n * extension) can emit structured rows describing one unit of work — a page\n * audit, a tool call, an evolve round, a full agent run — to a central sink.\n \n The schema is intentionally a strict superset of agent-eval's `Run` shape\n * so a future TraceStore adapter can promote envelopes into traces without\n * translation.\n /\n\nexport const TELEMETRY_SCHEMA_VERSION = 1\n\n/* Discriminator for the unit of work this envelope describes. /\nexport type TelemetryKind =\n \| 'agent-run'\n \| 'design-audit-page'\n \| 'design-audit-run'\n \| 'design-evolve-round'\n \| 'design-evolve-run'\n \| 'gepa-trial'\n \| 'gepa-generation'\n \| 'tool-call'\n \| 'judge-verdict'\n \| 'custom'\n\nexport interface TelemetryEnvelope {\n schemaVersion: typeof TELEMETRY_SCHEMA_VERSION\n envelopeId: string\n runId: string\n timestamp: string\n parentRunId?: string\n\n source: TelemetrySource\n model?: TelemetryModel\n kind: TelemetryKind\n ok: boolean\n durationMs: number\n\n data: Record<string, unknown>\n metrics: Record<string, number>\n tags?: Record<string, string>\n\n error?: string\n}\n\nexport interface TelemetrySource {\n /* Repo identity — basename of cwd plus git remote if discoverable. /\n repo: string\n cwd: string\n gitSha?: string\n gitBranch?: string\n cliVersion: string\n /* What was invoked, e.g. `design-audit`, `bad run`, `gepa --target`. /\n invocation: string\n /* Sanitised argv minus secrets. /\n argv?: string[]\n /\n Multi-tenant identity. Set when the consumer runs inside a hosted\n * product so a fleet rollup can group by tenant without leaking customer\n * URLs or PII.\n /\n tenantId?: string\n /* Optional sub-tenant identity (project, suite, walkthrough, customer). /\n customerId?: string\n /* SHA-256 (12 hex) of the API key used to authenticate this run, when applicable. /\n apiKeyHash?: string\n}\n\nexport interface TelemetryModel {\n provider: string\n name: string\n /* SHA-256 (12 hex chars) of the prompt(s) used. /\n promptHash?: string\n /* SHA-256 (12 hex chars) of the composed rubric body, if applicable. /\n rubricHash?: string\n}\n","/\n Workers-safe telemetry sinks — only `fetch` and pure JS. No `fs`, no\n * `child_process`. Safe to import from a Cloudflare Worker, Lambda, edge\n * function, or browser extension.\n \n For Node-only file persistence, import from './sink-file' instead.\n /\n\nimport type { TelemetryEnvelope } from './schema'\n\nexport interface TelemetrySink {\n emit(envelope: TelemetryEnvelope): Promise<void> \| void\n close?(): Promise<void> \| void\n}\n\n/* Best-effort POST to a remote collector. Fire-and-forget; never throws. /\nexport class HttpTelemetrySink implements TelemetrySink {\n private inflight = new Set<Promise<void>>()\n\n constructor(\n private readonly endpoint: string,\n private readonly bearer?: string,\n ) {}\n\n emit(envelope: TelemetryEnvelope): void {\n const body = JSON.stringify(envelope)\n const headers: Record<string, string> = { 'content-type': 'application/json' }\n if (this.bearer) headers.authorization = `Bearer ${this.bearer}`\n const promise = fetch(this.endpoint, { method: 'POST', headers, body })\n .then(() => undefined)\n .catch(() => undefined)\n this.inflight.add(promise)\n promise.finally(() => this.inflight.delete(promise))\n }\n\n async close(): Promise<void> {\n await Promise.allSettled(Array.from(this.inflight))\n }\n}\n\n/* Fanout to multiple sinks — failures in one do not affect others. /\nexport class FanoutTelemetrySink implements TelemetrySink {\n constructor(private readonly sinks: TelemetrySink[]) {}\n\n emit(envelope: TelemetryEnvelope): void {\n for (const sink of this.sinks) {\n try {\n const result = sink.emit(envelope)\n if (result && typeof (result as Promise<unknown>).catch === 'function') {\n ;(result as Promise<unknown>).catch(() => undefined)\n }\n } catch {\n // swallow — telemetry must never break a run\n }\n }\n }\n\n async close(): Promise<void> {\n await Promise.allSettled(this.sinks.map((s) => Promise.resolve(s.close?.())))\n }\n}\n\n/* No-op sink — used when telemetry is explicitly disabled. /\nexport class NullTelemetrySink implements TelemetrySink {\n emit(): void {}\n}\n\n/* In-memory sink — useful for tests + downstream adapters. /\nexport class InMemoryTelemetrySink implements TelemetrySink {\n readonly envelopes: TelemetryEnvelope[] = []\n emit(envelope: TelemetryEnvelope): void {\n this.envelopes.push(envelope)\n }\n clear(): void { this.envelopes.length = 0 }\n}\n","/\n Telemetry client — thin wrapper that builds envelopes from `EmitArgs` and\n * delegates to a `TelemetrySink`. Pure logic; no I/O. Use this from any\n * runtime — Workers, Node, browser — and choose the sink accordingly.\n \n For an opinionated singleton with env-var-driven sink wiring (the bad CLI\n * pattern), see `./node-client.ts`.\n /\n\nimport type { TelemetryEnvelope, TelemetryKind, TelemetryModel, TelemetrySource } from './schema'\nimport { TELEMETRY_SCHEMA_VERSION } from './schema'\nimport type { TelemetrySink } from './sink-fetch'\n\nexport interface EmitArgs {\n kind: TelemetryKind\n runId: string\n parentRunId?: string\n ok: boolean\n durationMs: number\n data?: Record<string, unknown>\n metrics?: Record<string, number>\n tags?: Record<string, string>\n model?: TelemetryModel\n error?: string\n /* Override the source for this envelope. Falls back to `defaultSource`. /\n source?: TelemetrySource\n}\n\nexport class TelemetryClient {\n constructor(\n private readonly sink: TelemetrySink,\n private readonly defaultSource: TelemetrySource,\n ) {}\n\n emit(args: EmitArgs): void {\n const envelope: TelemetryEnvelope = {\n schemaVersion: TELEMETRY_SCHEMA_VERSION,\n envelopeId: makeEnvelopeId(),\n runId: args.runId,\n timestamp: new Date().toISOString(),\n source: args.source ?? this.defaultSource,\n kind: args.kind,\n ok: args.ok,\n durationMs: args.durationMs,\n data: args.data ?? {},\n metrics: args.metrics ?? {},\n ...(args.parentRunId ? { parentRunId: args.parentRunId } : {}),\n ...(args.model ? { model: args.model } : {}),\n ...(args.tags ? { tags: args.tags } : {}),\n ...(args.error ? { error: args.error } : {}),\n }\n try {\n this.sink.emit(envelope)\n } catch {\n // swallow — telemetry never breaks the calling code path\n }\n }\n\n async close(): Promise<void> {\n await this.sink.close?.()\n }\n}\n\n/* Generate a UUIDv4 with whatever crypto is available (Node, Workers, browsers). /\nfunction makeEnvelopeId(): string {\n if (typeof crypto !== 'undefined' && typeof crypto.randomUUID === 'function') {\n return crypto.randomUUID()\n }\n // Last-resort fallback. Lower entropy but never throws.\n return 'env-' + Date.now().toString(36) + '-' + Math.random().toString(36).slice(2, 10)\n}\n\nexport const SECRET_FLAGS = new Set(['--api-key', '--bearer', '--token', '--password'])\n\n/* Strip likely-secret values from argv, preserving structure. /\nexport function sanitiseArgv(argv: string[]): string[] {\n const out: string[] = []\n for (let i = 0; i < argv.length; i++) {\n const a = argv[i]!\n if (SECRET_FLAGS.has(a)) {\n out.push(a, '<redacted>')\n i++\n continue\n }\n if (/^(?:--api-key\|--bearer\|--token\|--password)=/.test(a)) {\n out.push(a.replace(/=.$/, '=<redacted>'))\n continue\n }\n out.push(a)\n }\n return out\n}\n"],"mappings":";;;AAYO,IAAM,2BAA2B;;;ACIjC,IAAM,oBAAN,MAAiD;AAAA,EAGtD,YACmB,UACA,QACjB;AAFiB;AACA;AAAA,EAChB;AAAA,EAFgB;AAAA,EACA;AAAA,EAJX,WAAW,oBAAI,IAAmB;AAAA,EAO1C,KAAK,UAAmC;AACtC,UAAM,OAAO,KAAK,UAAU,QAAQ;AACpC,UAAM,UAAkC,EAAE,gBAAgB,mBAAmB;AAC7E,QAAI,KAAK,OAAQ,SAAQ,gBAAgB,UAAU,KAAK,MAAM;AAC9D,UAAM,UAAU,MAAM,KAAK,UAAU,EAAE,QAAQ,QAAQ,SAAS,KAAK,CAAC,EACnE,KAAK,MAAM,MAAS,EACpB,MAAM,MAAM,MAAS;AACxB,SAAK,SAAS,IAAI,OAAO;AACzB,YAAQ,QAAQ,MAAM,KAAK,SAAS,OAAO,OAAO,CAAC;AAAA,EACrD;AAAA,EAEA,MAAM,QAAuB;AAC3B,UAAM,QAAQ,WAAW,MAAM,KAAK,KAAK,QAAQ,CAAC;AAAA,EACpD;AACF;AAGO,IAAM,sBAAN,MAAmD;AAAA,EACxD,YAA6B,OAAwB;AAAxB;AAAA,EAAyB;AAAA,EAAzB;AAAA,EAE7B,KAAK,UAAmC;AACtC,eAAW,QAAQ,KAAK,OAAO;AAC7B,UAAI;AACF,cAAM,SAAS,KAAK,KAAK,QAAQ;AACjC,YAAI,UAAU,OAAQ,OAA4B,UAAU,YAAY;AACtE;AAAC,UAAC,OAA4B,MAAM,MAAM,MAAS;AAAA,QACrD;AAAA,MACF,QAAQ;AAAA,MAER;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAM,QAAuB;AAC3B,UAAM,QAAQ,WAAW,KAAK,MAAM,IAAI,CAAC,MAAM,QAAQ,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAAA,EAC9E;AACF;AAGO,IAAM,oBAAN,MAAiD;AAAA,EACtD,OAAa;AAAA,EAAC;AAChB;AAGO,IAAM,wBAAN,MAAqD;AAAA,EACjD,YAAiC,CAAC;AAAA,EAC3C,KAAK,UAAmC;AACtC,SAAK,UAAU,KAAK,QAAQ;AAAA,EAC9B;AAAA,EACA,QAAc;AAAE,SAAK,UAAU,SAAS;AAAA,EAAE;AAC5C;;;AC9CO,IAAM,kBAAN,MAAsB;AAAA,EAC3B,YACmB,MACA,eACjB;AAFiB;AACA;AAAA,EAChB;AAAA,EAFgB;AAAA,EACA;AAAA,EAGnB,KAAK,MAAsB;AACzB,UAAM,WAA8B;AAAA,MAClC,eAAe;AAAA,MACf,YAAY,eAAe;AAAA,MAC3B,OAAO,KAAK;AAAA,MACZ,YAAW,oBAAI,KAAK,GAAE,YAAY;AAAA,MAClC,QAAQ,KAAK,UAAU,KAAK;AAAA,MAC5B,MAAM,KAAK;AAAA,MACX,IAAI,KAAK;AAAA,MACT,YAAY,KAAK;AAAA,MACjB,MAAM,KAAK,QAAQ,CAAC;AAAA,MACpB,SAAS,KAAK,WAAW,CAAC;AAAA,MAC1B,GAAI,KAAK,cAAc,EAAE,aAAa,KAAK,YAAY,IAAI,CAAC;AAAA,MAC5D,GAAI,KAAK,QAAQ,EAAE,OAAO,KAAK,MAAM,IAAI,CAAC;AAAA,MAC1C,GAAI,KAAK,OAAO,EAAE,MAAM,KAAK,KAAK,IAAI,CAAC;AAAA,MACvC,GAAI,KAAK,QAAQ,EAAE,OAAO,KAAK,MAAM,IAAI,CAAC;AAAA,IAC5C;AACA,QAAI;AACF,WAAK,KAAK,KAAK,QAAQ;AAAA,IACzB,QAAQ;AAAA,IAER;AAAA,EACF;AAAA,EAEA,MAAM,QAAuB;AAC3B,UAAM,KAAK,KAAK,QAAQ;AAAA,EAC1B;AACF;AAGA,SAAS,iBAAyB;AAChC,MAAI,OAAO,WAAW,eAAe,OAAO,OAAO,eAAe,YAAY;AAC5E,WAAO,OAAO,WAAW;AAAA,EAC3B;AAEA,SAAO,SAAS,KAAK,IAAI,EAAE,SAAS,EAAE,IAAI,MAAM,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,GAAG,EAAE;AACxF;AAEO,IAAM,eAAe,oBAAI,IAAI,CAAC,aAAa,YAAY,WAAW,YAAY,CAAC;AAG/E,SAAS,aAAa,MAA0B;AACrD,QAAM,MAAgB,CAAC;AACvB,WAAS,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;AACpC,UAAM,IAAI,KAAK,CAAC;AAChB,QAAI,aAAa,IAAI,CAAC,GAAG;AACvB,UAAI,KAAK,GAAG,YAAY;AACxB;AACA;AAAA,IACF;AACA,QAAI,8CAA8C,KAAK,CAAC,GAAG;AACzD,UAAI,KAAK,EAAE,QAAQ,QAAQ,aAAa,CAAC;AACzC;AAAA,IACF;AACA,QAAI,KAAK,CAAC;AAAA,EACZ;AACA,SAAO;AACT;","names":[]}
1	+ {"version":3,"sources":["../../src/telemetry/schema.ts","../../src/telemetry/sink-fetch.ts","../../src/telemetry/client.ts"],"sourcesContent":["/*\n Fleet telemetry envelope — agent-eval's portable observability shape.\n \n Designed so any consumer (Node CLI, Cloudflare Worker, Lambda, browser\n * extension) can emit structured rows describing one unit of work — a page\n * audit, a tool call, an evolve round, a full agent run — to a central sink.\n \n The schema is intentionally a strict superset of agent-eval's `Run` shape\n * so a future TraceStore adapter can promote envelopes into traces without\n * translation.\n /\n\nexport const TELEMETRY_SCHEMA_VERSION = 1\n\n/* Discriminator for the unit of work this envelope describes. /\nexport type TelemetryKind =\n \| 'agent-run'\n \| 'design-audit-page'\n \| 'design-audit-run'\n \| 'design-evolve-round'\n \| 'design-evolve-run'\n \| 'gepa-trial'\n \| 'gepa-generation'\n \| 'tool-call'\n \| 'judge-verdict'\n \| 'custom'\n\nexport interface TelemetryEnvelope {\n schemaVersion: typeof TELEMETRY_SCHEMA_VERSION\n envelopeId: string\n runId: string\n timestamp: string\n parentRunId?: string\n\n source: TelemetrySource\n model?: TelemetryModel\n kind: TelemetryKind\n ok: boolean\n durationMs: number\n\n data: Record<string, unknown>\n metrics: Record<string, number>\n tags?: Record<string, string>\n\n error?: string\n}\n\nexport interface TelemetrySource {\n /* Repo identity — basename of cwd plus git remote if discoverable. /\n repo: string\n cwd: string\n gitSha?: string\n gitBranch?: string\n cliVersion: string\n /* What was invoked, e.g. `design-audit`, `bad run`, `gepa --target`. /\n invocation: string\n /* Sanitised argv minus secrets. /\n argv?: string[]\n /\n Multi-tenant identity. Set when the consumer runs inside a hosted\n * product so a fleet rollup can group by tenant without leaking customer\n * URLs or PII.\n /\n tenantId?: string\n /* Optional sub-tenant identity (project, suite, walkthrough, customer). /\n customerId?: string\n /* SHA-256 (12 hex) of the API key used to authenticate this run, when applicable. /\n apiKeyHash?: string\n}\n\nexport interface TelemetryModel {\n provider: string\n name: string\n /* SHA-256 (12 hex chars) of the prompt(s) used. /\n promptHash?: string\n /* SHA-256 (12 hex chars) of the composed rubric body, if applicable. /\n rubricHash?: string\n}\n","/\n Workers-safe telemetry sinks — only `fetch` and pure JS. No `fs`, no\n * `child_process`. Safe to import from a Cloudflare Worker, Lambda, edge\n * function, or browser extension.\n \n For Node-only file persistence, import from '@tangle-network/agent-eval/telemetry/file'.\n /\n\nimport type { TelemetryEnvelope } from './schema'\n\nexport interface TelemetrySink {\n emit(envelope: TelemetryEnvelope): Promise<void> \| void\n close?(): Promise<void> \| void\n}\n\n/* Best-effort POST to a remote collector. Fire-and-forget; never throws. /\nexport class HttpTelemetrySink implements TelemetrySink {\n private inflight = new Set<Promise<void>>()\n\n constructor(\n private readonly endpoint: string,\n private readonly bearer?: string,\n ) {}\n\n emit(envelope: TelemetryEnvelope): void {\n const body = JSON.stringify(envelope)\n const headers: Record<string, string> = { 'content-type': 'application/json' }\n if (this.bearer) headers.authorization = `Bearer ${this.bearer}`\n const promise = fetch(this.endpoint, { method: 'POST', headers, body })\n .then(() => undefined)\n .catch(() => undefined)\n this.inflight.add(promise)\n promise.finally(() => this.inflight.delete(promise))\n }\n\n async close(): Promise<void> {\n await Promise.allSettled(Array.from(this.inflight))\n }\n}\n\n/* Fanout to multiple sinks — failures in one do not affect others. /\nexport class FanoutTelemetrySink implements TelemetrySink {\n constructor(private readonly sinks: TelemetrySink[]) {}\n\n emit(envelope: TelemetryEnvelope): void {\n for (const sink of this.sinks) {\n try {\n const result = sink.emit(envelope)\n if (result && typeof (result as Promise<unknown>).catch === 'function') {\n ;(result as Promise<unknown>).catch(() => undefined)\n }\n } catch {\n // swallow — telemetry must never break a run\n }\n }\n }\n\n async close(): Promise<void> {\n await Promise.allSettled(this.sinks.map((s) => Promise.resolve(s.close?.())))\n }\n}\n\n/* No-op sink — used when telemetry is explicitly disabled. /\nexport class NullTelemetrySink implements TelemetrySink {\n emit(): void {}\n}\n\n/* In-memory sink — useful for tests + downstream adapters. /\nexport class InMemoryTelemetrySink implements TelemetrySink {\n readonly envelopes: TelemetryEnvelope[] = []\n emit(envelope: TelemetryEnvelope): void {\n this.envelopes.push(envelope)\n }\n clear(): void { this.envelopes.length = 0 }\n}\n","/\n Telemetry client — thin wrapper that builds envelopes from `EmitArgs` and\n * delegates to a `TelemetrySink`. Pure logic; no I/O. Use this from any\n * runtime — Workers, Node, browser — and choose the sink accordingly.\n \n For an opinionated singleton with env-var-driven sink wiring (the bad CLI\n * pattern), see `./node-client.ts`.\n /\n\nimport type { TelemetryEnvelope, TelemetryKind, TelemetryModel, TelemetrySource } from './schema'\nimport { TELEMETRY_SCHEMA_VERSION } from './schema'\nimport type { TelemetrySink } from './sink-fetch'\n\nexport interface EmitArgs {\n kind: TelemetryKind\n runId: string\n parentRunId?: string\n ok: boolean\n durationMs: number\n data?: Record<string, unknown>\n metrics?: Record<string, number>\n tags?: Record<string, string>\n model?: TelemetryModel\n error?: string\n /* Override the source for this envelope. Falls back to `defaultSource`. /\n source?: TelemetrySource\n}\n\nexport class TelemetryClient {\n constructor(\n private readonly sink: TelemetrySink,\n private readonly defaultSource: TelemetrySource,\n ) {}\n\n emit(args: EmitArgs): void {\n const envelope: TelemetryEnvelope = {\n schemaVersion: TELEMETRY_SCHEMA_VERSION,\n envelopeId: makeEnvelopeId(),\n runId: args.runId,\n timestamp: new Date().toISOString(),\n source: args.source ?? this.defaultSource,\n kind: args.kind,\n ok: args.ok,\n durationMs: args.durationMs,\n data: args.data ?? {},\n metrics: args.metrics ?? {},\n ...(args.parentRunId ? { parentRunId: args.parentRunId } : {}),\n ...(args.model ? { model: args.model } : {}),\n ...(args.tags ? { tags: args.tags } : {}),\n ...(args.error ? { error: args.error } : {}),\n }\n try {\n this.sink.emit(envelope)\n } catch {\n // swallow — telemetry never breaks the calling code path\n }\n }\n\n async close(): Promise<void> {\n await this.sink.close?.()\n }\n}\n\n/* Generate a UUIDv4 with whatever crypto is available (Node, Workers, browsers). /\nfunction makeEnvelopeId(): string {\n if (typeof crypto !== 'undefined' && typeof crypto.randomUUID === 'function') {\n return crypto.randomUUID()\n }\n // Last-resort fallback. Lower entropy but never throws.\n return 'env-' + Date.now().toString(36) + '-' + Math.random().toString(36).slice(2, 10)\n}\n\nexport const SECRET_FLAGS = new Set(['--api-key', '--bearer', '--token', '--password'])\n\n/* Strip likely-secret values from argv, preserving structure. /\nexport function sanitiseArgv(argv: string[]): string[] {\n const out: string[] = []\n for (let i = 0; i < argv.length; i++) {\n const a = argv[i]!\n if (SECRET_FLAGS.has(a)) {\n out.push(a, '<redacted>')\n i++\n continue\n }\n if (/^(?:--api-key\|--bearer\|--token\|--password)=/.test(a)) {\n out.push(a.replace(/=.$/, '=<redacted>'))\n continue\n }\n out.push(a)\n }\n return out\n}\n"],"mappings":";;;AAYO,IAAM,2BAA2B;;;ACIjC,IAAM,oBAAN,MAAiD;AAAA,EAGtD,YACmB,UACA,QACjB;AAFiB;AACA;AAAA,EAChB;AAAA,EAFgB;AAAA,EACA;AAAA,EAJX,WAAW,oBAAI,IAAmB;AAAA,EAO1C,KAAK,UAAmC;AACtC,UAAM,OAAO,KAAK,UAAU,QAAQ;AACpC,UAAM,UAAkC,EAAE,gBAAgB,mBAAmB;AAC7E,QAAI,KAAK,OAAQ,SAAQ,gBAAgB,UAAU,KAAK,MAAM;AAC9D,UAAM,UAAU,MAAM,KAAK,UAAU,EAAE,QAAQ,QAAQ,SAAS,KAAK,CAAC,EACnE,KAAK,MAAM,MAAS,EACpB,MAAM,MAAM,MAAS;AACxB,SAAK,SAAS,IAAI,OAAO;AACzB,YAAQ,QAAQ,MAAM,KAAK,SAAS,OAAO,OAAO,CAAC;AAAA,EACrD;AAAA,EAEA,MAAM,QAAuB;AAC3B,UAAM,QAAQ,WAAW,MAAM,KAAK,KAAK,QAAQ,CAAC;AAAA,EACpD;AACF;AAGO,IAAM,sBAAN,MAAmD;AAAA,EACxD,YAA6B,OAAwB;AAAxB;AAAA,EAAyB;AAAA,EAAzB;AAAA,EAE7B,KAAK,UAAmC;AACtC,eAAW,QAAQ,KAAK,OAAO;AAC7B,UAAI;AACF,cAAM,SAAS,KAAK,KAAK,QAAQ;AACjC,YAAI,UAAU,OAAQ,OAA4B,UAAU,YAAY;AACtE;AAAC,UAAC,OAA4B,MAAM,MAAM,MAAS;AAAA,QACrD;AAAA,MACF,QAAQ;AAAA,MAER;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAM,QAAuB;AAC3B,UAAM,QAAQ,WAAW,KAAK,MAAM,IAAI,CAAC,MAAM,QAAQ,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAAA,EAC9E;AACF;AAGO,IAAM,oBAAN,MAAiD;AAAA,EACtD,OAAa;AAAA,EAAC;AAChB;AAGO,IAAM,wBAAN,MAAqD;AAAA,EACjD,YAAiC,CAAC;AAAA,EAC3C,KAAK,UAAmC;AACtC,SAAK,UAAU,KAAK,QAAQ;AAAA,EAC9B;AAAA,EACA,QAAc;AAAE,SAAK,UAAU,SAAS;AAAA,EAAE;AAC5C;;;AC9CO,IAAM,kBAAN,MAAsB;AAAA,EAC3B,YACmB,MACA,eACjB;AAFiB;AACA;AAAA,EAChB;AAAA,EAFgB;AAAA,EACA;AAAA,EAGnB,KAAK,MAAsB;AACzB,UAAM,WAA8B;AAAA,MAClC,eAAe;AAAA,MACf,YAAY,eAAe;AAAA,MAC3B,OAAO,KAAK;AAAA,MACZ,YAAW,oBAAI,KAAK,GAAE,YAAY;AAAA,MAClC,QAAQ,KAAK,UAAU,KAAK;AAAA,MAC5B,MAAM,KAAK;AAAA,MACX,IAAI,KAAK;AAAA,MACT,YAAY,KAAK;AAAA,MACjB,MAAM,KAAK,QAAQ,CAAC;AAAA,MACpB,SAAS,KAAK,WAAW,CAAC;AAAA,MAC1B,GAAI,KAAK,cAAc,EAAE,aAAa,KAAK,YAAY,IAAI,CAAC;AAAA,MAC5D,GAAI,KAAK,QAAQ,EAAE,OAAO,KAAK,MAAM,IAAI,CAAC;AAAA,MAC1C,GAAI,KAAK,OAAO,EAAE,MAAM,KAAK,KAAK,IAAI,CAAC;AAAA,MACvC,GAAI,KAAK,QAAQ,EAAE,OAAO,KAAK,MAAM,IAAI,CAAC;AAAA,IAC5C;AACA,QAAI;AACF,WAAK,KAAK,KAAK,QAAQ;AAAA,IACzB,QAAQ;AAAA,IAER;AAAA,EACF;AAAA,EAEA,MAAM,QAAuB;AAC3B,UAAM,KAAK,KAAK,QAAQ;AAAA,EAC1B;AACF;AAGA,SAAS,iBAAyB;AAChC,MAAI,OAAO,WAAW,eAAe,OAAO,OAAO,eAAe,YAAY;AAC5E,WAAO,OAAO,WAAW;AAAA,EAC3B;AAEA,SAAO,SAAS,KAAK,IAAI,EAAE,SAAS,EAAE,IAAI,MAAM,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,GAAG,EAAE;AACxF;AAEO,IAAM,eAAe,oBAAI,IAAI,CAAC,aAAa,YAAY,WAAW,YAAY,CAAC;AAG/E,SAAS,aAAa,MAA0B;AACrD,QAAM,MAAgB,CAAC;AACvB,WAAS,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;AACpC,UAAM,IAAI,KAAK,CAAC;AAChB,QAAI,aAAa,IAAI,CAAC,GAAG;AACvB,UAAI,KAAK,GAAG,YAAY;AACxB;AACA;AAAA,IACF;AACA,QAAI,8CAA8C,KAAK,CAAC,GAAG;AACzD,UAAI,KAAK,EAAE,QAAQ,QAAQ,aAAa,CAAC;AACzC;AAAA,IACF;AACA,QAAI,KAAK,CAAC;AAAA,EACZ;AACA,SAAO;AACT;","names":[]}

package/dist/wire/index.js CHANGED Viewed

@@ -24,7 +24,7 @@ import {
   runRpcBatch,
   runRpcOnce,
   startServer
-} from "../chunk-CJJSB6ZQ.js";
+} from "../chunk-LSR4IAYN.js";
 import "../chunk-JAOLXRIA.js";
 import "../chunk-PZ5AY32C.js";
 export {

package/docs/concepts.md CHANGED Viewed

@@ -43,7 +43,7 @@ that can seed memory, replay scenarios, and optimization.
 | **Trace store** | The append-only log of every span/event during a run. Replay = read this back. |
 | **Composite score** | A 0..1 number combining all dimensions. The single number you gate on. |
 | **Rubric version** | A stable hash of the rubric. Scores from different rubric versions are not comparable. |
-| **Muffled gate** | A check that should fail loud but silently passes (e.g. `command || true`). The most expensive bug class in this codebase — see SKILL.md. |
+| **Muffled gate** | A check that should fail loud but silently passes (e.g. `command || true`). The most expensive bug class in this codebase. |
 ## The feedback trajectory loop
@@ -119,7 +119,7 @@ report.blendedScore   // 0..1 — weighted aggregate
 report.layers         // per-layer status, findings, duration
 ```
-Two rules that will save you bugs (paid for in real incidents — see SKILL.md):
+Two rules that will save you bugs:
 1. **Run both gates.** Build gates catch code that doesn't compile; structural assertions catch missing files. Run both unconditionally — they catch orthogonal failures.
@@ -150,6 +150,6 @@ You don't need to build the trace tree by hand. `BuilderSession` does it for you
 - **Just want to score a string against a rubric?** → [wire-protocol.md](./wire-protocol.md) — HTTP/RPC interface, pluggable from any language.
 - **Need a reusable driver/worker/evaluator loop?** → [control-runtime.md](./control-runtime.md) — generic runtime plus coding, browser, computer-use, and research integration patterns.
 - **Want review feedback to become eval/optimization data?** → [feedback-trajectories.md](./feedback-trajectories.md) — turn feedback into datasets, optimizer rows, and preference memory.
-- **Building a code-generator eval?** → SKILL.md §Minimal working path — the `BuilderSession` recipe.
-- **Multi-layer verifier?** → SKILL.md §Verification pipeline.
+- **Building a code-generator eval?** → Start with `BuilderSession`, `SandboxHarness`, and `MultiLayerVerifier`.
+- **Multi-layer verifier?** → Use [control-runtime.md](./control-runtime.md) and `MultiLayerVerifier` for ordered gates with dependencies.
 - **Adding a new judge or rubric?** → `src/wire/rubrics.ts` for the cross-language path; `src/anti-slop.ts` and `src/judges.ts` for the in-process path.

package/docs/knowledge-readiness.md CHANGED Viewed

@@ -2,8 +2,8 @@
 `agent-eval` owns the contract for deciding whether an agent had enough
 task-world context to run. It does not own web crawling, connector storage, wiki
-pages, credentials, or product policy. Those live in `agent-knowledge` and
-product repos.
+pages, credentials, or product policy. Those live in
+`@tangle-network/agent-knowledge` and product repos.
 The core loop is:

package/docs/wire-protocol.md CHANGED Viewed

@@ -96,13 +96,13 @@ GET /v1/version
 ```json
 {
   "package": "@tangle-network/agent-eval",
-  "version": "0.19.0",
+  "version": "0.20.10",
   "wireVersion": "1.0.0",
   "apiSurface": ["judge", "listRubrics", "version"]
 }
 ```
-`version` matches the npm/PyPI package version. `wireVersion` bumps independently — only on breaking request/response schema changes. Package versions can differ across releases as long as `wireVersion` matches.
+`version` matches the package version. `wireVersion` bumps independently — only on breaking request/response schema changes. Package versions can differ across releases as long as `wireVersion` matches.
 ### `GET /healthz` — liveness
@@ -176,7 +176,7 @@ Each invocation is one process — Node startup adds ~500 ms. For more than a fe
 ## Clients
-- **Python**: [`tangle-agent-eval`](../clients/python/README.md) on PyPI. Auto-detects HTTP, falls back to subprocess. Version-locked to npm.
+- **Python**: source lives in [`clients/python`](https://github.com/tangle-network/agent-eval/tree/main/clients/python). Auto-detects HTTP, falls back to subprocess. Version-locked to npm.
 - **TypeScript**: import directly from `@tangle-network/agent-eval` (no wire round-trip needed in-process).
 - **Rust / Go / Other**: generate from `dist/openapi.json`. PRs welcome to add an officially-maintained client.