@tangle-network/agent-eval 0.24.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,70 @@
1
1
  # Changelog
2
2
 
3
+ ## 0.25.0 — ProductionLoop primitive: close the eval → prod → eval cycle
4
+
5
+ This release ships the **orchestration layer** that turns the existing
6
+ eval substrate into a continuously-improving production system. Static
7
+ prompts decay; today's regulation flips tomorrow. The pieces to close
8
+ the loop were already in the package (`runMultiShotOptimization`,
9
+ `failureClusterView`, `evaluateReleaseConfidence`, `extractPreferences`,
10
+ `FeedbackTrajectoryStore`, `TraceStore`); this release adds the one
11
+ clean primitive that wires them together end-to-end.
12
+
13
+ ### Added
14
+
15
+ - **`runProductionLoop({ ... })`** (`src/production-loop.ts`,
16
+ `@experimental`) — one call = one cycle. Ingests production traces
17
+ and feedback, clusters failures, runs evolve against the worst
18
+ cluster, gates with `HeldOutGate` + `evaluateReleaseConfidence`
19
+ (fail-closed), and — when wired with an `AutoPrClient` — opens a PR
20
+ with the improved prompt. Idempotent + replayable: same `runId`
21
+ yields the same plan. Cron / GitHub Actions are the consumer's job;
22
+ the primitive doesn't own scheduling.
23
+
24
+ - **`proposeAutomatedPullRequest(client, input)`** + two transports
25
+ (`src/auto-pr.ts`, `@experimental`):
26
+ - `httpGithubClient({ token, ... })` — direct REST against
27
+ `api.github.com`, no extra deps. Idempotent on branch name:
28
+ existing open PRs are returned, not duplicated.
29
+ - `ghCliClient({ ... })` — shells out to `gh` for environments
30
+ where developer auth state is already configured.
31
+ Both validate inputs (no `..` paths, no whitespace branches, no
32
+ duplicate file changes) and surface `ValidationError` / `ConfigError`
33
+ from the typed taxonomy.
34
+
35
+ - **`POST /v1/feedback` + `POST /v1/traces/ingest`** wire endpoints
36
+ (`src/wire/`). Both Zod-validated, both append to the configured
37
+ store (`FeedbackTrajectoryStore` / `TraceStore`). 503 when no store
38
+ is wired (fail loud, not silent). Traces ingest accepts both
39
+ `application/json` (`{events:[...]}`) and `application/x-ndjson` for
40
+ streaming production runtimes. Schemas (`TraceEvent`,
41
+ `FeedbackTrajectory`, `TracesIngestRequest/Response`,
42
+ `FeedbackIngestResponse`) added to `openapi.json` for cross-language
43
+ clients.
44
+
45
+ - **Optional bearer-token auth** on the wire server, configured via
46
+ `createApp({ auth: { bearer: '...' } })` or as a verifier function
47
+ for rotating tokens. `/healthz` and `/v1/version` remain unprotected
48
+ (regression: never lock monitoring out of the runtime).
49
+
50
+ - **`examples/production-loop/`** — synthetic end-to-end demo wiring
51
+ the loop against in-memory trace + feedback stores and a fake
52
+ auto-PR client. Shows the failure-cluster trigger, the evolve round,
53
+ the gate verdict, and the PR-shaped output without requiring
54
+ credentials or a live model.
55
+
56
+ ### Changed
57
+
58
+ - **Wire server** (`createApp(opts)`) now accepts optional
59
+ `IngestionStores` (`{ traceStore?, feedbackStore? }`) and `auth`.
60
+ Existing zero-arg callers continue to work — judge / rubrics /
61
+ version / healthz are unchanged.
62
+
63
+ ### Status tags
64
+
65
+ - Every new export is `@experimental` initially. Pin the patch version
66
+ if you depend on it. All other 0.24.0 stability tags are preserved.
67
+
3
68
  ## 0.24.0 — DX cleanup: framing, stability tags, lint, taxonomy, strict indices
4
69
 
5
70
  This release is **DX + correctness**. No production behavior moved; consumer
package/README.md CHANGED
@@ -88,6 +88,75 @@ await product.storeEvalResult(task.id, result)
88
88
  Same loop shape in production, replay, benchmark, and optimization. Swap the
89
89
  dependencies behind `observe()` and `act()`, never the eval contract.
90
90
 
91
+ ## Production loop — close the eval → prod → eval cycle (0.25.0)
92
+
93
+ Static prompts decay. Yesterday's FTC rule flips today; yesterday's tool quirk
94
+ becomes today's incident. The production agents that win are the ones that
95
+ **continuously re-train against live failure modes**.
96
+
97
+ `runProductionLoop` is the orchestration layer that wires the existing eval
98
+ substrate into a self-improvement cron:
99
+
100
+ ```ts
101
+ import {
102
+ runProductionLoop,
103
+ httpGithubClient,
104
+ FileSystemFeedbackTrajectoryStore,
105
+ } from '@tangle-network/agent-eval'
106
+ import { FileSystemTraceStore } from '@tangle-network/agent-eval/traces'
107
+
108
+ const result = await runProductionLoop({
109
+ runId: `weekly-${new Date().toISOString().slice(0, 10)}`,
110
+ target: 'tax-agent',
111
+
112
+ // 1. Where production traces + feedback land. Wire the HTTP ingestion
113
+ // endpoints (POST /v1/traces/ingest, POST /v1/feedback) from your
114
+ // runtime; the same store reads them here.
115
+ traceStore: new FileSystemTraceStore({ dir: 'data/prod-traces' }),
116
+ feedbackStore: new FileSystemFeedbackTrajectoryStore({ dir: 'data/prod-feedback' }),
117
+
118
+ // 2. Cluster threshold: act on failure groups ≥ 20 runs or ≥ 5% of corpus.
119
+ cluster: { minClusterSize: 20, minSeverityRatio: 0.05, maxClustersPerCycle: 1 },
120
+
121
+ // 3. Evolve: seed = current prompt, gate against holdout scenarios.
122
+ evolve: {
123
+ baselinePrompt: currentSystemPrompt,
124
+ holdoutScenarios: productionShapeScenarios,
125
+ runner, // your agent driver
126
+ scorer, // calibrated judge or rubric
127
+ mutator, // GEPA-style or addendum-style mutator
128
+ gate: {
129
+ baselineKey: 'baseline',
130
+ minProductiveRuns: 5,
131
+ pairedDeltaThreshold: 0.03, // require Nσ improvement on holdout
132
+ overfitGapThreshold: 0.10,
133
+ },
134
+ },
135
+
136
+ // 4. Ship: when the gate passes, open a PR with the new prompt.
137
+ ship: {
138
+ client: httpGithubClient({ token: process.env.GITHUB_TOKEN! }),
139
+ repo: { owner: 'tangle-network', name: 'tax-agent' },
140
+ branchPrefix: 'eval/auto-improve',
141
+ promptFilePath: 'prompts/tax-agent-system.txt',
142
+ reviewers: ['drew'],
143
+ },
144
+
145
+ cron: { cadence: 'weekly' }, // surface-only; consumer schedules
146
+ })
147
+
148
+ console.log(result.decision) // 'pr_opened' | 'gate_failed' | 'no_actionable_failures' | ...
149
+ console.log(result.pullRequest?.prUrl) // populated when a PR was opened
150
+ ```
151
+
152
+ The primitive runs **one cycle**. Schedule it with `workflow_dispatch` + cron in
153
+ GitHub Actions. It is **idempotent + replayable**: same `runId` → same plan.
154
+ Gate failures are fail-closed — a candidate that beats baseline on search but
155
+ overfits on holdout never lands.
156
+
157
+ Full runnable demo (synthetic traces, no credentials) in
158
+ [`examples/production-loop`](./examples/production-loop/README.md).
159
+
91
160
  ## Self-improvement loop
92
161
 
93
162
  Eval doesn't end at "pass/fail." Outcomes become training signal, mutation
@@ -222,6 +291,8 @@ and runtime. See [`examples/`](./examples/).
222
291
  closed loop — score, reflect, mutate, re-score, repeat.
223
292
  - [`examples/fine-tune-with-prime-rl`](./examples/fine-tune-with-prime-rl/README.md):
224
293
  RunRecord → preferences → trainer (prime-rl) → next campaign.
294
+ - [`examples/production-loop`](./examples/production-loop/README.md):
295
+ ingest prod traces + feedback, cluster failures, evolve, gate, open a PR.
225
296
 
226
297
  ## Docs
227
298
 
@@ -74,6 +74,114 @@ var HealthResponseSchema = z.object({
74
74
  status: z.literal("ok"),
75
75
  uptimeSec: z.number()
76
76
  }).openapi("HealthResponse");
77
+ var TraceEventSchema = z.object({
78
+ eventId: z.string().min(1).describe("Stable id for the event. Use ULID or UUID."),
79
+ runId: z.string().min(1).describe("Run this event belongs to."),
80
+ spanId: z.string().optional().describe("Span that emitted the event, if any."),
81
+ kind: z.enum([
82
+ "log",
83
+ "error",
84
+ "budget_decrement",
85
+ "budget_breach",
86
+ "state_mutation",
87
+ "policy_violation",
88
+ "redaction_applied",
89
+ "custom"
90
+ ]).describe("Coarse event category \u2014 matches the TraceSchema v1 EventKind enum."),
91
+ timestamp: z.number().int().nonnegative().describe("Unix millis. Must be monotonically non-decreasing within a span."),
92
+ payload: z.record(z.string(), z.unknown()).describe("Free-form payload \u2014 the runtime owns the shape.")
93
+ }).openapi("TraceEvent");
94
+ var TracesIngestRequestSchema = z.object({
95
+ events: z.array(TraceEventSchema).min(1).max(1e4).describe("Batch of events. Max 10k per call \u2014 bigger streams should be chunked.")
96
+ }).openapi("TracesIngestRequest");
97
+ var TracesIngestResponseSchema = z.object({
98
+ accepted: z.number().int().nonnegative().describe("Number of events persisted."),
99
+ rejected: z.number().int().nonnegative().describe("Number of events the store refused \u2014 see `errors[]` for reasons."),
100
+ errors: z.array(
101
+ z.object({
102
+ eventId: z.string().describe("Event id this error applies to."),
103
+ message: z.string().describe("Why the event was rejected.")
104
+ })
105
+ ).default([])
106
+ }).openapi("TracesIngestResponse");
107
+ var FeedbackLabelSchema = z.object({
108
+ id: z.string().optional(),
109
+ source: z.enum(["user", "judge", "environment", "metric", "policy", "system"]),
110
+ kind: z.enum([
111
+ "approve",
112
+ "reject",
113
+ "select",
114
+ "edit",
115
+ "rank",
116
+ "rate",
117
+ "comment",
118
+ "metric_outcome",
119
+ "policy_block",
120
+ "revision_request"
121
+ ]),
122
+ value: z.unknown(),
123
+ reason: z.string().optional(),
124
+ severity: z.enum(["info", "warning", "error", "critical"]).optional(),
125
+ createdAt: z.string().describe("ISO-8601 UTC."),
126
+ metadata: z.record(z.string(), z.unknown()).optional()
127
+ }).openapi("FeedbackLabel");
128
+ var FeedbackAttemptSchema = z.object({
129
+ id: z.string().min(1),
130
+ stepIndex: z.number().int().nonnegative(),
131
+ artifactType: z.enum([
132
+ "text",
133
+ "code",
134
+ "plan",
135
+ "research",
136
+ "action",
137
+ "ui",
138
+ "decision",
139
+ "data",
140
+ "other"
141
+ ]),
142
+ artifact: z.unknown(),
143
+ options: z.array(z.unknown()).optional(),
144
+ proposedAction: z.object({
145
+ type: z.string(),
146
+ risk: z.enum(["low", "medium", "high"]).optional(),
147
+ costUsd: z.number().optional(),
148
+ externalSideEffect: z.boolean().optional(),
149
+ requiresApproval: z.boolean().optional(),
150
+ metadata: z.record(z.string(), z.unknown()).optional()
151
+ }).optional(),
152
+ feedback: z.array(FeedbackLabelSchema).optional(),
153
+ createdAt: z.string(),
154
+ metadata: z.record(z.string(), z.unknown()).optional()
155
+ }).openapi("FeedbackAttempt");
156
+ var FeedbackTrajectorySchema = z.object({
157
+ id: z.string().min(1).describe("Stable id; idempotency key for the trajectory."),
158
+ projectId: z.string().optional(),
159
+ scenarioId: z.string().optional(),
160
+ task: z.object({
161
+ intent: z.string().min(1),
162
+ context: z.unknown().optional()
163
+ }),
164
+ attempts: z.array(FeedbackAttemptSchema).default([]),
165
+ labels: z.array(FeedbackLabelSchema).default([]),
166
+ outcome: z.object({
167
+ success: z.boolean().optional(),
168
+ score: z.number().optional(),
169
+ metrics: z.record(z.string(), z.number()).optional(),
170
+ costUsd: z.number().optional(),
171
+ detail: z.string().optional(),
172
+ observedAt: z.string().optional(),
173
+ metadata: z.record(z.string(), z.unknown()).optional()
174
+ }).optional(),
175
+ split: z.enum(["train", "dev", "test", "holdout"]).optional(),
176
+ tags: z.record(z.string(), z.string()).optional(),
177
+ createdAt: z.string().describe("ISO-8601 UTC."),
178
+ updatedAt: z.string().optional(),
179
+ metadata: z.record(z.string(), z.unknown()).optional()
180
+ }).openapi("FeedbackTrajectory");
181
+ var FeedbackIngestResponseSchema = z.object({
182
+ id: z.string().describe("Trajectory id that was persisted."),
183
+ persisted: z.boolean().describe("True when the trajectory was saved (idempotent on id).")
184
+ }).openapi("FeedbackIngestResponse");
77
185
  var ErrorResponseSchema = z.object({
78
186
  error: z.object({
79
187
  code: z.string().describe(
@@ -378,9 +486,43 @@ function handleVersion() {
378
486
  package: "@tangle-network/agent-eval",
379
487
  version: readPackageVersion(),
380
488
  wireVersion: WIRE_VERSION,
381
- apiSurface: ["judge", "listRubrics", "version"]
489
+ apiSurface: ["judge", "listRubrics", "version", "feedback.ingest", "traces.ingest"]
382
490
  };
383
491
  }
492
+ async function handleTracesIngest(req, stores) {
493
+ if (!stores.traceStore) {
494
+ throw new WireError(
495
+ "service_unavailable",
496
+ "No trace store configured on this server. Pass `traceStore` to `createApp`.",
497
+ 503
498
+ );
499
+ }
500
+ const errors = [];
501
+ let accepted = 0;
502
+ for (const event of req.events) {
503
+ try {
504
+ await stores.traceStore.appendEvent(event);
505
+ accepted++;
506
+ } catch (err) {
507
+ errors.push({
508
+ eventId: event.eventId,
509
+ message: err instanceof Error ? err.message : String(err)
510
+ });
511
+ }
512
+ }
513
+ return { accepted, rejected: errors.length, errors };
514
+ }
515
+ async function handleFeedbackIngest(req, stores) {
516
+ if (!stores.feedbackStore) {
517
+ throw new WireError(
518
+ "service_unavailable",
519
+ "No feedback store configured on this server. Pass `feedbackStore` to `createApp`.",
520
+ 503
521
+ );
522
+ }
523
+ await stores.feedbackStore.save(req);
524
+ return { id: req.id, persisted: true };
525
+ }
384
526
 
385
527
  // src/wire/openapi.ts
386
528
  import { OpenAPIRegistry, OpenApiGeneratorV31 } from "@asteasolutions/zod-to-openapi";
@@ -392,6 +534,10 @@ function buildOpenApi(packageVersion) {
392
534
  registry.register("VersionResponse", VersionResponseSchema);
393
535
  registry.register("HealthResponse", HealthResponseSchema);
394
536
  registry.register("ErrorResponse", ErrorResponseSchema);
537
+ registry.register("TracesIngestRequest", TracesIngestRequestSchema);
538
+ registry.register("TracesIngestResponse", TracesIngestResponseSchema);
539
+ registry.register("FeedbackTrajectory", FeedbackTrajectorySchema);
540
+ registry.register("FeedbackIngestResponse", FeedbackIngestResponseSchema);
395
541
  registry.registerPath({
396
542
  method: "post",
397
543
  path: "/v1/judge",
@@ -458,6 +604,69 @@ function buildOpenApi(packageVersion) {
458
604
  }
459
605
  }
460
606
  });
607
+ registry.registerPath({
608
+ method: "post",
609
+ path: "/v1/traces/ingest",
610
+ summary: "Ingest a batch of production TraceEvents",
611
+ description: "Append a batch of TraceEvents to the configured TraceStore. Accepts application/json ({events:[...]}) or application/x-ndjson (one event per line). Returns counts of accepted + rejected events.",
612
+ request: {
613
+ body: {
614
+ content: {
615
+ "application/json": { schema: TracesIngestRequestSchema },
616
+ "application/x-ndjson": { schema: TracesIngestRequestSchema }
617
+ }
618
+ }
619
+ },
620
+ responses: {
621
+ 200: {
622
+ description: "Ingestion summary",
623
+ content: { "application/json": { schema: TracesIngestResponseSchema } }
624
+ },
625
+ 400: {
626
+ description: "Validation error",
627
+ content: { "application/json": { schema: ErrorResponseSchema } }
628
+ },
629
+ 401: {
630
+ description: "Unauthorized (when bearer auth is configured)",
631
+ content: { "application/json": { schema: ErrorResponseSchema } }
632
+ },
633
+ 503: {
634
+ description: "No trace store configured",
635
+ content: { "application/json": { schema: ErrorResponseSchema } }
636
+ }
637
+ }
638
+ });
639
+ registry.registerPath({
640
+ method: "post",
641
+ path: "/v1/feedback",
642
+ summary: "Ingest a FeedbackTrajectory from production",
643
+ description: "Persist a single FeedbackTrajectory. Idempotent on trajectory.id \u2014 re-posting replaces the prior record. Used by production runtimes to forward user \u{1F44D}/\u{1F44E}/edits into the eval substrate.",
644
+ request: {
645
+ body: {
646
+ content: {
647
+ "application/json": { schema: FeedbackTrajectorySchema }
648
+ }
649
+ }
650
+ },
651
+ responses: {
652
+ 200: {
653
+ description: "Persisted",
654
+ content: { "application/json": { schema: FeedbackIngestResponseSchema } }
655
+ },
656
+ 400: {
657
+ description: "Validation error",
658
+ content: { "application/json": { schema: ErrorResponseSchema } }
659
+ },
660
+ 401: {
661
+ description: "Unauthorized (when bearer auth is configured)",
662
+ content: { "application/json": { schema: ErrorResponseSchema } }
663
+ },
664
+ 503: {
665
+ description: "No feedback store configured",
666
+ content: { "application/json": { schema: ErrorResponseSchema } }
667
+ }
668
+ }
669
+ });
461
670
  const generator = new OpenApiGeneratorV31(registry.definitions);
462
671
  const doc = generator.generateDocument({
463
672
  openapi: "3.1.0",
@@ -608,14 +817,34 @@ import { serve } from "@hono/node-server";
608
817
  import { Hono } from "hono";
609
818
  import { cors } from "hono/cors";
610
819
  var STARTED_AT = Date.now();
611
- function createApp() {
820
+ var AUTH_EXEMPT_PATHS = /* @__PURE__ */ new Set(["/healthz", "/v1/version", "/openapi.json"]);
821
+ function createApp(opts = {}) {
612
822
  const app = new Hono();
613
823
  app.use("*", cors());
824
+ if (opts.auth) {
825
+ const verify = opts.auth.bearer;
826
+ app.use("*", async (c, next) => {
827
+ const path = new URL(c.req.url).pathname;
828
+ if (AUTH_EXEMPT_PATHS.has(path)) return next();
829
+ const raw = c.req.header("authorization") ?? "";
830
+ const match = raw.match(/^Bearer\s+(.+)$/i);
831
+ if (!match) {
832
+ throw new WireError("unauthorized", "Missing or malformed Authorization header.", 401);
833
+ }
834
+ const token = match[1];
835
+ const ok = typeof verify === "string" ? token === verify : await verify(token);
836
+ if (!ok) {
837
+ throw new WireError("unauthorized", "Invalid bearer token.", 401);
838
+ }
839
+ return next();
840
+ });
841
+ }
614
842
  app.onError((err, c) => {
615
843
  if (err instanceof WireError) {
844
+ const status = err.status;
616
845
  return c.json(
617
846
  { error: { code: err.code, message: err.message, details: err.details } },
618
- err.status
847
+ status
619
848
  );
620
849
  }
621
850
  console.error("[agent-eval] unhandled error:", err);
@@ -644,11 +873,64 @@ function createApp() {
644
873
  const result = await handleJudge(parsed.data);
645
874
  return c.json(result);
646
875
  });
876
+ app.post("/v1/traces/ingest", async (c) => {
877
+ const contentType = c.req.header("content-type") ?? "";
878
+ let payload;
879
+ if (contentType.includes("application/x-ndjson")) {
880
+ const text = await c.req.text();
881
+ const events = text.split("\n").map((line) => line.trim()).filter((line) => line.length > 0).map((line) => {
882
+ try {
883
+ return JSON.parse(line);
884
+ } catch {
885
+ throw new WireError(
886
+ "validation_error",
887
+ "NDJSON line did not parse as JSON.",
888
+ 400,
889
+ line.slice(0, 200)
890
+ );
891
+ }
892
+ });
893
+ payload = { events };
894
+ } else {
895
+ payload = await c.req.json().catch(() => null);
896
+ }
897
+ if (payload == null) {
898
+ throw new WireError("validation_error", "Request body must be JSON or NDJSON.", 400);
899
+ }
900
+ const parsed = TracesIngestRequestSchema.safeParse(payload);
901
+ if (!parsed.success) {
902
+ throw new WireError(
903
+ "validation_error",
904
+ "Request did not match TracesIngestRequest schema.",
905
+ 400,
906
+ parsed.error.issues
907
+ );
908
+ }
909
+ const result = await handleTracesIngest(parsed.data, opts.stores ?? {});
910
+ return c.json(result);
911
+ });
912
+ app.post("/v1/feedback", async (c) => {
913
+ const raw = await c.req.json().catch(() => null);
914
+ if (raw == null) {
915
+ throw new WireError("validation_error", "Request body must be JSON.", 400);
916
+ }
917
+ const parsed = FeedbackTrajectorySchema.safeParse(raw);
918
+ if (!parsed.success) {
919
+ throw new WireError(
920
+ "validation_error",
921
+ "Request did not match FeedbackTrajectory schema.",
922
+ 400,
923
+ parsed.error.issues
924
+ );
925
+ }
926
+ const result = await handleFeedbackIngest(parsed.data, opts.stores ?? {});
927
+ return c.json(result);
928
+ });
647
929
  app.get("/openapi.json", (c) => c.json(buildOpenApi(handleVersion().version)));
648
930
  return app;
649
931
  }
650
932
  function startServer(opts = {}) {
651
- const app = createApp();
933
+ const app = createApp(opts);
652
934
  const port = opts.port ?? 5005;
653
935
  const host = opts.host ?? "127.0.0.1";
654
936
  return serve({ fetch: app.fetch, port, hostname: host }, ({ address, port: actualPort }) => {
@@ -666,6 +948,13 @@ export {
666
948
  ListRubricsResponseSchema,
667
949
  VersionResponseSchema,
668
950
  HealthResponseSchema,
951
+ TraceEventSchema,
952
+ TracesIngestRequestSchema,
953
+ TracesIngestResponseSchema,
954
+ FeedbackLabelSchema,
955
+ FeedbackAttemptSchema,
956
+ FeedbackTrajectorySchema,
957
+ FeedbackIngestResponseSchema,
669
958
  ErrorResponseSchema,
670
959
  WIRE_VERSION,
671
960
  hashRubric,
@@ -676,6 +965,8 @@ export {
676
965
  handleJudge,
677
966
  handleListRubrics,
678
967
  handleVersion,
968
+ handleTracesIngest,
969
+ handleFeedbackIngest,
679
970
  buildOpenApi,
680
971
  dispatchRpc,
681
972
  runRpcOnce,
@@ -683,4 +974,4 @@ export {
683
974
  createApp,
684
975
  startServer
685
976
  };
686
- //# sourceMappingURL=chunk-SY6WAAAD.js.map
977
+ //# sourceMappingURL=chunk-5LBB5B3Z.js.map