npm - @tangle-network/agent-eval - Versions diffs - 0.24.0 → 0.27.0 - Mend

@tangle-network/agent-eval 0.24.0 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/CHANGELOG.md +138 -0
package/README.md +72 -0
package/dist/{chunk-SY6WAAAD.js → chunk-5LBB5B3Z.js} +296 -5
package/dist/chunk-5LBB5B3Z.js.map +1 -0
package/dist/{chunk-OHEPNJQN.js → chunk-JLZQWFV3.js} +65 -1
package/dist/chunk-JLZQWFV3.js.map +1 -0
package/dist/{chunk-VRJVTXRV.js → chunk-WHZMVFUV.js} +85 -85
package/dist/chunk-WHZMVFUV.js.map +1 -0
package/dist/cli.js +1 -1
package/dist/governance/index.d.ts +1 -1
package/dist/{index-Oj9fAPPN.d.ts → index-D3iBCjdF.d.ts} +63 -2
package/dist/index.d.ts +529 -12
package/dist/index.js +1106 -17
package/dist/index.js.map +1 -1
package/dist/openapi.json +491 -1
package/dist/optimization.d.ts +2 -2
package/dist/optimization.js +1 -1
package/dist/pipelines/index.js +3 -67
package/dist/pipelines/index.js.map +1 -1
package/dist/{release-report-TDPn1cxq.d.ts → release-report-wfUySN5F.d.ts} +1 -1
package/dist/reporting.d.ts +2 -2
package/dist/{researcher-CUOiGcGv.d.ts → researcher-bGkI7vCl.d.ts} +1 -1
package/dist/rl.d.ts +3 -3
package/dist/{summary-report-BXGs_9V0.d.ts → summary-report-DZVXOCK_.d.ts} +13 -1
package/dist/wire/index.d.ts +347 -3
package/dist/wire/index.js +19 -1
package/docs/concepts.md +11 -0
package/package.json +1 -1
package/dist/chunk-OHEPNJQN.js.map +0 -1
package/dist/chunk-SY6WAAAD.js.map +0 -1
package/dist/chunk-VRJVTXRV.js.map +0 -1

package/dist/openapi.json CHANGED Viewed

@@ -2,7 +2,7 @@
   "openapi": "3.1.0",
   "info": {
     "title": "@tangle-network/agent-eval — wire protocol",
-    "version": "0.24.0",
+    "version": "0.27.0",
     "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
     "contact": {
       "name": "Tangle Network",
@@ -382,6 +382,377 @@
         "required": [
           "error"
         ]
+      },
+      "TracesIngestRequest": {
+        "type": "object",
+        "properties": {
+          "events": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/TraceEvent"
+            },
+            "minItems": 1,
+            "maxItems": 10000,
+            "description": "Batch of events. Max 10k per call — bigger streams should be chunked."
+          }
+        },
+        "required": [
+          "events"
+        ]
+      },
+      "TraceEvent": {
+        "type": "object",
+        "properties": {
+          "eventId": {
+            "type": "string",
+            "minLength": 1,
+            "description": "Stable id for the event. Use ULID or UUID."
+          },
+          "runId": {
+            "type": "string",
+            "minLength": 1,
+            "description": "Run this event belongs to."
+          },
+          "spanId": {
+            "type": "string",
+            "description": "Span that emitted the event, if any."
+          },
+          "kind": {
+            "type": "string",
+            "enum": [
+              "log",
+              "error",
+              "budget_decrement",
+              "budget_breach",
+              "state_mutation",
+              "policy_violation",
+              "redaction_applied",
+              "custom"
+            ],
+            "description": "Coarse event category — matches the TraceSchema v1 EventKind enum."
+          },
+          "timestamp": {
+            "type": "integer",
+            "minimum": 0,
+            "description": "Unix millis. Must be monotonically non-decreasing within a span."
+          },
+          "payload": {
+            "type": "object",
+            "additionalProperties": {},
+            "description": "Free-form payload — the runtime owns the shape."
+          }
+        },
+        "required": [
+          "eventId",
+          "runId",
+          "kind",
+          "timestamp",
+          "payload"
+        ]
+      },
+      "TracesIngestResponse": {
+        "type": "object",
+        "properties": {
+          "accepted": {
+            "type": "integer",
+            "minimum": 0,
+            "description": "Number of events persisted."
+          },
+          "rejected": {
+            "type": "integer",
+            "minimum": 0,
+            "description": "Number of events the store refused — see `errors[]` for reasons."
+          },
+          "errors": {
+            "type": "array",
+            "items": {
+              "type": "object",
+              "properties": {
+                "eventId": {
+                  "type": "string",
+                  "description": "Event id this error applies to."
+                },
+                "message": {
+                  "type": "string",
+                  "description": "Why the event was rejected."
+                }
+              },
+              "required": [
+                "eventId",
+                "message"
+              ]
+            },
+            "default": []
+          }
+        },
+        "required": [
+          "accepted",
+          "rejected"
+        ]
+      },
+      "FeedbackTrajectory": {
+        "type": "object",
+        "properties": {
+          "id": {
+            "type": "string",
+            "minLength": 1,
+            "description": "Stable id; idempotency key for the trajectory."
+          },
+          "projectId": {
+            "type": "string"
+          },
+          "scenarioId": {
+            "type": "string"
+          },
+          "task": {
+            "type": "object",
+            "properties": {
+              "intent": {
+                "type": "string",
+                "minLength": 1
+              },
+              "context": {}
+            },
+            "required": [
+              "intent"
+            ]
+          },
+          "attempts": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/FeedbackAttempt"
+            },
+            "default": []
+          },
+          "labels": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/FeedbackLabel"
+            },
+            "default": []
+          },
+          "outcome": {
+            "type": "object",
+            "properties": {
+              "success": {
+                "type": "boolean"
+              },
+              "score": {
+                "type": "number"
+              },
+              "metrics": {
+                "type": "object",
+                "additionalProperties": {
+                  "type": "number"
+                }
+              },
+              "costUsd": {
+                "type": "number"
+              },
+              "detail": {
+                "type": "string"
+              },
+              "observedAt": {
+                "type": "string"
+              },
+              "metadata": {
+                "type": "object",
+                "additionalProperties": {}
+              }
+            }
+          },
+          "split": {
+            "type": "string",
+            "enum": [
+              "train",
+              "dev",
+              "test",
+              "holdout"
+            ]
+          },
+          "tags": {
+            "type": "object",
+            "additionalProperties": {
+              "type": "string"
+            }
+          },
+          "createdAt": {
+            "type": "string",
+            "description": "ISO-8601 UTC."
+          },
+          "updatedAt": {
+            "type": "string"
+          },
+          "metadata": {
+            "type": "object",
+            "additionalProperties": {}
+          }
+        },
+        "required": [
+          "id",
+          "task",
+          "createdAt"
+        ]
+      },
+      "FeedbackAttempt": {
+        "type": "object",
+        "properties": {
+          "id": {
+            "type": "string",
+            "minLength": 1
+          },
+          "stepIndex": {
+            "type": "integer",
+            "minimum": 0
+          },
+          "artifactType": {
+            "type": "string",
+            "enum": [
+              "text",
+              "code",
+              "plan",
+              "research",
+              "action",
+              "ui",
+              "decision",
+              "data",
+              "other"
+            ]
+          },
+          "artifact": {},
+          "options": {
+            "type": "array",
+            "items": {}
+          },
+          "proposedAction": {
+            "type": "object",
+            "properties": {
+              "type": {
+                "type": "string"
+              },
+              "risk": {
+                "type": "string",
+                "enum": [
+                  "low",
+                  "medium",
+                  "high"
+                ]
+              },
+              "costUsd": {
+                "type": "number"
+              },
+              "externalSideEffect": {
+                "type": "boolean"
+              },
+              "requiresApproval": {
+                "type": "boolean"
+              },
+              "metadata": {
+                "type": "object",
+                "additionalProperties": {}
+              }
+            },
+            "required": [
+              "type"
+            ]
+          },
+          "feedback": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/FeedbackLabel"
+            }
+          },
+          "createdAt": {
+            "type": "string"
+          },
+          "metadata": {
+            "type": "object",
+            "additionalProperties": {}
+          }
+        },
+        "required": [
+          "id",
+          "stepIndex",
+          "artifactType",
+          "createdAt"
+        ]
+      },
+      "FeedbackLabel": {
+        "type": "object",
+        "properties": {
+          "id": {
+            "type": "string"
+          },
+          "source": {
+            "type": "string",
+            "enum": [
+              "user",
+              "judge",
+              "environment",
+              "metric",
+              "policy",
+              "system"
+            ]
+          },
+          "kind": {
+            "type": "string",
+            "enum": [
+              "approve",
+              "reject",
+              "select",
+              "edit",
+              "rank",
+              "rate",
+              "comment",
+              "metric_outcome",
+              "policy_block",
+              "revision_request"
+            ]
+          },
+          "value": {},
+          "reason": {
+            "type": "string"
+          },
+          "severity": {
+            "type": "string",
+            "enum": [
+              "info",
+              "warning",
+              "error",
+              "critical"
+            ]
+          },
+          "createdAt": {
+            "type": "string",
+            "description": "ISO-8601 UTC."
+          },
+          "metadata": {
+            "type": "object",
+            "additionalProperties": {}
+          }
+        },
+        "required": [
+          "source",
+          "kind",
+          "createdAt"
+        ]
+      },
+      "FeedbackIngestResponse": {
+        "type": "object",
+        "properties": {
+          "id": {
+            "type": "string",
+            "description": "Trajectory id that was persisted."
+          },
+          "persisted": {
+            "type": "boolean",
+            "description": "True when the trajectory was saved (idempotent on id)."
+          }
+        },
+        "required": [
+          "id",
+          "persisted"
+        ]
       }
     },
     "parameters": {}
@@ -496,6 +867,125 @@
           }
         }
       }
+    },
+    "/v1/traces/ingest": {
+      "post": {
+        "summary": "Ingest a batch of production TraceEvents",
+        "description": "Append a batch of TraceEvents to the configured TraceStore. Accepts application/json ({events:[...]}) or application/x-ndjson (one event per line). Returns counts of accepted + rejected events.",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/TracesIngestRequest"
+              }
+            },
+            "application/x-ndjson": {
+              "schema": {
+                "$ref": "#/components/schemas/TracesIngestRequest"
+              }
+            }
+          }
+        },
+        "responses": {
+          "200": {
+            "description": "Ingestion summary",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/TracesIngestResponse"
+                }
+              }
+            }
+          },
+          "400": {
+            "description": "Validation error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                }
+              }
+            }
+          },
+          "401": {
+            "description": "Unauthorized (when bearer auth is configured)",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                }
+              }
+            }
+          },
+          "503": {
+            "description": "No trace store configured",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                }
+              }
+            }
+          }
+        }
+      }
+    },
+    "/v1/feedback": {
+      "post": {
+        "summary": "Ingest a FeedbackTrajectory from production",
+        "description": "Persist a single FeedbackTrajectory. Idempotent on trajectory.id — re-posting replaces the prior record. Used by production runtimes to forward user 👍/👎/edits into the eval substrate.",
+        "requestBody": {
+          "content": {
+            "application/json": {
+              "schema": {
+                "$ref": "#/components/schemas/FeedbackTrajectory"
+              }
+            }
+          }
+        },
+        "responses": {
+          "200": {
+            "description": "Persisted",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/FeedbackIngestResponse"
+                }
+              }
+            }
+          },
+          "400": {
+            "description": "Validation error",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                }
+              }
+            }
+          },
+          "401": {
+            "description": "Unauthorized (when bearer auth is configured)",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                }
+              }
+            }
+          },
+          "503": {
+            "description": "No feedback store configured",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/ErrorResponse"
+                }
+              }
+            }
+          }
+        }
+      }
     }
   },
   "webhooks": {}

package/dist/optimization.d.ts CHANGED Viewed

@@ -1,6 +1,6 @@
-export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-CUOiGcGv.js';
+export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-bGkI7vCl.js';
 export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-DfFdrraJ.js';
-export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-BXGs_9V0.js';
+export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-DZVXOCK_.js';
 import './errors-BZ9sTdz7.js';
 import './integrity-DK2EBVZC.js';
 import './store-Db2Bv8Cf.js';

package/dist/optimization.js CHANGED Viewed

@@ -25,7 +25,7 @@ import {
   summarizePreferenceMemory,
   trialTraceFromMultiShotTrial,
   withAssignedFeedbackSplit
-} from "./chunk-VRJVTXRV.js";
+} from "./chunk-WHZMVFUV.js";
 import "./chunk-NLMNWKVM.js";
 import {
   runEvalCampaign

package/dist/pipelines/index.js CHANGED Viewed

@@ -1,9 +1,8 @@
 import {
-  DEFAULT_RULES,
-  classifyFailure,
   compareToBaseline,
-  computeToolUseMetrics
-} from "../chunk-OHEPNJQN.js";
+  computeToolUseMetrics,
+  failureClusterView
+} from "../chunk-JLZQWFV3.js";
 import {
   buildTrajectory
 } from "../chunk-RZTMDUO7.js";
@@ -62,69 +61,6 @@ async function budgetBreachView(store, options = {}) {
   };
 }
-// src/pipelines/failure-cluster.ts
-async function failureClusterView(store, options = {}) {
-  const rules = options.rules ?? DEFAULT_RULES;
-  const minSize = options.minClusterSize ?? 1;
-  const runs = await store.listRuns();
-  const clusters = /* @__PURE__ */ new Map();
-  let totalFailures = 0;
-  for (const run of runs) {
-    if (run.status === "completed" && run.outcome?.pass !== false) continue;
-    totalFailures++;
-    const spans = await store.spans({ runId: run.runId });
-    const events = await store.events({ runId: run.runId });
-    const cls = classifyFailure({ run, spans, events }, rules);
-    let toolName;
-    let argPrefix;
-    let dimension;
-    if (cls.triggerSpanId) {
-      const trig = spans.find((s) => s.spanId === cls.triggerSpanId);
-      if (trig?.kind === "tool") {
-        toolName = trig.toolName;
-        argPrefix = argHash(trig.args).slice(0, 16);
-      } else if (trig?.kind === "judge") {
-        dimension = trig.dimension;
-      }
-    }
-    if (!toolName) {
-      const ts = await toolSpans(store, run.runId);
-      const errored = ts.filter((t) => t.status === "error").pop();
-      if (errored) {
-        toolName = errored.toolName;
-        argPrefix = argHash(errored.args).slice(0, 16);
-      }
-    }
-    if (!dimension) {
-      const judge = spans.find((s) => s.kind === "judge" && typeof s.dimension === "string");
-      if (judge?.kind === "judge") dimension = judge.dimension;
-    }
-    const key = `${cls.failureClass}|${toolName ?? ""}|${argPrefix ?? ""}|${dimension ?? ""}`;
-    let cluster = clusters.get(key);
-    if (!cluster) {
-      cluster = {
-        failureClass: cls.failureClass,
-        toolName,
-        argPrefix,
-        dimension,
-        runCount: 0,
-        scenarioIds: [],
-        exampleRunId: run.runId,
-        exampleError: firstErrorMessage(spans) ?? cls.reason
-      };
-      clusters.set(key, cluster);
-    }
-    cluster.runCount++;
-    if (!cluster.scenarioIds.includes(run.scenarioId)) cluster.scenarioIds.push(run.scenarioId);
-  }
-  const arr = [...clusters.values()].filter((c) => c.runCount >= minSize).sort((a, b) => b.runCount - a.runCount);
-  return { clusters: arr, totalFailures, totalRuns: runs.length };
-}
-function firstErrorMessage(spans) {
-  const errored = spans.find((s) => s.status === "error");
-  return errored?.error;
-}
 // src/pipelines/first-divergence.ts
 async function firstDivergenceView(store, runA, runB, options = {}) {
   const [a, b] = await Promise.all([buildTrajectory(store, runA), buildTrajectory(store, runB)]);