npm - @tangle-network/agent-eval - Versions diffs - 0.23.1 → 0.25.0 - Mend

@tangle-network/agent-eval 0.23.1 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (148) hide show

package/CHANGELOG.md +145 -0
package/README.md +212 -79
package/dist/baseline-4R5deP0N.d.ts +108 -0
package/dist/benchmarks/index.d.ts +3 -2
package/dist/benchmarks/index.js +1 -1
package/dist/builder-eval/index.d.ts +249 -0
package/dist/builder-eval/index.js +391 -0
package/dist/builder-eval/index.js.map +1 -0
package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
package/dist/chunk-2A5XJB43.js.map +1 -0
package/dist/chunk-47X6LRCE.js +76 -0
package/dist/chunk-47X6LRCE.js.map +1 -0
package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
package/dist/chunk-4F5DQN55.js.map +1 -0
package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
package/dist/chunk-4S4BM3QQ.js.map +1 -0
package/dist/chunk-5BKGXME7.js +65 -0
package/dist/chunk-5BKGXME7.js.map +1 -0
package/dist/{chunk-6KQG5HAH.js → chunk-5LBB5B3Z.js} +376 -72
package/dist/chunk-5LBB5B3Z.js.map +1 -0
package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
package/dist/chunk-6QDKWHLS.js.map +1 -0
package/dist/{chunk-VQQSPGSM.js → chunk-EDUKQ5AM.js} +247 -189
package/dist/chunk-EDUKQ5AM.js.map +1 -0
package/dist/chunk-I4MBDTY5.js +272 -0
package/dist/chunk-I4MBDTY5.js.map +1 -0
package/dist/chunk-JLZQWFV3.js +618 -0
package/dist/chunk-JLZQWFV3.js.map +1 -0
package/dist/chunk-K2TPS5LB.js +569 -0
package/dist/chunk-K2TPS5LB.js.map +1 -0
package/dist/chunk-KKHDIONI.js +414 -0
package/dist/chunk-KKHDIONI.js.map +1 -0
package/dist/chunk-KMPRBJK4.js +74 -0
package/dist/chunk-KMPRBJK4.js.map +1 -0
package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
package/dist/chunk-KTGTIOFD.js.map +1 -0
package/dist/chunk-LSH4MMOZ.js +838 -0
package/dist/chunk-LSH4MMOZ.js.map +1 -0
package/dist/chunk-NG236HPC.js +57 -0
package/dist/chunk-NG236HPC.js.map +1 -0
package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
package/dist/chunk-NLMNWKVM.js.map +1 -0
package/dist/chunk-NU65VQ7M.js +99 -0
package/dist/chunk-NU65VQ7M.js.map +1 -0
package/dist/chunk-OWLAAMME.js +250 -0
package/dist/chunk-OWLAAMME.js.map +1 -0
package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
package/dist/chunk-PC4UYEBM.js.map +1 -0
package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
package/dist/chunk-RAF443UI.js.map +1 -0
package/dist/chunk-RZTMDUO7.js +49 -0
package/dist/chunk-RZTMDUO7.js.map +1 -0
package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
package/dist/chunk-SESZDQPX.js.map +1 -0
package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
package/dist/chunk-TVVP3ZZQ.js.map +1 -0
package/dist/chunk-WWYCWKUM.js +196 -0
package/dist/chunk-WWYCWKUM.js.map +1 -0
package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
package/dist/chunk-YRZ4M5GS.js.map +1 -0
package/dist/chunk-ZN274SWR.js +613 -0
package/dist/chunk-ZN274SWR.js.map +1 -0
package/dist/cli.js +10 -6
package/dist/cli.js.map +1 -1
package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
package/dist/control.d.ts +8 -6
package/dist/control.js +10 -7
package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
package/dist/errors-BZ9sTdz7.d.ts +70 -0
package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
package/dist/governance/index.d.ts +5 -0
package/dist/governance/index.js +18 -0
package/dist/governance/index.js.map +1 -0
package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
package/dist/index-Oj9fAPPN.d.ts +270 -0
package/dist/index.d.ts +2018 -3003
package/dist/index.js +7443 -9102
package/dist/index.js.map +1 -1
package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
package/dist/knowledge/index.d.ts +102 -0
package/dist/knowledge/index.js +18 -0
package/dist/knowledge/index.js.map +1 -0
package/dist/meta-eval/index.d.ts +99 -0
package/dist/meta-eval/index.js +324 -0
package/dist/meta-eval/index.js.map +1 -0
package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
package/dist/openapi.json +491 -1
package/dist/optimization.d.ts +11 -8
package/dist/optimization.js +11 -9
package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
package/dist/pipelines/index.d.ts +172 -0
package/dist/pipelines/index.js +345 -0
package/dist/pipelines/index.js.map +1 -0
package/dist/prm/index.d.ts +99 -0
package/dist/prm/index.js +222 -0
package/dist/prm/index.js.map +1 -0
package/dist/query-DODUYdPg.d.ts +30 -0
package/dist/release-report-BNgMdqPF.d.ts +292 -0
package/dist/replay-BL96gCEP.d.ts +226 -0
package/dist/reporting.d.ts +10 -295
package/dist/reporting.js +10 -6
package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-BPT8x_NT.d.ts} +148 -146
package/dist/rl.d.ts +1762 -8
package/dist/rl.js +2035 -58
package/dist/rl.js.map +1 -1
package/dist/rubric-D5tjHNJQ.d.ts +72 -0
package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
package/dist/sequential-Dgz1n51-.d.ts +139 -0
package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-C7VPYEj2.d.ts} +3 -76
package/dist/telemetry/file.js +4 -1
package/dist/telemetry/file.js.map +1 -1
package/dist/telemetry/index.js +57 -57
package/dist/telemetry/index.js.map +1 -1
package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
package/dist/traces.d.ts +142 -387
package/dist/traces.js +1302 -40
package/dist/traces.js.map +1 -1
package/dist/trajectory-CnoBo-JY.d.ts +32 -0
package/dist/wire/index.d.ts +369 -25
package/dist/wire/index.js +22 -3
package/package.json +44 -18
package/dist/chunk-42I2QC2L.js.map +0 -1
package/dist/chunk-5IIQKMD5.js.map +0 -1
package/dist/chunk-6KQG5HAH.js.map +0 -1
package/dist/chunk-6M774GY6.js.map +0 -1
package/dist/chunk-7EAUOUQS.js.map +0 -1
package/dist/chunk-AXHNWLIX.js.map +0 -1
package/dist/chunk-EXGR4XEM.js.map +0 -1
package/dist/chunk-IOXMGMHQ.js.map +0 -1
package/dist/chunk-KAO3Q65R.js.map +0 -1
package/dist/chunk-LZKIOBG2.js +0 -2026
package/dist/chunk-LZKIOBG2.js.map +0 -1
package/dist/chunk-QBW3YBTR.js.map +0 -1
package/dist/chunk-QUKKGHTZ.js.map +0 -1
package/dist/chunk-SQQLHODJ.js.map +0 -1
package/dist/chunk-V5QSWN7L.js +0 -1310
package/dist/chunk-V5QSWN7L.js.map +0 -1
package/dist/chunk-VQQSPGSM.js.map +0 -1
package/dist/chunk-XPHOZPOM.js +0 -1947
package/dist/chunk-XPHOZPOM.js.map +0 -1
package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
package/dist/index-ekBXweiQ.d.ts +0 -1894
package/dist/sequential-DgU2mFsE.d.ts +0 -304

package/dist/rubric-predictive-validity-C0uDYwG6.d.ts ADDED Viewed

@@ -0,0 +1,105 @@
+import { R as RunRecord } from './run-record-CqzahIbx.js';
+import { O as OutcomeStore } from './outcome-store-D6KWmYvj.js';
+/**
+ * Rubric predictive validity — does our eval rubric predict deployment
+ * outcomes?
+ *
+ * `correlationStudy` (already in this package) joins a `TraceStore` to an
+ * `OutcomeStore` and computes Pearson + Spearman + bootstrap CI for each
+ * (eval-metric, outcome-metric) pair. That answers "does X correlate with
+ * Y at all." `rubricPredictiveValidity` is the campaign-shaped wrapper
+ * around it: take a sequence of `RunRecord`s (the canonical campaign
+ * artifact) and a `DeploymentOutcomeStore`, join on `runId`, return a
+ * ranked verdict on every rubric whose dimension scores were captured in
+ * `outcome.raw`.
+ *
+ * The point — quoting the methodology doc — is that **without this loop
+ * every rubric is faith-based**. Once it's wired, you know which rubrics
+ * have earned their promotion power and which ones are decoration.
+ *
+ *   const validity = await rubricPredictiveValidity({
+ *     runs: lastQuarter,
+ *     outcomes: shipFlagOutcomeStore,
+ *     outcomeMetrics: ['revenue_lift', 'retention_30d', 'csat'],
+ *     rubrics: ['anti_slop', 'semantic_concept', 'tool_recovery'],
+ *   })
+ *   for (const r of validity.ranked) {
+ *     console.log(`${r.rubric} → ${r.bestOutcome}: ρ=${r.spearman.toFixed(2)}`)
+ *   }
+ *
+ * The function is intentionally read-only. Use the verdict to deprecate
+ * decorative rubrics, re-weight composite scores, or trigger a
+ * recalibration sweep when predictive validity drops below a threshold.
+ */
+interface RubricPredictiveValidityInput {
+    /**
+     * Canonical campaign output. Each record's `outcome.raw[<rubricId>]`
+     * provides the eval score; missing keys are silently skipped per pair.
+     */
+    runs: RunRecord[];
+    outcomes: OutcomeStore;
+    /**
+     * Outcome metric names to evaluate against. Each must appear in at
+     * least one `DeploymentOutcome.metrics` keyspace; pairs with too few
+     * joined samples are excluded from the result.
+     */
+    outcomeMetrics: string[];
+    /**
+     * Rubric ids to evaluate. Must appear as keys in `RunRecord.outcome.raw`.
+     * If omitted, every numeric key in `outcome.raw` across the run set is
+     * treated as a rubric.
+     */
+    rubrics?: string[];
+    /** Minimum joined-sample count before a pair is reported. Default 8. */
+    minSamples?: number;
+    /** Bootstrap resamples for CI. Default 500. */
+    bootstrapResamples?: number;
+    /** Random seed for the bootstrap (mulberry32). Default unset (Math.random). */
+    seed?: number;
+    /**
+     * Reduction when multiple outcomes attach to one runId. Default `'latest'`
+     * (most recently captured).
+     */
+    reduction?: 'latest' | 'mean' | 'max';
+}
+interface RubricOutcomePair {
+    rubric: string;
+    outcome: string;
+    n: number;
+    pearson: number;
+    spearman: number;
+    ci95: {
+        low: number;
+        high: number;
+    };
+    /**
+     * Verdict bucket. `load_bearing` ≥ 0.7, `informative` ≥ 0.4,
+     * `decorative` < 0.4 in absolute correlation. A negative correlation
+     * with a desired outcome is also `decorative` — actively misleading
+     * is worse than uninformative.
+     */
+    verdict: 'load_bearing' | 'informative' | 'decorative';
+}
+interface RubricRanking {
+    rubric: string;
+    /** Outcome metric this rubric correlated best with. */
+    bestOutcome: string;
+    spearman: number;
+    pearson: number;
+    n: number;
+    verdict: RubricOutcomePair['verdict'];
+}
+interface RubricPredictiveValidityReport {
+    pairs: RubricOutcomePair[];
+    /** Per-rubric best pair, sorted descending by |spearman|. */
+    ranked: RubricRanking[];
+    joinedSamples: number;
+    skippedRuns: number;
+    /** Rubrics that were declared but never produced a usable score. */
+    rubricsWithoutData: string[];
+}
+declare function rubricPredictiveValidity(input: RubricPredictiveValidityInput): Promise<RubricPredictiveValidityReport>;
+export { type RubricOutcomePair as R, type RubricPredictiveValidityInput as a, type RubricPredictiveValidityReport as b, type RubricRanking as c, rubricPredictiveValidity as r };

package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} RENAMED Viewed

@@ -1,3 +1,5 @@
+import { V as ValidationError } from './errors-BZ9sTdz7.js';
 /**
  * Paper-grade RunRecord schema + runtime validator.
  *
@@ -117,7 +119,8 @@ interface RunRecord {
      */
     scenarioId?: string;
 }
-declare class RunRecordValidationError extends Error {
+declare class RunRecordValidationError extends ValidationError {
     readonly path: string;
     constructor(message: string, path?: string);
 }

package/dist/sequential-Dgz1n51-.d.ts ADDED Viewed

@@ -0,0 +1,139 @@
+/**
+ * Always-valid sequential evaluation.
+ *
+ * `researchReport` (0.21+) assumes a single pre-specified analysis. Real
+ * consumers run campaigns weekly / nightly / per-PR; each new run silently
+ * inflates the false-discovery rate, because the BH-FDR guarantee was for
+ * the *first* look, not the 47th. Without time-uniform inference,
+ * launch-decision teams either (a) don't peek, which forfeits the cost
+ * advantage of stop-when-decisive, or (b) peek and pretend they didn't,
+ * which forfeits scientific validity.
+ *
+ * This module ships **e-value-based confidence sequences** for paired
+ * bounded outcomes. The methodology is the predictable plug-in betting
+ * martingale of Waudby-Smith & Ramdas (2024) — provably valid at *any*
+ * stopping time. Concretely:
+ *
+ *   For paired deltas D_1, D_2, … ∈ [-c, c] with the null H_0: E[D] ≤ 0,
+ *   a betting fraction λ_i is chosen using only D_{1..i-1} (predictable
+ *   plug-in), and the running e-value is
+ *
+ *     E_t = ∏_{i=1}^{t} (1 + λ_i · D_i)
+ *
+ *   E_t is a non-negative martingale under H_0 with E[E_t] ≤ 1, so by
+ *   Ville's inequality, P(∃ t : E_t ≥ 1/α) ≤ α — we can reject the null
+ *   at any time without inflating the type-I error.
+ *
+ * Combined with `runEvalCampaign`, every consumer running rolling
+ * campaigns gains the ability to ship the moment evidence is decisive,
+ * stop-early on dead-on-arrival variants, and accumulate evidence across
+ * partial runs without spending the FDR budget. No new sweep is wasted.
+ *
+ * References:
+ *   - Howard, S. R., Ramdas, A., McAuliffe, J., Sekhon, J. (2021).
+ *     Time-uniform, nonparametric, nonasymptotic confidence sequences.
+ *     Annals of Statistics, 49(2), 1055–1080.
+ *   - Waudby-Smith, I., Ramdas, A. (2024). Estimating means of bounded
+ *     random variables by betting. JRSS B, 86(1), 1–27.
+ */
+type SequentialDecision = 'promote_now' | 'continue' | 'reject_now' | 'equivalent';
+interface PairedEvalueOptions {
+    /**
+     * Bound on |delta|. Default 1 (matching most score scales). Must satisfy
+     * c > 0; deltas outside [-c, c] are clipped with a warning attached to
+     * the return value.
+     */
+    bound?: number;
+    /** Target Type-I error. Default 0.05. */
+    alpha?: number;
+    /**
+     * Region of Practical Equivalence on the *mean* paired delta. When
+     * supplied, the verdict can return `'equivalent'` once the running
+     * confidence sequence on the mean is fully contained in [low, high].
+     */
+    rope?: {
+        low: number;
+        high: number;
+    };
+    /** Initial bet shrinkage (0 < scale ≤ 1). Default 0.5 — empirically robust. */
+    initialBetShrinkage?: number;
+}
+interface PairedEvalueStep {
+    /** 1-indexed observation count. */
+    t: number;
+    delta: number;
+    /** Running e-value E_t = ∏ (1 + λ_i · D_i). */
+    evalue: number;
+    /** Time-uniform p-value at stopping time t. */
+    pValue: number;
+    /** Lower bound of the empirical Bernstein confidence sequence at level 1-α. */
+    csLow: number;
+    csHigh: number;
+    /** Verdict at this stopping time. */
+    decision: SequentialDecision;
+}
+interface PairedEvalueSequence {
+    steps: PairedEvalueStep[];
+    /** The decision at the final step. */
+    finalDecision: SequentialDecision;
+    /** Index (1-based) at which a non-`continue` decision first fired, or null. */
+    decisionFiredAt: number | null;
+    /** True if any deltas were clipped to [-bound, bound]. */
+    clipped: boolean;
+}
+/**
+ * Run the paired e-value sequence over an in-order delta stream.
+ *
+ * Use for *streaming* / interim analyses: pass the deltas you have so
+ * far, get the verdict at every prefix length. The decision is
+ * monotone-stable in the sense that once `'reject_now'` or `'promote_now'`
+ * fires, the verdict at later steps remains decisive (the e-value is a
+ * non-negative martingale; once it crosses the threshold, it's crossed).
+ */
+declare function pairedEvalueSequence(deltas: number[], opts?: PairedEvalueOptions): PairedEvalueSequence;
+interface InterimReleaseConfidenceInput {
+    /**
+     * One delta series per candidate (paired deltas vs comparator). Order
+     * within a series is the order the campaigns were run.
+     */
+    deltaSeries: Array<{
+        candidateId: string;
+        deltas: number[];
+    }>;
+    alpha?: number;
+    bound?: number;
+    rope?: {
+        low: number;
+        high: number;
+    };
+}
+interface InterimReleaseConfidence {
+    candidates: Array<{
+        candidateId: string;
+        decision: SequentialDecision;
+        decisionFiredAt: number | null;
+        finalEvalue: number;
+        finalPValue: number;
+        pairs: number;
+        csLow: number;
+        csHigh: number;
+    }>;
+    /**
+     * Campaign-level recommendation: pick the strongest 'promote_now', else
+     * 'continue' if any candidate is still live, else 'reject_now' if every
+     * candidate is dead, else 'equivalent'.
+     */
+    recommendation: {
+        decision: SequentialDecision;
+        candidateId: string | null;
+    };
+}
+/**
+ * Run interim sequential analyses across many candidates at once,
+ * preserving the time-uniform α guarantee for each candidate's series and
+ * synthesising a campaign-level recommendation. Designed to be called on
+ * every campaign tick — the recommendation is anytime-valid.
+ */
+declare function evaluateInterimReleaseConfidence(input: InterimReleaseConfidenceInput): InterimReleaseConfidence;
+export { type InterimReleaseConfidence as I, type PairedEvalueOptions as P, type SequentialDecision as S, type InterimReleaseConfidenceInput as a, type PairedEvalueSequence as b, type PairedEvalueStep as c, evaluateInterimReleaseConfidence as e, pairedEvalueSequence as p };

package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} RENAMED Viewed

@@ -294,4 +294,4 @@ declare class FileSystemTraceStore implements TraceStore {
     artifacts(runId: string): Promise<Artifact[]>;
 }
-export { type Artifact as A, type BudgetLedgerEntry as B, type EventKind as E, type FailureClass as F, type GenericSpan as G, InMemoryTraceStore as I, type JudgeSpan as J, type LlmSpan as L, type Message as M, type RunOutcome as R, type Span as S, type TraceStore as T, type Run as a, type SpanKind as b, type ToolSpan as c, type RetrievalSpan as d, type SandboxSpan as e, type TraceEvent as f, type BudgetSpec as g, type RunFilter as h, type EventFilter as i, FAILURE_CLASSES as j, FileSystemTraceStore as k, type FileSystemTraceStoreOptions as l, type RunLayer as m, type RunStatus as n, type SpanBase as o, type SpanFilter as p, type SpanStatus as q, TRACE_SCHEMA_VERSION as r, isJudgeSpan as s, isLlmSpan as t, isRetrievalSpan as u, isSandboxSpan as v, isToolSpan as w };
+export { type Artifact as A, type BudgetLedgerEntry as B, type EventKind as E, type FailureClass as F, type GenericSpan as G, InMemoryTraceStore as I, type JudgeSpan as J, type LlmSpan as L, type Message as M, type Run as R, type Span as S, type TraceStore as T, type ToolSpan as a, type TraceEvent as b, type RunOutcome as c, type SpanKind as d, type RetrievalSpan as e, type SandboxSpan as f, type BudgetSpec as g, type RunFilter as h, type EventFilter as i, FAILURE_CLASSES as j, FileSystemTraceStore as k, type FileSystemTraceStoreOptions as l, type RunLayer as m, type RunStatus as n, type SpanBase as o, type SpanFilter as p, type SpanStatus as q, TRACE_SCHEMA_VERSION as r, isJudgeSpan as s, isLlmSpan as t, isRetrievalSpan as u, isSandboxSpan as v, isToolSpan as w };

package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-C7VPYEj2.d.ts} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { R as RunRecord, a as RunSplitTag } from './run-record-DNiOMBrZ.js';
-import { a as Run, S as Span, f as TraceEvent, F as FailureClass, T as TraceStore } from './store-u47QaJ9G.js';
+import { R as RunRecord, a as RunSplitTag } from './run-record-CqzahIbx.js';
+import { F as FailureClusterReport } from './failure-cluster-C2EGSDiT.js';
 /**
  * HeldOutGate — first-class held-out paired-delta promotion gate.
@@ -596,79 +596,6 @@ declare function runMultiShotOptimization<P>(config: MultiShotOptimizationConfig
 declare function defaultMultiShotObjectives(): Objective<VariantAggregate>[];
 declare function trialTraceFromMultiShotTrial(trial: MultiShotTrialResult): TrialTrace;
-/**
- * Failure taxonomy — canonical classes + a default classifier.
- *
- * Every failed run should end up in a named class. The classifier here
- * is rule-based (fast, deterministic); an LLM fallback can be added by
- * the consumer for novel cases and trained into the rule base over time.
- *
- * Consumers call `classifyFailure(run, spans, events)` and persist the
- * returned class as `Run.outcome.failureClass`.
- */
-interface FailureContext {
-    run: Run;
-    spans: Span[];
-    events: TraceEvent[];
-}
-interface FailureClassification {
-    failureClass: FailureClass;
-    reason: string;
-    triggerSpanId?: string;
-    triggerEventId?: string;
-}
-/** Ordered rules — first match wins. */
-interface FailureRule {
-    id: string;
-    match: (ctx: FailureContext) => {
-        failureClass: FailureClass;
-        reason: string;
-        triggerSpanId?: string;
-        triggerEventId?: string;
-    } | null;
-}
-declare const DEFAULT_RULES: FailureRule[];
-/** Classify the failure mode of a run using an ordered rule list. */
-declare function classifyFailure(ctx: FailureContext, rules?: FailureRule[]): FailureClassification;
-/**
- * FailureClusterView — groups failed runs by (failureClass, triggerTool,
- * argHash-prefix) so weekly reviews can prioritize the top-N clusters.
- *
- * Each cluster includes: N runs, scenarios affected, representative
- * error message, a proposed mitigation hint (rule → action table).
- */
-interface FailureCluster {
-    failureClass: FailureClass;
-    /** Tool name when the trigger was a tool span, else undefined. */
-    toolName?: string;
-    /** First 16 chars of argHash — clusters similar args. */
-    argPrefix?: string;
-    /**
-     * Source dimension when the trigger was a judge span (e.g. `'format'`,
-     * `'safety'`, `'correctness'`). Lets cross-template aggregators
-     * group failures by the dimension that fired without overloading
-     * `argPrefix`. Optional — legacy clusters without this field
-     * deserialize cleanly.
-     */
-    dimension?: string;
-    runCount: number;
-    scenarioIds: string[];
-    exampleError?: string;
-    exampleRunId: string;
-}
-interface FailureClusterReport {
-    clusters: FailureCluster[];
-    totalFailures: number;
-    totalRuns: number;
-}
-declare function failureClusterView(store: TraceStore, options?: {
-    rules?: FailureRule[];
-    minClusterSize?: number;
-}): Promise<FailureClusterReport>;
 /**
  * Reporting helpers — production summaries and paper-quality figures — sit alongside `reporter.ts` rather
  * than replacing it.
@@ -975,4 +902,4 @@ interface ResearchReport {
  */
 declare function researchReport(runs: RunRecord[], opts?: ResearchReportOptions): Promise<ResearchReport>;
-export { type GateEvidence as $, type ActionableSideInfo as A, trialTraceFromMultiShotTrial as B, type GateDecision as C, DEFAULT_MUTATION_PRIMITIVES as D, type EvolvableVariant as E, type ResearchReportOptions as F, type GenerationReport as G, type ResearchReport as H, InMemoryTrialCache as I, type ParetoResult as J, DEFAULT_RULES as K, type Direction as L, type MultiShotGateConfig as M, type FailureClassification as N, type Objective as O, type PromptEvolutionConfig as P, type FailureCluster as Q, type ReflectionContext as R, type ScenarioAggregate as S, type TrialCache as T, type FailureClusterReport as U, type VariantAggregate as V, type FailureContext as W, type FailureRule as X, type GainDistributionBin as Y, type GainDistributionFigureSpec as Z, type GainDistributionOptions as _, type AsiSeverity as a, HeldOutGate as a0, type HeldOutGateConfig as a1, type HeldOutGateRejectionCode as a2, type ParetoFigureSpec as a3, type ParetoPoint as a4, RESEARCH_REPORT_HARD_PAIR_FLOOR as a5, type ResearchReportCandidate as a6, type ResearchReportDecision as a7, type ResearchReportMethodology as a8, type ResearchReportRecommendation as a9, type SummaryTable as aa, type SummaryTableOptions as ab, type SummaryTableRow as ac, classifyFailure as ad, crowdingDistance as ae, dominates as af, failureClusterView as ag, gainHistogram as ah, paretoChart as ai, paretoFrontier as aj, paretoFrontierWithCrowding as ak, researchReport as al, scalarScore as am, summaryTable as an, type MultiShotGateResult as b, type MultiShotMutateAdapter as c, type MultiShotOptimizationConfig as d, type MultiShotOptimizationResult as e, type MultiShotRun as f, type MultiShotRunInput as g, type MultiShotRunner as h, type MultiShotScore as i, type MultiShotScorer as j, type MultiShotSplit as k, type MultiShotTrace as l, type MultiShotTrialResult as m, type MultiShotVariant as n, type MutateAdapter as o, type PromptEvolutionEvent as p, type PromptEvolutionResult as q, type ReflectionProposal as r, type ScoreAdapter as s, type TrialResult as t, type TrialTrace as u, buildReflectionPrompt as v, defaultMultiShotObjectives as w, parseReflectionResponse as x, runMultiShotOptimization as y, runPromptEvolution as z };
+export { gainHistogram as $, type ActionableSideInfo as A, trialTraceFromMultiShotTrial as B, type GainDistributionBin as C, DEFAULT_MUTATION_PRIMITIVES as D, type EvolvableVariant as E, type GainDistributionFigureSpec as F, type GenerationReport as G, type GainDistributionOptions as H, InMemoryTrialCache as I, type ParetoFigureSpec as J, type ParetoPoint as K, RESEARCH_REPORT_HARD_PAIR_FLOOR as L, type MultiShotGateConfig as M, type ResearchReport as N, type ResearchReportCandidate as O, type PromptEvolutionConfig as P, type ResearchReportDecision as Q, type ReflectionContext as R, type ScenarioAggregate as S, type TrialCache as T, type ResearchReportMethodology as U, type VariantAggregate as V, type ResearchReportOptions as W, type ResearchReportRecommendation as X, type SummaryTable as Y, type SummaryTableOptions as Z, type SummaryTableRow as _, type AsiSeverity as a, paretoChart as a0, researchReport as a1, summaryTable as a2, type GateDecision as a3, type HeldOutGateConfig as a4, type Objective as a5, type ParetoResult as a6, type Direction as a7, type GateEvidence as a8, HeldOutGate as a9, type HeldOutGateRejectionCode as aa, crowdingDistance as ab, dominates as ac, paretoFrontier as ad, paretoFrontierWithCrowding as ae, scalarScore as af, type MultiShotGateResult as b, type MultiShotMutateAdapter as c, type MultiShotOptimizationConfig as d, type MultiShotOptimizationResult as e, type MultiShotRun as f, type MultiShotRunInput as g, type MultiShotRunner as h, type MultiShotScore as i, type MultiShotScorer as j, type MultiShotSplit as k, type MultiShotTrace as l, type MultiShotTrialResult as m, type MultiShotVariant as n, type MutateAdapter as o, type PromptEvolutionEvent as p, type PromptEvolutionResult as q, type ReflectionProposal as r, type ScoreAdapter as s, type TrialResult as t, type TrialTrace as u, buildReflectionPrompt as v, defaultMultiShotObjectives as w, parseReflectionResponse as x, runMultiShotOptimization as y, runPromptEvolution as z };

package/dist/telemetry/file.js CHANGED Viewed

@@ -18,7 +18,10 @@ var FileTelemetrySink = class {
     if (!stream) {
       const dir = path.join(this.baseDir, repo);
       fs.mkdirSync(dir, { recursive: true });
-      stream = fs.createWriteStream(path.join(dir, `${date}.jsonl`), { flags: "a", encoding: "utf-8" });
+      stream = fs.createWriteStream(path.join(dir, `${date}.jsonl`), {
+        flags: "a",
+        encoding: "utf-8"
+      });
       this.streams.set(key, stream);
     }
     stream.write(`${JSON.stringify(envelope)}

package/dist/telemetry/file.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"sources":["../../src/telemetry/sink-file.ts"],"sourcesContent":["/*\n Node-only file sink. Imports `node:fs` — DO NOT import this from a Worker\n * or edge runtime; use `./sink-fetch` instead.\n /\n\nimport as fs from 'node:fs'\nimport * as path from 'node:path'\nimport type { TelemetryEnvelope } from './schema'\nimport type { TelemetrySink } from './sink-fetch'\n\n/** Append envelopes to a JSONL file, partitioned by repo + date. /\nexport class FileTelemetrySink implements TelemetrySink {\n private streams = new Map<string, fs.WriteStream>()\n\n constructor(private readonly baseDir: string) {\n fs.mkdirSync(baseDir, { recursive: true })\n }\n\n emit(envelope: TelemetryEnvelope): void {\n const date = envelope.timestamp.slice(0, 10) // YYYY-MM-DD\n const repo = envelope.source.repo \|\| 'unknown'\n const key = `${repo}/${date}`\n let stream = this.streams.get(key)\n if (!stream) {\n const dir = path.join(this.baseDir, repo)\n fs.mkdirSync(dir, { recursive: true })\n stream = fs.createWriteStream(path.join(dir, `${date}.jsonl`), { flags: 'a', encoding: 'utf-8' })\n this.streams.set(key, stream)\n }\n stream.write(`${JSON.stringify(envelope)}\\n`)\n }\n\n async close(): Promise<void> {\n const closes = Array.from(this.streams.values()).map(\n (s) => new Promise<void>((resolve) => s.end(() => resolve())),\n )\n this.streams.clear()\n await Promise.all(closes)\n }\n}\n\n/* Default location for local telemetry, mirroring bad CLI's convention. */\nexport function defaultTelemetryDir(homeDir: string, override?: string): string {\n return override \|\| path.join(homeDir, '.agent-eval', 'telemetry')\n}\n"],"mappings":";;;AAKA,YAAY,QAAQ;AACpB,YAAY,UAAU;AAKf,IAAM,oBAAN,MAAiD;AAAA,EAGtD,YAA6B,SAAiB;AAAjB;AAC3B,IAAG,aAAU,SAAS,EAAE,WAAW,KAAK,CAAC;AAAA,EAC3C;AAAA,EAF6B;AAAA,EAFrB,UAAU,oBAAI,IAA4B;AAAA,EAMlD,KAAK,UAAmC;AACtC,UAAM,OAAO,SAAS,UAAU,MAAM,GAAG,EAAE;AAC3C,UAAM,OAAO,SAAS,OAAO,QAAQ;AACrC,UAAM,MAAM,GAAG,IAAI,IAAI,IAAI;AAC3B,QAAI,SAAS,KAAK,QAAQ,IAAI,GAAG;AACjC,QAAI,CAAC,QAAQ;AACX,YAAM,MAAW,UAAK,KAAK,SAAS,IAAI;AACxC,MAAG,aAAU,KAAK,EAAE,WAAW,KAAK,CAAC;AACrC,eAAY,qBAAuB,UAAK,KAAK,GAAG,IAAI,QAAQ,GAAG,~~EAAE~~,OAAO,~~KAAK~~,UAAU,~~QAAQ~~,CAAC;~~AAChG~~,WAAK,QAAQ,IAAI,KAAK,MAAM;AAAA,IAC9B;AACA,WAAO,MAAM,GAAG,KAAK,UAAU,QAAQ,CAAC;AAAA,CAAI;AAAA,EAC9C;AAAA,EAEA,MAAM,QAAuB;AAC3B,UAAM,SAAS,MAAM,KAAK,KAAK,QAAQ,OAAO,CAAC,EAAE;AAAA,MAC/C,CAAC,MAAM,IAAI,QAAc,CAAC,YAAY,EAAE,IAAI,MAAM,QAAQ,CAAC,CAAC;AAAA,IAC9D;AACA,SAAK,QAAQ,MAAM;AACnB,UAAM,QAAQ,IAAI,MAAM;AAAA,EAC1B;AACF;AAGO,SAAS,oBAAoB,SAAiB,UAA2B;AAC9E,SAAO,YAAiB,UAAK,SAAS,eAAe,WAAW;AAClE;","names":[]}
1	+ {"version":3,"sources":["../../src/telemetry/sink-file.ts"],"sourcesContent":["/*\n Node-only file sink. Imports `node:fs` — DO NOT import this from a Worker\n * or edge runtime; use `./sink-fetch` instead.\n /\n\nimport as fs from 'node:fs'\nimport * as path from 'node:path'\nimport type { TelemetryEnvelope } from './schema'\nimport type { TelemetrySink } from './sink-fetch'\n\n/** Append envelopes to a JSONL file, partitioned by repo + date. /\nexport class FileTelemetrySink implements TelemetrySink {\n private streams = new Map<string, fs.WriteStream>()\n\n constructor(private readonly baseDir: string) {\n fs.mkdirSync(baseDir, { recursive: true })\n }\n\n emit(envelope: TelemetryEnvelope): void {\n const date = envelope.timestamp.slice(0, 10) // YYYY-MM-DD\n const repo = envelope.source.repo \|\| 'unknown'\n const key = `${repo}/${date}`\n let stream = this.streams.get(key)\n if (!stream) {\n const dir = path.join(this.baseDir, repo)\n fs.mkdirSync(dir, { recursive: true })\n stream = fs.createWriteStream(path.join(dir, `${date}.jsonl`), {\n flags: 'a',\n encoding: 'utf-8',\n })\n this.streams.set(key, stream)\n }\n stream.write(`${JSON.stringify(envelope)}\\n`)\n }\n\n async close(): Promise<void> {\n const closes = Array.from(this.streams.values()).map(\n (s) => new Promise<void>((resolve) => s.end(() => resolve())),\n )\n this.streams.clear()\n await Promise.all(closes)\n }\n}\n\n/* Default location for local telemetry, mirroring bad CLI's convention. */\nexport function defaultTelemetryDir(homeDir: string, override?: string): string {\n return override \|\| path.join(homeDir, '.agent-eval', 'telemetry')\n}\n"],"mappings":";;;AAKA,YAAY,QAAQ;AACpB,YAAY,UAAU;AAKf,IAAM,oBAAN,MAAiD;AAAA,EAGtD,YAA6B,SAAiB;AAAjB;AAC3B,IAAG,aAAU,SAAS,EAAE,WAAW,KAAK,CAAC;AAAA,EAC3C;AAAA,EAF6B;AAAA,EAFrB,UAAU,oBAAI,IAA4B;AAAA,EAMlD,KAAK,UAAmC;AACtC,UAAM,OAAO,SAAS,UAAU,MAAM,GAAG,EAAE;AAC3C,UAAM,OAAO,SAAS,OAAO,QAAQ;AACrC,UAAM,MAAM,GAAG,IAAI,IAAI,IAAI;AAC3B,QAAI,SAAS,KAAK,QAAQ,IAAI,GAAG;AACjC,QAAI,CAAC,QAAQ;AACX,YAAM,MAAW,UAAK,KAAK,SAAS,IAAI;AACxC,MAAG,aAAU,KAAK,EAAE,WAAW,KAAK,CAAC;AACrC,eAAY,qBAAuB,UAAK,KAAK,GAAG,IAAI,QAAQ,GAAG;AAAA,QAC7D,OAAO;AAAA,QACP,UAAU;AAAA,MACZ,CAAC;AACD,WAAK,QAAQ,IAAI,KAAK,MAAM;AAAA,IAC9B;AACA,WAAO,MAAM,GAAG,KAAK,UAAU,QAAQ,CAAC;AAAA,CAAI;AAAA,EAC9C;AAAA,EAEA,MAAM,QAAuB;AAC3B,UAAM,SAAS,MAAM,KAAK,KAAK,QAAQ,OAAO,CAAC,EAAE;AAAA,MAC/C,CAAC,MAAM,IAAI,QAAc,CAAC,YAAY,EAAE,IAAI,MAAM,QAAQ,CAAC,CAAC;AAAA,IAC9D;AACA,SAAK,QAAQ,MAAM;AACnB,UAAM,QAAQ,IAAI,MAAM;AAAA,EAC1B;AACF;AAGO,SAAS,oBAAoB,SAAiB,UAA2B;AAC9E,SAAO,YAAiB,UAAK,SAAS,eAAe,WAAW;AAClE;","names":[]}

package/dist/telemetry/index.js CHANGED Viewed

@@ -3,62 +3,6 @@ import "../chunk-PZ5AY32C.js";
 // src/telemetry/schema.ts
 var TELEMETRY_SCHEMA_VERSION = 1;
-// src/telemetry/sink-fetch.ts
-var HttpTelemetrySink = class {
-  constructor(endpoint, bearer) {
-    this.endpoint = endpoint;
-    this.bearer = bearer;
-  }
-  endpoint;
-  bearer;
-  inflight = /* @__PURE__ */ new Set();
-  emit(envelope) {
-    const body = JSON.stringify(envelope);
-    const headers = { "content-type": "application/json" };
-    if (this.bearer) headers.authorization = `Bearer ${this.bearer}`;
-    const promise = fetch(this.endpoint, { method: "POST", headers, body }).then(() => void 0).catch(() => void 0);
-    this.inflight.add(promise);
-    promise.finally(() => this.inflight.delete(promise));
-  }
-  async close() {
-    await Promise.allSettled(Array.from(this.inflight));
-  }
-};
-var FanoutTelemetrySink = class {
-  constructor(sinks) {
-    this.sinks = sinks;
-  }
-  sinks;
-  emit(envelope) {
-    for (const sink of this.sinks) {
-      try {
-        const result = sink.emit(envelope);
-        if (result && typeof result.catch === "function") {
-          ;
-          result.catch(() => void 0);
-        }
-      } catch {
-      }
-    }
-  }
-  async close() {
-    await Promise.allSettled(this.sinks.map((s) => Promise.resolve(s.close?.())));
-  }
-};
-var NullTelemetrySink = class {
-  emit() {
-  }
-};
-var InMemoryTelemetrySink = class {
-  envelopes = [];
-  emit(envelope) {
-    this.envelopes.push(envelope);
-  }
-  clear() {
-    this.envelopes.length = 0;
-  }
-};
 // src/telemetry/client.ts
 var TelemetryClient = class {
   constructor(sink, defaultSource) {
@@ -97,7 +41,7 @@ function makeEnvelopeId() {
   if (typeof crypto !== "undefined" && typeof crypto.randomUUID === "function") {
     return crypto.randomUUID();
   }
-  return "env-" + Date.now().toString(36) + "-" + Math.random().toString(36).slice(2, 10);
+  return `env-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`;
 }
 var SECRET_FLAGS = /* @__PURE__ */ new Set(["--api-key", "--bearer", "--token", "--password"]);
 function sanitiseArgv(argv) {
@@ -117,6 +61,62 @@ function sanitiseArgv(argv) {
   }
   return out;
 }
+// src/telemetry/sink-fetch.ts
+var HttpTelemetrySink = class {
+  constructor(endpoint, bearer) {
+    this.endpoint = endpoint;
+    this.bearer = bearer;
+  }
+  endpoint;
+  bearer;
+  inflight = /* @__PURE__ */ new Set();
+  emit(envelope) {
+    const body = JSON.stringify(envelope);
+    const headers = { "content-type": "application/json" };
+    if (this.bearer) headers.authorization = `Bearer ${this.bearer}`;
+    const promise = fetch(this.endpoint, { method: "POST", headers, body }).then(() => void 0).catch(() => void 0);
+    this.inflight.add(promise);
+    promise.finally(() => this.inflight.delete(promise));
+  }
+  async close() {
+    await Promise.allSettled(Array.from(this.inflight));
+  }
+};
+var FanoutTelemetrySink = class {
+  constructor(sinks) {
+    this.sinks = sinks;
+  }
+  sinks;
+  emit(envelope) {
+    for (const sink of this.sinks) {
+      try {
+        const result = sink.emit(envelope);
+        if (result && typeof result.catch === "function") {
+          ;
+          result.catch(() => void 0);
+        }
+      } catch {
+      }
+    }
+  }
+  async close() {
+    await Promise.allSettled(this.sinks.map((s) => Promise.resolve(s.close?.())));
+  }
+};
+var NullTelemetrySink = class {
+  emit() {
+  }
+};
+var InMemoryTelemetrySink = class {
+  envelopes = [];
+  emit(envelope) {
+    this.envelopes.push(envelope);
+  }
+  clear() {
+    this.envelopes.length = 0;
+  }
+};
 export {
   FanoutTelemetrySink,
   HttpTelemetrySink,

package/dist/telemetry/index.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"sources":["../../src/telemetry/schema.ts","../../src/telemetry/sink-fetch.ts","../../src/telemetry/client.ts"],"sourcesContent":["/*\n Fleet telemetry envelope — agent-eval's portable observability shape.\n \n Designed so any consumer (Node CLI, Cloudflare Worker, Lambda, browser\n * extension) can emit structured rows describing one unit of work — a page\n * audit, a tool call, an evolve round, a full agent run — to a central sink.\n \n The schema is intentionally a strict superset of agent-eval's `Run` shape\n * so a future TraceStore adapter can promote envelopes into traces without\n * translation.\n /\n\nexport const TELEMETRY_SCHEMA_VERSION = 1\n\n/* Discriminator for the unit of work this envelope describes. /\nexport type TelemetryKind =\n \| 'agent-run'\n \| 'design-audit-page'\n \| 'design-audit-run'\n \| 'design-evolve-round'\n \| 'design-evolve-run'\n \| 'gepa-trial'\n \| 'gepa-generation'\n \| 'tool-call'\n \| 'judge-verdict'\n \| 'custom'\n\nexport interface TelemetryEnvelope {\n schemaVersion: typeof TELEMETRY_SCHEMA_VERSION\n envelopeId: string\n runId: string\n timestamp: string\n parentRunId?: string\n\n source: TelemetrySource\n model?: TelemetryModel\n kind: TelemetryKind\n ok: boolean\n durationMs: number\n\n data: Record<string, unknown>\n metrics: Record<string, number>\n tags?: Record<string, string>\n\n error?: string\n}\n\nexport interface TelemetrySource {\n /* Repo identity — basename of cwd plus git remote if discoverable. /\n repo: string\n cwd: string\n gitSha?: string\n gitBranch?: string\n cliVersion: string\n /* What was invoked, e.g. `design-audit`, `bad run`, `gepa --target`. /\n invocation: string\n /* Sanitised argv minus secrets. /\n argv?: string[]\n /\n Multi-tenant identity. Set when the consumer runs inside a hosted\n * product so a fleet rollup can group by tenant without leaking customer\n * URLs or PII.\n /\n tenantId?: string\n /* Optional sub-tenant identity (project, suite, walkthrough, customer). /\n customerId?: string\n /* SHA-256 (12 hex) of the API key used to authenticate this run, when applicable. /\n apiKeyHash?: string\n}\n\nexport interface TelemetryModel {\n provider: string\n name: string\n /* SHA-256 (12 hex chars) of the prompt(s) used. /\n promptHash?: string\n /* SHA-256 (12 hex chars) of the composed rubric body, if applicable. /\n rubricHash?: string\n}\n","/\n Workers-safe telemetry sinks — only `fetch` and pure JS. No `fs`, no\n * `child_process`. Safe to import from a Cloudflare Worker, Lambda, edge\n * function, or browser extension.\n \n For Node-only file persistence, import from '@tangle-network/agent-eval/telemetry/file'.\n /\n\nimport type { TelemetryEnvelope } from './schema'\n\nexport interface TelemetrySink {\n emit(envelope: TelemetryEnvelope): Promise<void> \| void\n close?(): Promise<void> \| void\n}\n\n/* Best-effort POST to a remote collector. Fire-and-forget; never throws. /\nexport class HttpTelemetrySink implements TelemetrySink {\n private inflight = new Set<Promise<void>>()\n\n constructor(\n private readonly endpoint: string,\n private readonly bearer?: string,\n ) {}\n\n emit(envelope: TelemetryEnvelope): void {\n const body = JSON.stringify(envelope)\n const headers: Record<string, string> = { 'content-type': 'application/json' }\n if (this.bearer) headers.authorization = `Bearer ${this.bearer}`\n const promise = fetch(this.endpoint, { method: 'POST', headers, body })\n .then(() => undefined)\n .catch(() => undefined)\n this.inflight.add(promise)\n promise.finally(() => this.inflight.delete(promise))\n }\n\n async close(): Promise<void> {\n await Promise.allSettled(Array.from(this.inflight))\n }\n}\n\n/* Fanout to multiple sinks — failures in one do not affect others. /\nexport class FanoutTelemetrySink implements TelemetrySink {\n constructor(private readonly sinks: TelemetrySink[]) {}\n\n emit(envelope: TelemetryEnvelope): void {\n for (const sink of this.sinks) {\n try {\n const result = sink.emit(envelope)\n if (result && typeof (result as Promise<unknown>).catch === 'function') {\n ;(result as Promise<unknown>).catch(() => undefined)\n }\n } catch {\n // swallow — telemetry must never break a run\n }\n }\n }\n\n async close(): Promise<void> {\n await Promise.allSettled(this.sinks.map((s) => Promise.resolve(s.close?.())))\n }\n}\n\n/* No-op sink — used when telemetry is explicitly disabled. /\nexport class NullTelemetrySink implements TelemetrySink {\n emit(): void {}\n}\n\n/* In-memory sink — useful for tests + downstream adapters. /\nexport class InMemoryTelemetrySink implements TelemetrySink {\n readonly envelopes: TelemetryEnvelope[] = []\n emit(envelope: TelemetryEnvelope): void {\n this.envelopes.push(envelope)\n }\n clear(): void { this.envelopes.length = 0 }\n}\n","/\n Telemetry client — thin wrapper that builds envelopes from `EmitArgs` and\n * delegates to a `TelemetrySink`. Pure logic; no I/O. Use this from any\n * runtime — Workers, Node, browser — and choose the sink accordingly.\n \n For an opinionated singleton with env-var-driven sink wiring (the bad CLI\n * pattern), see `./node-client.ts`.\n /\n\nimport type { TelemetryEnvelope, TelemetryKind, TelemetryModel, TelemetrySource } from './schema'\nimport { TELEMETRY_SCHEMA_VERSION } from './schema'\nimport type { TelemetrySink } from './sink-fetch'\n\nexport interface EmitArgs {\n kind: TelemetryKind\n runId: string\n parentRunId?: string\n ok: boolean\n durationMs: number\n data?: Record<string, unknown>\n metrics?: Record<string, number>\n tags?: Record<string, string>\n model?: TelemetryModel\n error?: string\n /* Override the source for this envelope. Falls back to `defaultSource`. /\n source?: TelemetrySource\n}\n\nexport class TelemetryClient {\n constructor(\n private readonly sink: TelemetrySink,\n private readonly defaultSource: TelemetrySource,\n ) {}\n\n emit(args: EmitArgs): void {\n const envelope: TelemetryEnvelope = {\n schemaVersion: TELEMETRY_SCHEMA_VERSION,\n envelopeId: makeEnvelopeId(),\n runId: args.runId,\n timestamp: new Date().toISOString(),\n source: args.source ?? this.defaultSource,\n kind: args.kind,\n ok: args.ok,\n durationMs: args.durationMs,\n data: args.data ?? {},\n metrics: args.metrics ?? {},\n ...(args.parentRunId ? { parentRunId: args.parentRunId } : {}),\n ...(args.model ? { model: args.model } : {}),\n ...(args.tags ? { tags: args.tags } : {}),\n ...(args.error ? { error: args.error } : {}),\n }\n try {\n this.sink.emit(envelope)\n } catch {\n // swallow — telemetry never breaks the calling code path\n }\n }\n\n async close(): Promise<void> {\n await this.sink.close?.()\n }\n}\n\n/* Generate a UUIDv4 with whatever crypto is available (Node, Workers, browsers). /\nfunction makeEnvelopeId(): string {\n if (typeof crypto !== 'undefined' && typeof crypto.randomUUID === 'function') {\n return crypto.randomUUID()\n }\n // Last-resort fallback. Lower entropy but never throws.\n return 'env-' + Date.now().toString(36) + '-' + Math.random().toString(36).slice(2, 10)\n}\n\nexport const SECRET_FLAGS = new Set(['--api-key', '--bearer', '--token', '--password'])\n\n/* Strip likely-secret values from argv, preserving structure. /\nexport function sanitiseArgv(argv: string[]): string[] {\n const out: string[] = []\n for (let i = 0; i < argv.length; i++) {\n const a = argv[i]!\n if (SECRET_FLAGS.has(a)) {\n out.push(a, '<redacted>')\n i++\n continue\n }\n if (/^(?:--api-key\|--bearer\|--token\|--password)=/.test(a)) {\n out.push(a.replace(/=.$/, '=<redacted>'))\n continue\n }\n out.push(a)\n }\n return out\n}\n"],"mappings":";;;AAYO,IAAM,2BAA2B;;;ACIjC,IAAM,oBAAN,MAAiD;AAAA,EAGtD,YACmB,UACA,QACjB;AAFiB;AACA;AAAA,EAChB;AAAA,EAFgB;AAAA,EACA;AAAA,EAJX,WAAW,oBAAI,IAAmB;AAAA,EAO1C,KAAK,UAAmC;AACtC,UAAM,OAAO,KAAK,UAAU,QAAQ;AACpC,UAAM,UAAkC,EAAE,gBAAgB,mBAAmB;AAC7E,QAAI,KAAK,OAAQ,SAAQ,gBAAgB,UAAU,KAAK,MAAM;AAC9D,UAAM,UAAU,MAAM,KAAK,UAAU,EAAE,QAAQ,QAAQ,SAAS,KAAK,CAAC,EACnE,KAAK,MAAM,MAAS,EACpB,MAAM,MAAM,MAAS;AACxB,SAAK,SAAS,IAAI,OAAO;AACzB,YAAQ,QAAQ,MAAM,KAAK,SAAS,OAAO,OAAO,CAAC;AAAA,EACrD;AAAA,EAEA,MAAM,QAAuB;AAC3B,UAAM,QAAQ,WAAW,MAAM,KAAK,KAAK,QAAQ,CAAC;AAAA,EACpD;AACF;AAGO,IAAM,sBAAN,MAAmD;AAAA,EACxD,YAA6B,OAAwB;AAAxB;AAAA,EAAyB;AAAA,EAAzB;AAAA,EAE7B,KAAK,UAAmC;AACtC,eAAW,QAAQ,KAAK,OAAO;AAC7B,UAAI;AACF,cAAM,SAAS,KAAK,KAAK,QAAQ;AACjC,YAAI,UAAU,OAAQ,OAA4B,UAAU,YAAY;AACtE;AAAC,UAAC,OAA4B,MAAM,MAAM,MAAS;AAAA,QACrD;AAAA,MACF,QAAQ;AAAA,MAER;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAM,QAAuB;AAC3B,UAAM,QAAQ,WAAW,KAAK,MAAM,IAAI,CAAC,MAAM,QAAQ,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAAA,EAC9E;AACF;AAGO,IAAM,oBAAN,MAAiD;AAAA,EACtD,OAAa;AAAA,EAAC;AAChB;AAGO,IAAM,wBAAN,MAAqD;AAAA,EACjD,YAAiC,CAAC;AAAA,EAC3C,KAAK,UAAmC;AACtC,SAAK,UAAU,KAAK,QAAQ;AAAA,EAC9B;AAAA,EACA,QAAc;AAAE,SAAK,UAAU,SAAS;AAAA,EAAE;AAC5C;;;AC9CO,IAAM,kBAAN,MAAsB;AAAA,EAC3B,YACmB,MACA,eACjB;AAFiB;AACA;AAAA,EAChB;AAAA,EAFgB;AAAA,EACA;AAAA,EAGnB,KAAK,MAAsB;AACzB,UAAM,WAA8B;AAAA,MAClC,eAAe;AAAA,MACf,YAAY,eAAe;AAAA,MAC3B,OAAO,KAAK;AAAA,MACZ,YAAW,oBAAI,KAAK,GAAE,YAAY;AAAA,MAClC,QAAQ,KAAK,UAAU,KAAK;AAAA,MAC5B,MAAM,KAAK;AAAA,MACX,IAAI,KAAK;AAAA,MACT,YAAY,KAAK;AAAA,MACjB,MAAM,KAAK,QAAQ,CAAC;AAAA,MACpB,SAAS,KAAK,WAAW,CAAC;AAAA,MAC1B,GAAI,KAAK,cAAc,EAAE,aAAa,KAAK,YAAY,IAAI,CAAC;AAAA,MAC5D,GAAI,KAAK,QAAQ,EAAE,OAAO,KAAK,MAAM,IAAI,CAAC;AAAA,MAC1C,GAAI,KAAK,OAAO,EAAE,MAAM,KAAK,KAAK,IAAI,CAAC;AAAA,MACvC,GAAI,KAAK,QAAQ,EAAE,OAAO,KAAK,MAAM,IAAI,CAAC;AAAA,IAC5C;AACA,QAAI;AACF,WAAK,KAAK,KAAK,QAAQ;AAAA,IACzB,QAAQ;AAAA,IAER;AAAA,EACF;AAAA,EAEA,MAAM,QAAuB;AAC3B,UAAM,KAAK,KAAK,QAAQ;AAAA,EAC1B;AACF;AAGA,SAAS,iBAAyB;AAChC,MAAI,OAAO,WAAW,eAAe,OAAO,OAAO,eAAe,YAAY;AAC5E,WAAO,OAAO,WAAW;AAAA,EAC3B;AAEA,SAAO,SAAS,KAAK,IAAI,EAAE,SAAS,EAAE,IAAI,MAAM,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,GAAG,EAAE;AACxF;AAEO,IAAM,eAAe,oBAAI,IAAI,CAAC,aAAa,YAAY,WAAW,YAAY,CAAC;AAG/E,SAAS,aAAa,MAA0B;AACrD,QAAM,MAAgB,CAAC;AACvB,WAAS,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;AACpC,UAAM,IAAI,KAAK,CAAC;AAChB,QAAI,aAAa,IAAI,CAAC,GAAG;AACvB,UAAI,KAAK,GAAG,YAAY;AACxB;AACA;AAAA,IACF;AACA,QAAI,8CAA8C,KAAK,CAAC,GAAG;AACzD,UAAI,KAAK,EAAE,QAAQ,QAAQ,aAAa,CAAC;AACzC;AAAA,IACF;AACA,QAAI,KAAK,CAAC;AAAA,EACZ;AACA,SAAO;AACT;","names":[]}
1	+ {"version":3,"sources":["../../src/telemetry/schema.ts","../../src/telemetry/client.ts","../../src/telemetry/sink-fetch.ts"],"sourcesContent":["/*\n Fleet telemetry envelope — agent-eval's portable observability shape.\n \n Designed so any consumer (Node CLI, Cloudflare Worker, Lambda, browser\n * extension) can emit structured rows describing one unit of work — a page\n * audit, a tool call, an evolve round, a full agent run — to a central sink.\n \n The schema is intentionally a strict superset of agent-eval's `Run` shape\n * so a future TraceStore adapter can promote envelopes into traces without\n * translation.\n /\n\nexport const TELEMETRY_SCHEMA_VERSION = 1\n\n/* Discriminator for the unit of work this envelope describes. /\nexport type TelemetryKind =\n \| 'agent-run'\n \| 'design-audit-page'\n \| 'design-audit-run'\n \| 'design-evolve-round'\n \| 'design-evolve-run'\n \| 'gepa-trial'\n \| 'gepa-generation'\n \| 'tool-call'\n \| 'judge-verdict'\n \| 'custom'\n\nexport interface TelemetryEnvelope {\n schemaVersion: typeof TELEMETRY_SCHEMA_VERSION\n envelopeId: string\n runId: string\n timestamp: string\n parentRunId?: string\n\n source: TelemetrySource\n model?: TelemetryModel\n kind: TelemetryKind\n ok: boolean\n durationMs: number\n\n data: Record<string, unknown>\n metrics: Record<string, number>\n tags?: Record<string, string>\n\n error?: string\n}\n\nexport interface TelemetrySource {\n /* Repo identity — basename of cwd plus git remote if discoverable. /\n repo: string\n cwd: string\n gitSha?: string\n gitBranch?: string\n cliVersion: string\n /* What was invoked, e.g. `design-audit`, `bad run`, `gepa --target`. /\n invocation: string\n /* Sanitised argv minus secrets. /\n argv?: string[]\n /\n Multi-tenant identity. Set when the consumer runs inside a hosted\n * product so a fleet rollup can group by tenant without leaking customer\n * URLs or PII.\n /\n tenantId?: string\n /* Optional sub-tenant identity (project, suite, walkthrough, customer). /\n customerId?: string\n /* SHA-256 (12 hex) of the API key used to authenticate this run, when applicable. /\n apiKeyHash?: string\n}\n\nexport interface TelemetryModel {\n provider: string\n name: string\n /* SHA-256 (12 hex chars) of the prompt(s) used. /\n promptHash?: string\n /* SHA-256 (12 hex chars) of the composed rubric body, if applicable. /\n rubricHash?: string\n}\n","/\n Telemetry client — thin wrapper that builds envelopes from `EmitArgs` and\n * delegates to a `TelemetrySink`. Pure logic; no I/O. Use this from any\n * runtime — Workers, Node, browser — and choose the sink accordingly.\n \n For an opinionated singleton with env-var-driven sink wiring (the bad CLI\n * pattern), see `./node-client.ts`.\n /\n\nimport type { TelemetryEnvelope, TelemetryKind, TelemetryModel, TelemetrySource } from './schema'\nimport { TELEMETRY_SCHEMA_VERSION } from './schema'\nimport type { TelemetrySink } from './sink-fetch'\n\nexport interface EmitArgs {\n kind: TelemetryKind\n runId: string\n parentRunId?: string\n ok: boolean\n durationMs: number\n data?: Record<string, unknown>\n metrics?: Record<string, number>\n tags?: Record<string, string>\n model?: TelemetryModel\n error?: string\n /* Override the source for this envelope. Falls back to `defaultSource`. /\n source?: TelemetrySource\n}\n\nexport class TelemetryClient {\n constructor(\n private readonly sink: TelemetrySink,\n private readonly defaultSource: TelemetrySource,\n ) {}\n\n emit(args: EmitArgs): void {\n const envelope: TelemetryEnvelope = {\n schemaVersion: TELEMETRY_SCHEMA_VERSION,\n envelopeId: makeEnvelopeId(),\n runId: args.runId,\n timestamp: new Date().toISOString(),\n source: args.source ?? this.defaultSource,\n kind: args.kind,\n ok: args.ok,\n durationMs: args.durationMs,\n data: args.data ?? {},\n metrics: args.metrics ?? {},\n ...(args.parentRunId ? { parentRunId: args.parentRunId } : {}),\n ...(args.model ? { model: args.model } : {}),\n ...(args.tags ? { tags: args.tags } : {}),\n ...(args.error ? { error: args.error } : {}),\n }\n try {\n this.sink.emit(envelope)\n } catch {\n // swallow — telemetry never breaks the calling code path\n }\n }\n\n async close(): Promise<void> {\n await this.sink.close?.()\n }\n}\n\n/* Generate a UUIDv4 with whatever crypto is available (Node, Workers, browsers). /\nfunction makeEnvelopeId(): string {\n if (typeof crypto !== 'undefined' && typeof crypto.randomUUID === 'function') {\n return crypto.randomUUID()\n }\n // Last-resort fallback. Lower entropy but never throws.\n return `env-${Date.now().toString(36)}-${Math.random().toString(36).slice(2, 10)}`\n}\n\nexport const SECRET_FLAGS = new Set(['--api-key', '--bearer', '--token', '--password'])\n\n/* Strip likely-secret values from argv, preserving structure. /\nexport function sanitiseArgv(argv: string[]): string[] {\n const out: string[] = []\n for (let i = 0; i < argv.length; i++) {\n const a = argv[i]!\n if (SECRET_FLAGS.has(a)) {\n out.push(a, '<redacted>')\n i++\n continue\n }\n if (/^(?:--api-key\|--bearer\|--token\|--password)=/.test(a)) {\n out.push(a.replace(/=.$/, '=<redacted>'))\n continue\n }\n out.push(a)\n }\n return out\n}\n","/*\n Workers-safe telemetry sinks — only `fetch` and pure JS. No `fs`, no\n * `child_process`. Safe to import from a Cloudflare Worker, Lambda, edge\n * function, or browser extension.\n \n For Node-only file persistence, import from '@tangle-network/agent-eval/telemetry/file'.\n /\n\nimport type { TelemetryEnvelope } from './schema'\n\nexport interface TelemetrySink {\n emit(envelope: TelemetryEnvelope): Promise<void> \| void\n close?(): Promise<void> \| void\n}\n\n/* Best-effort POST to a remote collector. Fire-and-forget; never throws. /\nexport class HttpTelemetrySink implements TelemetrySink {\n private inflight = new Set<Promise<void>>()\n\n constructor(\n private readonly endpoint: string,\n private readonly bearer?: string,\n ) {}\n\n emit(envelope: TelemetryEnvelope): void {\n const body = JSON.stringify(envelope)\n const headers: Record<string, string> = { 'content-type': 'application/json' }\n if (this.bearer) headers.authorization = `Bearer ${this.bearer}`\n const promise = fetch(this.endpoint, { method: 'POST', headers, body })\n .then(() => undefined)\n .catch(() => undefined)\n this.inflight.add(promise)\n promise.finally(() => this.inflight.delete(promise))\n }\n\n async close(): Promise<void> {\n await Promise.allSettled(Array.from(this.inflight))\n }\n}\n\n/* Fanout to multiple sinks — failures in one do not affect others. /\nexport class FanoutTelemetrySink implements TelemetrySink {\n constructor(private readonly sinks: TelemetrySink[]) {}\n\n emit(envelope: TelemetryEnvelope): void {\n for (const sink of this.sinks) {\n try {\n const result = sink.emit(envelope)\n if (result && typeof (result as Promise<unknown>).catch === 'function') {\n ;(result as Promise<unknown>).catch(() => undefined)\n }\n } catch {\n // swallow — telemetry must never break a run\n }\n }\n }\n\n async close(): Promise<void> {\n await Promise.allSettled(this.sinks.map((s) => Promise.resolve(s.close?.())))\n }\n}\n\n/* No-op sink — used when telemetry is explicitly disabled. /\nexport class NullTelemetrySink implements TelemetrySink {\n emit(): void {}\n}\n\n/* In-memory sink — useful for tests + downstream adapters. */\nexport class InMemoryTelemetrySink implements TelemetrySink {\n readonly envelopes: TelemetryEnvelope[] = []\n emit(envelope: TelemetryEnvelope): void {\n this.envelopes.push(envelope)\n }\n clear(): void {\n this.envelopes.length = 0\n }\n}\n"],"mappings":";;;AAYO,IAAM,2BAA2B;;;ACgBjC,IAAM,kBAAN,MAAsB;AAAA,EAC3B,YACmB,MACA,eACjB;AAFiB;AACA;AAAA,EAChB;AAAA,EAFgB;AAAA,EACA;AAAA,EAGnB,KAAK,MAAsB;AACzB,UAAM,WAA8B;AAAA,MAClC,eAAe;AAAA,MACf,YAAY,eAAe;AAAA,MAC3B,OAAO,KAAK;AAAA,MACZ,YAAW,oBAAI,KAAK,GAAE,YAAY;AAAA,MAClC,QAAQ,KAAK,UAAU,KAAK;AAAA,MAC5B,MAAM,KAAK;AAAA,MACX,IAAI,KAAK;AAAA,MACT,YAAY,KAAK;AAAA,MACjB,MAAM,KAAK,QAAQ,CAAC;AAAA,MACpB,SAAS,KAAK,WAAW,CAAC;AAAA,MAC1B,GAAI,KAAK,cAAc,EAAE,aAAa,KAAK,YAAY,IAAI,CAAC;AAAA,MAC5D,GAAI,KAAK,QAAQ,EAAE,OAAO,KAAK,MAAM,IAAI,CAAC;AAAA,MAC1C,GAAI,KAAK,OAAO,EAAE,MAAM,KAAK,KAAK,IAAI,CAAC;AAAA,MACvC,GAAI,KAAK,QAAQ,EAAE,OAAO,KAAK,MAAM,IAAI,CAAC;AAAA,IAC5C;AACA,QAAI;AACF,WAAK,KAAK,KAAK,QAAQ;AAAA,IACzB,QAAQ;AAAA,IAER;AAAA,EACF;AAAA,EAEA,MAAM,QAAuB;AAC3B,UAAM,KAAK,KAAK,QAAQ;AAAA,EAC1B;AACF;AAGA,SAAS,iBAAyB;AAChC,MAAI,OAAO,WAAW,eAAe,OAAO,OAAO,eAAe,YAAY;AAC5E,WAAO,OAAO,WAAW;AAAA,EAC3B;AAEA,SAAO,OAAO,KAAK,IAAI,EAAE,SAAS,EAAE,CAAC,IAAI,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,GAAG,EAAE,CAAC;AAClF;AAEO,IAAM,eAAe,oBAAI,IAAI,CAAC,aAAa,YAAY,WAAW,YAAY,CAAC;AAG/E,SAAS,aAAa,MAA0B;AACrD,QAAM,MAAgB,CAAC;AACvB,WAAS,IAAI,GAAG,IAAI,KAAK,QAAQ,KAAK;AACpC,UAAM,IAAI,KAAK,CAAC;AAChB,QAAI,aAAa,IAAI,CAAC,GAAG;AACvB,UAAI,KAAK,GAAG,YAAY;AACxB;AACA;AAAA,IACF;AACA,QAAI,8CAA8C,KAAK,CAAC,GAAG;AACzD,UAAI,KAAK,EAAE,QAAQ,QAAQ,aAAa,CAAC;AACzC;AAAA,IACF;AACA,QAAI,KAAK,CAAC;AAAA,EACZ;AACA,SAAO;AACT;;;AC3EO,IAAM,oBAAN,MAAiD;AAAA,EAGtD,YACmB,UACA,QACjB;AAFiB;AACA;AAAA,EAChB;AAAA,EAFgB;AAAA,EACA;AAAA,EAJX,WAAW,oBAAI,IAAmB;AAAA,EAO1C,KAAK,UAAmC;AACtC,UAAM,OAAO,KAAK,UAAU,QAAQ;AACpC,UAAM,UAAkC,EAAE,gBAAgB,mBAAmB;AAC7E,QAAI,KAAK,OAAQ,SAAQ,gBAAgB,UAAU,KAAK,MAAM;AAC9D,UAAM,UAAU,MAAM,KAAK,UAAU,EAAE,QAAQ,QAAQ,SAAS,KAAK,CAAC,EACnE,KAAK,MAAM,MAAS,EACpB,MAAM,MAAM,MAAS;AACxB,SAAK,SAAS,IAAI,OAAO;AACzB,YAAQ,QAAQ,MAAM,KAAK,SAAS,OAAO,OAAO,CAAC;AAAA,EACrD;AAAA,EAEA,MAAM,QAAuB;AAC3B,UAAM,QAAQ,WAAW,MAAM,KAAK,KAAK,QAAQ,CAAC;AAAA,EACpD;AACF;AAGO,IAAM,sBAAN,MAAmD;AAAA,EACxD,YAA6B,OAAwB;AAAxB;AAAA,EAAyB;AAAA,EAAzB;AAAA,EAE7B,KAAK,UAAmC;AACtC,eAAW,QAAQ,KAAK,OAAO;AAC7B,UAAI;AACF,cAAM,SAAS,KAAK,KAAK,QAAQ;AACjC,YAAI,UAAU,OAAQ,OAA4B,UAAU,YAAY;AACtE;AAAC,UAAC,OAA4B,MAAM,MAAM,MAAS;AAAA,QACrD;AAAA,MACF,QAAQ;AAAA,MAER;AAAA,IACF;AAAA,EACF;AAAA,EAEA,MAAM,QAAuB;AAC3B,UAAM,QAAQ,WAAW,KAAK,MAAM,IAAI,CAAC,MAAM,QAAQ,QAAQ,EAAE,QAAQ,CAAC,CAAC,CAAC;AAAA,EAC9E;AACF;AAGO,IAAM,oBAAN,MAAiD;AAAA,EACtD,OAAa;AAAA,EAAC;AAChB;AAGO,IAAM,wBAAN,MAAqD;AAAA,EACjD,YAAiC,CAAC;AAAA,EAC3C,KAAK,UAAmC;AACtC,SAAK,UAAU,KAAK,QAAQ;AAAA,EAC9B;AAAA,EACA,QAAc;AACZ,SAAK,UAAU,SAAS;AAAA,EAC1B;AACF;","names":[]}