npm - @tangle-network/agent-eval - Versions diffs - 0.52.0 → 0.54.0 - Mend

@tangle-network/agent-eval 0.52.0 → 0.54.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

package/CHANGELOG.md +23 -0
package/dist/adapters/http.d.ts +1 -1
package/dist/adapters/langchain.d.ts +1 -1
package/dist/adapters/otel.d.ts +7 -6
package/dist/{baseline-4R5deP0N.d.ts → baseline-DE36-Np7.d.ts} +1 -1
package/dist/benchmarks/index.d.ts +3 -2
package/dist/builder-eval/index.d.ts +4 -3
package/dist/campaign/index.d.ts +9 -7
package/dist/campaign/index.js +33 -4
package/dist/campaign/index.js.map +1 -1
package/dist/{chunk-L7XMNXLO.js → chunk-J4DIMSRK.js} +2 -2
package/dist/{chunk-BWZEGTES.js → chunk-NCK5QLGT.js} +1 -1
package/dist/chunk-NCK5QLGT.js.map +1 -0
package/dist/{chunk-5KSDYBYH.js → chunk-YXTT6GSZ.js} +2 -2
package/dist/contract/index.d.ts +25 -12
package/dist/contract/index.js +171 -0
package/dist/contract/index.js.map +1 -1
package/dist/{control-ojEWkMfJ.d.ts → control-DjEgwWNo.d.ts} +6 -5
package/dist/{control-runtime-BZ_lVLYW.d.ts → control-runtime-DuFBYg7A.d.ts} +3 -2
package/dist/control.d.ts +7 -6
package/dist/control.js +2 -2
package/dist/{emitter-DP_cSSiw.d.ts → emitter-DEZwY14K.d.ts} +2 -1
package/dist/{failure-cluster-Cw65_5FY.d.ts → failure-cluster-CL7IVgkJ.d.ts} +2 -1
package/dist/{feedback-trajectory-BSxqEpu7.d.ts → feedback-trajectory-DpUmE90J.d.ts} +1 -1
package/dist/governance/index.d.ts +3 -2
package/dist/hosted/index.d.ts +7 -6
package/dist/{index-DQHtWQ57.d.ts → index-D2nT6_KT.d.ts} +66 -2
package/dist/{index-0pu_fBwZ.d.ts → index-wlaiph9Y.d.ts} +1 -1
package/dist/index.d.ts +31 -29
package/dist/index.js +3 -3
package/dist/{integrity-CTDhR1Sg.d.ts → integrity-CfXjSqEv.d.ts} +1 -1
package/dist/knowledge/index.d.ts +4 -3
package/dist/meta-eval/index.d.ts +4 -3
package/dist/openapi.json +1 -1
package/dist/pipelines/index.d.ts +7 -6
package/dist/prm/index.d.ts +5 -4
package/dist/{query-DODUYdPg.d.ts → query-CqTxMwDw.d.ts} +2 -1
package/dist/{red-team-30II1T4o.d.ts → red-team-CrC5MZYd.d.ts} +1 -1
package/dist/{registry-8KAs18kY.d.ts → registry-BSWy0rvH.d.ts} +1 -1
package/dist/{release-report-DSu0DWy8.d.ts → release-report-B6l5fi7T.d.ts} +2 -2
package/dist/reporting.d.ts +7 -6
package/dist/{researcher-LZD0qHEa.d.ts → researcher-D4AZjxNa.d.ts} +5 -5
package/dist/rl.d.ts +11 -10
package/dist/rl.js +2 -2
package/dist/{rubric-D5tjHNJQ.d.ts → rubric-BOfxn4ja.d.ts} +3 -2
package/dist/{rubric-predictive-validity-ByZEC3BX.d.ts → rubric-predictive-validity-B3qNa4aY.d.ts} +1 -1
package/dist/{run-improvement-loop-Cc7oZlRP.d.ts → run-improvement-loop-BhfdjrMY.d.ts} +3 -3
package/dist/{run-record-BGY6bHRh.d.ts → run-record-etiCMsUq.d.ts} +11 -3
package/dist/{store-Db2Bv8Cf.d.ts → schema-m0gsnbt3.d.ts} +1 -99
package/dist/store-CKUAgsJz.d.ts +101 -0
package/dist/{summary-report-B7gNRX-r.d.ts → summary-report-DLxh4yWk.d.ts} +2 -2
package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BdVaPyHT.d.ts} +3 -2
package/dist/traces.d.ts +7 -6
package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-GEdXJCL5.d.ts} +2 -1
package/dist/{types-Dbj5gu8n.d.ts → types-BgrxOJSf.d.ts} +31 -1
package/dist/wire/index.d.ts +5 -4
package/docs/design/self-improvement-protocol.md +223 -0
package/docs/pilot/README.md +62 -0
package/docs/pilot/customer-checklist.md +90 -0
package/docs/pilot/integration-foreign-stack.md +296 -0
package/docs/pilot/integration-tangle-stack.md +248 -0
package/docs/pilot/one-pager.md +161 -0
package/docs/pilot/sample-insight-report.json +172 -0
package/docs/research/research-roadmap.md +204 -0
package/package.json +1 -1
package/dist/chunk-BWZEGTES.js.map +0 -1
/package/dist/{chunk-L7XMNXLO.js.map → chunk-J4DIMSRK.js.map} +0 -0
/package/dist/{chunk-5KSDYBYH.js.map → chunk-YXTT6GSZ.js.map} +0 -0

package/dist/{store-Db2Bv8Cf.d.ts → schema-m0gsnbt3.d.ts} RENAMED Viewed

@@ -196,102 +196,4 @@ declare function isRetrievalSpan(s: Span): s is RetrievalSpan;
 declare function isJudgeSpan(s: Span): s is JudgeSpan;
 declare function isSandboxSpan(s: Span): s is SandboxSpan;
-interface RunFilter {
-    scenarioId?: string;
-    variantId?: string;
-    status?: RunStatus;
-    since?: number;
-    until?: number;
-    tag?: {
-        key: string;
-        value: string;
-    };
-    parentRunId?: string;
-    projectId?: string;
-    chatId?: string;
-    layer?: RunLayer;
-}
-interface SpanFilter {
-    runId?: string;
-    parentSpanId?: string;
-    kind?: SpanKind;
-    name?: string;
-    toolName?: string;
-    judgeId?: string;
-    since?: number;
-    until?: number;
-}
-interface EventFilter {
-    runId?: string;
-    spanId?: string;
-    kind?: EventKind;
-    since?: number;
-    until?: number;
-}
-interface TraceStore {
-    appendRun(run: Run): Promise<void>;
-    updateRun(runId: string, patch: Partial<Run>): Promise<void>;
-    appendSpan(span: Span): Promise<void>;
-    updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
-    appendEvent(event: TraceEvent): Promise<void>;
-    appendArtifact(artifact: Artifact): Promise<void>;
-    appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
-    getRun(runId: string): Promise<Run | undefined>;
-    listRuns(filter?: RunFilter): Promise<Run[]>;
-    spans(filter?: SpanFilter): Promise<Span[]>;
-    events(filter?: EventFilter): Promise<TraceEvent[]>;
-    budget(runId: string): Promise<BudgetLedgerEntry[]>;
-    artifacts(runId: string): Promise<Artifact[]>;
-}
-declare class InMemoryTraceStore implements TraceStore {
-    private runs;
-    private allSpans;
-    private allEvents;
-    private allArtifacts;
-    private allBudget;
-    appendRun(run: Run): Promise<void>;
-    updateRun(runId: string, patch: Partial<Run>): Promise<void>;
-    appendSpan(span: Span): Promise<void>;
-    updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
-    appendEvent(event: TraceEvent): Promise<void>;
-    appendArtifact(artifact: Artifact): Promise<void>;
-    appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
-    getRun(runId: string): Promise<Run | undefined>;
-    listRuns(filter?: RunFilter): Promise<Run[]>;
-    spans(filter?: SpanFilter): Promise<Span[]>;
-    events(filter?: EventFilter): Promise<TraceEvent[]>;
-    budget(runId: string): Promise<BudgetLedgerEntry[]>;
-    artifacts(runId: string): Promise<Artifact[]>;
-}
-interface FileSystemTraceStoreOptions {
-    dir: string;
-    /** Roll over NDJSON files when they exceed this size in bytes. Default 32 MB. */
-    maxBytes?: number;
-}
-declare class FileSystemTraceStore implements TraceStore {
-    private dir;
-    private maxBytes;
-    /** Lazy in-memory index for queries — populated on first read. */
-    private index?;
-    private loaded;
-    constructor(options: FileSystemTraceStoreOptions);
-    private ensureDir;
-    private append;
-    private insertInto;
-    private load;
-    appendRun(run: Run): Promise<void>;
-    updateRun(runId: string, patch: Partial<Run>): Promise<void>;
-    appendSpan(span: Span): Promise<void>;
-    updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
-    appendEvent(event: TraceEvent): Promise<void>;
-    appendArtifact(artifact: Artifact): Promise<void>;
-    appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
-    getRun(runId: string): Promise<Run | undefined>;
-    listRuns(filter?: RunFilter): Promise<Run[]>;
-    spans(filter?: SpanFilter): Promise<Span[]>;
-    events(filter?: EventFilter): Promise<TraceEvent[]>;
-    budget(runId: string): Promise<BudgetLedgerEntry[]>;
-    artifacts(runId: string): Promise<Artifact[]>;
-}
-export { type Artifact as A, type BudgetLedgerEntry as B, type EventKind as E, type FailureClass as F, type GenericSpan as G, InMemoryTraceStore as I, type JudgeSpan as J, type LlmSpan as L, type Message as M, type Run as R, type Span as S, type TraceStore as T, type ToolSpan as a, type TraceEvent as b, type RunOutcome as c, type SpanKind as d, type RetrievalSpan as e, type SandboxSpan as f, type BudgetSpec as g, type RunFilter as h, type EventFilter as i, FAILURE_CLASSES as j, FileSystemTraceStore as k, type FileSystemTraceStoreOptions as l, type RunLayer as m, type RunStatus as n, type SpanBase as o, type SpanFilter as p, type SpanStatus as q, TRACE_SCHEMA_VERSION as r, isJudgeSpan as s, isLlmSpan as t, isRetrievalSpan as u, isSandboxSpan as v, isToolSpan as w };
+export { type Artifact as A, type BudgetLedgerEntry as B, type EventKind as E, type FailureClass as F, type GenericSpan as G, type JudgeSpan as J, type LlmSpan as L, type Message as M, type Run as R, type Span as S, type ToolSpan as T, type TraceEvent as a, type RunOutcome as b, type SpanKind as c, type RetrievalSpan as d, type SandboxSpan as e, type RunStatus as f, type RunLayer as g, type BudgetSpec as h, FAILURE_CLASSES as i, type SpanBase as j, type SpanStatus as k, TRACE_SCHEMA_VERSION as l, isJudgeSpan as m, isLlmSpan as n, isRetrievalSpan as o, isSandboxSpan as p, isToolSpan as q };

package/dist/store-CKUAgsJz.d.ts ADDED Viewed

@@ -0,0 +1,101 @@
+import { R as Run, S as Span, a as TraceEvent, A as Artifact, B as BudgetLedgerEntry, f as RunStatus, g as RunLayer, c as SpanKind, E as EventKind } from './schema-m0gsnbt3.js';
+interface RunFilter {
+    scenarioId?: string;
+    variantId?: string;
+    status?: RunStatus;
+    since?: number;
+    until?: number;
+    tag?: {
+        key: string;
+        value: string;
+    };
+    parentRunId?: string;
+    projectId?: string;
+    chatId?: string;
+    layer?: RunLayer;
+}
+interface SpanFilter {
+    runId?: string;
+    parentSpanId?: string;
+    kind?: SpanKind;
+    name?: string;
+    toolName?: string;
+    judgeId?: string;
+    since?: number;
+    until?: number;
+}
+interface EventFilter {
+    runId?: string;
+    spanId?: string;
+    kind?: EventKind;
+    since?: number;
+    until?: number;
+}
+interface TraceStore {
+    appendRun(run: Run): Promise<void>;
+    updateRun(runId: string, patch: Partial<Run>): Promise<void>;
+    appendSpan(span: Span): Promise<void>;
+    updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
+    appendEvent(event: TraceEvent): Promise<void>;
+    appendArtifact(artifact: Artifact): Promise<void>;
+    appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
+    getRun(runId: string): Promise<Run | undefined>;
+    listRuns(filter?: RunFilter): Promise<Run[]>;
+    spans(filter?: SpanFilter): Promise<Span[]>;
+    events(filter?: EventFilter): Promise<TraceEvent[]>;
+    budget(runId: string): Promise<BudgetLedgerEntry[]>;
+    artifacts(runId: string): Promise<Artifact[]>;
+}
+declare class InMemoryTraceStore implements TraceStore {
+    private runs;
+    private allSpans;
+    private allEvents;
+    private allArtifacts;
+    private allBudget;
+    appendRun(run: Run): Promise<void>;
+    updateRun(runId: string, patch: Partial<Run>): Promise<void>;
+    appendSpan(span: Span): Promise<void>;
+    updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
+    appendEvent(event: TraceEvent): Promise<void>;
+    appendArtifact(artifact: Artifact): Promise<void>;
+    appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
+    getRun(runId: string): Promise<Run | undefined>;
+    listRuns(filter?: RunFilter): Promise<Run[]>;
+    spans(filter?: SpanFilter): Promise<Span[]>;
+    events(filter?: EventFilter): Promise<TraceEvent[]>;
+    budget(runId: string): Promise<BudgetLedgerEntry[]>;
+    artifacts(runId: string): Promise<Artifact[]>;
+}
+interface FileSystemTraceStoreOptions {
+    dir: string;
+    /** Roll over NDJSON files when they exceed this size in bytes. Default 32 MB. */
+    maxBytes?: number;
+}
+declare class FileSystemTraceStore implements TraceStore {
+    private dir;
+    private maxBytes;
+    /** Lazy in-memory index for queries — populated on first read. */
+    private index?;
+    private loaded;
+    constructor(options: FileSystemTraceStoreOptions);
+    private ensureDir;
+    private append;
+    private insertInto;
+    private load;
+    appendRun(run: Run): Promise<void>;
+    updateRun(runId: string, patch: Partial<Run>): Promise<void>;
+    appendSpan(span: Span): Promise<void>;
+    updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
+    appendEvent(event: TraceEvent): Promise<void>;
+    appendArtifact(artifact: Artifact): Promise<void>;
+    appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
+    getRun(runId: string): Promise<Run | undefined>;
+    listRuns(filter?: RunFilter): Promise<Run[]>;
+    spans(filter?: SpanFilter): Promise<Span[]>;
+    events(filter?: EventFilter): Promise<TraceEvent[]>;
+    budget(runId: string): Promise<BudgetLedgerEntry[]>;
+    artifacts(runId: string): Promise<Artifact[]>;
+}
+export { type EventFilter as E, FileSystemTraceStore as F, InMemoryTraceStore as I, type RunFilter as R, type SpanFilter as S, type TraceStore as T, type FileSystemTraceStoreOptions as a };

package/dist/{summary-report-B7gNRX-r.d.ts → summary-report-DLxh4yWk.d.ts} RENAMED Viewed

@@ -1,5 +1,5 @@
-import { R as RunRecord } from './run-record-BGY6bHRh.js';
-import { F as FailureClusterReport } from './failure-cluster-Cw65_5FY.js';
+import { R as RunRecord } from './run-record-etiCMsUq.js';
+import { F as FailureClusterReport } from './failure-cluster-CL7IVgkJ.js';
 /**
  * HeldOutGate — first-class held-out paired-delta promotion gate.

package/dist/{test-graded-scenario-B2kWEdh9.d.ts → test-graded-scenario-BdVaPyHT.d.ts} RENAMED Viewed

@@ -1,5 +1,6 @@
-import { T as TraceEmitter } from './emitter-DP_cSSiw.js';
-import { R as Run, F as FailureClass, T as TraceStore } from './store-Db2Bv8Cf.js';
+import { T as TraceEmitter } from './emitter-DEZwY14K.js';
+import { R as Run, F as FailureClass } from './schema-m0gsnbt3.js';
+import { T as TraceStore } from './store-CKUAgsJz.js';
 /**
  * SandboxHarness — executes a scenario in an isolated environment and

package/dist/traces.d.ts CHANGED Viewed

@@ -1,12 +1,13 @@
 import { N as NotFoundError, R as ReplayError } from './errors-mje_cKOs.js';
 import { R as RawProviderSink, d as RawProviderEvent } from './raw-provider-sink-C46HDghv.js';
 export { F as FileSystemRawProviderSink, a as FileSystemRawProviderSinkOptions, I as InMemoryRawProviderSink, b as InMemoryRawProviderSinkOptions, N as NoopRawProviderSink, P as ProviderRedactor, c as RawProviderDirection, e as RawProviderSinkFilter, f as defaultProviderRedactor, p as providerFromBaseUrl } from './raw-provider-sink-C46HDghv.js';
-import { R as RunCompleteHook, a as RunCompleteHookContext } from './emitter-DP_cSSiw.js';
-export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DP_cSSiw.js';
-export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CTDhR1Sg.js';
-import { T as TraceStore } from './store-Db2Bv8Cf.js';
-export { A as Artifact, B as BudgetLedgerEntry, g as BudgetSpec, i as EventFilter, E as EventKind, j as FAILURE_CLASSES, F as FailureClass, k as FileSystemTraceStore, l as FileSystemTraceStoreOptions, G as GenericSpan, I as InMemoryTraceStore, J as JudgeSpan, L as LlmSpan, M as Message, e as RetrievalSpan, R as Run, h as RunFilter, m as RunLayer, c as RunOutcome, n as RunStatus, f as SandboxSpan, S as Span, o as SpanBase, p as SpanFilter, d as SpanKind, q as SpanStatus, r as TRACE_SCHEMA_VERSION, a as ToolSpan, b as TraceEvent, s as isJudgeSpan, t as isLlmSpan, u as isRetrievalSpan, v as isSandboxSpan, w as isToolSpan } from './store-Db2Bv8Cf.js';
-export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-DODUYdPg.js';
+import { R as RunCompleteHook, a as RunCompleteHookContext } from './emitter-DEZwY14K.js';
+export { S as SpanHandle, T as TraceEmitter, b as TraceEmitterOptions, l as llmSpanFromProvider } from './emitter-DEZwY14K.js';
+export { b as RunIntegrityError, R as RunIntegrityExpectations, c as RunIntegrityIssue, d as RunIntegrityIssueCode, a as RunIntegrityReport, e as assertRunCaptured, t as throwIfRunIncomplete } from './integrity-CfXjSqEv.js';
+import { T as TraceStore } from './store-CKUAgsJz.js';
+export { E as EventFilter, F as FileSystemTraceStore, a as FileSystemTraceStoreOptions, I as InMemoryTraceStore, R as RunFilter, S as SpanFilter } from './store-CKUAgsJz.js';
+export { a as aggregateLlm, b as argHash, g as groupBy, j as judgeSpans, l as llmSpans, r as runFailureClass, c as runsForScenario, t as toolSpans } from './query-CqTxMwDw.js';
+export { A as Artifact, B as BudgetLedgerEntry, h as BudgetSpec, E as EventKind, i as FAILURE_CLASSES, F as FailureClass, G as GenericSpan, J as JudgeSpan, L as LlmSpan, M as Message, d as RetrievalSpan, R as Run, g as RunLayer, b as RunOutcome, f as RunStatus, e as SandboxSpan, S as Span, j as SpanBase, c as SpanKind, k as SpanStatus, l as TRACE_SCHEMA_VERSION, T as ToolSpan, a as TraceEvent, m as isJudgeSpan, n as isLlmSpan, o as isRetrievalSpan, p as isSandboxSpan, q as isToolSpan } from './schema-m0gsnbt3.js';
 import { AxAIService, AxFunction } from '@ax-llm/ax';
 import { T as TraceAnalysisStore, f as TraceAnalystFilters, a as DatasetOverview, Q as QueryTracesPage, l as ViewTraceResult, V as ViewSpansResult, b as SearchTraceResult, S as SearchSpanResult } from './store-CJbzDxZ2.js';
 export { D as DEFAULT_TRACE_ANALYST_BUDGETS, c as SpanMatchRecord, d as TRACE_ANALYST_TRUNCATION_MARKER_PREFIX, e as TraceAnalystByteBudgets, g as TraceAnalystSpan, h as TraceAnalystSpanKind, i as TraceAnalystSpanStatus, j as TraceAnalystTraceSummary, k as ViewTraceOversized } from './store-CJbzDxZ2.js';

package/dist/{trajectory-CnoBo-JY.d.ts → trajectory-GEdXJCL5.d.ts} RENAMED Viewed

@@ -1,4 +1,5 @@
-import { S as Span, b as TraceEvent, T as TraceStore } from './store-Db2Bv8Cf.js';
+import { S as Span, a as TraceEvent } from './schema-m0gsnbt3.js';
+import { T as TraceStore } from './store-CKUAgsJz.js';
 /**
  * Trajectory — ordered, structured view over a run's spans.

package/dist/{types-Dbj5gu8n.d.ts → types-BgrxOJSf.d.ts} RENAMED Viewed

@@ -248,6 +248,23 @@ interface CampaignCostMeter {
  *  training scenarios unless explicitly opted in). */
 type LabeledScenarioSource = 'production-trace' | 'eval-run' | 'manual' | 'red-team' | 'synthetic';
 type RedactionStatus = 'raw' | 'redacted-pii' | 'redacted-secrets' | 'fully-redacted';
+/** How much a label can be trusted to evaluate against — the gold-admission
+ *  gate. Strictly ordered: a record qualifies for a `minTrust` filter when its
+ *  trust rank is >= the requested rank.
+ *
+ *   - `unverified`      — label is a heuristic (e.g. raw outcome success/fail).
+ *                          Fine as corpus; MUST NOT enter a gold set that lift
+ *                          numbers are computed against.
+ *   - `verified-signal` — an external signal confirmed the outcome (PR merged,
+ *                          tests green, user did not retry, downstream check).
+ *   - `human-rated`     — a human explicitly rated or corrected the artifact.
+ *
+ *  Absent on a write ⇒ treated as `unverified` (fail-closed: a writer must
+ *  explicitly assert trust to make a record gold-eligible — it never happens
+ *  by accident). */
+type LabelTrust = 'unverified' | 'verified-signal' | 'human-rated';
+/** Ordinal rank for a label-trust tier; absent ⇒ `unverified` (rank 0). */
+declare function labelTrustRank(trust: LabelTrust | undefined): number;
 /** @experimental Required-provenance write. The store rejects writes that
  *  lack provenance — a default-on flywheel without provenance is the
  *  data-poisoning vector flagged in the alignment review. */
@@ -259,6 +276,11 @@ interface LabeledScenarioWrite<TScenario extends Scenario = Scenario, TArtifact
     sourceVersionHash: string;
     capturedAt: string;
     redactionStatus: RedactionStatus;
+    /** Gold-admission trust tier. Absent ⇒ `unverified` (fail-closed): the
+     *  record is corpus, never gold. A writer must explicitly assert
+     *  `verified-signal` or `human-rated` to make it eligible for a gold
+     *  sample. See {@link LabelTrust}. */
+    labelTrust?: LabelTrust;
     /** Optional per-source rate-limit bucket key (e.g., the tenant id). */
     rateLimitBucket?: string;
 }
@@ -282,6 +304,11 @@ interface LabeledScenarioSampleArgs {
         source?: LabeledScenarioSource | LabeledScenarioSource[];
         minComposite?: number;
         maxComposite?: number;
+        /** Gold gate: only records whose trust rank is >= this tier are
+         *  returned. `sample({ split: 'test', minTrust: 'verified-signal' })` is
+         *  the canonical "give me the gold set" call. Absent ⇒ no trust gate
+         *  (corpus-level read). */
+        minTrust?: LabelTrust;
     };
 }
 interface LabeledScenarioStore {
@@ -291,6 +318,9 @@ interface LabeledScenarioStore {
         train: number;
         test: number;
         bySource: Record<string, number>;
+        /** Count by trust tier — tells the flywheel how much gold it has
+         *  accumulated vs. raw corpus. */
+        byTrust: Record<LabelTrust, number>;
     }>;
 }
 interface CampaignCellResult<TArtifact> {
@@ -372,4 +402,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
     scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
 }
-export type { CodeSurface as C, DispatchFn as D, Gate as G, ImprovementDriver as I, JudgeScore as J, LabeledScenarioStore as L, MutableSurface as M, OptimizerConfig as O, ProposeContext as P, RedactionStatus as R, Scenario as S, TraceSpan as T, JudgeConfig as a, DispatchContext as b, LabeledScenarioWrite as c, LabeledScenarioSampleArgs as d, LabeledScenarioRecord as e, CampaignAggregates as f, CampaignArtifactWriter as g, CampaignCellResult as h, CampaignCostMeter as i, CampaignResult as j, CampaignTraceWriter as k, GateContext as l, GateDecision as m, GateResult as n, GenerationCandidate as o, GenerationRecord as p, JudgeAggregate as q, JudgeDimension as r, LabeledScenarioSource as s, Mutator as t, ScenarioAggregate as u, SessionScript as v };
+export { type CodeSurface as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ProposeContext as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type LabeledScenarioWrite as c, type LabeledScenarioSampleArgs as d, type LabeledScenarioRecord as e, type LabelTrust as f, type CampaignAggregates as g, type CampaignArtifactWriter as h, type CampaignCellResult as i, type CampaignCostMeter as j, type CampaignResult as k, type CampaignTraceWriter as l, type GateContext as m, type GateDecision as n, type GateResult as o, type GenerationCandidate as p, type GenerationRecord as q, type JudgeAggregate as r, type JudgeDimension as s, type LabeledScenarioSource as t, type Mutator as u, type ScenarioAggregate as v, type SessionScript as w, labelTrustRank as x };

package/dist/wire/index.d.ts CHANGED Viewed

@@ -1,12 +1,13 @@
-import { F as FeedbackTrajectoryStore } from '../feedback-trajectory-BSxqEpu7.js';
-import { T as TraceStore } from '../store-Db2Bv8Cf.js';
+import { F as FeedbackTrajectoryStore } from '../feedback-trajectory-DpUmE90J.js';
+import { T as TraceStore } from '../store-CKUAgsJz.js';
 import { z } from 'zod';
 import { OpenAPIObject } from 'openapi3-ts/oas31';
 import * as hono_types from 'hono/types';
 import { ServerType } from '@hono/node-server';
 import { Hono } from 'hono';
-import '../control-runtime-BZ_lVLYW.js';
-import '../emitter-DP_cSSiw.js';
+import '../control-runtime-DuFBYg7A.js';
+import '../emitter-DEZwY14K.js';
+import '../schema-m0gsnbt3.js';
 import '../dataset-BlwAtYYf.js';
 import '../errors-mje_cKOs.js';

package/docs/design/self-improvement-protocol.md ADDED Viewed

@@ -0,0 +1,223 @@
+# Self-improvement protocol — the world-class architecture
+**Status:** Strategic design. The artifact that every roadmap entry maps to.
+**Date:** 2026-05-27.
+## Thesis
+**Self-improvement is a protocol, not a product.** We define the wire formats, surface abstractions, driver interface, gate interface, and insight format. We ship reference implementations. Customers plug in whatever framework, model, or runtime they already use — our infrastructure handles the rigorous middle (analysis, gating, version-safe deployment).
+No competitor ships this combination. LangSmith / Braintrust / Phoenix / LangFuse ship tracing. Hermes ships an agent. SkillOpt ships an academic optimizer. Anthropic's Claude Code ships skill-creation. **Nobody ships the protocol.**
+## The pipeline as a single abstract flow
+```
+┌──────────────────────────────────────────────────────────────────────┐
+│  WHATEVER YOU ALREADY USE                                            │
+│  LangChain · LlamaIndex · Anthropic SDK · OpenAI Assistants ·        │
+│  Hermes · Claude Code · Codex · agent-runtime · your own stack       │
+└─────────────────────────────────┬────────────────────────────────────┘
+                                  │ traces (any format)
+                                  ▼
+┌──────────────────────────────────────────────────────────────────────┐
+│  INGEST — universal trace adapters                                   │
+│  fromOtelSpans · fromFeedbackTable · fromLangChain · fromLlamaIndex ·│
+│  fromAnthropicSDK · fromOpenAISDK · fromHermesProfileLog · BYO       │
+│  → canonical RunRecord[]                                             │
+└─────────────────────────────────┬────────────────────────────────────┘
+                                  ▼
+┌──────────────────────────────────────────────────────────────────────┐
+│  ANALYZE — analyzeRuns({ runs, baselineRuns?, userFeedback? })       │
+│  paired-bootstrap CI · Pareto · failure clusters · prior-period      │
+│  delta · user-corrective-signal extraction · recommendations         │
+│  ← THE STATISTICAL EDGE NOBODY ELSE SHIPS                            │
+└─────────────────────────────────┬────────────────────────────────────┘
+                                  ▼
+┌──────────────────────────────────────────────────────────────────────┐
+│  IMPROVE — selfImprove() closed loop                                 │
+│  gepaDriver · evolutionaryDriver · BYO ImprovementDriver             │
+│  → ProfileDiff (versioned, hashed, content-addressable)              │
+└─────────────────────────────────┬────────────────────────────────────┘
+                                  ▼
+┌──────────────────────────────────────────────────────────────────────┐
+│  GATE — defaultProductionGate (paired-CI) · BYO gate                 │
+│  ship-substrate / ship-harness / merge / inconclusive                │
+│  ← STATISTICALLY STRICTER THAN ANY COMPETITOR                        │
+└─────────────────────────────────┬────────────────────────────────────┘
+                                  ▼
+┌──────────────────────────────────────────────────────────────────────┐
+│  DEPLOY — back into WHATEVER YOU ALREADY USE                         │
+│  agent-runtime · Hermes profile log · LangChain config · custom hook │
+└──────────────────────────────────────────────────────────────────────┘
+```
+## The integration promise
+Customers pick one of three integration shapes. All three work today (some are aspirational on adapter coverage). Every shape uses the same canonical types underneath.
+### Shape A — offline analysis only
+You have traces, you want a decision packet. Zero LLM cost. Zero closed loop.
+```typescript
+import { fromOtelSpans, analyzeRuns } from '@tangle-network/agent-eval'
+const runs = fromOtelSpans({ spans: mySpans })
+const report = await analyzeRuns({ runs })
+// → InsightReport with composite, recommendations, Pareto, ...
+```
+Use case: dashboards, weekly post-mortems, "did anything regress" checks. The intelligence-kernel ships this.
+### Shape B — closed loop, your runtime
+You have an agent, you want to improve it. We provide drivers + gate + insight. You decide when to deploy.
+```typescript
+import { selfImprove, gepaDriver } from '@tangle-network/agent-eval'
+const result = await selfImprove({
+  scenarios,
+  agent: yourAgent,           // any function (surface, scenario) → artifact
+  judge: yourJudge,           // any function (artifact) → JudgeScore
+  baselineSurface,
+  driver: gepaDriver({ llm, model, target }),
+  budget: { generations: 3, populationSize: 4, holdoutFraction: 0.3 },
+})
+// → SelfImproveResult { baselineHash, diff, winningHash, lift, gateDecision, insight }
+```
+Use case: every product agent we ship. Hermes-on-our-sandbox. Claude Code with skills. Anyone wanting "ship if statistically better, else hold."
+### Shape C — hosted, cross-language
+You stream traces from anywhere, get InsightReports + selfImprove orchestration. Bills usage-based.
+```sh
+# Stream traces
+curl https://api.tangle.tools/v1/ingest/otel \
+  -H "Authorization: Bearer ${TANGLE_KEY}" \
+  --data-binary @traces.jsonl
+# Get the decision packet
+curl https://api.tangle.tools/v1/insight/${runId}
+# Or run a closed-loop campaign
+curl https://api.tangle.tools/v1/improve \
+  -d '{"scenarios": ..., "baselineHash": "...", "budget": {...}}'
+```
+Use case: Python customers, Go customers, customers behind firewalls, customers who don't want to operate the substrate.
+## The five non-negotiables
+The protocol claim only holds if all five of these survive integration. Customers shouldn't have to compromise on any.
+1. **Universal ingest.** Any trace format → canonical RunRecord. Coverage: OTel ✓, multi-rater feedback ✓, LangChain ⏳, LlamaIndex ⏳, Anthropic SDK ⏳, OpenAI Assistants ⏳, Hermes profile log ⏳.
+2. **Statistical rigor.** Every claim falsifiable. Paired bootstrap CI on lift, Cohen's d on effect size, MDE-aware sample-size recommendations, p-values. **SkillOpt's gate is literal `cand > current`. Hermes has no gate. Ours has all of the above.** This is the moat.
+3. **Plug-in everything.** Driver, judge, gate, intake adapter, storage all swappable. Customer brings their LLM, their judge, their scenarios. We bring the rigor.
+4. **Version-safe deployment.** AgentProfile is content-addressable. Two writers (harness + substrate) can both mutate without lost-update. Gate verdicts are scoped to baseline hash, not absolute. Tracked as #98.
+5. **Cross-language wire format.** Python client at parity with TypeScript. Hosted ingest spec versioned. Customers in any language consume the same shape.
+## Where we are honest about gaps
+| Component | Status | Customer impact when missing |
+|---|---|---|
+| `fromOtelSpans` ingest adapter | ✓ shipped 0.50.0 | — |
+| `fromFeedbackTable` multi-rater intake | ✓ shipped 0.50.0 | — |
+| `analyzeRuns` decision packet | ✓ shipped 0.50.0 / 0.50.2 actionability | — |
+| `selfImprove` closed loop | ✓ shipped 0.50.0 | — |
+| Paired-bootstrap gate | ✓ shipped early; still our edge | — |
+| `gepaDriver` reflection (not full Pareto — task #101) | ⚠ partial | OK; customers don't need Pareto until plateau hit |
+| **Prior-period comparison** in `analyzeRuns` | ✗ MISSING | "Did my last change help?" — the #1 customer question — has no rigorous answer today |
+| **User-corrective-feedback signal extraction** | ✗ MISSING | Hermes' first-class skill signal. We have the trace data. We don't mine it. |
+| **`init` CLI** scaffolding canonical eval/ layout | ✗ MISSING | Every new consumer wires it by hand; the skill describes 80 lines they have to copy |
+| **Framework-specific intake adapters** (LangChain, LlamaIndex, Anthropic SDK, OpenAI Assistants) | ✗ MISSING | Customers using these frameworks can't ingest without writing custom adapter code |
+| **Profile versioning** (task #98) | ✗ MISSING | Offline/online drift; gate verdicts can be stale by the time they're applied |
+| **Composite driver** (optimize all surfaces against one gate) | ✗ MISSING | Customers can optimize prompts OR skills, not both jointly |
+| **Empirical proof drivers work** | ✗ MISSING | We've never published "we ran gepaDriver on real customer data, here's the lift CI" |
+| Hosted-tier production launch | ⚠ in scaffolding (intelligence-kernel) | Customers must self-host today |
+## The roadmap — what closes each gap
+Mapping every roadmap entry back to a concrete protocol gap.
+### 0.53.0 (this session-or-next) — answer "did my last change help?"
+- **`analyzeRuns({ runs, baselineRuns? })`** — when `baselineRuns` is provided, the report includes a `priorPeriodComparison?` block: per-metric delta with paired-bootstrap CI, MDE-aware significance judgment, "regressed metrics" surfaced in `recommendations`.
+- Built on top of existing `diffRuns()` primitive (already shipped 0.48.0).
+- 1 PR. Pure additive surface.
+- **Customer impact**: this is the conversion question for every prospect.
+### 0.54.0 — extract Hermes' missing signal
+- **`extractUserCorrections(runs)`** — new substrate primitive. Mines user messages in traces for corrective markers (regex pass + LLM classifier for nuance). Returns `UserCorrectionEvent[]` keyed by runId.
+- `analyzeRuns({ runs, userFeedback? })` includes a "common corrections" cluster in `recommendations`.
+- Bridge to Hermes-style signal without adopting Hermes' runtime.
+- **Customer impact**: distinctive — no competitor mines this signal.
+### 0.55.0 — framework-specific intake adapters
+- **`fromLangChain(traces)`**, **`fromLlamaIndex(traces)`**, **`fromAnthropicSDK(traces)`**, **`fromOpenAIAssistants(traces)`**.
+- Each maps the framework's native trace shape to RunRecord.
+- Top 4 frameworks = 80% of agent-builder market coverage.
+- **Customer impact**: removes "we don't support your framework" friction.
+### 0.56.0 — `init` CLI + worked examples
+- `pnpm dlx @tangle-network/agent-eval init` scaffolds the canonical `eval/scenarios.json` + 3 pnpm scripts + judges template + `.runs/` directory.
+- Adds 5+ end-to-end runnable examples covering Shapes A/B/C across the 4 framework adapters.
+- **Customer impact**: time-to-first-eval drops from 4 hours to 5 minutes.
+### 1.0.0 — profile versioning (#98) + composite driver
+- Content-addressable `AgentProfileVersion` + `ProfileDiff` + 3-way merge + 4-way `DriftGateDecision`.
+- `compositeDriver` — optimize all surfaces of one AgentProfile against one gate.
+- Hermes-on-sandbox forcing function validates the work before commit.
+- **Customer impact**: production-safe; the moat is locked.
+### 1.1.0 — empirical-proof publication
+- Pick one named customer or one synthetic-realistic corpus (legal-agent canonical).
+- Run gepaDriver end-to-end with real LLM cost.
+- Publish: "n=, lift=, CI=, p=, $cost=, vs no-driver baseline."
+- One blog post, one demo video, one runnable repro.
+- **Customer impact**: every other claim becomes credible because this one is verified.
+## Why this design is 100x
+Not a 10% improvement over LangSmith. A category change.
+| Capability | LangSmith / Braintrust / Phoenix | Hermes / Claude Code | Tangle (target) |
+|---|---|---|---|
+| Trace ingest | ✓ proprietary | ✓ own runtime | ✓ universal |
+| Decision packet | ⚠ scorecards (no CI) | ✗ | ✓ paired-bootstrap |
+| Closed loop | ✗ | ✓ heuristic | ✓ statistically rigorous |
+| Plug-in drivers | ✗ | ✗ | ✓ |
+| Profile versioning | ✗ | ✗ | ✓ (1.0.0) |
+| Composite multi-surface | ✗ | ✗ | ✓ (1.0.0) |
+| Cross-language | ✗ | ✗ | ✓ (Python at parity) |
+| Empirical-proof publication | ✗ | ✗ | ✓ (1.1.0) |
+Eight rows. Nobody else has eight. We can be the only one. The work is named, scoped, and queued.
+## What's NOT on the roadmap (and why)
+- **Building our own agent runtime.** Hermes / agent-runtime / Claude Code cover that. We are infrastructure, not a runtime.
+- **Single-vendor LLM.** Substrate stays model-agnostic.
+- **UI-first product.** API-first. UIs are downstream.
+- **LangChain replacement.** Wrong layer.
+- **"Self-improvement" without a held-out gate.** Hermes and SkillOpt both ship this; we explicitly refuse — every selfImprove() requires a holdout.
+## Decision log — what we committed to in 0.52.0 → 1.0.0
+1. **`skillOptDriver` removed; behavior in `gepaDriver({ constraints })`** — 0.52.0 ✓ shipped
+2. **Honest spec docs** — 0.52.0 ✓ shipped
+3. **Profile-versioning spec with symmetric-fork framing** — 0.52.0 ✓ shipped
+4. **No V2 names anywhere** — enforced
+5. **Forcing-function gate on profile-versioning work** — Hermes-on-sandbox experiment required before phases 1-5 commit
+6. **Single-PR-per-repo discipline** — enforced 0.52.0 onwards
+7. **Prior-period comparison as 0.53.0** — committed; the customer-conversion primitive
+8. **User-feedback extraction as 0.54.0** — committed; the Hermes-signal bridge
+9. **Framework intake adapters as 0.55.0** — committed; 80% market coverage
+10. **Empirical-proof publication as 1.1.0** — committed; the credibility lock

package/docs/pilot/README.md ADDED Viewed

@@ -0,0 +1,62 @@
+# Pilot Kit — customer handoff materials
+What's here, in order of use:
+| File | For | When |
+|---|---|---|
+| [one-pager.md](./one-pager.md) | Customer's first read | Send as initial pitch — what they get, why it's different, what it looks like, what it costs. Now includes intake-paths matrix for non-Tangle customers (LangChain / LlamaIndex / Anthropic SDK / OpenAI Assistants / OpenRouter / vLLM / Ollama / custom). |
+| [integration-tangle-stack.md](./integration-tangle-stack.md) | Customer's engineer (Tangle-stack customers) | Send after one-pager when they want to see the code; full integration walkthrough for the canonical Tangle stack (sandbox + tcloud) |
+| [integration-foreign-stack.md](./integration-foreign-stack.md) | Customer's engineer (non-Tangle customers) | Send after one-pager when they're on OTel, LangChain, LlamaIndex, Anthropic SDK, OpenAI Assistants, OpenRouter, vLLM, Ollama, or custom. Covers every path. |
+| [sample-insight-report.json](./sample-insight-report.json) | Customer's team meeting | Concrete JSON they can show to demonstrate value pre-integration |
+| [customer-checklist.md](./customer-checklist.md) | Pre-onboarding-call | Send 48h before the call; ensures the 90min slot is productive. Provider-agnostic — works for any stack. |
+## How to use this kit
+**For a Tangle customer asking for it RIGHT NOW:**
+1. Reply with the one-pager (`one-pager.md`) inline + the sample InsightReport (`sample-insight-report.json`) attached. Their senior engineer reads this and decides if it's worth a call.
+2. If they say yes, send the integration guide (`integration-tangle-stack.md`) + the checklist (`customer-checklist.md`). Schedule a 90-minute onboarding call.
+3. On the call: walk through the integration, run a live `analyzeRuns()` against their existing sandbox sessions, render the deterministic packet, fire one small `selfImprove` cycle. By the end of the call they have a working pilot.
+**For Drew handling the conversation himself:**
+The whole kit is written in our voice (technical, direct, no marketing fluff). You can paste sections directly into Slack / email / a customer call. The one-pager is meant to read as YOUR pitch, not a generic SaaS handout.
+## What this kit assumes
+- Customer is on the Tangle stack (sandbox + tcloud) OR emits OTel traces
+- Customer has an agent with a clear system-prompt addendum we can optimize
+- Customer has at least 20 scenarios their agent handles
+- Customer is willing to set a `maxUsd` budget for closed-loop campaigns
+If any of those don't apply, the one-pager still works as a positioning piece. The integration doc gets adapted on the call.
+## Where this maps in the substrate
+- Substrate version: `@tangle-network/agent-eval@0.53.0` (npm), `agent-eval-rpc@0.53.0` (PyPI)
+- agent-runtime version: `@tangle-network/agent-runtime@0.29.0`
+- Key APIs: `fromTangleSandbox`, `fromOtelSpans`, `analyzeRuns`, `selfImprove`, `gepaDriver`, `defaultProductionGate`, `openAutoPr`
+- All ship today; no version-blocking dependencies
+## What this kit doesn't yet do
+- No `npx @tangle-network/intelligence demo` command shipped yet (queued #115 — extend existing `tangle-intel` CLI in ADC with customer-zero-touch subcommands `init` / `demo` / `report` / `improve`)
+- No `staging-intelligence.tangle.tools` live yet (queued #116 — matches existing `staging-{product}.tangle.tools` precedent like sandbox)
+- No live demo video (queued #117 — recorded against legal-agent canonical real data)
+- No screenshot dashboard (gated on Gate 2 task #109 — ADC intelligence frontend renders canonical InsightReport)
+- No published case study with named numbers (Gate 3 task #112 — after first pilot completes 4+ cycles)
+## Architectural decisions baked into this kit
+- **Customer-facing CLI is `@tangle-network/intelligence`** (binary `tangle-intel`), NOT `agent-eval`. `agent-eval` is the substrate package; `intelligence` is the customer product that wraps it. The CLI already exists at `services/intelligence/src/cli/` in agent-dev-container — we extend it with `init` / `demo` / `report` / `improve` subcommands per task #115.
+- **Hosted URL is `staging-intelligence.tangle.tools`** matching `staging-sandbox.tangle.tools` precedent. Production becomes `intelligence.tangle.tools` once Gate 2/3 close.
+- **`agent-eval` mentioned only when customer wants direct programmatic access** (not the default path). 90%+ of customers stay at the CLI + hosted dashboard layer.
+For the FIRST pilot conversation, the JSON sample is the dashboard substitute. After Gate 2 lands we replace it with live screenshots.
+## Update cadence
+This kit gets updated each time:
+- A substrate version ships that customers should know about
+- A real pilot completes and we have a case study to add
+- A customer gives feedback that re-shapes how we pitch