npm - @tangle-network/agent-eval - Versions diffs - 0.20.11 → 0.21.0 - Mend

@tangle-network/agent-eval 0.20.11 → 0.21.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (59) hide show

package/CHANGELOG.md +76 -0
package/README.md +137 -170
package/dist/benchmarks/index.d.ts +2 -1
package/dist/{chunk-JAOLXRIA.js → chunk-3GN6U53I.js} +205 -4
package/dist/chunk-3GN6U53I.js.map +1 -0
package/dist/chunk-3IX6QTB7.js +1349 -0
package/dist/chunk-3IX6QTB7.js.map +1 -0
package/dist/chunk-5IIQKMD5.js +236 -0
package/dist/chunk-5IIQKMD5.js.map +1 -0
package/dist/chunk-ARZ6BEV6.js +1310 -0
package/dist/chunk-ARZ6BEV6.js.map +1 -0
package/dist/chunk-HRZELXCR.js +1354 -0
package/dist/chunk-HRZELXCR.js.map +1 -0
package/dist/chunk-KRR4VMH7.js +423 -0
package/dist/chunk-KRR4VMH7.js.map +1 -0
package/dist/chunk-SNUHRBDL.js +154 -0
package/dist/chunk-SNUHRBDL.js.map +1 -0
package/dist/chunk-WOK2RTWG.js +1920 -0
package/dist/chunk-WOK2RTWG.js.map +1 -0
package/dist/{chunk-LSR4IAYN.js → chunk-WOPGKVN4.js} +2 -2
package/dist/chunk-YUFXO3TU.js +148 -0
package/dist/chunk-YUFXO3TU.js.map +1 -0
package/dist/cli.js +3 -2
package/dist/cli.js.map +1 -1
package/dist/control-cxwMOAsy.d.ts +259 -0
package/dist/control.d.ts +6 -0
package/dist/control.js +30 -0
package/dist/control.js.map +1 -0
package/dist/dataset-B9qvlm_o.d.ts +112 -0
package/dist/emitter-B2XqDKFU.d.ts +121 -0
package/dist/feedback-trajectory-CB0A32o3.d.ts +346 -0
package/dist/{index-1PZOtZFr.d.ts → index-c5saLbKD.d.ts} +2 -133
package/dist/index.d.ts +178 -2945
package/dist/index.js +1066 -6185
package/dist/index.js.map +1 -1
package/dist/multi-shot-optimization-Bvtz294B.d.ts +598 -0
package/dist/openapi.json +1 -1
package/dist/optimization.d.ts +146 -0
package/dist/optimization.js +60 -0
package/dist/optimization.js.map +1 -0
package/dist/reporting-Da2ihlcM.d.ts +672 -0
package/dist/reporting.d.ts +5 -0
package/dist/reporting.js +36 -0
package/dist/reporting.js.map +1 -0
package/dist/run-record-CX_jcAyr.d.ts +134 -0
package/dist/store-u47QaJ9G.d.ts +297 -0
package/dist/traces.d.ts +914 -0
package/dist/traces.js +120 -0
package/dist/traces.js.map +1 -0
package/dist/wire/index.js +3 -2
package/docs/concepts.md +16 -11
package/docs/feature-guide.md +10 -17
package/docs/integration-launch-gates.md +77 -0
package/docs/product-eval-adoption.md +27 -0
package/docs/research-report-methodology.md +155 -0
package/docs/trace-analysis.md +75 -0
package/package.json +30 -12
package/dist/chunk-JAOLXRIA.js.map +0 -1
/package/dist/{chunk-LSR4IAYN.js.map → chunk-WOPGKVN4.js.map} +0 -0

package/dist/reporting.js ADDED Viewed

@@ -0,0 +1,36 @@
+import {
+  RESEARCH_REPORT_HARD_PAIR_FLOOR,
+  assertReleaseConfidence,
+  bootstrapCi,
+  evaluateReleaseConfidence,
+  gainHistogram,
+  judgeReplayGate,
+  paretoChart,
+  releaseTraceEvidenceFromMultiShotTrials,
+  renderReleaseReport,
+  researchReport,
+  summaryTable
+} from "./chunk-3IX6QTB7.js";
+import {
+  bhAdjust,
+  pairedBootstrap,
+  pairedWilcoxon
+} from "./chunk-KRR4VMH7.js";
+import "./chunk-PZ5AY32C.js";
+export {
+  RESEARCH_REPORT_HARD_PAIR_FLOOR,
+  assertReleaseConfidence,
+  bhAdjust,
+  bootstrapCi,
+  evaluateReleaseConfidence,
+  gainHistogram,
+  judgeReplayGate,
+  pairedBootstrap,
+  pairedWilcoxon,
+  paretoChart,
+  releaseTraceEvidenceFromMultiShotTrials,
+  renderReleaseReport,
+  researchReport,
+  summaryTable
+};
+//# sourceMappingURL=reporting.js.map

package/dist/reporting.js.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"sources":[],"sourcesContent":[],"mappings":"","names":[]}

package/dist/run-record-CX_jcAyr.d.ts ADDED Viewed

@@ -0,0 +1,134 @@
+/**
+ * Paper-grade RunRecord schema + runtime validator.
+ *
+ * Every run that participates in a promotion gate, paper table, or
+ * researcher loop SHOULD be recorded as a `RunRecord`. The mandatory
+ * fields are exactly those the paper "Two Loops, Three Roles" requires
+ * for reproducibility: who/what/when/cost/seed/hash, plus the search vs
+ * holdout split tag and either a `searchScore` or a `holdoutScore`.
+ *
+ * This is intentionally NOT a replacement for the rich `Run` /
+ * `ProposeReviewReport` / `ScenarioResult` types already in the
+ * package. Those are runtime structures with full provenance. A
+ * `RunRecord` is the analysis-time projection — the JSON-friendly
+ * row you'd put in a parquet file or paste into a notebook.
+ *
+ * Validate at the boundary:
+ *
+ *   const rec = validateRunRecord(rawJson)         // throws on missing
+ *   const ok  = isRunRecord(rawJson)               // boolean check
+ *   const rec = parseRunRecordSafe(rawJson)        // { ok, value | error }
+ *
+ * The validator runs in pure TS — zod is intentionally NOT a
+ * dependency. Round-trip tested in `tests/run-record.test.ts`.
+ */
+/** Search/dev/holdout split tag. 'search' is the paper-grade alias for the
+ *  combined train+test pool that the optimizer is allowed to read. */
+type RunSplitTag = 'search' | 'dev' | 'holdout';
+interface RunTokenUsage {
+    input: number;
+    output: number;
+    cached?: number;
+}
+interface RunJudgeMetadata {
+    model: string;
+    promptVersion: string;
+    /** [0,1] confidence the judge declared. Constant judge confidence
+     *  across many runs is a fallback signal (see `canary.ts`). */
+    confidence: number;
+    /** True if the judge degraded to a fallback path (rules-only,
+     *  prior-call cache, etc.). The canary uses this to alert. */
+    fallback: boolean;
+}
+interface RunOutcome {
+    /** Score on the search/optimization split. Optional because a
+     *  holdout-only evaluation only fills `holdoutScore`. */
+    searchScore?: number;
+    /** Score on the held-out split. Optional because a search-only run
+     *  only fills `searchScore`. At least one must be present. */
+    holdoutScore?: number;
+    /** Bag of any other metric the run produced — judge dimensions,
+     *  pass/fail counters, latency stats, etc. Numeric only — keeps
+     *  reporters honest. */
+    raw: Record<string, number>;
+}
+/**
+ * Mandatory paper-grade fields for a single evaluation run. Optional
+ * fields are extension points; mandatory fields throw if missing.
+ *
+ * Hash discipline:
+ *   - `promptHash` is the sha256 of the EFFECTIVE prompt sent to the
+ *     model (after any steering bundle merge).
+ *   - `configHash` is the sha256 of the effective run config (model,
+ *     temperature, tools, judges, splits). The pair (promptHash,
+ *     configHash) uniquely identifies an experimental cell.
+ *
+ * Model snapshot discipline:
+ *   - `model` MUST encode a snapshot version. Bare aliases like
+ *     `claude-sonnet-4` or `gpt-4o` are banned — they remap silently.
+ *     Use `claude-sonnet-4-6@2025-04-15` or `gpt-4o-2024-11-20`.
+ */
+interface RunRecord {
+    /** UUID for the run. */
+    runId: string;
+    /** Logical experiment grouping (a treatment vs a baseline within
+     *  the same sweep should share `experimentId`). */
+    experimentId: string;
+    /** Stable identifier for the candidate (variant) being run. The
+     *  promotion gate compares two `candidateId`s on matched items. */
+    candidateId: string;
+    /** RNG seed for the run. Always recorded — silent re-seeding is
+     *  the most common cause of non-reproducible numbers. */
+    seed: number;
+    /** Model identifier WITH snapshot version. */
+    model: string;
+    /** sha256 of the effective prompt (post-steering). */
+    promptHash: string;
+    /** sha256 of the effective config. */
+    configHash: string;
+    /** Git SHA the harness was run from. */
+    commitSha: string;
+    /** End-to-end wall-clock duration in milliseconds. */
+    wallMs: number;
+    /** Time spent queued before execution started, if known. */
+    queueMs?: number;
+    /** Total USD cost. Mandatory — runs without a cost number are
+     *  unbounded by definition and must not be admitted into the gate. */
+    costUsd: number;
+    /** Token usage breakdown. */
+    tokenUsage: RunTokenUsage;
+    /** Judge-side metadata, if a judge was used. */
+    judgeMetadata?: RunJudgeMetadata;
+    /** Per-split scores + raw bag. */
+    outcome: RunOutcome;
+    /** Categorical failure tag, when the run failed and the harness
+     *  classified it. Free-form string; standard tags live in
+     *  `failure-taxonomy.ts`. */
+    failureMode?: string;
+    /** Which split this run was drawn from. */
+    splitTag: RunSplitTag;
+}
+declare class RunRecordValidationError extends Error {
+    readonly path: string;
+    constructor(message: string, path?: string);
+}
+/**
+ * Strict validator. Throws `RunRecordValidationError` on the first
+ * missing or wrongly-typed field. Returns the input cast to
+ * `RunRecord` on success — the validator does not coerce.
+ */
+declare function validateRunRecord(input: unknown): RunRecord;
+/** Boolean validator — convenience for filtering arrays. */
+declare function isRunRecord(input: unknown): input is RunRecord;
+/** Non-throwing validator — returns a discriminated union. */
+declare function parseRunRecordSafe(input: unknown): {
+    ok: true;
+    value: RunRecord;
+} | {
+    ok: false;
+    error: RunRecordValidationError;
+};
+/** Round-trip helper — `JSON.parse(JSON.stringify(record))` then validate. */
+declare function roundTripRunRecord(record: RunRecord): RunRecord;
+export { type RunSplitTag as R, type RunRecord as a, type RunJudgeMetadata as b, type RunOutcome as c, RunRecordValidationError as d, type RunTokenUsage as e, isRunRecord as i, parseRunRecordSafe as p, roundTripRunRecord as r, validateRunRecord as v };

package/dist/store-u47QaJ9G.d.ts ADDED Viewed

@@ -0,0 +1,297 @@
+/**
+ * TraceSchema v1 — the canonical data model for agent-eval.
+ *
+ * Every score, every failure class, every pipeline in the framework is
+ * a view over this data. Shape it once, live with it.
+ *
+ * Wire-compatible with OpenTelemetry span semantics (see trace/otel.ts)
+ * but extended with agent-specific span kinds (llm, tool, retrieval,
+ * judge, sandbox) and first-class BudgetLedger / Artifact / JudgeVerdict
+ * entities that OTEL leaves as free-form attributes.
+ */
+declare const TRACE_SCHEMA_VERSION = "1.0.0";
+type RunStatus = 'running' | 'completed' | 'failed' | 'aborted';
+interface BudgetSpec {
+    tokens?: number;
+    wallMs?: number;
+    calls?: number;
+    usd?: number;
+}
+interface RunOutcome {
+    score?: number;
+    pass?: boolean;
+    failureClass?: FailureClass;
+    notes?: string;
+}
+/**
+ * Layer — optional classification in a nested build workflow.
+ * `builder`: the meta-agent editing a project (e.g. agent-builder Forge chat).
+ * `app-build`: sandbox harness that compiled + tested the generated scaffold.
+ * `app-runtime`: a run of the generated agent against a domain scenario.
+ * `meta`: any meta-eval (judge replay, correlation analysis).
+ */
+type RunLayer = 'builder' | 'app-build' | 'app-runtime' | 'meta' | 'custom';
+interface Run {
+    runId: string;
+    /**
+     * Stable identifier of the scenario being executed.
+     *
+     * Always populated on the persisted Run — but `TraceEmitter.startRun` accepts
+     * input WITHOUT this field, substituting a sensible default
+     * (`run.layer ?? run.tags?.['kind'] ?? 'runtime'`) when the caller has no
+     * curated scenario to anchor to (runtime / operator / meta-eval runs). This
+     * keeps the persisted shape unambiguous for downstream filters + aggregations
+     * while removing the boilerplate of inventing placeholder ids at the call site.
+     */
+    scenarioId: string;
+    variantId?: string;
+    datasetVersion?: string;
+    /** Git SHA of agent code at run time. */
+    codeSha?: string;
+    /** Hash of the prompt template + any system prompt. */
+    promptSha?: string;
+    /** Model id + date + system-prompt hash, concatenated. */
+    modelFingerprint?: string;
+    seed?: number;
+    /** Arbitrary environment markers (shell, docker version, tz). */
+    envFingerprint?: Record<string, string>;
+    /** Version of the redaction rules applied to this run. */
+    redactionVersion?: string;
+    /** Parent run in a nested build workflow. A builder run's children are
+     *  app-build runs; those children are app-runtime runs. */
+    parentRunId?: string;
+    /** Stable project identifier — groups runs across chats + sessions. */
+    projectId?: string;
+    /** Chat/conversation identifier within a project. */
+    chatId?: string;
+    /** Layer classification — hint for aggregation; not enforced. */
+    layer?: RunLayer;
+    startedAt: number;
+    endedAt?: number;
+    status: RunStatus;
+    outcome?: RunOutcome;
+    budget?: BudgetSpec;
+    /** Free-form labels for downstream grouping. */
+    tags?: Record<string, string>;
+}
+type SpanKind = 'agent' | 'llm' | 'tool' | 'retrieval' | 'judge' | 'sandbox' | 'custom';
+type SpanStatus = 'ok' | 'error';
+interface SpanBase {
+    spanId: string;
+    parentSpanId?: string;
+    runId: string;
+    kind: SpanKind;
+    name: string;
+    startedAt: number;
+    endedAt?: number;
+    status?: SpanStatus;
+    error?: string;
+    /** Anything not covered by typed fields. Kept deliberately free-form. */
+    attributes?: Record<string, unknown>;
+}
+interface Message {
+    role: 'system' | 'user' | 'assistant' | 'tool';
+    content: string;
+    tokens?: number;
+    /** Multi-modal content descriptors; blobs themselves live in Artifacts. */
+    images?: Array<{
+        artifactId?: string;
+        url?: string;
+        mime?: string;
+    }>;
+}
+interface LlmSpan extends SpanBase {
+    kind: 'llm';
+    model: string;
+    messages: Message[];
+    output?: string;
+    inputTokens?: number;
+    outputTokens?: number;
+    cachedTokens?: number;
+    reasoningTokens?: number;
+    costUsd?: number;
+    finishReason?: string;
+}
+interface ToolSpan extends SpanBase {
+    kind: 'tool';
+    toolName: string;
+    args: unknown;
+    result?: unknown;
+    latencyMs?: number;
+}
+interface RetrievalSpan extends SpanBase {
+    kind: 'retrieval';
+    query: string;
+    hits: Array<{
+        docId: string;
+        score: number;
+        content?: string;
+    }>;
+}
+interface JudgeSpan extends SpanBase {
+    kind: 'judge';
+    judgeId: string;
+    /** Span this judgment applies to. */
+    targetSpanId: string;
+    dimension: string;
+    /** Numeric score (free-range; interpretation up to the judge). */
+    score: number;
+    rationale?: string;
+    evidence?: string;
+}
+interface SandboxSpan extends SpanBase {
+    kind: 'sandbox';
+    image?: string;
+    command?: string;
+    exitCode?: number;
+    testsTotal?: number;
+    testsPassed?: number;
+    stdoutHash?: string;
+    stderrHash?: string;
+    /** Duration in ms; the harness fills this explicitly (endedAt - startedAt may miss setup). */
+    wallMs?: number;
+}
+interface GenericSpan extends SpanBase {
+    kind: 'agent' | 'custom';
+}
+type Span = LlmSpan | ToolSpan | RetrievalSpan | JudgeSpan | SandboxSpan | GenericSpan;
+type EventKind = 'log' | 'error' | 'budget_decrement' | 'budget_breach' | 'state_mutation' | 'policy_violation' | 'redaction_applied' | 'custom';
+interface TraceEvent {
+    eventId: string;
+    runId: string;
+    spanId?: string;
+    kind: EventKind;
+    timestamp: number;
+    payload: Record<string, unknown>;
+}
+interface BudgetLedgerEntry {
+    runId: string;
+    dimension: keyof BudgetSpec;
+    limit: number;
+    consumed: number;
+    remaining: number;
+    timestamp: number;
+    breached: boolean;
+    /** Span that triggered this entry, if any. */
+    spanId?: string;
+}
+interface Artifact {
+    artifactId: string;
+    runId: string;
+    spanId?: string;
+    contentType: string;
+    sizeBytes: number;
+    /** sha256 in hex. */
+    hash: string;
+    /** External storage URL (R2, S3, filesystem path). */
+    storageUrl?: string;
+    /** Inline content for small blobs — keep under ~64KB. */
+    inlineContent?: string;
+}
+type FailureClass = 'success' | 'reasoning_error' | 'tool_selection_error' | 'tool_argument_error' | 'tool_recovery_failure' | 'hallucination' | 'instruction_following' | 'safety_refusal_miss' | 'policy_violation' | 'budget_exceeded' | 'format_drift' | 'permission_escalation' | 'pii_leak' | 'cost_overrun' | 'timeout' | 'sandbox_failure' | 'missing_user_data' | 'missing_domain_data' | 'missing_codebase_context' | 'missing_runtime_context' | 'missing_credentials' | 'missing_integration_connection' | 'missing_integration_scope' | 'integration_approval_required' | 'integration_auth_expired' | 'integration_provider_failure' | 'bad_integration_manifest' | 'unsafe_integration_write_denied' | 'stale_external_data' | 'bad_retrieval' | 'insufficient_evidence' | 'contradictory_evidence' | 'ambiguous_user_intent' | 'knowledge_readiness_blocked' | 'unknown';
+declare const FAILURE_CLASSES: readonly FailureClass[];
+declare function isLlmSpan(s: Span): s is LlmSpan;
+declare function isToolSpan(s: Span): s is ToolSpan;
+declare function isRetrievalSpan(s: Span): s is RetrievalSpan;
+declare function isJudgeSpan(s: Span): s is JudgeSpan;
+declare function isSandboxSpan(s: Span): s is SandboxSpan;
+interface RunFilter {
+    scenarioId?: string;
+    variantId?: string;
+    status?: RunStatus;
+    since?: number;
+    until?: number;
+    tag?: {
+        key: string;
+        value: string;
+    };
+    parentRunId?: string;
+    projectId?: string;
+    chatId?: string;
+    layer?: RunLayer;
+}
+interface SpanFilter {
+    runId?: string;
+    parentSpanId?: string;
+    kind?: SpanKind;
+    name?: string;
+    toolName?: string;
+    judgeId?: string;
+    since?: number;
+    until?: number;
+}
+interface EventFilter {
+    runId?: string;
+    spanId?: string;
+    kind?: EventKind;
+    since?: number;
+    until?: number;
+}
+interface TraceStore {
+    appendRun(run: Run): Promise<void>;
+    updateRun(runId: string, patch: Partial<Run>): Promise<void>;
+    appendSpan(span: Span): Promise<void>;
+    updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
+    appendEvent(event: TraceEvent): Promise<void>;
+    appendArtifact(artifact: Artifact): Promise<void>;
+    appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
+    getRun(runId: string): Promise<Run | undefined>;
+    listRuns(filter?: RunFilter): Promise<Run[]>;
+    spans(filter?: SpanFilter): Promise<Span[]>;
+    events(filter?: EventFilter): Promise<TraceEvent[]>;
+    budget(runId: string): Promise<BudgetLedgerEntry[]>;
+    artifacts(runId: string): Promise<Artifact[]>;
+}
+declare class InMemoryTraceStore implements TraceStore {
+    private runs;
+    private allSpans;
+    private allEvents;
+    private allArtifacts;
+    private allBudget;
+    appendRun(run: Run): Promise<void>;
+    updateRun(runId: string, patch: Partial<Run>): Promise<void>;
+    appendSpan(span: Span): Promise<void>;
+    updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
+    appendEvent(event: TraceEvent): Promise<void>;
+    appendArtifact(artifact: Artifact): Promise<void>;
+    appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
+    getRun(runId: string): Promise<Run | undefined>;
+    listRuns(filter?: RunFilter): Promise<Run[]>;
+    spans(filter?: SpanFilter): Promise<Span[]>;
+    events(filter?: EventFilter): Promise<TraceEvent[]>;
+    budget(runId: string): Promise<BudgetLedgerEntry[]>;
+    artifacts(runId: string): Promise<Artifact[]>;
+}
+interface FileSystemTraceStoreOptions {
+    dir: string;
+    /** Roll over NDJSON files when they exceed this size in bytes. Default 32 MB. */
+    maxBytes?: number;
+}
+declare class FileSystemTraceStore implements TraceStore {
+    private dir;
+    private maxBytes;
+    /** Lazy in-memory index for queries — populated on first read. */
+    private index?;
+    private loaded;
+    constructor(options: FileSystemTraceStoreOptions);
+    private ensureDir;
+    private append;
+    private insertInto;
+    private load;
+    appendRun(run: Run): Promise<void>;
+    updateRun(runId: string, patch: Partial<Run>): Promise<void>;
+    appendSpan(span: Span): Promise<void>;
+    updateSpan(spanId: string, patch: Partial<Span>): Promise<void>;
+    appendEvent(event: TraceEvent): Promise<void>;
+    appendArtifact(artifact: Artifact): Promise<void>;
+    appendBudgetEntry(entry: BudgetLedgerEntry): Promise<void>;
+    getRun(runId: string): Promise<Run | undefined>;
+    listRuns(filter?: RunFilter): Promise<Run[]>;
+    spans(filter?: SpanFilter): Promise<Span[]>;
+    events(filter?: EventFilter): Promise<TraceEvent[]>;
+    budget(runId: string): Promise<BudgetLedgerEntry[]>;
+    artifacts(runId: string): Promise<Artifact[]>;
+}
+export { type Artifact as A, type BudgetLedgerEntry as B, type EventKind as E, type FailureClass as F, type GenericSpan as G, InMemoryTraceStore as I, type JudgeSpan as J, type LlmSpan as L, type Message as M, type RunOutcome as R, type Span as S, type TraceStore as T, type Run as a, type SpanKind as b, type ToolSpan as c, type RetrievalSpan as d, type SandboxSpan as e, type TraceEvent as f, type BudgetSpec as g, type RunFilter as h, type EventFilter as i, FAILURE_CLASSES as j, FileSystemTraceStore as k, type FileSystemTraceStoreOptions as l, type RunLayer as m, type RunStatus as n, type SpanBase as o, type SpanFilter as p, type SpanStatus as q, TRACE_SCHEMA_VERSION as r, isJudgeSpan as s, isLlmSpan as t, isRetrievalSpan as u, isSandboxSpan as v, isToolSpan as w };