@tangle-network/agent-eval 0.59.1 → 0.60.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/http.d.ts +1 -1
- package/dist/adapters/http.js +1 -1
- package/dist/adapters/langchain.d.ts +1 -1
- package/dist/adapters/langchain.js +1 -1
- package/dist/adapters/otel.d.ts +2 -2
- package/dist/adapters/otel.js +1 -1
- package/dist/benchmarks/index.js +2 -2
- package/dist/builder-eval/index.js +1 -1
- package/dist/campaign/index.d.ts +7 -3
- package/dist/campaign/index.js +21 -16
- package/dist/campaign/index.js.map +1 -1
- package/dist/{chunk-MHQPVHXU.js → chunk-6QDKWHLS.js} +2 -2
- package/dist/{chunk-N4SBKEPJ.js → chunk-GBHRUAOF.js} +106 -1
- package/dist/chunk-GBHRUAOF.js.map +1 -0
- package/dist/{chunk-JB4UWIM6.js → chunk-LBSXXH56.js} +265 -14
- package/dist/chunk-LBSXXH56.js.map +1 -0
- package/dist/{chunk-74Y2EMNH.js → chunk-NOPYCRNG.js} +6 -5
- package/dist/{chunk-74Y2EMNH.js.map → chunk-NOPYCRNG.js.map} +1 -1
- package/dist/chunk-PZ5AY32C.js +10 -0
- package/dist/chunk-SHTXZ4O2.js +113 -0
- package/dist/chunk-SHTXZ4O2.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/contract/index.d.ts +42 -10
- package/dist/contract/index.js +55 -15
- package/dist/contract/index.js.map +1 -1
- package/dist/control.js +1 -1
- package/dist/governance/index.js +1 -1
- package/dist/hosted/index.d.ts +2 -2
- package/dist/hosted/index.js +1 -1
- package/dist/{index-D2nT6_KT.d.ts → index-BIkvdkSU.d.ts} +1 -1
- package/dist/index.js +8 -8
- package/dist/knowledge/index.js +1 -1
- package/dist/matrix/index.js +1 -1
- package/dist/meta-eval/index.js +1 -1
- package/dist/multishot/index.js +1 -1
- package/dist/openapi.json +1 -1
- package/dist/pipelines/index.js +1 -1
- package/dist/prm/index.js +1 -1
- package/dist/{run-improvement-loop-BhfdjrMY.d.ts → provenance-BM8vmMBa.d.ts} +205 -3
- package/dist/reporting.js +1 -1
- package/dist/rl.d.ts +1 -1
- package/dist/rl.js +1 -1
- package/dist/{run-campaign-ZURVWMMI.js → run-campaign-5XENUKRF.js} +3 -3
- package/dist/telemetry/file.js +1 -1
- package/dist/telemetry/index.js +1 -1
- package/dist/traces.js +1 -1
- package/dist/{types-BgrxOJSf.d.ts → types-VCIXx_yo.d.ts} +32 -4
- package/dist/wire/index.js +1 -1
- package/package.json +25 -12
- package/dist/chunk-JB4UWIM6.js.map +0 -1
- package/dist/chunk-N4SBKEPJ.js.map +0 -1
- package/dist/chunk-NSBPE2FW.js +0 -17
- package/dist/chunk-ZWEQJIM6.js +0 -220
- package/dist/chunk-ZWEQJIM6.js.map +0 -1
- /package/dist/{chunk-MHQPVHXU.js.map → chunk-6QDKWHLS.js.map} +0 -0
- /package/dist/{chunk-NSBPE2FW.js.map → chunk-PZ5AY32C.js.map} +0 -0
- /package/dist/{run-campaign-ZURVWMMI.js.map → run-campaign-5XENUKRF.js.map} +0 -0
package/dist/control.js
CHANGED
package/dist/governance/index.js
CHANGED
package/dist/hosted/index.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
export { E as EvalRunCellScore, d as EvalRunEvent, e as EvalRunGenerationSnapshot, f as EvalRunStatus, g as HOSTED_WIRE_VERSION, H as HostedClient, h as HostedIngestHeaders, a as HostedTenant, i as HostedWireVersion, j as IngestEvalRunsRequest, k as IngestResponse, l as IngestTracesRequest, T as TraceSpanEvent, m as createHostedClient } from '../index-
|
|
2
|
-
import '../types-
|
|
1
|
+
export { E as EvalRunCellScore, d as EvalRunEvent, e as EvalRunGenerationSnapshot, f as EvalRunStatus, g as HOSTED_WIRE_VERSION, H as HostedClient, h as HostedIngestHeaders, a as HostedTenant, i as HostedWireVersion, j as IngestEvalRunsRequest, k as IngestResponse, l as IngestTracesRequest, T as TraceSpanEvent, m as createHostedClient } from '../index-BIkvdkSU.js';
|
|
2
|
+
import '../types-VCIXx_yo.js';
|
|
3
3
|
import '../summary-report-DLxh4yWk.js';
|
|
4
4
|
import '../run-record-etiCMsUq.js';
|
|
5
5
|
import '../errors-mje_cKOs.js';
|
package/dist/hosted/index.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { M as MutableSurface, n as GateDecision } from './types-
|
|
1
|
+
import { M as MutableSurface, n as GateDecision } from './types-VCIXx_yo.js';
|
|
2
2
|
import { G as GainDistributionBin, P as ParetoFigureSpec } from './summary-report-DLxh4yWk.js';
|
|
3
3
|
import { a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
|
|
4
4
|
|
package/dist/index.js
CHANGED
|
@@ -1,18 +1,17 @@
|
|
|
1
1
|
import {
|
|
2
|
-
BackendIntegrityError,
|
|
3
2
|
HoldoutAuditor,
|
|
4
|
-
assertRealBackend,
|
|
5
3
|
canaryLeakView,
|
|
6
4
|
checkBehavioralCanary,
|
|
7
5
|
checkCanaries,
|
|
8
|
-
runBehavioralCanaries
|
|
9
|
-
|
|
10
|
-
} from "./chunk-ZWEQJIM6.js";
|
|
6
|
+
runBehavioralCanaries
|
|
7
|
+
} from "./chunk-SHTXZ4O2.js";
|
|
11
8
|
import {
|
|
9
|
+
BackendIntegrityError,
|
|
12
10
|
DEFAULT_MUTATION_PRIMITIVES,
|
|
13
11
|
DEFAULT_RED_TEAM_CORPUS,
|
|
14
12
|
Dataset,
|
|
15
13
|
HoldoutLockedError,
|
|
14
|
+
assertRealBackend,
|
|
16
15
|
buildReflectionPrompt,
|
|
17
16
|
hashScenarios,
|
|
18
17
|
parseReflectionResponse,
|
|
@@ -20,13 +19,14 @@ import {
|
|
|
20
19
|
redTeamReport,
|
|
21
20
|
runCanaries,
|
|
22
21
|
scoreRedTeamOutput,
|
|
22
|
+
summarizeBackendIntegrity,
|
|
23
23
|
toolNamesForRun
|
|
24
|
-
} from "./chunk-
|
|
24
|
+
} from "./chunk-GBHRUAOF.js";
|
|
25
25
|
import {
|
|
26
26
|
BENCHMARK_SPLIT_SEED,
|
|
27
27
|
benchmarks_exports,
|
|
28
28
|
deterministicSplit
|
|
29
|
-
} from "./chunk-
|
|
29
|
+
} from "./chunk-6QDKWHLS.js";
|
|
30
30
|
import {
|
|
31
31
|
DEFAULT_RULES,
|
|
32
32
|
classifyFailure,
|
|
@@ -260,7 +260,7 @@ import {
|
|
|
260
260
|
ValidationError,
|
|
261
261
|
VerificationError
|
|
262
262
|
} from "./chunk-QYJT52YW.js";
|
|
263
|
-
import "./chunk-
|
|
263
|
+
import "./chunk-PZ5AY32C.js";
|
|
264
264
|
|
|
265
265
|
// src/run-score.ts
|
|
266
266
|
var DEFAULT_RUN_SCORE_WEIGHTS = {
|
package/dist/knowledge/index.js
CHANGED
package/dist/matrix/index.js
CHANGED
package/dist/meta-eval/index.js
CHANGED
|
@@ -10,7 +10,7 @@ import {
|
|
|
10
10
|
llmSpans
|
|
11
11
|
} from "../chunk-47X6LRCE.js";
|
|
12
12
|
import "../chunk-5BKGXME7.js";
|
|
13
|
-
import "../chunk-
|
|
13
|
+
import "../chunk-PZ5AY32C.js";
|
|
14
14
|
|
|
15
15
|
// src/meta-eval/calibration.ts
|
|
16
16
|
async function calibrationCurve(traceStore, outcomeStore, evalMetric, outcomeMetric, options = {}) {
|
package/dist/multishot/index.js
CHANGED
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.60.0",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
package/dist/pipelines/index.js
CHANGED
|
@@ -19,7 +19,7 @@ import {
|
|
|
19
19
|
} from "../chunk-47X6LRCE.js";
|
|
20
20
|
import "../chunk-5BKGXME7.js";
|
|
21
21
|
import "../chunk-QYJT52YW.js";
|
|
22
|
-
import "../chunk-
|
|
22
|
+
import "../chunk-PZ5AY32C.js";
|
|
23
23
|
|
|
24
24
|
// src/pipelines/budget-breach.ts
|
|
25
25
|
async function budgetBreachView(store, options = {}) {
|
package/dist/prm/index.js
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
import { S as Scenario, k as CampaignResult, o as GateResult, u as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, l as CampaignTraceWriter, M as MutableSurface, q as GenerationRecord } from './types-
|
|
1
|
+
import { S as Scenario, k as CampaignResult, o as GateResult, u as Mutator, I as ImprovementDriver, G as Gate, D as DispatchFn, a as JudgeConfig, L as LabeledScenarioStore, l as CampaignTraceWriter, M as MutableSurface, q as GenerationRecord, n as GateDecision } from './types-VCIXx_yo.js';
|
|
2
2
|
import { L as LlmClientOptions } from './llm-client-BXVRUZyX.js';
|
|
3
3
|
import { R as RedTeamCase } from './red-team-CrC5MZYd.js';
|
|
4
4
|
import { R as RunRecord } from './run-record-etiCMsUq.js';
|
|
5
|
+
import { H as HostedClient, T as TraceSpanEvent } from './index-BIkvdkSU.js';
|
|
5
6
|
|
|
6
7
|
/**
|
|
7
8
|
* @experimental
|
|
@@ -242,7 +243,11 @@ interface CampaignStorage {
|
|
|
242
243
|
}
|
|
243
244
|
/** Node-filesystem storage — the default. Lazily requires `node:fs` so the
|
|
244
245
|
* module imports cleanly in non-Node runtimes (where the caller passes
|
|
245
|
-
* `inMemoryCampaignStorage` instead and never constructs this).
|
|
246
|
+
* `inMemoryCampaignStorage` instead and never constructs this).
|
|
247
|
+
*
|
|
248
|
+
* `createRequire(import.meta.url)` is the ESM-native lazy require — a bare
|
|
249
|
+
* `require` is a ReferenceError under `"type": "module"`, which is exactly
|
|
250
|
+
* the shape this package publishes. */
|
|
246
251
|
declare function fsCampaignStorage(): CampaignStorage;
|
|
247
252
|
/** In-memory storage for filesystem-less runtimes. Artifacts + trace spans
|
|
248
253
|
* live in a `Map` for the duration of the run; the `CampaignResult` is
|
|
@@ -385,6 +390,14 @@ interface RunOptimizationResult<TArtifact, TScenario extends Scenario> {
|
|
|
385
390
|
}>;
|
|
386
391
|
winnerSurface: MutableSurface;
|
|
387
392
|
winnerSurfaceHash: string;
|
|
393
|
+
/** Driver label for the promoted surface. Present when the winning
|
|
394
|
+
* candidate came from a `ProposedCandidate` (a reflective driver);
|
|
395
|
+
* absent when the winner is the baseline or a bare-surface mutator. */
|
|
396
|
+
winnerLabel?: string;
|
|
397
|
+
/** Driver rationale for the promoted surface — the "because Z" that
|
|
398
|
+
* motivated the winning change. Survives to `SelfImproveResult` and the
|
|
399
|
+
* emitted provenance record. Absent when the winner is the baseline. */
|
|
400
|
+
winnerRationale?: string;
|
|
388
401
|
baselineCampaign: CampaignResult<TArtifact, TScenario>;
|
|
389
402
|
}
|
|
390
403
|
declare function runOptimization<TScenario extends Scenario, TArtifact>(opts: RunOptimizationOptions<TScenario, TArtifact>): Promise<RunOptimizationResult<TArtifact, TScenario>>;
|
|
@@ -443,8 +456,197 @@ interface RunImprovementLoopResult<TArtifact, TScenario extends Scenario> extend
|
|
|
443
456
|
baselineOnHoldout: CampaignResult<TArtifact, TScenario>;
|
|
444
457
|
winnerOnHoldout: CampaignResult<TArtifact, TScenario>;
|
|
445
458
|
gateResult: Awaited<ReturnType<Gate<TArtifact, TScenario>['decide']>>;
|
|
459
|
+
/** Unified baseline→winner surface diff. Computed UNCONDITIONALLY (not only
|
|
460
|
+
* when `autoOnPromote === 'pr'`) so the diff that the gate decided on is
|
|
461
|
+
* always present on the result + in the emitted provenance record. Empty
|
|
462
|
+
* string when winner == baseline (no change to diff). */
|
|
463
|
+
promotedDiff: string;
|
|
446
464
|
prResult?: ReturnType<typeof openAutoPr>;
|
|
447
465
|
}
|
|
448
466
|
declare function runImprovementLoop<TScenario extends Scenario, TArtifact>(opts: RunImprovementLoopOptions<TScenario, TArtifact>): Promise<RunImprovementLoopResult<TArtifact, TScenario>>;
|
|
467
|
+
declare function defaultRenderDiff(winnerSurface: MutableSurface, baselineSurface: MutableSurface): string;
|
|
468
|
+
|
|
469
|
+
/**
|
|
470
|
+
* @experimental
|
|
471
|
+
*
|
|
472
|
+
* Loop provenance — the durable, queryable record of WHAT a self-improvement
|
|
473
|
+
* loop did and WHY, plus the OTel spans that let an OTLP collector pivot from
|
|
474
|
+
* an eval-run to the underlying candidate→cell→gate→promote chain.
|
|
475
|
+
*
|
|
476
|
+
* Two artifacts, one source of truth:
|
|
477
|
+
*
|
|
478
|
+
* 1. `LoopProvenanceRecord` — a structured JSON record capturing every
|
|
479
|
+
* candidate (surfaceHash + label + rationale), its measured composite,
|
|
480
|
+
* the gate decision + reasons + delta, the held-out lift, the explicit
|
|
481
|
+
* baseline→candidate diff, and BACKEND PROVENANCE (the
|
|
482
|
+
* `assertRealBackend` verdict + worker call count + model). This is the
|
|
483
|
+
* ingestable audit artifact: the +lift recomputes from it, the "because
|
|
484
|
+
* Z" rationale survives in it, and a stub backend is detectable from it.
|
|
485
|
+
*
|
|
486
|
+
* 2. `loopProvenanceSpans()` — the same chain emitted as OTLP-ingestable
|
|
487
|
+
* `TraceSpanEvent`s, pivoted on the substrate's standard
|
|
488
|
+
* `tangle.runId` / `tangle.scenarioId` / `tangle.cellId` /
|
|
489
|
+
* `tangle.generation` attributes (the same pivots `/adapters/otel`
|
|
490
|
+
* reads). The hosted `/v1/ingest/traces` endpoint receives the FULL loop,
|
|
491
|
+
* not just the `cost.*` spans `runCampaign` already emits per cell.
|
|
492
|
+
*
|
|
493
|
+
* The record is built from the substrate's own loop result + the per-call
|
|
494
|
+
* `RunRecord`s the worker emitted — no new measurement, no recomputation that
|
|
495
|
+
* could drift from what the gate actually saw.
|
|
496
|
+
*/
|
|
497
|
+
|
|
498
|
+
/** Stable sha256 (full hex) of a surface's effective text. Code surfaces hash
|
|
499
|
+
* their worktree+base identity since the content lives in git. Distinct from
|
|
500
|
+
* `surfaceHash` (16-char content fingerprint used as a loop identity key);
|
|
501
|
+
* this is the byte-identical-verifiable content hash the provenance record +
|
|
502
|
+
* `RunRecord.promptHash` carry. */
|
|
503
|
+
declare function surfaceContentHash(surface: MutableSurface): string;
|
|
504
|
+
interface LoopProvenanceCandidate {
|
|
505
|
+
/** Generation index this candidate was proposed in. */
|
|
506
|
+
generation: number;
|
|
507
|
+
/** 16-char loop-identity fingerprint (matches `GenerationCandidate.surfaceHash`). */
|
|
508
|
+
surfaceHash: string;
|
|
509
|
+
/** Full sha256 content hash — byte-identical-verifiable. */
|
|
510
|
+
contentHash: string;
|
|
511
|
+
/** Driver label, when the driver returned a `ProposedCandidate`. */
|
|
512
|
+
label?: string;
|
|
513
|
+
/** Driver rationale — the "because Z". When the driver returned a bare
|
|
514
|
+
* surface (blind mutator) this is absent. */
|
|
515
|
+
rationale?: string;
|
|
516
|
+
/** Mean composite this candidate scored on the search split. */
|
|
517
|
+
composite: number;
|
|
518
|
+
/** Whether this candidate was promoted out of its generation. */
|
|
519
|
+
promoted: boolean;
|
|
520
|
+
}
|
|
521
|
+
interface LoopProvenanceBackend {
|
|
522
|
+
/** `assertRealBackend`-grade verdict over the worker call records. */
|
|
523
|
+
verdict: 'real' | 'mixed' | 'stub';
|
|
524
|
+
/** Number of worker LLM calls captured (the audit's "worker call count"). */
|
|
525
|
+
workerCallCount: number;
|
|
526
|
+
/** Distinct model ids observed across worker calls. */
|
|
527
|
+
models: string[];
|
|
528
|
+
totalInputTokens: number;
|
|
529
|
+
totalOutputTokens: number;
|
|
530
|
+
totalCostUsd: number;
|
|
531
|
+
}
|
|
532
|
+
/**
|
|
533
|
+
* The durable provenance record. Aligns to the hosted `EvalRunEvent` path but
|
|
534
|
+
* ADDS the rationale + the explicit baseline→candidate diff (both omitted from
|
|
535
|
+
* the bare hosted event) + backend provenance.
|
|
536
|
+
*/
|
|
537
|
+
interface LoopProvenanceRecord {
|
|
538
|
+
schema: 'tangle.loop-provenance.v1';
|
|
539
|
+
runId: string;
|
|
540
|
+
runDir: string;
|
|
541
|
+
timestamp: string;
|
|
542
|
+
/** Baseline + winner surface content hashes — distinguishable, byte-verifiable. */
|
|
543
|
+
baselineContentHash: string;
|
|
544
|
+
winnerContentHash: string;
|
|
545
|
+
/** Driver label/rationale for the promoted change. Absent ⇒ winner == baseline. */
|
|
546
|
+
winnerLabel?: string;
|
|
547
|
+
winnerRationale?: string;
|
|
548
|
+
/** The explicit baseline→winner unified diff the gate decided on. */
|
|
549
|
+
diff: string;
|
|
550
|
+
/** Every candidate across every generation, each carrying its rationale. */
|
|
551
|
+
candidates: LoopProvenanceCandidate[];
|
|
552
|
+
/** The gate verdict — decision + reasons + contributing gates + delta. */
|
|
553
|
+
gate: {
|
|
554
|
+
decision: GateDecision;
|
|
555
|
+
reasons: string[];
|
|
556
|
+
delta?: number;
|
|
557
|
+
contributingGates: Array<{
|
|
558
|
+
name: string;
|
|
559
|
+
passed: boolean;
|
|
560
|
+
}>;
|
|
561
|
+
};
|
|
562
|
+
/** baseline-on-holdout composite mean. */
|
|
563
|
+
baselineHoldoutComposite: number;
|
|
564
|
+
/** winner-on-holdout composite mean. */
|
|
565
|
+
winnerHoldoutComposite: number;
|
|
566
|
+
/** winnerHoldout - baselineHoldout — RECOMPUTABLE from this record. */
|
|
567
|
+
heldOutLift: number;
|
|
568
|
+
/** Backend provenance: stub-vs-real verdict + worker call count + models. */
|
|
569
|
+
backend: LoopProvenanceBackend;
|
|
570
|
+
totalCostUsd: number;
|
|
571
|
+
totalDurationMs: number;
|
|
572
|
+
}
|
|
573
|
+
interface BuildLoopProvenanceArgs<TArtifact, TScenario extends Scenario> {
|
|
574
|
+
runId: string;
|
|
575
|
+
runDir: string;
|
|
576
|
+
timestamp: string;
|
|
577
|
+
baselineSurface: MutableSurface;
|
|
578
|
+
winnerSurface: MutableSurface;
|
|
579
|
+
winnerLabel?: string;
|
|
580
|
+
winnerRationale?: string;
|
|
581
|
+
diff: string;
|
|
582
|
+
/** Per-generation candidate records straight off the loop result. */
|
|
583
|
+
generations: Array<{
|
|
584
|
+
generationIndex: number;
|
|
585
|
+
candidates: Array<{
|
|
586
|
+
surfaceHash: string;
|
|
587
|
+
composite: number;
|
|
588
|
+
label?: string;
|
|
589
|
+
rationale?: string;
|
|
590
|
+
}>;
|
|
591
|
+
promoted: string[];
|
|
592
|
+
/** Surfaces measured this generation, keyed positionally to candidates so
|
|
593
|
+
* the content hash can be computed from the real surface text. */
|
|
594
|
+
surfaces: Array<{
|
|
595
|
+
surfaceHash: string;
|
|
596
|
+
surface: MutableSurface;
|
|
597
|
+
}>;
|
|
598
|
+
}>;
|
|
599
|
+
gate: GateResult;
|
|
600
|
+
baselineOnHoldout: CampaignResult<TArtifact, TScenario>;
|
|
601
|
+
winnerOnHoldout: CampaignResult<TArtifact, TScenario>;
|
|
602
|
+
/** Worker call records — the source for backend provenance. */
|
|
603
|
+
workerRecords: ReadonlyArray<RunRecord>;
|
|
604
|
+
totalCostUsd: number;
|
|
605
|
+
totalDurationMs: number;
|
|
606
|
+
}
|
|
607
|
+
/** Build the durable provenance record from a completed loop result. */
|
|
608
|
+
declare function buildLoopProvenanceRecord<TArtifact, TScenario extends Scenario>(args: BuildLoopProvenanceArgs<TArtifact, TScenario>): LoopProvenanceRecord;
|
|
609
|
+
/**
|
|
610
|
+
* Build the loop's OTLP-ingestable spans from a provenance record. One root
|
|
611
|
+
* span per loop (`tangle.runId`), one span per generation, one span per
|
|
612
|
+
* candidate (carrying its surfaceHash + label), and one span for the gate
|
|
613
|
+
* decision (carrying reasons + delta + lift). Candidate + gate spans pivot on
|
|
614
|
+
* the same `tangle.runId` / `tangle.generation` attributes `/adapters/otel`
|
|
615
|
+
* reads, so the hosted collector reconstructs the full tree.
|
|
616
|
+
*
|
|
617
|
+
* Times are synthesized monotonically off a single base so the span tree is
|
|
618
|
+
* orderable; the substrate does not retain per-candidate wall-clock starts.
|
|
619
|
+
*/
|
|
620
|
+
declare function loopProvenanceSpans(record: LoopProvenanceRecord, opts?: {
|
|
621
|
+
baseTimeMs?: number;
|
|
622
|
+
}): TraceSpanEvent[];
|
|
623
|
+
/** Canonical durable paths under the run dir. */
|
|
624
|
+
declare function provenanceRecordPath(runDir: string): string;
|
|
625
|
+
declare function provenanceSpansPath(runDir: string): string;
|
|
626
|
+
interface EmitLoopProvenanceResult {
|
|
627
|
+
record: LoopProvenanceRecord;
|
|
628
|
+
spans: TraceSpanEvent[];
|
|
629
|
+
/** Absolute paths the record + spans were written to, when storage persists. */
|
|
630
|
+
recordPath: string;
|
|
631
|
+
spansPath: string;
|
|
632
|
+
}
|
|
633
|
+
interface EmitLoopProvenanceArgs<TArtifact, TScenario extends Scenario> extends BuildLoopProvenanceArgs<TArtifact, TScenario> {
|
|
634
|
+
/** Storage the record + spans are written through. */
|
|
635
|
+
storage: CampaignStorage;
|
|
636
|
+
/** When set, the spans are also shipped to the hosted `/v1/ingest/traces`
|
|
637
|
+
* endpoint so the collector receives the full loop, not just `cost.*`. */
|
|
638
|
+
hostedClient?: HostedClient;
|
|
639
|
+
}
|
|
640
|
+
/**
|
|
641
|
+
* Build the provenance record + OTel spans and persist them durably under the
|
|
642
|
+
* run dir (and ship spans to a hosted collector when one is wired). Returns
|
|
643
|
+
* both artifacts so the caller can assert on / re-derive from them.
|
|
644
|
+
*
|
|
645
|
+
* Fail-loud: the durable write throws on storage failure (a swallowed write is
|
|
646
|
+
* exactly the "emitted but lost" failure this closes). The hosted span ship is
|
|
647
|
+
* the one best-effort leg — its failure is logged, not thrown, so an offline
|
|
648
|
+
* collector never fails the loop (the durable artifact is the source of truth).
|
|
649
|
+
*/
|
|
650
|
+
declare function emitLoopProvenance<TArtifact, TScenario extends Scenario>(args: EmitLoopProvenanceArgs<TArtifact, TScenario>): Promise<EmitLoopProvenanceResult>;
|
|
449
651
|
|
|
450
|
-
export { type CampaignStorage as C, type DefaultProductionGateOptions as D, type
|
|
652
|
+
export { provenanceSpansPath as A, type BuildLoopProvenanceArgs as B, type CampaignStorage as C, type DefaultProductionGateOptions as D, type EmitLoopProvenanceArgs as E, runCampaign as F, type GepaDriverConstraints as G, type HeldOutGateOptions as H, runEval as I, runImprovementLoop as J, runOptimization as K, type LoopProvenanceBackend as L, surfaceContentHash as M, surfaceHash as N, type OpenAutoPrOptions as O, type RunCampaignOptions as R, type EmitLoopProvenanceResult as a, type EvolutionaryDriverOptions as b, type GepaDriverOptions as c, type LoopProvenanceCandidate as d, type LoopProvenanceRecord as e, type OpenAutoPrResult as f, type RunEvalOptions as g, type RunImprovementLoopOptions as h, type RunImprovementLoopResult as i, type RunOptimizationOptions as j, type RunOptimizationResult as k, buildLoopProvenanceRecord as l, composeGate as m, countSentenceEdits as n, defaultProductionGate as o, defaultRenderDiff as p, emitLoopProvenance as q, evolutionaryDriver as r, extractH2Sections as s, fsCampaignStorage as t, gepaDriver as u, heldOutGate as v, inMemoryCampaignStorage as w, loopProvenanceSpans as x, openAutoPr as y, provenanceRecordPath as z };
|
package/dist/reporting.js
CHANGED
package/dist/rl.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { R as RunRecord, a as RunSplitTag } from './run-record-etiCMsUq.js';
|
|
2
|
-
import { k as CampaignResult } from './types-
|
|
2
|
+
import { k as CampaignResult } from './types-VCIXx_yo.js';
|
|
3
3
|
import { V as VerificationReport, R as Researcher, F as FailureMode, S as SteeringChange, E as ExperimentPlan, a as ExperimentResult, b as EvalCampaignResult, c as EvalCampaignOptions } from './researcher-JP8EvnLv.js';
|
|
4
4
|
export { r as runEvalCampaign } from './researcher-JP8EvnLv.js';
|
|
5
5
|
import { S as Span } from './schema-m0gsnbt3.js';
|
package/dist/rl.js
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
import {
|
|
2
2
|
runCampaign
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-NOPYCRNG.js";
|
|
4
4
|
import "./chunk-S3SDD56V.js";
|
|
5
5
|
import "./chunk-QYJT52YW.js";
|
|
6
|
-
import "./chunk-
|
|
6
|
+
import "./chunk-PZ5AY32C.js";
|
|
7
7
|
export {
|
|
8
8
|
runCampaign
|
|
9
9
|
};
|
|
10
|
-
//# sourceMappingURL=run-campaign-
|
|
10
|
+
//# sourceMappingURL=run-campaign-5XENUKRF.js.map
|
package/dist/telemetry/file.js
CHANGED
package/dist/telemetry/index.js
CHANGED
package/dist/traces.js
CHANGED
|
@@ -118,6 +118,24 @@ interface CodeSurface {
|
|
|
118
118
|
* Tier 3 (knowledge) is owned by agent-knowledge and rides its own adapter,
|
|
119
119
|
* not this type. */
|
|
120
120
|
type MutableSurface = string | CodeSurface;
|
|
121
|
+
/** @experimental A driver proposal carrying the surface AND the WHY behind
|
|
122
|
+
* it. Reflective drivers (`gepaDriver`) parse a `{label, rationale, payload}`
|
|
123
|
+
* from the model; without this wrapper the loop keeps only `payload` and the
|
|
124
|
+
* rationale that motivated the change is lost — the candidate becomes
|
|
125
|
+
* unattributable. `propose()` may return either bare `MutableSurface`s (cheap
|
|
126
|
+
* blind mutators) or these (reflective drivers); the loop normalizes both. */
|
|
127
|
+
interface ProposedCandidate {
|
|
128
|
+
surface: MutableSurface;
|
|
129
|
+
/** Short human label for the change (≤ 40 chars typical). */
|
|
130
|
+
label: string;
|
|
131
|
+
/** Why this change was proposed — which failure it targets, which
|
|
132
|
+
* primitive it used. Survives to `GenerationCandidate.rationale` and the
|
|
133
|
+
* emitted provenance record. */
|
|
134
|
+
rationale: string;
|
|
135
|
+
}
|
|
136
|
+
/** @experimental Type guard: a proposal carrying its rationale vs a bare
|
|
137
|
+
* surface. The loop branches on this to populate `GenerationCandidate`. */
|
|
138
|
+
declare function isProposedCandidate(value: MutableSurface | ProposedCandidate): value is ProposedCandidate;
|
|
121
139
|
/** @experimental Stateless surface mutation — given findings + current
|
|
122
140
|
* surface, return N candidate surfaces. Pure transform, no generation
|
|
123
141
|
* awareness. Reflective-mutation, `runMultiShotOptimization`, `AxGEPA`
|
|
@@ -129,7 +147,7 @@ interface Mutator<TFindings = unknown> {
|
|
|
129
147
|
currentSurface: MutableSurface;
|
|
130
148
|
populationSize: number;
|
|
131
149
|
signal: AbortSignal;
|
|
132
|
-
}): Promise<MutableSurface
|
|
150
|
+
}): Promise<Array<MutableSurface | ProposedCandidate>>;
|
|
133
151
|
}
|
|
134
152
|
/** @experimental Everything a driver's `propose()` may read to plan the next
|
|
135
153
|
* batch of candidates. The first six fields are always present; the rest are
|
|
@@ -169,8 +187,11 @@ interface ProposeContext<TFindings = unknown> {
|
|
|
169
187
|
* are driver-agnostic. */
|
|
170
188
|
interface ImprovementDriver<TFindings = unknown> {
|
|
171
189
|
kind: string;
|
|
172
|
-
/** Plan: propose N candidate surfaces for the next generation.
|
|
173
|
-
|
|
190
|
+
/** Plan: propose N candidate surfaces for the next generation. A driver
|
|
191
|
+
* may return bare `MutableSurface`s or `ProposedCandidate`s that carry the
|
|
192
|
+
* `{label, rationale}` motivating the change — the loop threads the
|
|
193
|
+
* rationale into `GenerationCandidate` and the emitted provenance. */
|
|
194
|
+
propose(ctx: ProposeContext<TFindings>): Promise<Array<MutableSurface | ProposedCandidate>>;
|
|
174
195
|
/** Decide: stop early when the driver judges the search converged or
|
|
175
196
|
* exhausted. Default (omitted) runs all `maxGenerations`. */
|
|
176
197
|
decide?(args: {
|
|
@@ -368,6 +389,13 @@ interface GenerationCandidate {
|
|
|
368
389
|
scenarioId: string;
|
|
369
390
|
composite: number;
|
|
370
391
|
}>;
|
|
392
|
+
/** Driver-supplied short label for the change. Present when the driver
|
|
393
|
+
* returned a `ProposedCandidate`; absent for bare-surface mutators. */
|
|
394
|
+
label?: string;
|
|
395
|
+
/** Driver-supplied rationale — WHY this candidate was proposed. The
|
|
396
|
+
* "because rationale Z" the audit requires to survive to the result.
|
|
397
|
+
* Present when the driver returned a `ProposedCandidate`. */
|
|
398
|
+
rationale?: string;
|
|
371
399
|
}
|
|
372
400
|
interface CampaignAggregates {
|
|
373
401
|
byJudge: Record<string, JudgeAggregate>;
|
|
@@ -402,4 +430,4 @@ interface CampaignResult<TArtifact = unknown, TScenario extends Scenario = Scena
|
|
|
402
430
|
scenarios: Array<Pick<TScenario, 'id' | 'kind'>>;
|
|
403
431
|
}
|
|
404
432
|
|
|
405
|
-
export { type CodeSurface as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ProposeContext as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type LabeledScenarioWrite as c, type LabeledScenarioSampleArgs as d, type LabeledScenarioRecord as e, type LabelTrust as f, type CampaignAggregates as g, type CampaignArtifactWriter as h, type CampaignCellResult as i, type CampaignCostMeter as j, type CampaignResult as k, type CampaignTraceWriter as l, type GateContext as m, type GateDecision as n, type GateResult as o, type GenerationCandidate as p, type GenerationRecord as q, type JudgeAggregate as r, type JudgeDimension as s, type LabeledScenarioSource as t, type Mutator as u, type
|
|
433
|
+
export { type CodeSurface as C, type DispatchFn as D, type Gate as G, type ImprovementDriver as I, type JudgeScore as J, type LabeledScenarioStore as L, type MutableSurface as M, type OptimizerConfig as O, type ProposeContext as P, type RedactionStatus as R, type Scenario as S, type TraceSpan as T, type JudgeConfig as a, type DispatchContext as b, type LabeledScenarioWrite as c, type LabeledScenarioSampleArgs as d, type LabeledScenarioRecord as e, type LabelTrust as f, type CampaignAggregates as g, type CampaignArtifactWriter as h, type CampaignCellResult as i, type CampaignCostMeter as j, type CampaignResult as k, type CampaignTraceWriter as l, type GateContext as m, type GateDecision as n, type GateResult as o, type GenerationCandidate as p, type GenerationRecord as q, type JudgeAggregate as r, type JudgeDimension as s, type LabeledScenarioSource as t, type Mutator as u, type ProposedCandidate as v, type ScenarioAggregate as w, type SessionScript as x, isProposedCandidate as y, labelTrustRank as z };
|
package/dist/wire/index.js
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@tangle-network/agent-eval",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.60.0",
|
|
4
4
|
"description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
|
|
5
5
|
"homepage": "https://github.com/tangle-network/agent-eval#readme",
|
|
6
6
|
"repository": {
|
|
@@ -144,6 +144,18 @@
|
|
|
144
144
|
"publishConfig": {
|
|
145
145
|
"access": "public"
|
|
146
146
|
},
|
|
147
|
+
"scripts": {
|
|
148
|
+
"build": "tsup && pnpm openapi",
|
|
149
|
+
"dev": "tsup --watch",
|
|
150
|
+
"prepare": "husky",
|
|
151
|
+
"prepublishOnly": "pnpm build",
|
|
152
|
+
"test": "vitest run",
|
|
153
|
+
"test:watch": "vitest",
|
|
154
|
+
"typecheck": "tsc --noEmit",
|
|
155
|
+
"lint": "biome check src",
|
|
156
|
+
"format": "biome format --write src",
|
|
157
|
+
"openapi": "node dist/cli.js openapi --out dist/openapi.json"
|
|
158
|
+
},
|
|
147
159
|
"dependencies": {
|
|
148
160
|
"@asteasolutions/zod-to-openapi": "^8.5.0",
|
|
149
161
|
"@ax-llm/ax": "^19.0.25",
|
|
@@ -171,6 +183,16 @@
|
|
|
171
183
|
"typescript": "^5.7.0",
|
|
172
184
|
"vitest": "^3.0.0"
|
|
173
185
|
},
|
|
186
|
+
"pnpm": {
|
|
187
|
+
"minimumReleaseAge": 4320,
|
|
188
|
+
"minimumReleaseAgeExclude": [
|
|
189
|
+
"@tangle-network/sandbox"
|
|
190
|
+
],
|
|
191
|
+
"overrides": {
|
|
192
|
+
"postcss@<8.5.10": "^8.5.10",
|
|
193
|
+
"ws@>=8.0.0 <8.20.1": "^8.20.1"
|
|
194
|
+
}
|
|
195
|
+
},
|
|
174
196
|
"engines": {
|
|
175
197
|
"node": ">=20"
|
|
176
198
|
},
|
|
@@ -183,14 +205,5 @@
|
|
|
183
205
|
]
|
|
184
206
|
},
|
|
185
207
|
"license": "MIT",
|
|
186
|
-
"
|
|
187
|
-
|
|
188
|
-
"dev": "tsup --watch",
|
|
189
|
-
"test": "vitest run",
|
|
190
|
-
"test:watch": "vitest",
|
|
191
|
-
"typecheck": "tsc --noEmit",
|
|
192
|
-
"lint": "biome check src",
|
|
193
|
-
"format": "biome format --write src",
|
|
194
|
-
"openapi": "node dist/cli.js openapi --out dist/openapi.json"
|
|
195
|
-
}
|
|
196
|
-
}
|
|
208
|
+
"packageManager": "pnpm@10.22.0"
|
|
209
|
+
}
|