@tangle-network/agent-eval 0.72.0 → 0.72.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. package/CHANGELOG.md +39 -0
  2. package/dist/adapters/http.d.ts +1 -1
  3. package/dist/adapters/langchain.d.ts +1 -1
  4. package/dist/adapters/otel.d.ts +3 -2
  5. package/dist/agent-profile-DYRboYWu.d.ts +364 -0
  6. package/dist/analyst/index.d.ts +221 -0
  7. package/dist/analyst/index.js +371 -0
  8. package/dist/analyst/index.js.map +1 -0
  9. package/dist/analyst-t7zZS3TV.d.ts +88 -0
  10. package/dist/campaign/index.d.ts +518 -9
  11. package/dist/campaign/index.js +672 -22
  12. package/dist/campaign/index.js.map +1 -1
  13. package/dist/chunk-7W4SM7FD.js +1075 -0
  14. package/dist/chunk-7W4SM7FD.js.map +1 -0
  15. package/dist/{chunk-AIWHLG7J.js → chunk-GJJNJVIR.js} +11 -11
  16. package/dist/chunk-JHA3ZGSO.js +1496 -0
  17. package/dist/chunk-JHA3ZGSO.js.map +1 -0
  18. package/dist/{chunk-4QJN7RDX.js → chunk-JYE3WOTE.js} +55 -7
  19. package/dist/{chunk-4QJN7RDX.js.map → chunk-JYE3WOTE.js.map} +1 -1
  20. package/dist/chunk-LB2UOI5F.js +412 -0
  21. package/dist/chunk-LB2UOI5F.js.map +1 -0
  22. package/dist/{chunk-ODGETRTM.js → chunk-VUINJM5M.js} +234 -1415
  23. package/dist/chunk-VUINJM5M.js.map +1 -0
  24. package/dist/chunk-WYIHD6EB.js +1044 -0
  25. package/dist/chunk-WYIHD6EB.js.map +1 -0
  26. package/dist/{chunk-UD6EF73X.js → chunk-XPILG2CA.js} +119 -2
  27. package/dist/chunk-XPILG2CA.js.map +1 -0
  28. package/dist/contract/index.d.ts +17 -13
  29. package/dist/contract/index.js +13 -7
  30. package/dist/contract/index.js.map +1 -1
  31. package/dist/{control-DxvZeV5X.d.ts → control-BgA6BYTm.d.ts} +1 -1
  32. package/dist/control.d.ts +2 -2
  33. package/dist/{feedback-trajectory-8hKC5EOb.d.ts → feedback-trajectory-B3rErRsh.d.ts} +1 -1
  34. package/dist/harness-optimizer-EnEnQPsr.d.ts +106 -0
  35. package/dist/hosted/index.d.ts +223 -2
  36. package/dist/index.d.ts +49 -1323
  37. package/dist/index.js +353 -2496
  38. package/dist/index.js.map +1 -1
  39. package/dist/{index-BGBrVS24.d.ts → insight-report-Df3lxYXM.d.ts} +1 -221
  40. package/dist/kind-factory-DW9XWPvM.d.ts +172 -0
  41. package/dist/multi-layer-verifier-DlWCXuxL.d.ts +141 -0
  42. package/dist/openapi.json +1 -1
  43. package/dist/pareto-E-pembql.d.ts +81 -0
  44. package/dist/{provenance-C69gLUXH.d.ts → provenance-B-TFszPW.d.ts} +131 -4
  45. package/dist/redact-B40YG2M_.d.ts +45 -0
  46. package/dist/registry-DuVYiTvw.d.ts +128 -0
  47. package/dist/{researcher-WJvIpX3L.d.ts → researcher-C_KJyIGg.d.ts} +1 -141
  48. package/dist/rl.d.ts +4 -3
  49. package/dist/rl.js +4 -4
  50. package/dist/run-critic-BAIjX99r.d.ts +56 -0
  51. package/dist/{run-improvement-loop-Bzamo6GB.d.ts → run-improvement-loop-BqYH2vCR.d.ts} +25 -1
  52. package/dist/semantic-concept-judge-CV9Wlx4t.d.ts +650 -0
  53. package/dist/{store-jzKpMl16.d.ts → store-GmBE2pZZ.d.ts} +1 -1
  54. package/dist/traces.d.ts +371 -308
  55. package/dist/traces.js +43 -18
  56. package/dist/{types-CnmZ2bkP.d.ts → types-Bba0vl1V.d.ts} +1 -1
  57. package/dist/{registry-BGKyX6bw.d.ts → types-CRD68aH7.d.ts} +3 -128
  58. package/dist/wire/index.d.ts +1 -1
  59. package/dist/workflow/index.d.ts +494 -0
  60. package/dist/workflow/index.js +2177 -0
  61. package/dist/workflow/index.js.map +1 -0
  62. package/docs/design/self-improvement-roadmap.md +106 -0
  63. package/package.json +36 -12
  64. package/dist/agent-profile-DzcPHR1Z.d.ts +0 -114
  65. package/dist/chunk-ODGETRTM.js.map +0 -1
  66. package/dist/chunk-SL55X4VN.js +0 -186
  67. package/dist/chunk-SL55X4VN.js.map +0 -1
  68. package/dist/chunk-UD6EF73X.js.map +0 -1
  69. /package/dist/{chunk-AIWHLG7J.js.map → chunk-GJJNJVIR.js.map} +0 -0
@@ -1,4 +1,3 @@
1
- import { M as MutableSurface, j as GateDecision } from './types-CnmZ2bkP.js';
2
1
  import { G as GainDistributionBin, P as ParetoFigureSpec } from './summary-report-ByiOUrHj.js';
3
2
  import { a as ContinuousAgreement } from './judge-calibration-DilmB3Ml.js';
4
3
 
@@ -290,223 +289,4 @@ interface Recommendation {
290
289
  evidencePath?: string;
291
290
  }
292
291
 
293
- /**
294
- * # Hosted-tier wire format — the schema that EVERY orchestrator (ours,
295
- * a partner's self-hosted one, a future open implementation) must accept.
296
- *
297
- * **Stability:** every type in this file is committed under semver. New
298
- * minors only ADD optional fields. Breaking changes mean a major bump
299
- * (`HostedWireVersion` literal increment).
300
- *
301
- * The wire format is two event streams in one transport:
302
- *
303
- * 1. **Eval-run events** (`POST /v1/ingest/eval-runs`). Posted when a
304
- * campaign / improvement-loop completes (or per-generation if
305
- * streaming). Carries the structured result + per-cell scores +
306
- * surface diffs the orchestrator stores for the dashboard.
307
- *
308
- * 2. **Trace spans** (`POST /v1/ingest/traces`). Standard OTLP-shaped
309
- * spans with a few additional attributes so the orchestrator can
310
- * pivot from eval-run → underlying execution. Compatible with any
311
- * OTel collector.
312
- *
313
- * Both endpoints are authenticated with a bearer token + a tenant id
314
- * header. Tenants isolate everything downstream of ingest; no tenant
315
- * ever sees another tenant's data.
316
- */
317
-
318
- declare const HOSTED_WIRE_VERSION: "2026-05-26.v1";
319
- type HostedWireVersion = typeof HOSTED_WIRE_VERSION;
320
- /** Every ingest request carries these. */
321
- interface HostedIngestHeaders {
322
- /** Bearer token. The orchestrator validates against the tenant key. */
323
- authorization: `Bearer ${string}`;
324
- /** Stable tenant id (the orchestrator-side primary key for the tenant). */
325
- 'x-tangle-tenant-id': string;
326
- /** Wire-version pin so the server can reject incompatible payloads. */
327
- 'x-tangle-wire-version': HostedWireVersion;
328
- /** Optional idempotency key for retry-safe ingest. */
329
- 'idempotency-key'?: string;
330
- }
331
- /** Lifecycle stages of an eval-run as the substrate reports them. */
332
- type EvalRunStatus = 'started' | 'baseline-complete' | 'generation-complete' | 'gate-decided' | 'finished' | 'errored';
333
- interface EvalRunCellScore {
334
- /** Stable scenario id from the consumer's scenario set. */
335
- scenarioId: string;
336
- /** Repetition index when reps > 1; 0 for the default. */
337
- rep: number;
338
- /** Composite score across all judges + dimensions for this cell. */
339
- compositeMean: number;
340
- /** Per-judge → per-dimension scores; null where the judge did not run. */
341
- dimensions: Record<string, Record<string, number>>;
342
- /** Per-cell error message if the dispatch threw. Null on success. */
343
- errorMessage?: string;
344
- }
345
- interface EvalRunGenerationSnapshot {
346
- /** Generation index. 0 is baseline. */
347
- index: number;
348
- /** Candidate surface fingerprint (stable hash) — pivot key into the
349
- * trace stream to fetch the underlying execution. */
350
- surfaceHash: string;
351
- /** The candidate surface itself. May be omitted to avoid PII when the
352
- * consumer prefers not to ship verbatim prompts. */
353
- surface?: MutableSurface;
354
- /** Per-cell scores for this generation. */
355
- cells: EvalRunCellScore[];
356
- /** Aggregate composite mean across all cells in this generation. */
357
- compositeMean: number;
358
- /** Total $ spent across this generation. */
359
- costUsd: number;
360
- /** Wall-clock duration of this generation. */
361
- durationMs: number;
362
- }
363
- /**
364
- * The top-level eval-run event. One ingest call per logical eval-run;
365
- * generations stream in incrementally via repeated calls with the same
366
- * `runId`. The orchestrator deduplicates by `(runId, generation.index)`.
367
- */
368
- interface EvalRunEvent {
369
- /** Stable run id (the substrate's `runId`). UUID or substrate-generated. */
370
- runId: string;
371
- /** Where this run was happening — derived from `RunCampaignOptions.runDir`. */
372
- runDir: string;
373
- /** ISO-8601 timestamp the substrate recorded the event. */
374
- timestamp: string;
375
- /** Lifecycle stage this event represents. */
376
- status: EvalRunStatus;
377
- /** Free-form consumer tags (env, branch, model id, etc.). Searchable. */
378
- labels: Record<string, string>;
379
- /** Baseline campaign snapshot. Present when status >= baseline-complete. */
380
- baseline?: EvalRunGenerationSnapshot;
381
- /** Per-generation snapshots. Streams in; orchestrator appends. */
382
- generations: EvalRunGenerationSnapshot[];
383
- /** Final gate decision. Present when status >= gate-decided. */
384
- gateDecision?: GateDecision;
385
- /** Held-out lift = winner-on-holdout - baseline-on-holdout. */
386
- holdoutLift?: number;
387
- /** Total $ spent across baseline + every generation. */
388
- totalCostUsd: number;
389
- /** Total wall-clock duration. */
390
- totalDurationMs: number;
391
- /** Error message if status === 'errored'. */
392
- errorMessage?: string;
393
- /** Rigor packet emitted alongside the run — distributional summary,
394
- * paired-bootstrap lift CI, judge stats, inter-rater agreement,
395
- * contamination check, failure clusters (when an analyst is wired),
396
- * outcome correlation (when downstream signal is supplied), and the
397
- * recommendations the dashboard surfaces verbatim. Additive; older
398
- * clients that don't know about this field continue to work. */
399
- insightReport?: InsightReport;
400
- }
401
- /**
402
- * OTel-shape span with a few additional attributes for eval-run pivoting.
403
- * Compatible with any OTLP collector — `name`, `traceId`, `spanId`,
404
- * `startTimeUnixNano`, `endTimeUnixNano`, `attributes` are stock OTel.
405
- */
406
- interface TraceSpanEvent {
407
- traceId: string;
408
- spanId: string;
409
- parentSpanId?: string;
410
- name: string;
411
- startTimeUnixNano: number;
412
- endTimeUnixNano: number;
413
- attributes: Record<string, string | number | boolean>;
414
- events?: Array<{
415
- timeUnixNano: number;
416
- name: string;
417
- attributes?: Record<string, string | number | boolean>;
418
- }>;
419
- status?: {
420
- code: 'OK' | 'ERROR' | 'UNSET';
421
- message?: string;
422
- };
423
- /** Pivot back into the eval-run stream. */
424
- 'tangle.runId'?: string;
425
- /** Pivot to the specific generation. */
426
- 'tangle.generation'?: number;
427
- /** Pivot to the specific cell. */
428
- 'tangle.cellId'?: string;
429
- /** Pivot to the specific scenario. */
430
- 'tangle.scenarioId'?: string;
431
- }
432
- interface IngestEvalRunsRequest {
433
- wireVersion: HostedWireVersion;
434
- events: EvalRunEvent[];
435
- }
436
- interface IngestTracesRequest {
437
- wireVersion: HostedWireVersion;
438
- spans: TraceSpanEvent[];
439
- }
440
- interface IngestResponse {
441
- /** Accepted events / spans count. */
442
- accepted: number;
443
- /** Rejected events with reasons (validation failures, dup idempotency key, etc.). */
444
- rejected: Array<{
445
- index: number;
446
- reason: string;
447
- }>;
448
- }
449
-
450
- /**
451
- * # Hosted-tier ingest client.
452
- *
453
- * Ships eval-run events + trace spans to any orchestrator (ours, a
454
- * partner's self-hosted one, or a future open implementation) that
455
- * speaks the wire format in `./types.ts`.
456
- *
457
- * Three modes:
458
- * - **Ours:** point at `https://orchestrator.tangle.tools` (the host root —
459
- * the client appends the versioned `/v1/ingest/...` path itself; a trailing
460
- * `/v1` on the endpoint is tolerated and normalized away). We handle ingest
461
- * + storage + dashboard.
462
- * - **Self-hosted:** point at whatever URL runs the reference receiver
463
- * from `examples/hosted-ingest-server/`.
464
- * - **Off (default):** when `hostedTenant` is unset, nothing is sent.
465
- * Everything stays local.
466
- */
467
-
468
- interface HostedTenant {
469
- /** Orchestrator endpoint base URL (no trailing slash). Required. */
470
- endpoint: string;
471
- /** Bearer token issued by the orchestrator. Required. */
472
- apiKey: string;
473
- /** Tenant id — the orchestrator's primary key for this consumer. Required. */
474
- tenantId: string;
475
- /** Optional `fetch` override (auth wrappers, custom agent, test mocks). */
476
- fetchImpl?: typeof fetch;
477
- /** Per-call timeout in ms. Default 30s. */
478
- timeoutMs?: number;
479
- /** Retries on 5xx / network errors. Default 2. */
480
- retries?: number;
481
- }
482
- interface HostedClient {
483
- ingestEvalRun(event: EvalRunEvent, idempotencyKey?: string): Promise<IngestResponse>;
484
- ingestEvalRuns(events: EvalRunEvent[], idempotencyKey?: string): Promise<IngestResponse>;
485
- ingestTraces(spans: TraceSpanEvent[], idempotencyKey?: string): Promise<IngestResponse>;
486
- readonly tenant: HostedTenant;
487
- readonly wireVersion: HostedWireVersion;
488
- }
489
- declare function createHostedClient(tenant: HostedTenant): HostedClient;
490
- /**
491
- * Build a `HostedClient` from environment, or `undefined` when ingest is not
492
- * configured — the canonical, fail-soft wiring every product uses so eval-run +
493
- * trace provenance lands in the Intelligence dashboard with ONE call:
494
- *
495
- * const hosted = hostedClientFromEnv()
496
- * // ...run the loop...
497
- * await emitLoopProvenance({ ..., hostedClient: hosted }) // no-op if undefined
498
- *
499
- * Returns `undefined` (NOT an error) when any of endpoint / apiKey / tenantId is
500
- * missing — so a product wires the ship call unconditionally and it stays a
501
- * no-op until the env is set. Env precedence:
502
- * - endpoint: `TANGLE_INGEST_URL` → `TANGLE_ORCHESTRATOR_URL`
503
- * - apiKey: `TANGLE_INGEST_API_KEY` → `TANGLE_API_KEY`
504
- * - tenantId: `TANGLE_TENANT_ID`
505
- * A trailing slash on the endpoint is stripped. Pass `overrides` to supply any
506
- * field directly (e.g. a fixed `tenantId` per product) — overrides win over env.
507
- */
508
- declare function hostedClientFromEnv(overrides?: Partial<HostedTenant> & {
509
- env?: Record<string, string | undefined>;
510
- }): HostedClient | undefined;
511
-
512
- export { type EvalRunCellScore as E, type FailureClusterInsight as F, type HostedClient as H, type InsightReport as I, type JudgeInsight as J, type LiftInsight as L, type OutcomeCorrelationInsight as O, type Recommendation as R, type ScalarDistribution as S, type TraceSpanEvent as T, type HostedTenant as a, type InterRaterInsight as b, type ReleaseSummary as c, type EvalRunEvent as d, type EvalRunGenerationSnapshot as e, type EvalRunStatus as f, HOSTED_WIRE_VERSION as g, type HostedIngestHeaders as h, type HostedWireVersion as i, type IngestEvalRunsRequest as j, type IngestResponse as k, type IngestTracesRequest as l, createHostedClient as m, hostedClientFromEnv as n };
292
+ export type { FailureClusterInsight as F, InsightReport as I, JudgeInsight as J, LiftInsight as L, OutcomeCorrelationInsight as O, Recommendation as R, ScalarDistribution as S, InterRaterInsight as a, ReleaseSummary as b };
@@ -0,0 +1,172 @@
1
+ import { AxAIService, AxFunction } from '@ax-llm/ax';
2
+ import { T as TraceAnalysisStore } from './store-GmBE2pZZ.js';
3
+ import { z } from 'zod';
4
+ import { g as AnalystCost, a as AnalystContext, A as Analyst } from './types-CRD68aH7.js';
5
+
6
+ /**
7
+ * Typed Ax output for analyst findings.
8
+ *
9
+ * Replaces the legacy `findings:string[]` pattern (where every bullet
10
+ * became a flat-severity `AnalystFinding`) with a structured object
11
+ * array. Ax binds the field as `findings:json[]` so the provider emits
12
+ * native structured output; at the kind-factory boundary we Zod-validate
13
+ * each emitted finding so malformed rows fail loud instead of being
14
+ * silently lifted with default severity.
15
+ *
16
+ * Why not `f.object().array()` directly in the signature? The Ax
17
+ * signature string `question:string -> findings:json[]` already lets
18
+ * the provider emit JSON arrays. A Zod boundary is required either
19
+ * way (the provider can return any JSON), and Zod gives us a single
20
+ * validation surface independent of which Ax version is installed.
21
+ */
22
+
23
+ declare const ANALYST_SEVERITIES: readonly ["critical", "high", "medium", "low", "info"];
24
+ declare const RawAnalystFindingSchema: z.ZodObject<{
25
+ severity: z.ZodEnum<{
26
+ info: "info";
27
+ critical: "critical";
28
+ medium: "medium";
29
+ low: "low";
30
+ high: "high";
31
+ }>;
32
+ claim: z.ZodString;
33
+ subject: z.ZodOptional<z.ZodString>;
34
+ evidence_uri: z.ZodString;
35
+ evidence_excerpt: z.ZodOptional<z.ZodString>;
36
+ confidence: z.ZodNumber;
37
+ rationale: z.ZodOptional<z.ZodString>;
38
+ recommended_action: z.ZodOptional<z.ZodString>;
39
+ }, z.core.$strict>;
40
+ type RawAnalystFinding = z.infer<typeof RawAnalystFindingSchema>;
41
+ /**
42
+ * Description embedded into the actor prompt so the LLM knows what
43
+ * shape to emit. Kept here so kinds share one source of truth rather
44
+ * than restating the schema in every prompt.
45
+ */
46
+ declare const RAW_FINDING_SCHEMA_PROMPT = "Each finding MUST be a JSON object with these fields:\n - severity: one of \"critical\" | \"high\" | \"medium\" | \"low\" | \"info\"\n - claim: one-sentence statement (max 2000 chars)\n - subject?: the routing locus this finding is about. It MUST be one of the exact subject forms listed in this kind's instructions above (e.g. `system-prompt:<section>`, `agent-knowledge:wiki:<slug>`, `tool-doc:<tool>`). A free phrase, a bare noun, or any form not in that list is REJECTED at parse time and the finding is discarded \u2014 omit subject entirely rather than guess a form.\n - evidence_uri: REQUIRED, never blank. Exactly one of \"span://<trace_id>/<span_id>\" (trace evidence), \"artifact://<relative-path>\" (files), \"metric://<name>\" (named scalars) \u2014 ALWAYS cite a real id surfaced by the tools. If you have no citable id, do not emit the finding.\n - evidence_excerpt?: short quote (<=2000 chars) from the cited span/artifact\n - confidence: number 0..1 \u2014 0.9+ when backed by exact quotes, 0.6-0.8 for inferred patterns, <0.5 for speculative\n - rationale?: one or two sentences explaining the reasoning\n - recommended_action?: concrete change phrased as an imperative (\"Add ...\", \"Replace ...\", \"Stop ...\") \u2014 omit when the finding is purely descriptive\n\nEmit an empty array when the question has no findings to report. Do not fabricate evidence.";
47
+ /**
48
+ * Validate one row emitted by the LLM. Returns the typed finding on
49
+ * success; returns `null` and logs the reason on failure so the kind
50
+ * factory can skip-and-count rather than abort the whole analyst run.
51
+ */
52
+ declare function parseRawFinding(row: unknown, log?: (msg: string, fields?: Record<string, unknown>) => void): RawAnalystFinding | null;
53
+
54
+ /**
55
+ * Analyst-kind factory — the typed, focused replacement for the
56
+ * legacy `createTraceAnalystAdapter`.
57
+ *
58
+ * A "kind" is a specialized analyst whose actor prompt, tool subset,
59
+ * and Ax recursion config target one failure-mode lens (failure-mode
60
+ * classification, knowledge gap discovery, knowledge poisoning, recursive
61
+ * self-improvement, ...). Kinds emit findings in the typed `RawAnalystFinding`
62
+ * shape via a JSON-array Ax output; the factory validates each row with
63
+ * Zod and lifts it into `AnalystFinding[]` with no shape guessing.
64
+ *
65
+ * Composition rules:
66
+ * - Each kind owns its actor description. No generic "answer this
67
+ * question" prompt — the prompt names the failure lens.
68
+ * - Each kind picks a narrow tool subset from `ANALYST_TOOL_GROUPS`.
69
+ * A kind that never needs full-trace dumps can drop `viewTrace` /
70
+ * `viewSpans` and stay cheap.
71
+ * - Each kind declares its recursion + parallelism budget. Discovery-
72
+ * heavy kinds (failure-mode) get higher `maxDepth`; lens kinds
73
+ * (poisoning) usually stay at 0 since they have a tighter brief.
74
+ *
75
+ * Optimizer hook: kinds may declare `goldens` — labeled examples used
76
+ * by `AxMiPRO` / `AxBootstrapFewShot` / `AxGEPA` to fit the actor
77
+ * description programmatically. Stored on the kind, not the registry,
78
+ * because the right metric is kind-specific.
79
+ */
80
+
81
+ /**
82
+ * Per-kind specification. The factory turns this into a regular
83
+ * `Analyst<TraceAnalysisStore>` ready for `AnalystRegistry.register()`.
84
+ */
85
+ interface TraceAnalystKindSpec {
86
+ /** Stable id. Appears in finding_id, telemetry, and registry exclusions. */
87
+ id: string;
88
+ /** One-sentence description shown in `registry.list()`. */
89
+ description: string;
90
+ /** Coarse classification stamped on every emitted finding (`failure-mode`, `knowledge-gap`, ...). */
91
+ area: string;
92
+ /** Bump on any breaking change to the actor prompt or output schema. */
93
+ version: string;
94
+ /** Actor system prompt. Must instruct the LLM to emit `findings` per the schema. */
95
+ actorDescription: string;
96
+ /** Responder system prompt; falls back to a minimal "format the findings" instruction. */
97
+ responderDescription?: string;
98
+ /** Tool functions the actor may call. Pick narrow subsets via `ANALYST_TOOL_GROUPS`. */
99
+ buildTools: (store: TraceAnalysisStore) => AxFunction[];
100
+ /** Recursion budget. `maxDepth: 0` disables subagents. */
101
+ recursion?: {
102
+ maxDepth: number;
103
+ maxParallelSubagents?: number;
104
+ };
105
+ /** Actor turn cap. Default 12. */
106
+ maxTurns?: number;
107
+ /** Runtime char cap. Default 6000. */
108
+ maxRuntimeChars?: number;
109
+ /** Cost classification surfaced in `registry.list()` and budget enforcement. */
110
+ cost: AnalystCost;
111
+ /** Per-finding-row hook — kinds may reject / rewrite before lifting. */
112
+ postProcess?: (row: RawAnalystFinding, ctx: AnalystContext) => RawAnalystFinding | null;
113
+ /** Optional optimizer hook — populated when a kind wants to fit its prompt against labeled examples. */
114
+ goldens?: TraceAnalystGolden[];
115
+ }
116
+ /**
117
+ * One labeled example consumed by Ax optimizers (MIPRO / GEPA / Bootstrap).
118
+ * Each input is the same `{question}` an analyst would receive; `expected`
119
+ * is the ground-truth finding set a fitted prompt should produce on this
120
+ * input. Metric: kind-specific (default: F1 on `finding_id` overlap).
121
+ */
122
+ interface TraceAnalystGolden {
123
+ question: string;
124
+ expected: ReadonlyArray<Omit<RawAnalystFinding, 'confidence'>>;
125
+ }
126
+ interface CreateTraceAnalystKindOpts {
127
+ /** AxAIService bound at registration time. */
128
+ ai: AxAIService;
129
+ /** Optional model override; falls back to the AI service's default. */
130
+ model?: string;
131
+ /** Override the spec's `version` (e.g. when an optimizer has fitted a new prompt). */
132
+ versionSuffix?: string;
133
+ /**
134
+ * Optional two-phase recovery: when the agentic harvest is empty but the
135
+ * actor produced a substantive free-form `report`, extract findings from that
136
+ * prose via a tolerant chat-completions pass (`structureFindings`) — no
137
+ * strict-emission contract, so it works on weak models. Omit to leave the
138
+ * actor's harvest as-is (the report is still surfaced fail-loud either way).
139
+ */
140
+ recovery?: {
141
+ baseUrl: string;
142
+ apiKey?: string;
143
+ model?: string;
144
+ fetchImpl?: typeof fetch;
145
+ };
146
+ }
147
+ /**
148
+ * Build an `Analyst<TraceAnalysisStore>` from a kind spec.
149
+ *
150
+ * Lifts the Ax pipeline once at registration time so the registry
151
+ * gets a stateless analyst. The Ax agent is freshly constructed per
152
+ * `analyze()` call (the agent carries chat-log + usage state we don't
153
+ * want shared across analyst runs).
154
+ */
155
+ declare function createTraceAnalystKind(spec: TraceAnalystKindSpec, opts: CreateTraceAnalystKindOpts): Analyst<TraceAnalysisStore>;
156
+ /**
157
+ * Render a compact prior-findings block the actor reads alongside its
158
+ * brief. Each row is one line so the actor can scan dozens cheaply.
159
+ * The kind's prompt instructs the actor to (a) check whether a new
160
+ * cluster matches a prior `finding_id` (carry the id forward via
161
+ * `id_basis` to keep diffs stable) and (b) raise severity / confidence
162
+ * when a prior finding has reappeared without remediation.
163
+ *
164
+ * Returns the empty string when there are no prior findings — most
165
+ * runs are "first-of-its-kind" and the prompt stays unchanged.
166
+ *
167
+ * Exported for tests + for consumers that build their own actor
168
+ * prompts (e.g. specialized analysts living outside the default kinds).
169
+ */
170
+ declare function renderPriorFindings(prior: AnalystContext['priorFindings']): string;
171
+
172
+ export { ANALYST_SEVERITIES as A, type CreateTraceAnalystKindOpts as C, RAW_FINDING_SCHEMA_PROMPT as R, type TraceAnalystGolden as T, type RawAnalystFinding as a, RawAnalystFindingSchema as b, type TraceAnalystKindSpec as c, createTraceAnalystKind as d, parseRawFinding as p, renderPriorFindings as r };
@@ -0,0 +1,141 @@
1
+ /**
2
+ * Multi-layer verifier — ordered pipeline of verification layers.
3
+ *
4
+ * Different contract from {@link JudgeRunner} (which runs parallel
5
+ * specs against a sandbox). MultiLayerVerifier is a DAG of layers
6
+ * (install → typecheck → build → lint → serve → semantic → …) with
7
+ * dependency-based skip, per-layer findings, soft-fail semantics, and
8
+ * an aggregated `blendedScore` across all passed layers.
9
+ *
10
+ * Use when you want:
11
+ * - ordered stages where a failing upstream stage skips downstream ones
12
+ * - each stage produces rich `findings` (severity + message + evidence)
13
+ * - a single composite score across stages with per-stage weights
14
+ * - soft-fail stages whose failure doesn't abort the pipeline
15
+ *
16
+ * Use {@link JudgeRunner} when you want:
17
+ * - N independent judges running in parallel against the same artifact
18
+ * - no inter-judge dependencies
19
+ * - boolean `passed` per judge + overall
20
+ *
21
+ * Both primitives compose — JudgeRunner can be invoked as a single
22
+ * layer inside a MultiLayerVerifier if that suits the caller.
23
+ */
24
+ type LayerStatus = 'pass' | 'fail' | 'skipped' | 'error' | 'timeout';
25
+ type Severity = 'critical' | 'major' | 'minor' | 'info';
26
+ interface Finding {
27
+ severity: Severity;
28
+ message: string;
29
+ evidence?: string;
30
+ /** Optional layer name the finding belongs to (set by the verifier if omitted). */
31
+ layer?: string;
32
+ /**
33
+ * Free-form structured payload — used by `multiToolchainLayer` to attach
34
+ * `{ adapter: 'pnpm' }`, by judges to attach evidence pointers, etc.
35
+ * Renderers MAY interrogate; agent-eval primitives never assume shape.
36
+ */
37
+ detail?: Record<string, unknown>;
38
+ }
39
+ interface LayerResult {
40
+ layer: string;
41
+ status: LayerStatus;
42
+ /** 0..1 score, optional — layers that don't produce a numeric score omit. */
43
+ score?: number;
44
+ durationMs: number;
45
+ findings: Finding[];
46
+ /** Short human-readable summary (one line). */
47
+ reason?: string;
48
+ /**
49
+ * Numeric layer-level diagnostics: error counts, warning counts,
50
+ * cyclomatic complexity, total adapter wall-time, etc. Keyed by
51
+ * diagnostic name; null = "diagnostic not applicable / not measured."
52
+ * Renderers that know the keys can display them; ones that don't,
53
+ * ignore. Free-form on purpose — consumers type the value shape in
54
+ * their own namespace.
55
+ */
56
+ diagnostics?: Record<string, number | null>;
57
+ /** Any rich per-layer detail — rendered as-is by consumers that know the layer. */
58
+ detail?: Record<string, unknown>;
59
+ }
60
+ interface VerifyContext<Env = unknown> {
61
+ /** Per-run opaque context the caller provides. Layers destructure what they need. */
62
+ env: Env;
63
+ /** Previously-computed results from layers that already ran. */
64
+ prior: Record<string, LayerResult>;
65
+ /** Signal — if aborted, layers MUST bail within reasonable wall. */
66
+ signal: AbortSignal;
67
+ }
68
+ interface Layer<Env = unknown> {
69
+ name: string;
70
+ /** Stages that must have `status: 'pass'` before this layer runs. */
71
+ dependsOn?: string[];
72
+ /**
73
+ * Weight in the composite `blendedScore`. Default 1.0. Layers with weight 0
74
+ * contribute findings but not score.
75
+ */
76
+ weight?: number;
77
+ /**
78
+ * If true, a `fail` status contributes to `blendedScore` (as 0) instead of
79
+ * being dropped — use for layers whose failure is a real signal. Default:
80
+ * fail drops from numerator + denominator, matching VB's existing semantics.
81
+ */
82
+ failContributesToScore?: boolean;
83
+ /** Optional per-layer wall-cap in ms. Honored by the verifier (AbortSignal). */
84
+ capMs?: number;
85
+ run: (ctx: VerifyContext<Env>) => Promise<LayerResult> | LayerResult;
86
+ }
87
+ interface VerifyOptions<Env = unknown> {
88
+ env: Env;
89
+ /**
90
+ * Overall wall cap. Default: sum of layer capMs, or Infinity if any layer
91
+ * omits a cap. The verifier short-circuits remaining layers on overall cap.
92
+ */
93
+ overallCapMs?: number;
94
+ /** Called with each layer result as it completes. */
95
+ onLayer?: (result: LayerResult) => void;
96
+ }
97
+ interface VerificationReport {
98
+ layers: LayerResult[];
99
+ passCount: number;
100
+ failCount: number;
101
+ skippedCount: number;
102
+ errorCount: number;
103
+ /** True iff at least one scored layer ran AND every scored layer passed. */
104
+ allPass: boolean;
105
+ /**
106
+ * Weighted mean of `score` across contributing layers. 0 when no layers
107
+ * contributed. See {@link Layer.failContributesToScore} for fail semantics.
108
+ */
109
+ blendedScore: number;
110
+ durationMs: number;
111
+ startedAt: string;
112
+ finishedAt: string;
113
+ }
114
+ /**
115
+ * Grade a semantic-concept-style judge result into a single layer status.
116
+ *
117
+ * Pass when overall score >= threshold AND no critical-severity concept gap.
118
+ * Fail otherwise. Use inside a `Layer.run` when wrapping a concept judge.
119
+ *
120
+ * Generalized from VerticalBench H3 fix: `failingConcepts.length === 0` was
121
+ * too strict — a single concept at 6/10 failed the entire layer despite
122
+ * overall score being >= 0.7. Now we trust the judge's own `severity` field:
123
+ * `critical` findings veto; `major`/`minor` reduce the score but don't veto.
124
+ */
125
+ declare function gradeSemanticStatus(input: {
126
+ score: number;
127
+ findings: Array<{
128
+ severity: Severity;
129
+ present?: boolean;
130
+ score?: number;
131
+ }>;
132
+ available: boolean;
133
+ threshold?: number;
134
+ }): LayerStatus;
135
+ declare class MultiLayerVerifier<Env = unknown> {
136
+ private readonly layers;
137
+ constructor(layers: Layer<Env>[]);
138
+ run(opts: VerifyOptions<Env>): Promise<VerificationReport>;
139
+ }
140
+
141
+ export { type Finding as F, type LayerResult as L, MultiLayerVerifier as M, type Severity as S, type VerifyOptions as V, type VerificationReport as a, type Layer as b, type VerifyContext as c, type LayerStatus as d, gradeSemanticStatus as g };
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.72.0",
5
+ "version": "0.72.4",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -0,0 +1,81 @@
1
+ /**
2
+ * Pareto frontier — multi-objective optimization over candidate runs.
3
+ *
4
+ * Lifted from ADC pareto.ts and blueprint-agent frontier.ts. When you're
5
+ * trading off (cost, latency, quality) or (passRate, tokenBudget,
6
+ * ttfb), you rarely have a single "winner" — you have a set of
7
+ * non-dominated candidates. This module exposes:
8
+ *
9
+ * - `paretoFrontier`: filter a set of candidates to the non-dominated ones
10
+ * - `dominates`: does A dominate B across all objectives?
11
+ *
12
+ * Each objective is declared with a direction: 'maximize' (higher=better)
13
+ * or 'minimize' (lower=better). Candidates are any object; pass an
14
+ * `objective(candidate)` accessor.
15
+ */
16
+ type Direction = 'maximize' | 'minimize';
17
+ interface Objective<T> {
18
+ /** Stable label used in reports. */
19
+ name: string;
20
+ direction: Direction;
21
+ value: (candidate: T) => number;
22
+ }
23
+ interface ParetoResult<T> {
24
+ frontier: T[];
25
+ dominated: T[];
26
+ /** Index map: frontier[i] dominates each of dominatedBy[i]. */
27
+ dominanceMap: Array<{
28
+ dominator: T;
29
+ dominated: T[];
30
+ }>;
31
+ }
32
+ /** Does candidate A weakly dominate B — strictly better on at least one objective and no worse on any? */
33
+ declare function dominates<T>(a: T, b: T, objectives: Objective<T>[]): boolean;
34
+ /**
35
+ * Compute the non-dominated frontier. Candidates with NaN/Infinity on any
36
+ * objective are excluded (can't rank them). A candidate enters the frontier
37
+ * iff no other candidate dominates it.
38
+ */
39
+ declare function paretoFrontier<T>(candidates: T[], objectives: Objective<T>[]): ParetoResult<T>;
40
+ /**
41
+ * Weighted-sum scalarisation. Use as a tie-break / single-winner selector
42
+ * when callers don't want to consume a frontier. Each objective contributes
43
+ * its normalised value (0..1 via min-max across the candidate pool) times
44
+ * its weight; missing weights default to 1/N.
45
+ *
46
+ * Direction is honoured automatically — `minimize` axes have their values
47
+ * inverted before scaling so "higher scalar = better" always holds.
48
+ */
49
+ declare function scalarScore<T>(candidates: T[], objectives: Objective<T>[], options?: {
50
+ weights?: Partial<Record<string, number>>;
51
+ }): Array<{
52
+ candidate: T;
53
+ score: number;
54
+ }>;
55
+ /**
56
+ * NSGA-II crowding distance — secondary sort for ties on the frontier.
57
+ *
58
+ * When the Pareto front collapses to a single point (or many candidates tie
59
+ * on dominance), naive selection picks arbitrarily and the population
60
+ * degenerates over generations. NSGA-II preserves diversity by preferring
61
+ * candidates with more empty space around them on the frontier.
62
+ *
63
+ * Returns an array of `{ candidate, distance }` in the SAME order as the
64
+ * input. Higher distance = more isolated = should be preferred when
65
+ * preserving diversity.
66
+ */
67
+ declare function crowdingDistance<T>(candidates: T[], objectives: Objective<T>[]): Array<{
68
+ candidate: T;
69
+ distance: number;
70
+ }>;
71
+ /**
72
+ * Pareto frontier with tie-break by crowding distance — the canonical
73
+ * NSGA-II selection step. Returns the frontier sorted by descending crowding
74
+ * distance so callers can `.slice(0, k)` to pick K diverse winners.
75
+ */
76
+ declare function paretoFrontierWithCrowding<T>(candidates: T[], objectives: Objective<T>[]): Array<{
77
+ candidate: T;
78
+ distance: number;
79
+ }>;
80
+
81
+ export { type Direction as D, type Objective as O, type ParetoResult as P, paretoFrontierWithCrowding as a, crowdingDistance as c, dominates as d, paretoFrontier as p, scalarScore as s };