@tangle-network/agent-eval 0.23.1 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. package/CHANGELOG.md +145 -0
  2. package/README.md +212 -79
  3. package/dist/baseline-4R5deP0N.d.ts +108 -0
  4. package/dist/benchmarks/index.d.ts +3 -2
  5. package/dist/benchmarks/index.js +1 -1
  6. package/dist/builder-eval/index.d.ts +249 -0
  7. package/dist/builder-eval/index.js +391 -0
  8. package/dist/builder-eval/index.js.map +1 -0
  9. package/dist/{chunk-IOXMGMHQ.js → chunk-2A5XJB43.js} +142 -318
  10. package/dist/chunk-2A5XJB43.js.map +1 -0
  11. package/dist/chunk-47X6LRCE.js +76 -0
  12. package/dist/chunk-47X6LRCE.js.map +1 -0
  13. package/dist/{chunk-6M774GY6.js → chunk-4F5DQN55.js} +1 -1
  14. package/dist/chunk-4F5DQN55.js.map +1 -0
  15. package/dist/{chunk-KAO3Q65R.js → chunk-4S4BM3QQ.js} +15 -13
  16. package/dist/chunk-4S4BM3QQ.js.map +1 -0
  17. package/dist/chunk-5BKGXME7.js +65 -0
  18. package/dist/chunk-5BKGXME7.js.map +1 -0
  19. package/dist/{chunk-6KQG5HAH.js → chunk-5LBB5B3Z.js} +376 -72
  20. package/dist/chunk-5LBB5B3Z.js.map +1 -0
  21. package/dist/{chunk-42I2QC2L.js → chunk-6QDKWHLS.js} +18 -14
  22. package/dist/chunk-6QDKWHLS.js.map +1 -0
  23. package/dist/{chunk-VQQSPGSM.js → chunk-EDUKQ5AM.js} +247 -189
  24. package/dist/chunk-EDUKQ5AM.js.map +1 -0
  25. package/dist/chunk-I4MBDTY5.js +272 -0
  26. package/dist/chunk-I4MBDTY5.js.map +1 -0
  27. package/dist/chunk-JLZQWFV3.js +618 -0
  28. package/dist/chunk-JLZQWFV3.js.map +1 -0
  29. package/dist/chunk-K2TPS5LB.js +569 -0
  30. package/dist/chunk-K2TPS5LB.js.map +1 -0
  31. package/dist/chunk-KKHDIONI.js +414 -0
  32. package/dist/chunk-KKHDIONI.js.map +1 -0
  33. package/dist/chunk-KMPRBJK4.js +74 -0
  34. package/dist/chunk-KMPRBJK4.js.map +1 -0
  35. package/dist/{chunk-QUKKGHTZ.js → chunk-KTGTIOFD.js} +6 -3
  36. package/dist/chunk-KTGTIOFD.js.map +1 -0
  37. package/dist/chunk-LSH4MMOZ.js +838 -0
  38. package/dist/chunk-LSH4MMOZ.js.map +1 -0
  39. package/dist/chunk-NG236HPC.js +57 -0
  40. package/dist/chunk-NG236HPC.js.map +1 -0
  41. package/dist/{chunk-QBW3YBTR.js → chunk-NLMNWKVM.js} +14 -6
  42. package/dist/chunk-NLMNWKVM.js.map +1 -0
  43. package/dist/chunk-NU65VQ7M.js +99 -0
  44. package/dist/chunk-NU65VQ7M.js.map +1 -0
  45. package/dist/chunk-OWLAAMME.js +250 -0
  46. package/dist/chunk-OWLAAMME.js.map +1 -0
  47. package/dist/{chunk-SQQLHODJ.js → chunk-PC4UYEBM.js} +7 -4
  48. package/dist/chunk-PC4UYEBM.js.map +1 -0
  49. package/dist/{chunk-7EAUOUQS.js → chunk-RAF443UI.js} +213 -115
  50. package/dist/chunk-RAF443UI.js.map +1 -0
  51. package/dist/chunk-RZTMDUO7.js +49 -0
  52. package/dist/chunk-RZTMDUO7.js.map +1 -0
  53. package/dist/{chunk-EXGR4XEM.js → chunk-SESZDQPX.js} +23 -19
  54. package/dist/chunk-SESZDQPX.js.map +1 -0
  55. package/dist/{chunk-5IIQKMD5.js → chunk-TVVP3ZZQ.js} +14 -4
  56. package/dist/chunk-TVVP3ZZQ.js.map +1 -0
  57. package/dist/chunk-WWYCWKUM.js +196 -0
  58. package/dist/chunk-WWYCWKUM.js.map +1 -0
  59. package/dist/{chunk-AXHNWLIX.js → chunk-YRZ4M5GS.js} +2 -90
  60. package/dist/chunk-YRZ4M5GS.js.map +1 -0
  61. package/dist/chunk-ZN274SWR.js +613 -0
  62. package/dist/chunk-ZN274SWR.js.map +1 -0
  63. package/dist/cli.js +10 -6
  64. package/dist/cli.js.map +1 -1
  65. package/dist/{control-DvkH87qJ.d.ts → control-CBShYYA6.d.ts} +32 -33
  66. package/dist/control-runtime-BuJHoLg0.d.ts +180 -0
  67. package/dist/control.d.ts +8 -6
  68. package/dist/control.js +10 -7
  69. package/dist/{dataset-B9qvlm_o.d.ts → dataset-CiK_3LDr.d.ts} +5 -2
  70. package/dist/{emitter-B2XqDKFU.d.ts → emitter-DP_cSSiw.d.ts} +1 -1
  71. package/dist/errors-BZ9sTdz7.d.ts +70 -0
  72. package/dist/failure-cluster-C2EGSDiT.d.ts +76 -0
  73. package/dist/feedback-trajectory-DfFdrraJ.d.ts +169 -0
  74. package/dist/governance/index.d.ts +5 -0
  75. package/dist/governance/index.js +18 -0
  76. package/dist/governance/index.js.map +1 -0
  77. package/dist/{index-DDTlbHEK.d.ts → index--fVrWDiR.d.ts} +1 -1
  78. package/dist/index-Oj9fAPPN.d.ts +270 -0
  79. package/dist/index.d.ts +2018 -3003
  80. package/dist/index.js +7443 -9102
  81. package/dist/index.js.map +1 -1
  82. package/dist/{integrity-Cr5YodSY.d.ts → integrity-DK2EBVZC.d.ts} +4 -3
  83. package/dist/knowledge/index.d.ts +102 -0
  84. package/dist/knowledge/index.js +18 -0
  85. package/dist/knowledge/index.js.map +1 -0
  86. package/dist/meta-eval/index.d.ts +99 -0
  87. package/dist/meta-eval/index.js +324 -0
  88. package/dist/meta-eval/index.js.map +1 -0
  89. package/dist/multi-layer-verifier-LkP3LVKj.d.ts +141 -0
  90. package/dist/openapi.json +491 -1
  91. package/dist/optimization.d.ts +11 -8
  92. package/dist/optimization.js +11 -9
  93. package/dist/outcome-store-D6KWmYvj.d.ts +63 -0
  94. package/dist/pipelines/index.d.ts +172 -0
  95. package/dist/pipelines/index.js +345 -0
  96. package/dist/pipelines/index.js.map +1 -0
  97. package/dist/prm/index.d.ts +99 -0
  98. package/dist/prm/index.js +222 -0
  99. package/dist/prm/index.js.map +1 -0
  100. package/dist/query-DODUYdPg.d.ts +30 -0
  101. package/dist/release-report-BNgMdqPF.d.ts +292 -0
  102. package/dist/replay-BL96gCEP.d.ts +226 -0
  103. package/dist/reporting.d.ts +10 -295
  104. package/dist/reporting.js +10 -6
  105. package/dist/{eval-campaign-Ds5QljIh.d.ts → researcher-BPT8x_NT.d.ts} +148 -146
  106. package/dist/rl.d.ts +1762 -8
  107. package/dist/rl.js +2035 -58
  108. package/dist/rl.js.map +1 -1
  109. package/dist/rubric-D5tjHNJQ.d.ts +72 -0
  110. package/dist/rubric-predictive-validity-C0uDYwG6.d.ts +105 -0
  111. package/dist/{run-record-DNiOMBrZ.d.ts → run-record-CqzahIbx.d.ts} +4 -1
  112. package/dist/sequential-Dgz1n51-.d.ts +139 -0
  113. package/dist/{store-u47QaJ9G.d.ts → store-Db2Bv8Cf.d.ts} +1 -1
  114. package/dist/{summary-report-Ce1r4EYo.d.ts → summary-report-C7VPYEj2.d.ts} +3 -76
  115. package/dist/telemetry/file.js +4 -1
  116. package/dist/telemetry/file.js.map +1 -1
  117. package/dist/telemetry/index.js +57 -57
  118. package/dist/telemetry/index.js.map +1 -1
  119. package/dist/test-graded-scenario-B2kWEdh9.d.ts +146 -0
  120. package/dist/traces.d.ts +142 -387
  121. package/dist/traces.js +1302 -40
  122. package/dist/traces.js.map +1 -1
  123. package/dist/trajectory-CnoBo-JY.d.ts +32 -0
  124. package/dist/wire/index.d.ts +369 -25
  125. package/dist/wire/index.js +22 -3
  126. package/package.json +44 -18
  127. package/dist/chunk-42I2QC2L.js.map +0 -1
  128. package/dist/chunk-5IIQKMD5.js.map +0 -1
  129. package/dist/chunk-6KQG5HAH.js.map +0 -1
  130. package/dist/chunk-6M774GY6.js.map +0 -1
  131. package/dist/chunk-7EAUOUQS.js.map +0 -1
  132. package/dist/chunk-AXHNWLIX.js.map +0 -1
  133. package/dist/chunk-EXGR4XEM.js.map +0 -1
  134. package/dist/chunk-IOXMGMHQ.js.map +0 -1
  135. package/dist/chunk-KAO3Q65R.js.map +0 -1
  136. package/dist/chunk-LZKIOBG2.js +0 -2026
  137. package/dist/chunk-LZKIOBG2.js.map +0 -1
  138. package/dist/chunk-QBW3YBTR.js.map +0 -1
  139. package/dist/chunk-QUKKGHTZ.js.map +0 -1
  140. package/dist/chunk-SQQLHODJ.js.map +0 -1
  141. package/dist/chunk-V5QSWN7L.js +0 -1310
  142. package/dist/chunk-V5QSWN7L.js.map +0 -1
  143. package/dist/chunk-VQQSPGSM.js.map +0 -1
  144. package/dist/chunk-XPHOZPOM.js +0 -1947
  145. package/dist/chunk-XPHOZPOM.js.map +0 -1
  146. package/dist/feedback-trajectory-c43WGtTX.d.ts +0 -346
  147. package/dist/index-ekBXweiQ.d.ts +0 -1894
  148. package/dist/sequential-DgU2mFsE.d.ts +0 -304
@@ -0,0 +1,141 @@
1
+ /**
2
+ * Multi-layer verifier — ordered pipeline of verification layers.
3
+ *
4
+ * Different contract from {@link JudgeRunner} (which runs parallel
5
+ * specs against a sandbox). MultiLayerVerifier is a DAG of layers
6
+ * (install → typecheck → build → lint → serve → semantic → …) with
7
+ * dependency-based skip, per-layer findings, soft-fail semantics, and
8
+ * an aggregated `blendedScore` across all passed layers.
9
+ *
10
+ * Use when you want:
11
+ * - ordered stages where a failing upstream stage skips downstream ones
12
+ * - each stage produces rich `findings` (severity + message + evidence)
13
+ * - a single composite score across stages with per-stage weights
14
+ * - soft-fail stages whose failure doesn't abort the pipeline
15
+ *
16
+ * Use {@link JudgeRunner} when you want:
17
+ * - N independent judges running in parallel against the same artifact
18
+ * - no inter-judge dependencies
19
+ * - boolean `passed` per judge + overall
20
+ *
21
+ * Both primitives compose — JudgeRunner can be invoked as a single
22
+ * layer inside a MultiLayerVerifier if that suits the caller.
23
+ */
24
+ type LayerStatus = 'pass' | 'fail' | 'skipped' | 'error' | 'timeout';
25
+ type Severity = 'critical' | 'major' | 'minor' | 'info';
26
+ interface Finding {
27
+ severity: Severity;
28
+ message: string;
29
+ evidence?: string;
30
+ /** Optional layer name the finding belongs to (set by the verifier if omitted). */
31
+ layer?: string;
32
+ /**
33
+ * Free-form structured payload — used by `multiToolchainLayer` to attach
34
+ * `{ adapter: 'pnpm' }`, by judges to attach evidence pointers, etc.
35
+ * Renderers MAY interrogate; agent-eval primitives never assume shape.
36
+ */
37
+ detail?: Record<string, unknown>;
38
+ }
39
+ interface LayerResult {
40
+ layer: string;
41
+ status: LayerStatus;
42
+ /** 0..1 score, optional — layers that don't produce a numeric score omit. */
43
+ score?: number;
44
+ durationMs: number;
45
+ findings: Finding[];
46
+ /** Short human-readable summary (one line). */
47
+ reason?: string;
48
+ /**
49
+ * Numeric layer-level diagnostics: error counts, warning counts,
50
+ * cyclomatic complexity, total adapter wall-time, etc. Keyed by
51
+ * diagnostic name; null = "diagnostic not applicable / not measured."
52
+ * Renderers that know the keys can display them; ones that don't,
53
+ * ignore. Free-form on purpose — consumers type the value shape in
54
+ * their own namespace. Added in 0.10.
55
+ */
56
+ diagnostics?: Record<string, number | null>;
57
+ /** Any rich per-layer detail — rendered as-is by consumers that know the layer. */
58
+ detail?: Record<string, unknown>;
59
+ }
60
+ interface VerifyContext<Env = unknown> {
61
+ /** Per-run opaque context the caller provides. Layers destructure what they need. */
62
+ env: Env;
63
+ /** Previously-computed results from layers that already ran. */
64
+ prior: Record<string, LayerResult>;
65
+ /** Signal — if aborted, layers MUST bail within reasonable wall. */
66
+ signal: AbortSignal;
67
+ }
68
+ interface Layer<Env = unknown> {
69
+ name: string;
70
+ /** Stages that must have `status: 'pass'` before this layer runs. */
71
+ dependsOn?: string[];
72
+ /**
73
+ * Weight in the composite `blendedScore`. Default 1.0. Layers with weight 0
74
+ * contribute findings but not score.
75
+ */
76
+ weight?: number;
77
+ /**
78
+ * If true, a `fail` status contributes to `blendedScore` (as 0) instead of
79
+ * being dropped — use for layers whose failure is a real signal. Default:
80
+ * fail drops from numerator + denominator, matching VB's existing semantics.
81
+ */
82
+ failContributesToScore?: boolean;
83
+ /** Optional per-layer wall-cap in ms. Honored by the verifier (AbortSignal). */
84
+ capMs?: number;
85
+ run: (ctx: VerifyContext<Env>) => Promise<LayerResult> | LayerResult;
86
+ }
87
+ interface VerifyOptions<Env = unknown> {
88
+ env: Env;
89
+ /**
90
+ * Overall wall cap. Default: sum of layer capMs, or Infinity if any layer
91
+ * omits a cap. The verifier short-circuits remaining layers on overall cap.
92
+ */
93
+ overallCapMs?: number;
94
+ /** Called with each layer result as it completes. */
95
+ onLayer?: (result: LayerResult) => void;
96
+ }
97
+ interface VerificationReport {
98
+ layers: LayerResult[];
99
+ passCount: number;
100
+ failCount: number;
101
+ skippedCount: number;
102
+ errorCount: number;
103
+ /** True iff at least one scored layer ran AND every scored layer passed. */
104
+ allPass: boolean;
105
+ /**
106
+ * Weighted mean of `score` across contributing layers. 0 when no layers
107
+ * contributed. See {@link Layer.failContributesToScore} for fail semantics.
108
+ */
109
+ blendedScore: number;
110
+ durationMs: number;
111
+ startedAt: string;
112
+ finishedAt: string;
113
+ }
114
+ /**
115
+ * Grade a semantic-concept-style judge result into a single layer status.
116
+ *
117
+ * Pass when overall score >= threshold AND no critical-severity concept gap.
118
+ * Fail otherwise. Use inside a `Layer.run` when wrapping a concept judge.
119
+ *
120
+ * Generalized from VerticalBench H3 fix: `failingConcepts.length === 0` was
121
+ * too strict — a single concept at 6/10 failed the entire layer despite
122
+ * overall score being >= 0.7. Now we trust the judge's own `severity` field:
123
+ * `critical` findings veto; `major`/`minor` reduce the score but don't veto.
124
+ */
125
+ declare function gradeSemanticStatus(input: {
126
+ score: number;
127
+ findings: Array<{
128
+ severity: Severity;
129
+ present?: boolean;
130
+ score?: number;
131
+ }>;
132
+ available: boolean;
133
+ threshold?: number;
134
+ }): LayerStatus;
135
+ declare class MultiLayerVerifier<Env = unknown> {
136
+ private readonly layers;
137
+ constructor(layers: Layer<Env>[]);
138
+ run(opts: VerifyOptions<Env>): Promise<VerificationReport>;
139
+ }
140
+
141
+ export { type Finding as F, type Layer as L, MultiLayerVerifier as M, type Severity as S, type VerificationReport as V, type LayerResult as a, type VerifyContext as b, type LayerStatus as c, type VerifyOptions as d, gradeSemanticStatus as g };
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.23.1",
5
+ "version": "0.25.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -382,6 +382,377 @@
382
382
  "required": [
383
383
  "error"
384
384
  ]
385
+ },
386
+ "TracesIngestRequest": {
387
+ "type": "object",
388
+ "properties": {
389
+ "events": {
390
+ "type": "array",
391
+ "items": {
392
+ "$ref": "#/components/schemas/TraceEvent"
393
+ },
394
+ "minItems": 1,
395
+ "maxItems": 10000,
396
+ "description": "Batch of events. Max 10k per call — bigger streams should be chunked."
397
+ }
398
+ },
399
+ "required": [
400
+ "events"
401
+ ]
402
+ },
403
+ "TraceEvent": {
404
+ "type": "object",
405
+ "properties": {
406
+ "eventId": {
407
+ "type": "string",
408
+ "minLength": 1,
409
+ "description": "Stable id for the event. Use ULID or UUID."
410
+ },
411
+ "runId": {
412
+ "type": "string",
413
+ "minLength": 1,
414
+ "description": "Run this event belongs to."
415
+ },
416
+ "spanId": {
417
+ "type": "string",
418
+ "description": "Span that emitted the event, if any."
419
+ },
420
+ "kind": {
421
+ "type": "string",
422
+ "enum": [
423
+ "log",
424
+ "error",
425
+ "budget_decrement",
426
+ "budget_breach",
427
+ "state_mutation",
428
+ "policy_violation",
429
+ "redaction_applied",
430
+ "custom"
431
+ ],
432
+ "description": "Coarse event category — matches the TraceSchema v1 EventKind enum."
433
+ },
434
+ "timestamp": {
435
+ "type": "integer",
436
+ "minimum": 0,
437
+ "description": "Unix millis. Must be monotonically non-decreasing within a span."
438
+ },
439
+ "payload": {
440
+ "type": "object",
441
+ "additionalProperties": {},
442
+ "description": "Free-form payload — the runtime owns the shape."
443
+ }
444
+ },
445
+ "required": [
446
+ "eventId",
447
+ "runId",
448
+ "kind",
449
+ "timestamp",
450
+ "payload"
451
+ ]
452
+ },
453
+ "TracesIngestResponse": {
454
+ "type": "object",
455
+ "properties": {
456
+ "accepted": {
457
+ "type": "integer",
458
+ "minimum": 0,
459
+ "description": "Number of events persisted."
460
+ },
461
+ "rejected": {
462
+ "type": "integer",
463
+ "minimum": 0,
464
+ "description": "Number of events the store refused — see `errors[]` for reasons."
465
+ },
466
+ "errors": {
467
+ "type": "array",
468
+ "items": {
469
+ "type": "object",
470
+ "properties": {
471
+ "eventId": {
472
+ "type": "string",
473
+ "description": "Event id this error applies to."
474
+ },
475
+ "message": {
476
+ "type": "string",
477
+ "description": "Why the event was rejected."
478
+ }
479
+ },
480
+ "required": [
481
+ "eventId",
482
+ "message"
483
+ ]
484
+ },
485
+ "default": []
486
+ }
487
+ },
488
+ "required": [
489
+ "accepted",
490
+ "rejected"
491
+ ]
492
+ },
493
+ "FeedbackTrajectory": {
494
+ "type": "object",
495
+ "properties": {
496
+ "id": {
497
+ "type": "string",
498
+ "minLength": 1,
499
+ "description": "Stable id; idempotency key for the trajectory."
500
+ },
501
+ "projectId": {
502
+ "type": "string"
503
+ },
504
+ "scenarioId": {
505
+ "type": "string"
506
+ },
507
+ "task": {
508
+ "type": "object",
509
+ "properties": {
510
+ "intent": {
511
+ "type": "string",
512
+ "minLength": 1
513
+ },
514
+ "context": {}
515
+ },
516
+ "required": [
517
+ "intent"
518
+ ]
519
+ },
520
+ "attempts": {
521
+ "type": "array",
522
+ "items": {
523
+ "$ref": "#/components/schemas/FeedbackAttempt"
524
+ },
525
+ "default": []
526
+ },
527
+ "labels": {
528
+ "type": "array",
529
+ "items": {
530
+ "$ref": "#/components/schemas/FeedbackLabel"
531
+ },
532
+ "default": []
533
+ },
534
+ "outcome": {
535
+ "type": "object",
536
+ "properties": {
537
+ "success": {
538
+ "type": "boolean"
539
+ },
540
+ "score": {
541
+ "type": "number"
542
+ },
543
+ "metrics": {
544
+ "type": "object",
545
+ "additionalProperties": {
546
+ "type": "number"
547
+ }
548
+ },
549
+ "costUsd": {
550
+ "type": "number"
551
+ },
552
+ "detail": {
553
+ "type": "string"
554
+ },
555
+ "observedAt": {
556
+ "type": "string"
557
+ },
558
+ "metadata": {
559
+ "type": "object",
560
+ "additionalProperties": {}
561
+ }
562
+ }
563
+ },
564
+ "split": {
565
+ "type": "string",
566
+ "enum": [
567
+ "train",
568
+ "dev",
569
+ "test",
570
+ "holdout"
571
+ ]
572
+ },
573
+ "tags": {
574
+ "type": "object",
575
+ "additionalProperties": {
576
+ "type": "string"
577
+ }
578
+ },
579
+ "createdAt": {
580
+ "type": "string",
581
+ "description": "ISO-8601 UTC."
582
+ },
583
+ "updatedAt": {
584
+ "type": "string"
585
+ },
586
+ "metadata": {
587
+ "type": "object",
588
+ "additionalProperties": {}
589
+ }
590
+ },
591
+ "required": [
592
+ "id",
593
+ "task",
594
+ "createdAt"
595
+ ]
596
+ },
597
+ "FeedbackAttempt": {
598
+ "type": "object",
599
+ "properties": {
600
+ "id": {
601
+ "type": "string",
602
+ "minLength": 1
603
+ },
604
+ "stepIndex": {
605
+ "type": "integer",
606
+ "minimum": 0
607
+ },
608
+ "artifactType": {
609
+ "type": "string",
610
+ "enum": [
611
+ "text",
612
+ "code",
613
+ "plan",
614
+ "research",
615
+ "action",
616
+ "ui",
617
+ "decision",
618
+ "data",
619
+ "other"
620
+ ]
621
+ },
622
+ "artifact": {},
623
+ "options": {
624
+ "type": "array",
625
+ "items": {}
626
+ },
627
+ "proposedAction": {
628
+ "type": "object",
629
+ "properties": {
630
+ "type": {
631
+ "type": "string"
632
+ },
633
+ "risk": {
634
+ "type": "string",
635
+ "enum": [
636
+ "low",
637
+ "medium",
638
+ "high"
639
+ ]
640
+ },
641
+ "costUsd": {
642
+ "type": "number"
643
+ },
644
+ "externalSideEffect": {
645
+ "type": "boolean"
646
+ },
647
+ "requiresApproval": {
648
+ "type": "boolean"
649
+ },
650
+ "metadata": {
651
+ "type": "object",
652
+ "additionalProperties": {}
653
+ }
654
+ },
655
+ "required": [
656
+ "type"
657
+ ]
658
+ },
659
+ "feedback": {
660
+ "type": "array",
661
+ "items": {
662
+ "$ref": "#/components/schemas/FeedbackLabel"
663
+ }
664
+ },
665
+ "createdAt": {
666
+ "type": "string"
667
+ },
668
+ "metadata": {
669
+ "type": "object",
670
+ "additionalProperties": {}
671
+ }
672
+ },
673
+ "required": [
674
+ "id",
675
+ "stepIndex",
676
+ "artifactType",
677
+ "createdAt"
678
+ ]
679
+ },
680
+ "FeedbackLabel": {
681
+ "type": "object",
682
+ "properties": {
683
+ "id": {
684
+ "type": "string"
685
+ },
686
+ "source": {
687
+ "type": "string",
688
+ "enum": [
689
+ "user",
690
+ "judge",
691
+ "environment",
692
+ "metric",
693
+ "policy",
694
+ "system"
695
+ ]
696
+ },
697
+ "kind": {
698
+ "type": "string",
699
+ "enum": [
700
+ "approve",
701
+ "reject",
702
+ "select",
703
+ "edit",
704
+ "rank",
705
+ "rate",
706
+ "comment",
707
+ "metric_outcome",
708
+ "policy_block",
709
+ "revision_request"
710
+ ]
711
+ },
712
+ "value": {},
713
+ "reason": {
714
+ "type": "string"
715
+ },
716
+ "severity": {
717
+ "type": "string",
718
+ "enum": [
719
+ "info",
720
+ "warning",
721
+ "error",
722
+ "critical"
723
+ ]
724
+ },
725
+ "createdAt": {
726
+ "type": "string",
727
+ "description": "ISO-8601 UTC."
728
+ },
729
+ "metadata": {
730
+ "type": "object",
731
+ "additionalProperties": {}
732
+ }
733
+ },
734
+ "required": [
735
+ "source",
736
+ "kind",
737
+ "createdAt"
738
+ ]
739
+ },
740
+ "FeedbackIngestResponse": {
741
+ "type": "object",
742
+ "properties": {
743
+ "id": {
744
+ "type": "string",
745
+ "description": "Trajectory id that was persisted."
746
+ },
747
+ "persisted": {
748
+ "type": "boolean",
749
+ "description": "True when the trajectory was saved (idempotent on id)."
750
+ }
751
+ },
752
+ "required": [
753
+ "id",
754
+ "persisted"
755
+ ]
385
756
  }
386
757
  },
387
758
  "parameters": {}
@@ -496,6 +867,125 @@
496
867
  }
497
868
  }
498
869
  }
870
+ },
871
+ "/v1/traces/ingest": {
872
+ "post": {
873
+ "summary": "Ingest a batch of production TraceEvents",
874
+ "description": "Append a batch of TraceEvents to the configured TraceStore. Accepts application/json ({events:[...]}) or application/x-ndjson (one event per line). Returns counts of accepted + rejected events.",
875
+ "requestBody": {
876
+ "content": {
877
+ "application/json": {
878
+ "schema": {
879
+ "$ref": "#/components/schemas/TracesIngestRequest"
880
+ }
881
+ },
882
+ "application/x-ndjson": {
883
+ "schema": {
884
+ "$ref": "#/components/schemas/TracesIngestRequest"
885
+ }
886
+ }
887
+ }
888
+ },
889
+ "responses": {
890
+ "200": {
891
+ "description": "Ingestion summary",
892
+ "content": {
893
+ "application/json": {
894
+ "schema": {
895
+ "$ref": "#/components/schemas/TracesIngestResponse"
896
+ }
897
+ }
898
+ }
899
+ },
900
+ "400": {
901
+ "description": "Validation error",
902
+ "content": {
903
+ "application/json": {
904
+ "schema": {
905
+ "$ref": "#/components/schemas/ErrorResponse"
906
+ }
907
+ }
908
+ }
909
+ },
910
+ "401": {
911
+ "description": "Unauthorized (when bearer auth is configured)",
912
+ "content": {
913
+ "application/json": {
914
+ "schema": {
915
+ "$ref": "#/components/schemas/ErrorResponse"
916
+ }
917
+ }
918
+ }
919
+ },
920
+ "503": {
921
+ "description": "No trace store configured",
922
+ "content": {
923
+ "application/json": {
924
+ "schema": {
925
+ "$ref": "#/components/schemas/ErrorResponse"
926
+ }
927
+ }
928
+ }
929
+ }
930
+ }
931
+ }
932
+ },
933
+ "/v1/feedback": {
934
+ "post": {
935
+ "summary": "Ingest a FeedbackTrajectory from production",
936
+ "description": "Persist a single FeedbackTrajectory. Idempotent on trajectory.id — re-posting replaces the prior record. Used by production runtimes to forward user 👍/👎/edits into the eval substrate.",
937
+ "requestBody": {
938
+ "content": {
939
+ "application/json": {
940
+ "schema": {
941
+ "$ref": "#/components/schemas/FeedbackTrajectory"
942
+ }
943
+ }
944
+ }
945
+ },
946
+ "responses": {
947
+ "200": {
948
+ "description": "Persisted",
949
+ "content": {
950
+ "application/json": {
951
+ "schema": {
952
+ "$ref": "#/components/schemas/FeedbackIngestResponse"
953
+ }
954
+ }
955
+ }
956
+ },
957
+ "400": {
958
+ "description": "Validation error",
959
+ "content": {
960
+ "application/json": {
961
+ "schema": {
962
+ "$ref": "#/components/schemas/ErrorResponse"
963
+ }
964
+ }
965
+ }
966
+ },
967
+ "401": {
968
+ "description": "Unauthorized (when bearer auth is configured)",
969
+ "content": {
970
+ "application/json": {
971
+ "schema": {
972
+ "$ref": "#/components/schemas/ErrorResponse"
973
+ }
974
+ }
975
+ }
976
+ },
977
+ "503": {
978
+ "description": "No feedback store configured",
979
+ "content": {
980
+ "application/json": {
981
+ "schema": {
982
+ "$ref": "#/components/schemas/ErrorResponse"
983
+ }
984
+ }
985
+ }
986
+ }
987
+ }
988
+ }
499
989
  }
500
990
  },
501
991
  "webhooks": {}
@@ -1,8 +1,11 @@
1
- export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './eval-campaign-Ds5QljIh.js';
2
- export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-Ce1r4EYo.js';
3
- export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-c43WGtTX.js';
4
- import './run-record-DNiOMBrZ.js';
5
- import './integrity-Cr5YodSY.js';
6
- import './store-u47QaJ9G.js';
7
- import './emitter-B2XqDKFU.js';
8
- import './dataset-B9qvlm_o.js';
1
+ export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-BPT8x_NT.js';
2
+ export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-DfFdrraJ.js';
3
+ export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-C7VPYEj2.js';
4
+ import './errors-BZ9sTdz7.js';
5
+ import './integrity-DK2EBVZC.js';
6
+ import './store-Db2Bv8Cf.js';
7
+ import './run-record-CqzahIbx.js';
8
+ import './emitter-DP_cSSiw.js';
9
+ import './control-runtime-BuJHoLg0.js';
10
+ import './dataset-CiK_3LDr.js';
11
+ import './failure-cluster-C2EGSDiT.js';