@alis-build/harness-eval 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2195 @@
1
+ {
2
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
3
+ "$id": "https://raw.githubusercontent.com/alis-build/harness-eval-ts/main/schemas/eval-run-envelope.schema.json",
4
+ "title": "EvalRunEnvelope",
5
+ "description": "Cross-harness eval run record for CI/CD, APIs, and databases. Not vendor stream-json or OTLP.",
6
+ "type": "object",
7
+ "properties": {
8
+ "schemaVersion": {
9
+ "$ref": "#/$defs/__schema0"
10
+ },
11
+ "runId": {
12
+ "$ref": "#/$defs/__schema1"
13
+ },
14
+ "startedAt": {
15
+ "$ref": "#/$defs/__schema2"
16
+ },
17
+ "durationMs": {
18
+ "$ref": "#/$defs/__schema3"
19
+ },
20
+ "suite": {
21
+ "$ref": "#/$defs/__schema4"
22
+ },
23
+ "harness": {
24
+ "$ref": "#/$defs/__schema11"
25
+ },
26
+ "provenance": {
27
+ "$ref": "#/$defs/__schema17"
28
+ },
29
+ "summary": {
30
+ "$ref": "#/$defs/__schema41"
31
+ },
32
+ "cells": {
33
+ "$ref": "#/$defs/__schema47"
34
+ }
35
+ },
36
+ "required": [
37
+ "schemaVersion",
38
+ "runId",
39
+ "startedAt",
40
+ "durationMs",
41
+ "harness",
42
+ "summary",
43
+ "cells"
44
+ ],
45
+ "additionalProperties": false,
46
+ "$defs": {
47
+ "__schema0": {
48
+ "type": "string",
49
+ "const": "1.0",
50
+ "description": "EvalRunEnvelope schema version. Bump on breaking JSON changes."
51
+ },
52
+ "__schema1": {
53
+ "type": "string",
54
+ "format": "uuid",
55
+ "pattern": "^([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-8][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}|00000000-0000-0000-0000-000000000000|ffffffff-ffff-ffff-ffff-ffffffffffff)$",
56
+ "description": "Unique identifier for this eval run (UUID).",
57
+ "examples": [
58
+ "00000000-0000-4000-8000-000000000001"
59
+ ]
60
+ },
61
+ "__schema2": {
62
+ "type": "string",
63
+ "description": "ISO 8601 timestamp when the run started.",
64
+ "examples": [
65
+ "2026-06-23T12:00:00.000Z"
66
+ ]
67
+ },
68
+ "__schema3": {
69
+ "type": "integer",
70
+ "minimum": 0,
71
+ "maximum": 9007199254740991,
72
+ "description": "Total wall time for the run in milliseconds."
73
+ },
74
+ "__schema4": {
75
+ "description": "Reference to the suite YAML that defined this run.",
76
+ "$ref": "#/$defs/SuiteReference"
77
+ },
78
+ "SuiteReference": {
79
+ "type": "object",
80
+ "properties": {
81
+ "uri": {
82
+ "$ref": "#/$defs/__schema5"
83
+ },
84
+ "id": {
85
+ "$ref": "#/$defs/__schema7"
86
+ },
87
+ "contentHash": {
88
+ "$ref": "#/$defs/__schema9"
89
+ }
90
+ },
91
+ "additionalProperties": false,
92
+ "title": "SuiteReference",
93
+ "description": "Link to the eval suite specification that produced this run."
94
+ },
95
+ "__schema5": {
96
+ "description": "Absolute or repo-relative path to the suite YAML that produced this run.",
97
+ "examples": [
98
+ "examples/basic.yaml"
99
+ ],
100
+ "$ref": "#/$defs/__schema6"
101
+ },
102
+ "__schema6": {
103
+ "type": "string"
104
+ },
105
+ "__schema7": {
106
+ "description": "Stable suite identifier when known (e.g. bundle or catalog name).",
107
+ "$ref": "#/$defs/__schema8"
108
+ },
109
+ "__schema8": {
110
+ "type": "string"
111
+ },
112
+ "__schema9": {
113
+ "description": "Hash of suite file contents (SHA-256 hex) for reproducibility.",
114
+ "$ref": "#/$defs/__schema10"
115
+ },
116
+ "__schema10": {
117
+ "type": "string"
118
+ },
119
+ "__schema11": {
120
+ "description": "Harness adapter that executed the run.",
121
+ "$ref": "#/$defs/HarnessInfo"
122
+ },
123
+ "__schema12": {
124
+ "type": "string",
125
+ "description": "Harness adapter id from suite YAML, e.g. claude-code.",
126
+ "examples": [
127
+ "claude-code"
128
+ ]
129
+ },
130
+ "__schema13": {
131
+ "description": "harness-eval package version when the envelope was built.",
132
+ "$ref": "#/$defs/__schema14"
133
+ },
134
+ "__schema14": {
135
+ "type": "string"
136
+ },
137
+ "__schema15": {
138
+ "description": "Optional harness CLI version string (e.g. claude -v output).",
139
+ "$ref": "#/$defs/__schema16"
140
+ },
141
+ "__schema16": {
142
+ "type": "string"
143
+ },
144
+ "HarnessInfo": {
145
+ "type": "object",
146
+ "properties": {
147
+ "adapter": {
148
+ "$ref": "#/$defs/__schema12"
149
+ },
150
+ "frameworkVersion": {
151
+ "$ref": "#/$defs/__schema13"
152
+ },
153
+ "harnessVersion": {
154
+ "$ref": "#/$defs/__schema15"
155
+ }
156
+ },
157
+ "required": [
158
+ "adapter"
159
+ ],
160
+ "additionalProperties": false,
161
+ "title": "HarnessInfo",
162
+ "description": "Harness adapter and version metadata for the run."
163
+ },
164
+ "__schema17": {
165
+ "description": "CI, git, and runtime provenance for correlation.",
166
+ "$ref": "#/$defs/EvalProvenance"
167
+ },
168
+ "EvalProvenance": {
169
+ "type": "object",
170
+ "properties": {
171
+ "runId": {
172
+ "$ref": "#/$defs/__schema18"
173
+ },
174
+ "ci": {
175
+ "$ref": "#/$defs/__schema20"
176
+ },
177
+ "git": {
178
+ "$ref": "#/$defs/__schema29"
179
+ },
180
+ "pluginVersion": {
181
+ "$ref": "#/$defs/__schema36"
182
+ },
183
+ "triggeredBy": {
184
+ "$ref": "#/$defs/__schema38"
185
+ }
186
+ },
187
+ "additionalProperties": {
188
+ "$ref": "#/$defs/__schema40"
189
+ },
190
+ "title": "EvalProvenance",
191
+ "description": "CI, git, and runtime provenance for DB correlation. Additional keys allowed."
192
+ },
193
+ "__schema18": {
194
+ "description": "Optional duplicate of envelope runId for nested provenance blobs.",
195
+ "$ref": "#/$defs/__schema19"
196
+ },
197
+ "__schema19": {
198
+ "type": "string"
199
+ },
200
+ "__schema20": {
201
+ "description": "CI metadata when run from a pipeline.",
202
+ "$ref": "#/$defs/CiProvenance"
203
+ },
204
+ "CiProvenance": {
205
+ "type": "object",
206
+ "properties": {
207
+ "provider": {
208
+ "$ref": "#/$defs/__schema21"
209
+ },
210
+ "jobId": {
211
+ "$ref": "#/$defs/__schema23"
212
+ },
213
+ "pipelineId": {
214
+ "$ref": "#/$defs/__schema25"
215
+ },
216
+ "url": {
217
+ "$ref": "#/$defs/__schema27"
218
+ }
219
+ },
220
+ "additionalProperties": false,
221
+ "title": "CiProvenance",
222
+ "description": "CI job metadata for correlating eval runs with pipelines."
223
+ },
224
+ "__schema21": {
225
+ "description": "CI provider name, e.g. github-actions or gitlab-ci.",
226
+ "$ref": "#/$defs/__schema22"
227
+ },
228
+ "__schema22": {
229
+ "type": "string"
230
+ },
231
+ "__schema23": {
232
+ "description": "CI job or run identifier.",
233
+ "$ref": "#/$defs/__schema24"
234
+ },
235
+ "__schema24": {
236
+ "type": "string"
237
+ },
238
+ "__schema25": {
239
+ "description": "CI pipeline or workflow identifier.",
240
+ "$ref": "#/$defs/__schema26"
241
+ },
242
+ "__schema26": {
243
+ "type": "string"
244
+ },
245
+ "__schema27": {
246
+ "description": "URL to the CI job run page.",
247
+ "$ref": "#/$defs/__schema28"
248
+ },
249
+ "__schema28": {
250
+ "type": "string"
251
+ },
252
+ "__schema29": {
253
+ "description": "Git metadata when run from a repo.",
254
+ "$ref": "#/$defs/GitProvenance"
255
+ },
256
+ "GitProvenance": {
257
+ "type": "object",
258
+ "properties": {
259
+ "commit": {
260
+ "$ref": "#/$defs/__schema30"
261
+ },
262
+ "branch": {
263
+ "$ref": "#/$defs/__schema32"
264
+ },
265
+ "repository": {
266
+ "$ref": "#/$defs/__schema34"
267
+ }
268
+ },
269
+ "additionalProperties": false,
270
+ "title": "GitProvenance",
271
+ "description": "Git coordinates for the code under evaluation."
272
+ },
273
+ "__schema30": {
274
+ "description": "Git commit SHA at time of run.",
275
+ "$ref": "#/$defs/__schema31"
276
+ },
277
+ "__schema31": {
278
+ "type": "string"
279
+ },
280
+ "__schema32": {
281
+ "description": "Git branch name.",
282
+ "$ref": "#/$defs/__schema33"
283
+ },
284
+ "__schema33": {
285
+ "type": "string"
286
+ },
287
+ "__schema34": {
288
+ "description": "Repository slug or URL, e.g. alis-build/harness-eval-ts.",
289
+ "$ref": "#/$defs/__schema35"
290
+ },
291
+ "__schema35": {
292
+ "type": "string"
293
+ },
294
+ "__schema36": {
295
+ "description": "Version of the plugin or MCP bundle under test.",
296
+ "$ref": "#/$defs/__schema37"
297
+ },
298
+ "__schema37": {
299
+ "type": "string"
300
+ },
301
+ "__schema38": {
302
+ "description": "Actor or trigger source, e.g. user id, schedule, or pull_request.",
303
+ "$ref": "#/$defs/__schema39"
304
+ },
305
+ "__schema39": {
306
+ "type": "string"
307
+ },
308
+ "__schema40": {},
309
+ "__schema41": {
310
+ "description": "Aggregate behavioral and outcome pass summary.",
311
+ "$ref": "#/$defs/EvalRunSummary"
312
+ },
313
+ "__schema42": {
314
+ "type": "integer",
315
+ "minimum": 0,
316
+ "maximum": 9007199254740991,
317
+ "description": "Number of matrix cells (case × config) in this run."
318
+ },
319
+ "__schema43": {
320
+ "type": "integer",
321
+ "minimum": 0,
322
+ "maximum": 9007199254740991,
323
+ "description": "Cells that passed all behavioral assertion thresholds."
324
+ },
325
+ "__schema44": {
326
+ "type": "boolean",
327
+ "description": "True when every cell passed behavioral assertion thresholds."
328
+ },
329
+ "__schema45": {
330
+ "description": "True when every graded cell passed all outcome expectations. Omitted if not graded.",
331
+ "$ref": "#/$defs/__schema46"
332
+ },
333
+ "__schema46": {
334
+ "type": "boolean"
335
+ },
336
+ "EvalRunSummary": {
337
+ "type": "object",
338
+ "properties": {
339
+ "cellsTotal": {
340
+ "$ref": "#/$defs/__schema42"
341
+ },
342
+ "cellsPassed": {
343
+ "$ref": "#/$defs/__schema43"
344
+ },
345
+ "behavioralPass": {
346
+ "$ref": "#/$defs/__schema44"
347
+ },
348
+ "outcomePass": {
349
+ "$ref": "#/$defs/__schema45"
350
+ }
351
+ },
352
+ "required": [
353
+ "cellsTotal",
354
+ "cellsPassed",
355
+ "behavioralPass"
356
+ ],
357
+ "additionalProperties": false,
358
+ "title": "EvalRunSummary",
359
+ "description": "Aggregate pass/fail summary for CI gates and dashboards."
360
+ },
361
+ "__schema47": {
362
+ "type": "array",
363
+ "items": {
364
+ "$ref": "#/$defs/EvalCellResult"
365
+ },
366
+ "description": "Results for each test case × matrix cell."
367
+ },
368
+ "EvalCellResult": {
369
+ "type": "object",
370
+ "properties": {
371
+ "caseId": {
372
+ "$ref": "#/$defs/__schema48"
373
+ },
374
+ "category": {
375
+ "$ref": "#/$defs/__schema49"
376
+ },
377
+ "notes": {
378
+ "$ref": "#/$defs/__schema51"
379
+ },
380
+ "prompt": {
381
+ "$ref": "#/$defs/__schema53"
382
+ },
383
+ "expectations": {
384
+ "$ref": "#/$defs/__schema55"
385
+ },
386
+ "reference_trajectory": {
387
+ "$ref": "#/$defs/__schema57"
388
+ },
389
+ "human_ratings": {
390
+ "$ref": "#/$defs/__schema61"
391
+ },
392
+ "cellLabel": {
393
+ "$ref": "#/$defs/__schema63"
394
+ },
395
+ "axes": {
396
+ "$ref": "#/$defs/__schema64"
397
+ },
398
+ "assertionStats": {
399
+ "$ref": "#/$defs/__schema66"
400
+ },
401
+ "adapterErrors": {
402
+ "$ref": "#/$defs/__schema73"
403
+ },
404
+ "behavioralPass": {
405
+ "$ref": "#/$defs/__schema74"
406
+ },
407
+ "outcomePass": {
408
+ "$ref": "#/$defs/__schema75"
409
+ },
410
+ "repetitions": {
411
+ "$ref": "#/$defs/__schema77"
412
+ }
413
+ },
414
+ "required": [
415
+ "caseId",
416
+ "cellLabel",
417
+ "assertionStats",
418
+ "adapterErrors",
419
+ "behavioralPass",
420
+ "repetitions"
421
+ ],
422
+ "additionalProperties": false,
423
+ "title": "EvalCellResult",
424
+ "description": "Result for one test case × matrix cell combination."
425
+ },
426
+ "__schema48": {
427
+ "type": "string",
428
+ "description": "Test case id from the suite YAML."
429
+ },
430
+ "__schema49": {
431
+ "description": "Optional case category for reporting.",
432
+ "$ref": "#/$defs/__schema50"
433
+ },
434
+ "__schema50": {
435
+ "type": "string"
436
+ },
437
+ "__schema51": {
438
+ "description": "Author notes copied from the suite.",
439
+ "$ref": "#/$defs/__schema52"
440
+ },
441
+ "__schema52": {
442
+ "type": "string"
443
+ },
444
+ "__schema53": {
445
+ "description": "Prompt sent to the harness for this case.",
446
+ "$ref": "#/$defs/__schema54"
447
+ },
448
+ "__schema54": {
449
+ "type": "string"
450
+ },
451
+ "__schema55": {
452
+ "description": "Natural-language outcome expectations for grading.",
453
+ "$ref": "#/$defs/__schema56"
454
+ },
455
+ "__schema56": {
456
+ "type": "array",
457
+ "items": {
458
+ "type": "string"
459
+ }
460
+ },
461
+ "__schema57": {
462
+ "description": "Reference tool-call trajectory for metric computation.",
463
+ "$ref": "#/$defs/__schema58"
464
+ },
465
+ "__schema58": {
466
+ "type": "array",
467
+ "items": {
468
+ "$ref": "#/$defs/TabularToolCall"
469
+ }
470
+ },
471
+ "TabularToolCall": {
472
+ "type": "object",
473
+ "properties": {
474
+ "tool_name": {
475
+ "$ref": "#/$defs/__schema59"
476
+ },
477
+ "tool_input": {
478
+ "$ref": "#/$defs/__schema60"
479
+ }
480
+ },
481
+ "required": [
482
+ "tool_name",
483
+ "tool_input"
484
+ ],
485
+ "additionalProperties": false,
486
+ "title": "TabularToolCall",
487
+ "description": "Tool call with structured tool_input for JSONL/tabular export."
488
+ },
489
+ "__schema59": {
490
+ "type": "string",
491
+ "description": "Tool name as emitted by the agent."
492
+ },
493
+ "__schema60": {
494
+ "description": "Tool arguments as a structured object for tabular consumption."
495
+ },
496
+ "__schema61": {
497
+ "description": "Human ratings keyed by metric name for judge calibration.",
498
+ "$ref": "#/$defs/__schema62"
499
+ },
500
+ "__schema62": {
501
+ "type": "object",
502
+ "propertyNames": {
503
+ "type": "string"
504
+ },
505
+ "additionalProperties": {
506
+ "type": "number"
507
+ }
508
+ },
509
+ "__schema63": {
510
+ "type": "string",
511
+ "description": "Matrix cell label, e.g. sonnet or opus-marketplace."
512
+ },
513
+ "__schema64": {
514
+ "description": "Matrix axis values for this cell (model, plugin source, etc.).",
515
+ "$ref": "#/$defs/__schema65"
516
+ },
517
+ "__schema65": {
518
+ "type": "object",
519
+ "propertyNames": {
520
+ "type": "string"
521
+ },
522
+ "additionalProperties": {
523
+ "type": "string"
524
+ }
525
+ },
526
+ "__schema66": {
527
+ "type": "array",
528
+ "items": {
529
+ "$ref": "#/$defs/EvalAssertionStat"
530
+ },
531
+ "description": "Per-assertion pass rates across repetitions in this cell."
532
+ },
533
+ "EvalAssertionStat": {
534
+ "type": "object",
535
+ "properties": {
536
+ "description": {
537
+ "$ref": "#/$defs/__schema67"
538
+ },
539
+ "threshold": {
540
+ "$ref": "#/$defs/__schema68"
541
+ },
542
+ "passedCount": {
543
+ "$ref": "#/$defs/__schema69"
544
+ },
545
+ "evaluatedCount": {
546
+ "$ref": "#/$defs/__schema70"
547
+ },
548
+ "passRate": {
549
+ "$ref": "#/$defs/__schema71"
550
+ },
551
+ "meetsThreshold": {
552
+ "$ref": "#/$defs/__schema72"
553
+ }
554
+ },
555
+ "required": [
556
+ "description",
557
+ "threshold",
558
+ "passedCount",
559
+ "evaluatedCount",
560
+ "passRate",
561
+ "meetsThreshold"
562
+ ],
563
+ "additionalProperties": false,
564
+ "title": "EvalAssertionStat",
565
+ "description": "Behavioral assertion statistics for one assertion in a matrix cell."
566
+ },
567
+ "__schema67": {
568
+ "type": "string",
569
+ "description": "Assertion description aggregated across repetitions."
570
+ },
571
+ "__schema68": {
572
+ "type": "number",
573
+ "minimum": 0,
574
+ "maximum": 1,
575
+ "description": "Minimum pass rate required across repetitions (0..1)."
576
+ },
577
+ "__schema69": {
578
+ "type": "integer",
579
+ "minimum": 0,
580
+ "maximum": 9007199254740991,
581
+ "description": "Repetitions where this assertion passed."
582
+ },
583
+ "__schema70": {
584
+ "type": "integer",
585
+ "minimum": 0,
586
+ "maximum": 9007199254740991,
587
+ "description": "Repetitions included in the denominator (excludes adapter errors)."
588
+ },
589
+ "__schema71": {
590
+ "type": "number",
591
+ "minimum": 0,
592
+ "maximum": 1,
593
+ "description": "passedCount / evaluatedCount."
594
+ },
595
+ "__schema72": {
596
+ "type": "boolean",
597
+ "description": "Whether passRate meets or exceeds threshold."
598
+ },
599
+ "__schema73": {
600
+ "type": "integer",
601
+ "minimum": 0,
602
+ "maximum": 9007199254740991,
603
+ "description": "Repetitions excluded from assertion denominators due to harness failure."
604
+ },
605
+ "__schema74": {
606
+ "type": "boolean",
607
+ "description": "Cell passed all behavioral assertion thresholds."
608
+ },
609
+ "__schema75": {
610
+ "description": "Cell passed all outcome expectations when graded. Omitted if not graded.",
611
+ "$ref": "#/$defs/__schema76"
612
+ },
613
+ "__schema76": {
614
+ "type": "boolean"
615
+ },
616
+ "__schema77": {
617
+ "type": "array",
618
+ "items": {
619
+ "$ref": "#/$defs/EvalRepetition"
620
+ },
621
+ "description": "Individual harness runs for statistical eval."
622
+ },
623
+ "EvalRepetition": {
624
+ "type": "object",
625
+ "properties": {
626
+ "repetitionIndex": {
627
+ "$ref": "#/$defs/__schema78"
628
+ },
629
+ "durationMs": {
630
+ "$ref": "#/$defs/__schema79"
631
+ },
632
+ "trajectory": {
633
+ "$ref": "#/$defs/__schema80"
634
+ },
635
+ "diagnostics": {
636
+ "$ref": "#/$defs/__schema123"
637
+ },
638
+ "assertionResults": {
639
+ "$ref": "#/$defs/__schema138"
640
+ },
641
+ "outcomeGrades": {
642
+ "$ref": "#/$defs/__schema147"
643
+ },
644
+ "externalScores": {
645
+ "$ref": "#/$defs/__schema171"
646
+ },
647
+ "artifacts": {
648
+ "$ref": "#/$defs/__schema181"
649
+ },
650
+ "predicted_trajectory": {
651
+ "$ref": "#/$defs/__schema188"
652
+ },
653
+ "agent_trace": {
654
+ "$ref": "#/$defs/__schema192"
655
+ },
656
+ "latency_in_seconds": {
657
+ "$ref": "#/$defs/__schema226"
658
+ },
659
+ "failure": {
660
+ "$ref": "#/$defs/__schema228"
661
+ },
662
+ "trajectoryMetrics": {
663
+ "$ref": "#/$defs/__schema230"
664
+ },
665
+ "toolCallMetrics": {
666
+ "$ref": "#/$defs/__schema237"
667
+ },
668
+ "error": {
669
+ "$ref": "#/$defs/__schema242"
670
+ }
671
+ },
672
+ "required": [
673
+ "repetitionIndex",
674
+ "durationMs",
675
+ "assertionResults"
676
+ ],
677
+ "additionalProperties": false,
678
+ "title": "EvalRepetition",
679
+ "description": "One harness invocation — the unit external judges and trajectory queries use."
680
+ },
681
+ "__schema78": {
682
+ "type": "integer",
683
+ "minimum": 0,
684
+ "maximum": 9007199254740991,
685
+ "description": "Zero-based index of this repetition within the cell."
686
+ },
687
+ "__schema79": {
688
+ "type": "integer",
689
+ "minimum": 0,
690
+ "maximum": 9007199254740991,
691
+ "description": "Wall time for this repetition in milliseconds."
692
+ },
693
+ "__schema80": {
694
+ "description": "Normalized harness session when the run completed with a view.",
695
+ "$ref": "#/$defs/TrajectoryViewExport"
696
+ },
697
+ "TrajectoryViewExport": {
698
+ "type": "object",
699
+ "properties": {
700
+ "meta": {
701
+ "$ref": "#/$defs/__schema81"
702
+ },
703
+ "toolCalls": {
704
+ "$ref": "#/$defs/__schema92"
705
+ },
706
+ "turns": {
707
+ "$ref": "#/$defs/__schema103"
708
+ },
709
+ "finalResponse": {
710
+ "$ref": "#/$defs/__schema109"
711
+ },
712
+ "finalStopReason": {
713
+ "$ref": "#/$defs/__schema110"
714
+ },
715
+ "usage": {
716
+ "$ref": "#/$defs/__schema112"
717
+ },
718
+ "retries": {
719
+ "$ref": "#/$defs/__schema118"
720
+ },
721
+ "success": {
722
+ "$ref": "#/$defs/__schema121"
723
+ },
724
+ "schemaVersion": {
725
+ "$ref": "#/$defs/__schema122"
726
+ }
727
+ },
728
+ "required": [
729
+ "meta",
730
+ "toolCalls",
731
+ "turns",
732
+ "finalResponse",
733
+ "finalStopReason",
734
+ "usage",
735
+ "retries",
736
+ "success",
737
+ "schemaVersion"
738
+ ],
739
+ "additionalProperties": false,
740
+ "title": "TrajectoryViewExport",
741
+ "description": "TrajectoryView with schemaVersion, as embedded in EvalRunEnvelope repetitions."
742
+ },
743
+ "__schema81": {
744
+ "description": "Session metadata from harness initialization.",
745
+ "$ref": "#/$defs/SessionMeta"
746
+ },
747
+ "__schema82": {
748
+ "type": "string",
749
+ "description": "Harness-assigned session identifier from the vendor stream."
750
+ },
751
+ "__schema83": {
752
+ "type": "string",
753
+ "description": "Model identifier used for the session, e.g. claude-sonnet-4-6."
754
+ },
755
+ "__schema84": {
756
+ "type": "string",
757
+ "description": "Working directory the harness used for the run."
758
+ },
759
+ "__schema85": {
760
+ "description": "Permission mode active for the session, when reported by the harness.",
761
+ "$ref": "#/$defs/__schema86"
762
+ },
763
+ "__schema86": {
764
+ "type": "string"
765
+ },
766
+ "__schema87": {
767
+ "type": "array",
768
+ "items": {
769
+ "$ref": "#/$defs/__schema88"
770
+ },
771
+ "description": "Tool names the harness reported as available at session start."
772
+ },
773
+ "__schema88": {
774
+ "type": "string"
775
+ },
776
+ "__schema89": {
777
+ "type": "array",
778
+ "items": {
779
+ "$ref": "#/$defs/McpServer"
780
+ },
781
+ "description": "MCP servers configured for the session, with connection status."
782
+ },
783
+ "McpServer": {
784
+ "type": "object",
785
+ "properties": {
786
+ "name": {
787
+ "$ref": "#/$defs/__schema90"
788
+ },
789
+ "status": {
790
+ "$ref": "#/$defs/__schema91"
791
+ }
792
+ },
793
+ "required": [
794
+ "name",
795
+ "status"
796
+ ],
797
+ "additionalProperties": false,
798
+ "title": "McpServer",
799
+ "description": "MCP server entry from session initialization metadata."
800
+ },
801
+ "__schema90": {
802
+ "type": "string",
803
+ "description": "MCP server name as reported by the harness."
804
+ },
805
+ "__schema91": {
806
+ "type": "string",
807
+ "description": "Connection status at session start, e.g. connected or failed."
808
+ },
809
+ "SessionMeta": {
810
+ "type": "object",
811
+ "properties": {
812
+ "sessionId": {
813
+ "$ref": "#/$defs/__schema82"
814
+ },
815
+ "model": {
816
+ "$ref": "#/$defs/__schema83"
817
+ },
818
+ "cwd": {
819
+ "$ref": "#/$defs/__schema84"
820
+ },
821
+ "permissionMode": {
822
+ "$ref": "#/$defs/__schema85"
823
+ },
824
+ "availableTools": {
825
+ "$ref": "#/$defs/__schema87"
826
+ },
827
+ "mcpServers": {
828
+ "$ref": "#/$defs/__schema89"
829
+ }
830
+ },
831
+ "required": [
832
+ "sessionId",
833
+ "model",
834
+ "cwd",
835
+ "availableTools",
836
+ "mcpServers"
837
+ ],
838
+ "additionalProperties": false,
839
+ "title": "SessionMeta",
840
+ "description": "Session metadata captured from harness initialization (e.g. Claude system/init)."
841
+ },
842
+ "__schema92": {
843
+ "type": "array",
844
+ "items": {
845
+ "$ref": "#/$defs/ToolCall"
846
+ },
847
+ "description": "Every tool call in global emission order."
848
+ },
849
+ "ToolCall": {
850
+ "type": "object",
851
+ "properties": {
852
+ "name": {
853
+ "$ref": "#/$defs/__schema93"
854
+ },
855
+ "namespace": {
856
+ "$ref": "#/$defs/__schema94"
857
+ },
858
+ "callId": {
859
+ "$ref": "#/$defs/__schema96"
860
+ },
861
+ "args": {
862
+ "$ref": "#/$defs/__schema97"
863
+ },
864
+ "result": {
865
+ "$ref": "#/$defs/__schema98"
866
+ },
867
+ "isError": {
868
+ "$ref": "#/$defs/__schema100"
869
+ },
870
+ "turnIndex": {
871
+ "$ref": "#/$defs/__schema101"
872
+ },
873
+ "callIndex": {
874
+ "$ref": "#/$defs/__schema102"
875
+ }
876
+ },
877
+ "required": [
878
+ "name",
879
+ "namespace",
880
+ "callId",
881
+ "args",
882
+ "result",
883
+ "isError",
884
+ "turnIndex",
885
+ "callIndex"
886
+ ],
887
+ "additionalProperties": false,
888
+ "title": "ToolCall",
889
+ "description": "One tool invocation in emission order. Primary unit for behavioral assertions."
890
+ },
891
+ "__schema93": {
892
+ "type": "string",
893
+ "description": "Fully-qualified tool name, e.g. mcp__plugin_alis-build_api__SearchSkills or Bash.",
894
+ "examples": [
895
+ "mcp__plugin_alis-build_api__SearchSkills",
896
+ "Bash"
897
+ ]
898
+ },
899
+ "__schema94": {
900
+ "anyOf": [
901
+ {
902
+ "$ref": "#/$defs/__schema95"
903
+ },
904
+ {
905
+ "type": "null"
906
+ }
907
+ ],
908
+ "description": "Namespace prefix for MCP-style names (mcp__<server>), or null for built-in tools.",
909
+ "examples": [
910
+ "mcp__plugin_alis-build_api",
911
+ null
912
+ ]
913
+ },
914
+ "__schema95": {
915
+ "type": "string"
916
+ },
917
+ "__schema96": {
918
+ "type": "string",
919
+ "description": "Vendor tool-use block id; matches a later tool_result.tool_use_id when present."
920
+ },
921
+ "__schema97": {
922
+ "description": "Arguments the model emitted for this tool call. Tool-specific schema."
923
+ },
924
+ "__schema98": {
925
+ "anyOf": [
926
+ {
927
+ "$ref": "#/$defs/__schema99"
928
+ },
929
+ {
930
+ "type": "null"
931
+ }
932
+ ],
933
+ "description": "Tool result payload, or null if no result was observed (e.g. process killed)."
934
+ },
935
+ "__schema99": {},
936
+ "__schema100": {
937
+ "type": "boolean",
938
+ "description": "Whether the tool reported an error in its result envelope."
939
+ },
940
+ "__schema101": {
941
+ "type": "integer",
942
+ "minimum": -9007199254740991,
943
+ "maximum": 9007199254740991,
944
+ "description": "Assistant turn that produced this call. Parallel calls in one message share a turnIndex."
945
+ },
946
+ "__schema102": {
947
+ "type": "integer",
948
+ "minimum": -9007199254740991,
949
+ "maximum": 9007199254740991,
950
+ "description": "Index in the global ordered tool-call sequence (used for called_before assertions)."
951
+ },
952
+ "__schema103": {
953
+ "type": "array",
954
+ "items": {
955
+ "$ref": "#/$defs/AssistantTurn"
956
+ },
957
+ "description": "Assistant turns with per-turn text and tool calls."
958
+ },
959
+ "AssistantTurn": {
960
+ "type": "object",
961
+ "properties": {
962
+ "turnIndex": {
963
+ "$ref": "#/$defs/__schema104"
964
+ },
965
+ "text": {
966
+ "$ref": "#/$defs/__schema105"
967
+ },
968
+ "toolCalls": {
969
+ "$ref": "#/$defs/__schema106"
970
+ },
971
+ "stopReason": {
972
+ "$ref": "#/$defs/__schema107"
973
+ }
974
+ },
975
+ "required": [
976
+ "turnIndex",
977
+ "text",
978
+ "toolCalls",
979
+ "stopReason"
980
+ ],
981
+ "additionalProperties": false,
982
+ "title": "AssistantTurn",
983
+ "description": "One assistant turn: text content plus any tool calls in that turn."
984
+ },
985
+ "__schema104": {
986
+ "type": "integer",
987
+ "minimum": -9007199254740991,
988
+ "maximum": 9007199254740991,
989
+ "description": "Monotonic assistant turn index."
990
+ },
991
+ "__schema105": {
992
+ "type": "string",
993
+ "description": "Assistant text emitted in this turn (may be empty for tool-only turns)."
994
+ },
995
+ "__schema106": {
996
+ "type": "array",
997
+ "items": {
998
+ "$ref": "#/$defs/ToolCall"
999
+ },
1000
+ "description": "Tool calls emitted in this turn, in block order."
1001
+ },
1002
+ "__schema107": {
1003
+ "anyOf": [
1004
+ {
1005
+ "$ref": "#/$defs/__schema108"
1006
+ },
1007
+ {
1008
+ "type": "null"
1009
+ }
1010
+ ],
1011
+ "description": "Model stop reason for this turn, or null if not reported.",
1012
+ "examples": [
1013
+ "end_turn",
1014
+ "tool_use",
1015
+ null
1016
+ ]
1017
+ },
1018
+ "__schema108": {
1019
+ "type": "string"
1020
+ },
1021
+ "__schema109": {
1022
+ "type": "string",
1023
+ "description": "All assistant text concatenated across turns. Used for response_contains assertions."
1024
+ },
1025
+ "__schema110": {
1026
+ "anyOf": [
1027
+ {
1028
+ "$ref": "#/$defs/__schema111"
1029
+ },
1030
+ {
1031
+ "type": "null"
1032
+ }
1033
+ ],
1034
+ "description": "Stop reason of the last assistant turn."
1035
+ },
1036
+ "__schema111": {
1037
+ "type": "string"
1038
+ },
1039
+ "__schema112": {
1040
+ "description": "Aggregate usage and cost for the session.",
1041
+ "$ref": "#/$defs/UsageSummary"
1042
+ },
1043
+ "__schema113": {
1044
+ "type": "number",
1045
+ "description": "Total input tokens for the session."
1046
+ },
1047
+ "__schema114": {
1048
+ "type": "number",
1049
+ "description": "Total output tokens for the session."
1050
+ },
1051
+ "__schema115": {
1052
+ "type": "number",
1053
+ "description": "Total session cost in USD when reported by the harness."
1054
+ },
1055
+ "__schema116": {
1056
+ "type": "number",
1057
+ "description": "Session duration in milliseconds from harness result metadata."
1058
+ },
1059
+ "__schema117": {
1060
+ "type": "number",
1061
+ "description": "Number of assistant turns in the session."
1062
+ },
1063
+ "UsageSummary": {
1064
+ "type": "object",
1065
+ "properties": {
1066
+ "inputTokens": {
1067
+ "$ref": "#/$defs/__schema113"
1068
+ },
1069
+ "outputTokens": {
1070
+ "$ref": "#/$defs/__schema114"
1071
+ },
1072
+ "totalCostUsd": {
1073
+ "$ref": "#/$defs/__schema115"
1074
+ },
1075
+ "durationMs": {
1076
+ "$ref": "#/$defs/__schema116"
1077
+ },
1078
+ "numTurns": {
1079
+ "$ref": "#/$defs/__schema117"
1080
+ }
1081
+ },
1082
+ "required": [
1083
+ "inputTokens",
1084
+ "outputTokens",
1085
+ "totalCostUsd",
1086
+ "durationMs",
1087
+ "numTurns"
1088
+ ],
1089
+ "additionalProperties": false,
1090
+ "title": "UsageSummary",
1091
+ "description": "Aggregate token usage, cost, and timing from the harness result."
1092
+ },
1093
+ "__schema118": {
1094
+ "type": "array",
1095
+ "items": {
1096
+ "$ref": "#/$defs/RetryRecord"
1097
+ },
1098
+ "description": "Retry events observed during the run."
1099
+ },
1100
+ "RetryRecord": {
1101
+ "type": "object",
1102
+ "properties": {
1103
+ "offsetMs": {
1104
+ "$ref": "#/$defs/__schema119"
1105
+ },
1106
+ "raw": {
1107
+ "$ref": "#/$defs/__schema120"
1108
+ }
1109
+ },
1110
+ "required": [
1111
+ "offsetMs",
1112
+ "raw"
1113
+ ],
1114
+ "additionalProperties": false,
1115
+ "title": "RetryRecord",
1116
+ "description": "Rate-limit or transient error retry observed during the run."
1117
+ },
1118
+ "__schema119": {
1119
+ "type": "number",
1120
+ "description": "Approximate milliseconds since session start when the retry was observed."
1121
+ },
1122
+ "__schema120": {
1123
+ "description": "Raw vendor payload from the retry event (e.g. system/api_retry)."
1124
+ },
1125
+ "__schema121": {
1126
+ "type": "boolean",
1127
+ "description": "Whether the harness result envelope indicated success."
1128
+ },
1129
+ "__schema122": {
1130
+ "type": "string",
1131
+ "const": "1.0",
1132
+ "description": "TrajectoryView schema version for storage and API interchange."
1133
+ },
1134
+ "__schema123": {
1135
+ "description": "Adapter process diagnostics for this repetition.",
1136
+ "$ref": "#/$defs/AdapterDiagnostics"
1137
+ },
1138
+ "AdapterDiagnostics": {
1139
+ "type": "object",
1140
+ "properties": {
1141
+ "exitCode": {
1142
+ "$ref": "#/$defs/__schema124"
1143
+ },
1144
+ "signal": {
1145
+ "$ref": "#/$defs/__schema126"
1146
+ },
1147
+ "stderr": {
1148
+ "$ref": "#/$defs/__schema128"
1149
+ },
1150
+ "parseErrors": {
1151
+ "$ref": "#/$defs/__schema130"
1152
+ },
1153
+ "timedOut": {
1154
+ "$ref": "#/$defs/__schema134"
1155
+ },
1156
+ "durationMs": {
1157
+ "$ref": "#/$defs/__schema136"
1158
+ }
1159
+ },
1160
+ "additionalProperties": false,
1161
+ "title": "AdapterDiagnostics",
1162
+ "description": "Process-level diagnostics from the harness adapter."
1163
+ },
1164
+ "__schema124": {
1165
+ "description": "Child process exit code, or null if not available.",
1166
+ "$ref": "#/$defs/__schema125"
1167
+ },
1168
+ "__schema125": {
1169
+ "anyOf": [
1170
+ {
1171
+ "type": "number"
1172
+ },
1173
+ {
1174
+ "type": "null"
1175
+ }
1176
+ ]
1177
+ },
1178
+ "__schema126": {
1179
+ "description": "Termination signal when the harness process was signaled.",
1180
+ "$ref": "#/$defs/__schema127"
1181
+ },
1182
+ "__schema127": {
1183
+ "anyOf": [
1184
+ {
1185
+ "type": "string"
1186
+ },
1187
+ {
1188
+ "type": "null"
1189
+ }
1190
+ ]
1191
+ },
1192
+ "__schema128": {
1193
+ "description": "Captured stderr from the harness process.",
1194
+ "$ref": "#/$defs/__schema129"
1195
+ },
1196
+ "__schema129": {
1197
+ "type": "string"
1198
+ },
1199
+ "__schema130": {
1200
+ "description": "Parse errors from adapter output handling.",
1201
+ "$ref": "#/$defs/__schema131"
1202
+ },
1203
+ "__schema131": {
1204
+ "type": "array",
1205
+ "items": {
1206
+ "$ref": "#/$defs/ParseErrorRecord"
1207
+ }
1208
+ },
1209
+ "ParseErrorRecord": {
1210
+ "type": "object",
1211
+ "properties": {
1212
+ "line": {
1213
+ "$ref": "#/$defs/__schema132"
1214
+ },
1215
+ "error": {
1216
+ "$ref": "#/$defs/__schema133"
1217
+ }
1218
+ },
1219
+ "required": [
1220
+ "line",
1221
+ "error"
1222
+ ],
1223
+ "additionalProperties": false,
1224
+ "title": "ParseErrorRecord",
1225
+ "description": "One stream-json or adapter output parse failure."
1226
+ },
1227
+ "__schema132": {
1228
+ "type": "string",
1229
+ "description": "Raw line from harness output that failed to parse."
1230
+ },
1231
+ "__schema133": {
1232
+ "type": "string",
1233
+ "description": "Parse error message."
1234
+ },
1235
+ "__schema134": {
1236
+ "description": "Whether the harness run hit the configured timeout.",
1237
+ "$ref": "#/$defs/__schema135"
1238
+ },
1239
+ "__schema135": {
1240
+ "type": "boolean"
1241
+ },
1242
+ "__schema136": {
1243
+ "description": "Harness process duration in milliseconds.",
1244
+ "$ref": "#/$defs/__schema137"
1245
+ },
1246
+ "__schema137": {
1247
+ "type": "number"
1248
+ },
1249
+ "__schema138": {
1250
+ "type": "array",
1251
+ "items": {
1252
+ "$ref": "#/$defs/__schema139"
1253
+ },
1254
+ "description": "Deterministic behavioral assertion results for this repetition."
1255
+ },
1256
+ "__schema139": {
1257
+ "$ref": "#/$defs/AssertionResult"
1258
+ },
1259
+ "AssertionResult": {
1260
+ "type": "object",
1261
+ "properties": {
1262
+ "passed": {
1263
+ "$ref": "#/$defs/__schema140"
1264
+ },
1265
+ "description": {
1266
+ "$ref": "#/$defs/__schema141"
1267
+ },
1268
+ "details": {
1269
+ "$ref": "#/$defs/__schema142"
1270
+ },
1271
+ "matches": {
1272
+ "$ref": "#/$defs/__schema143"
1273
+ },
1274
+ "children": {
1275
+ "$ref": "#/$defs/__schema145"
1276
+ }
1277
+ },
1278
+ "required": [
1279
+ "passed",
1280
+ "description",
1281
+ "details"
1282
+ ],
1283
+ "additionalProperties": false,
1284
+ "title": "AssertionResult",
1285
+ "description": "Result of evaluating one assertion, optionally with child nodes."
1286
+ },
1287
+ "__schema140": {
1288
+ "type": "boolean",
1289
+ "description": "Whether this assertion node passed."
1290
+ },
1291
+ "__schema141": {
1292
+ "type": "string",
1293
+ "description": "Short human-readable name, e.g. called(mcp__api__SearchSkills, >= 1)."
1294
+ },
1295
+ "__schema142": {
1296
+ "type": "string",
1297
+ "description": "Diagnostic detail explaining pass or fail."
1298
+ },
1299
+ "__schema143": {
1300
+ "description": "Tool calls that satisfied (or could have satisfied) this assertion.",
1301
+ "$ref": "#/$defs/__schema144"
1302
+ },
1303
+ "__schema144": {
1304
+ "type": "array",
1305
+ "items": {
1306
+ "$ref": "#/$defs/ToolCall"
1307
+ }
1308
+ },
1309
+ "__schema145": {
1310
+ "description": "Sub-results for compound assertions (and/or/not).",
1311
+ "$ref": "#/$defs/__schema146"
1312
+ },
1313
+ "__schema146": {
1314
+ "type": "array",
1315
+ "items": {
1316
+ "$ref": "#/$defs/__schema139"
1317
+ }
1318
+ },
1319
+ "__schema147": {
1320
+ "description": "LLM or custom judge outcome grades when grading was run.",
1321
+ "$ref": "#/$defs/OutcomeGrades"
1322
+ },
1323
+ "OutcomeGrades": {
1324
+ "type": "object",
1325
+ "properties": {
1326
+ "judge": {
1327
+ "$ref": "#/$defs/__schema148"
1328
+ },
1329
+ "expectations": {
1330
+ "$ref": "#/$defs/__schema154"
1331
+ },
1332
+ "summary": {
1333
+ "$ref": "#/$defs/__schema158"
1334
+ },
1335
+ "evalFeedback": {
1336
+ "$ref": "#/$defs/__schema163"
1337
+ },
1338
+ "error": {
1339
+ "$ref": "#/$defs/__schema169"
1340
+ }
1341
+ },
1342
+ "required": [
1343
+ "judge",
1344
+ "expectations",
1345
+ "summary"
1346
+ ],
1347
+ "additionalProperties": false,
1348
+ "title": "OutcomeGrades",
1349
+ "description": "Outcome grades for one repetition from the built-in or external LLM judge."
1350
+ },
1351
+ "__schema148": {
1352
+ "description": "Judge that produced these grades.",
1353
+ "$ref": "#/$defs/JudgeInfo"
1354
+ },
1355
+ "__schema149": {
1356
+ "type": "string",
1357
+ "description": "Stable judge identifier, e.g. harness-eval/claude-grader or langsmith/my-judge."
1358
+ },
1359
+ "__schema150": {
1360
+ "description": "Model used by the judge, when applicable.",
1361
+ "$ref": "#/$defs/__schema151"
1362
+ },
1363
+ "__schema151": {
1364
+ "type": "string"
1365
+ },
1366
+ "__schema152": {
1367
+ "description": "Judge or grader package version.",
1368
+ "$ref": "#/$defs/__schema153"
1369
+ },
1370
+ "__schema153": {
1371
+ "type": "string"
1372
+ },
1373
+ "JudgeInfo": {
1374
+ "type": "object",
1375
+ "properties": {
1376
+ "id": {
1377
+ "$ref": "#/$defs/__schema149"
1378
+ },
1379
+ "model": {
1380
+ "$ref": "#/$defs/__schema150"
1381
+ },
1382
+ "version": {
1383
+ "$ref": "#/$defs/__schema152"
1384
+ }
1385
+ },
1386
+ "required": [
1387
+ "id"
1388
+ ],
1389
+ "additionalProperties": false,
1390
+ "title": "JudgeInfo",
1391
+ "description": "Identity of the judge that produced outcome grades."
1392
+ },
1393
+ "__schema154": {
1394
+ "type": "array",
1395
+ "items": {
1396
+ "$ref": "#/$defs/GradedExpectation"
1397
+ },
1398
+ "description": "Per-expectation pass/fail with evidence."
1399
+ },
1400
+ "GradedExpectation": {
1401
+ "type": "object",
1402
+ "properties": {
1403
+ "text": {
1404
+ "$ref": "#/$defs/__schema155"
1405
+ },
1406
+ "passed": {
1407
+ "$ref": "#/$defs/__schema156"
1408
+ },
1409
+ "evidence": {
1410
+ "$ref": "#/$defs/__schema157"
1411
+ }
1412
+ },
1413
+ "required": [
1414
+ "text",
1415
+ "passed",
1416
+ "evidence"
1417
+ ],
1418
+ "additionalProperties": false,
1419
+ "title": "GradedExpectation",
1420
+ "description": "Outcome grade for one natural-language expectation."
1421
+ },
1422
+ "__schema155": {
1423
+ "type": "string",
1424
+ "description": "Natural-language expectation that was graded."
1425
+ },
1426
+ "__schema156": {
1427
+ "type": "boolean",
1428
+ "description": "Whether the judge deemed this expectation satisfied."
1429
+ },
1430
+ "__schema157": {
1431
+ "type": "string",
1432
+ "description": "Judge explanation citing transcript or trajectory evidence."
1433
+ },
1434
+ "__schema158": {
1435
+ "description": "Aggregate pass/fail counts.",
1436
+ "$ref": "#/$defs/GradingSummary"
1437
+ },
1438
+ "__schema159": {
1439
+ "type": "integer",
1440
+ "minimum": 0,
1441
+ "maximum": 9007199254740991,
1442
+ "description": "Count of expectations that passed."
1443
+ },
1444
+ "__schema160": {
1445
+ "type": "integer",
1446
+ "minimum": 0,
1447
+ "maximum": 9007199254740991,
1448
+ "description": "Count of expectations that failed."
1449
+ },
1450
+ "__schema161": {
1451
+ "type": "integer",
1452
+ "minimum": 0,
1453
+ "maximum": 9007199254740991,
1454
+ "description": "Total expectations graded."
1455
+ },
1456
+ "__schema162": {
1457
+ "type": "number",
1458
+ "minimum": 0,
1459
+ "maximum": 1,
1460
+ "description": "Fraction of expectations that passed (0..1)."
1461
+ },
1462
+ "GradingSummary": {
1463
+ "type": "object",
1464
+ "properties": {
1465
+ "passed": {
1466
+ "$ref": "#/$defs/__schema159"
1467
+ },
1468
+ "failed": {
1469
+ "$ref": "#/$defs/__schema160"
1470
+ },
1471
+ "total": {
1472
+ "$ref": "#/$defs/__schema161"
1473
+ },
1474
+ "passRate": {
1475
+ "$ref": "#/$defs/__schema162"
1476
+ }
1477
+ },
1478
+ "required": [
1479
+ "passed",
1480
+ "failed",
1481
+ "total",
1482
+ "passRate"
1483
+ ],
1484
+ "additionalProperties": false,
1485
+ "title": "GradingSummary",
1486
+ "description": "Aggregate counts for a set of graded expectations."
1487
+ },
1488
+ "__schema163": {
1489
+ "description": "Optional structured feedback for suite authors.",
1490
+ "$ref": "#/$defs/EvalFeedback"
1491
+ },
1492
+ "EvalFeedback": {
1493
+ "type": "object",
1494
+ "properties": {
1495
+ "suggestions": {
1496
+ "$ref": "#/$defs/__schema164"
1497
+ },
1498
+ "overall": {
1499
+ "$ref": "#/$defs/__schema168"
1500
+ }
1501
+ },
1502
+ "required": [
1503
+ "suggestions",
1504
+ "overall"
1505
+ ],
1506
+ "additionalProperties": false,
1507
+ "title": "EvalFeedback",
1508
+ "description": "Structured eval feedback from the outcome judge."
1509
+ },
1510
+ "__schema164": {
1511
+ "type": "array",
1512
+ "items": {
1513
+ "$ref": "#/$defs/EvalFeedbackSuggestion"
1514
+ },
1515
+ "description": "Per-item suggestions for improving the suite or expectations."
1516
+ },
1517
+ "EvalFeedbackSuggestion": {
1518
+ "type": "object",
1519
+ "properties": {
1520
+ "assertion": {
1521
+ "$ref": "#/$defs/__schema165"
1522
+ },
1523
+ "reason": {
1524
+ "$ref": "#/$defs/__schema167"
1525
+ }
1526
+ },
1527
+ "required": [
1528
+ "reason"
1529
+ ],
1530
+ "additionalProperties": false,
1531
+ "title": "EvalFeedbackSuggestion",
1532
+ "description": "Actionable suggestion from eval feedback."
1533
+ },
1534
+ "__schema165": {
1535
+ "description": "Related assertion or expectation text, when the suggestion targets one.",
1536
+ "$ref": "#/$defs/__schema166"
1537
+ },
1538
+ "__schema166": {
1539
+ "type": "string"
1540
+ },
1541
+ "__schema167": {
1542
+ "type": "string",
1543
+ "description": "Why the judge suggests changing the suite or assertions."
1544
+ },
1545
+ "__schema168": {
1546
+ "type": "string",
1547
+ "description": "Overall narrative feedback from the judge."
1548
+ },
1549
+ "__schema169": {
1550
+ "description": "Error message when grading failed for this repetition.",
1551
+ "$ref": "#/$defs/__schema170"
1552
+ },
1553
+ "__schema170": {
1554
+ "type": "string"
1555
+ },
1556
+ "__schema171": {
1557
+ "description": "Scores from external eval frameworks (LangSmith, Braintrust, etc.).",
1558
+ "$ref": "#/$defs/__schema172"
1559
+ },
1560
+ "__schema172": {
1561
+ "type": "array",
1562
+ "items": {
1563
+ "$ref": "#/$defs/ExternalScore"
1564
+ }
1565
+ },
1566
+ "ExternalScore": {
1567
+ "type": "object",
1568
+ "properties": {
1569
+ "source": {
1570
+ "$ref": "#/$defs/__schema173"
1571
+ },
1572
+ "metric": {
1573
+ "$ref": "#/$defs/__schema174"
1574
+ },
1575
+ "value": {
1576
+ "$ref": "#/$defs/__schema175"
1577
+ },
1578
+ "metadata": {
1579
+ "$ref": "#/$defs/__schema179"
1580
+ }
1581
+ },
1582
+ "required": [
1583
+ "source",
1584
+ "metric",
1585
+ "value"
1586
+ ],
1587
+ "additionalProperties": false,
1588
+ "title": "ExternalScore",
1589
+ "description": "Score attached from an external eval platform without replacing OutcomeGrades."
1590
+ },
1591
+ "__schema173": {
1592
+ "type": "string",
1593
+ "description": "External framework identifier, e.g. langsmith or braintrust."
1594
+ },
1595
+ "__schema174": {
1596
+ "type": "string",
1597
+ "description": "Metric name within the external framework."
1598
+ },
1599
+ "__schema175": {
1600
+ "anyOf": [
1601
+ {
1602
+ "$ref": "#/$defs/__schema176"
1603
+ },
1604
+ {
1605
+ "$ref": "#/$defs/__schema177"
1606
+ },
1607
+ {
1608
+ "$ref": "#/$defs/__schema178"
1609
+ }
1610
+ ],
1611
+ "description": "Metric value (numeric score, boolean pass, or categorical label)."
1612
+ },
1613
+ "__schema176": {
1614
+ "type": "number"
1615
+ },
1616
+ "__schema177": {
1617
+ "type": "boolean"
1618
+ },
1619
+ "__schema178": {
1620
+ "type": "string"
1621
+ },
1622
+ "__schema179": {
1623
+ "description": "Framework-specific metadata (run ids, trace urls, etc.).",
1624
+ "$ref": "#/$defs/__schema180"
1625
+ },
1626
+ "__schema180": {
1627
+ "type": "object",
1628
+ "propertyNames": {
1629
+ "type": "string"
1630
+ },
1631
+ "additionalProperties": {}
1632
+ },
1633
+ "__schema181": {
1634
+ "description": "Optional transcript, raw stream, or OTLP URI artifacts.",
1635
+ "$ref": "#/$defs/EvalArtifacts"
1636
+ },
1637
+ "EvalArtifacts": {
1638
+ "type": "object",
1639
+ "properties": {
1640
+ "rawStreamEvents": {
1641
+ "$ref": "#/$defs/__schema182"
1642
+ },
1643
+ "otlpTraceUri": {
1644
+ "$ref": "#/$defs/__schema184"
1645
+ },
1646
+ "transcript": {
1647
+ "$ref": "#/$defs/__schema186"
1648
+ }
1649
+ },
1650
+ "additionalProperties": false,
1651
+ "title": "EvalArtifacts",
1652
+ "description": "Optional large or vendor-specific blobs. Store by reference in DB when possible."
1653
+ },
1654
+ "__schema182": {
1655
+ "description": "Claude Code stream-json lines — debug only, not cross-harness. Prefer transcript for judges.",
1656
+ "$ref": "#/$defs/__schema183"
1657
+ },
1658
+ "__schema183": {
1659
+ "type": "array",
1660
+ "items": {}
1661
+ },
1662
+ "__schema184": {
1663
+ "description": "URI to an OTLP trace blob (S3, GCS, etc.) when exported separately.",
1664
+ "$ref": "#/$defs/__schema185"
1665
+ },
1666
+ "__schema185": {
1667
+ "type": "string"
1668
+ },
1669
+ "__schema186": {
1670
+ "description": "Text transcript for judges (trajectoryToTranscript output).",
1671
+ "$ref": "#/$defs/__schema187"
1672
+ },
1673
+ "__schema187": {
1674
+ "type": "string"
1675
+ },
1676
+ "__schema188": {
1677
+ "description": "Predicted tool-call trajectory in interchange wire format.",
1678
+ "$ref": "#/$defs/__schema189"
1679
+ },
1680
+ "__schema189": {
1681
+ "type": "array",
1682
+ "items": {
1683
+ "$ref": "#/$defs/InterchangeToolCall"
1684
+ }
1685
+ },
1686
+ "InterchangeToolCall": {
1687
+ "type": "object",
1688
+ "properties": {
1689
+ "tool_name": {
1690
+ "$ref": "#/$defs/__schema190"
1691
+ },
1692
+ "tool_input": {
1693
+ "$ref": "#/$defs/__schema191"
1694
+ }
1695
+ },
1696
+ "required": [
1697
+ "tool_name",
1698
+ "tool_input"
1699
+ ],
1700
+ "additionalProperties": false,
1701
+ "title": "InterchangeToolCall",
1702
+ "description": "Tool call in interchange wire format."
1703
+ },
1704
+ "__schema190": {
1705
+ "type": "string",
1706
+ "description": "Tool name as emitted by the agent."
1707
+ },
1708
+ "__schema191": {
1709
+ "type": "string",
1710
+ "description": "JSON-serialized tool arguments (wire format)."
1711
+ },
1712
+ "__schema192": {
1713
+ "description": "Full multi-turn agent trace in interchange format.",
1714
+ "$ref": "#/$defs/AgentTrace"
1715
+ },
1716
+ "AgentTrace": {
1717
+ "type": "object",
1718
+ "properties": {
1719
+ "agents": {
1720
+ "$ref": "#/$defs/__schema193"
1721
+ },
1722
+ "turns": {
1723
+ "$ref": "#/$defs/__schema206"
1724
+ }
1725
+ },
1726
+ "required": [
1727
+ "agents",
1728
+ "turns"
1729
+ ],
1730
+ "additionalProperties": false,
1731
+ "title": "AgentTrace",
1732
+ "description": "Full multi-turn agent execution trace."
1733
+ },
1734
+ "__schema193": {
1735
+ "type": "object",
1736
+ "propertyNames": {
1737
+ "$ref": "#/$defs/__schema194"
1738
+ },
1739
+ "additionalProperties": {
1740
+ "$ref": "#/$defs/AgentConfig"
1741
+ },
1742
+ "description": "Agent configurations keyed by agent id."
1743
+ },
1744
+ "__schema194": {
1745
+ "type": "string"
1746
+ },
1747
+ "AgentConfig": {
1748
+ "type": "object",
1749
+ "properties": {
1750
+ "agent_id": {
1751
+ "$ref": "#/$defs/__schema195"
1752
+ },
1753
+ "agent_type": {
1754
+ "$ref": "#/$defs/__schema196"
1755
+ },
1756
+ "description": {
1757
+ "$ref": "#/$defs/__schema198"
1758
+ },
1759
+ "instruction": {
1760
+ "$ref": "#/$defs/__schema200"
1761
+ },
1762
+ "tools": {
1763
+ "$ref": "#/$defs/__schema202"
1764
+ },
1765
+ "sub_agents": {
1766
+ "$ref": "#/$defs/__schema204"
1767
+ }
1768
+ },
1769
+ "required": [
1770
+ "agent_id"
1771
+ ],
1772
+ "additionalProperties": false,
1773
+ "title": "AgentConfig",
1774
+ "description": "Static configuration for one agent in a trace."
1775
+ },
1776
+ "__schema195": {
1777
+ "type": "string",
1778
+ "description": "Stable agent identifier."
1779
+ },
1780
+ "__schema196": {
1781
+ "description": "Agent type or role.",
1782
+ "$ref": "#/$defs/__schema197"
1783
+ },
1784
+ "__schema197": {
1785
+ "type": "string"
1786
+ },
1787
+ "__schema198": {
1788
+ "description": "Human-readable agent description.",
1789
+ "$ref": "#/$defs/__schema199"
1790
+ },
1791
+ "__schema199": {
1792
+ "type": "string"
1793
+ },
1794
+ "__schema200": {
1795
+ "description": "System instruction for the agent.",
1796
+ "$ref": "#/$defs/__schema201"
1797
+ },
1798
+ "__schema201": {
1799
+ "type": "string"
1800
+ },
1801
+ "__schema202": {
1802
+ "description": "Tools available to this agent.",
1803
+ "$ref": "#/$defs/__schema203"
1804
+ },
1805
+ "__schema203": {
1806
+ "type": "array",
1807
+ "items": {
1808
+ "type": "object",
1809
+ "properties": {
1810
+ "name": {
1811
+ "type": "string",
1812
+ "description": "Tool name."
1813
+ }
1814
+ },
1815
+ "required": [
1816
+ "name"
1817
+ ],
1818
+ "additionalProperties": false
1819
+ }
1820
+ },
1821
+ "__schema204": {
1822
+ "description": "Sub-agent identifiers when using multi-agent setups.",
1823
+ "$ref": "#/$defs/__schema205"
1824
+ },
1825
+ "__schema205": {
1826
+ "type": "array",
1827
+ "items": {
1828
+ "type": "string"
1829
+ }
1830
+ },
1831
+ "__schema206": {
1832
+ "type": "array",
1833
+ "items": {
1834
+ "$ref": "#/$defs/ConversationTurn"
1835
+ },
1836
+ "description": "Chronological conversation turns."
1837
+ },
1838
+ "ConversationTurn": {
1839
+ "type": "object",
1840
+ "properties": {
1841
+ "turn_index": {
1842
+ "$ref": "#/$defs/__schema207"
1843
+ },
1844
+ "turn_id": {
1845
+ "$ref": "#/$defs/__schema208"
1846
+ },
1847
+ "events": {
1848
+ "$ref": "#/$defs/__schema210"
1849
+ }
1850
+ },
1851
+ "required": [
1852
+ "turn_index",
1853
+ "events"
1854
+ ],
1855
+ "additionalProperties": false,
1856
+ "title": "ConversationTurn",
1857
+ "description": "One turn in a multi-turn agent conversation."
1858
+ },
1859
+ "__schema207": {
1860
+ "type": "integer",
1861
+ "minimum": -9007199254740991,
1862
+ "maximum": 9007199254740991,
1863
+ "description": "Zero-based turn index."
1864
+ },
1865
+ "__schema208": {
1866
+ "description": "Optional stable turn identifier.",
1867
+ "$ref": "#/$defs/__schema209"
1868
+ },
1869
+ "__schema209": {
1870
+ "type": "string"
1871
+ },
1872
+ "__schema210": {
1873
+ "type": "array",
1874
+ "items": {
1875
+ "$ref": "#/$defs/AgentEvent"
1876
+ },
1877
+ "description": "Events in chronological order."
1878
+ },
1879
+ "AgentEvent": {
1880
+ "type": "object",
1881
+ "properties": {
1882
+ "author": {
1883
+ "$ref": "#/$defs/__schema211"
1884
+ },
1885
+ "content": {
1886
+ "$ref": "#/$defs/__schema212"
1887
+ },
1888
+ "event_time": {
1889
+ "$ref": "#/$defs/__schema220"
1890
+ },
1891
+ "state_delta": {
1892
+ "$ref": "#/$defs/__schema222"
1893
+ },
1894
+ "active_tools": {
1895
+ "$ref": "#/$defs/__schema224"
1896
+ }
1897
+ },
1898
+ "required": [
1899
+ "author",
1900
+ "content"
1901
+ ],
1902
+ "additionalProperties": false,
1903
+ "title": "AgentEvent",
1904
+ "description": "One event in a multi-turn agent trace."
1905
+ },
1906
+ "__schema211": {
1907
+ "type": "string",
1908
+ "description": "Agent id or user identifier for this event."
1909
+ },
1910
+ "__schema212": {
1911
+ "type": "object",
1912
+ "properties": {
1913
+ "parts": {
1914
+ "$ref": "#/$defs/__schema213"
1915
+ }
1916
+ },
1917
+ "required": [
1918
+ "parts"
1919
+ ],
1920
+ "additionalProperties": false,
1921
+ "description": "Structured event content."
1922
+ },
1923
+ "__schema213": {
1924
+ "type": "array",
1925
+ "items": {
1926
+ "$ref": "#/$defs/ContentPart"
1927
+ },
1928
+ "description": "Content parts for this event."
1929
+ },
1930
+ "ContentPart": {
1931
+ "type": "object",
1932
+ "properties": {
1933
+ "text": {
1934
+ "$ref": "#/$defs/__schema214"
1935
+ },
1936
+ "function_call": {
1937
+ "$ref": "#/$defs/__schema216"
1938
+ },
1939
+ "function_response": {
1940
+ "$ref": "#/$defs/__schema218"
1941
+ }
1942
+ },
1943
+ "additionalProperties": false,
1944
+ "title": "ContentPart",
1945
+ "description": "One part of agent event content (text, function_call, or function_response)."
1946
+ },
1947
+ "__schema214": {
1948
+ "description": "Plain text content.",
1949
+ "$ref": "#/$defs/__schema215"
1950
+ },
1951
+ "__schema215": {
1952
+ "type": "string"
1953
+ },
1954
+ "__schema216": {
1955
+ "description": "Function call emitted by the agent.",
1956
+ "$ref": "#/$defs/__schema217"
1957
+ },
1958
+ "__schema217": {
1959
+ "type": "object",
1960
+ "properties": {
1961
+ "name": {
1962
+ "type": "string",
1963
+ "description": "Function or tool name."
1964
+ },
1965
+ "args": {
1966
+ "description": "Function arguments."
1967
+ }
1968
+ },
1969
+ "required": [
1970
+ "name",
1971
+ "args"
1972
+ ],
1973
+ "additionalProperties": false
1974
+ },
1975
+ "__schema218": {
1976
+ "description": "Function response from tool execution.",
1977
+ "$ref": "#/$defs/__schema219"
1978
+ },
1979
+ "__schema219": {
1980
+ "type": "object",
1981
+ "properties": {
1982
+ "name": {
1983
+ "type": "string",
1984
+ "description": "Function or tool name."
1985
+ },
1986
+ "response": {
1987
+ "description": "Function result payload."
1988
+ }
1989
+ },
1990
+ "required": [
1991
+ "name",
1992
+ "response"
1993
+ ],
1994
+ "additionalProperties": false
1995
+ },
1996
+ "__schema220": {
1997
+ "description": "ISO 8601 timestamp when the event occurred.",
1998
+ "$ref": "#/$defs/__schema221"
1999
+ },
2000
+ "__schema221": {
2001
+ "type": "string"
2002
+ },
2003
+ "__schema222": {
2004
+ "description": "Session state changes associated with this event.",
2005
+ "$ref": "#/$defs/__schema223"
2006
+ },
2007
+ "__schema223": {
2008
+ "type": "object",
2009
+ "propertyNames": {
2010
+ "type": "string"
2011
+ },
2012
+ "additionalProperties": {}
2013
+ },
2014
+ "__schema224": {
2015
+ "description": "Tools available to the agent at event time.",
2016
+ "$ref": "#/$defs/__schema225"
2017
+ },
2018
+ "__schema225": {
2019
+ "type": "array",
2020
+ "items": {
2021
+ "type": "object",
2022
+ "properties": {
2023
+ "name": {
2024
+ "type": "string",
2025
+ "description": "Tool name."
2026
+ }
2027
+ },
2028
+ "required": [
2029
+ "name"
2030
+ ],
2031
+ "additionalProperties": false
2032
+ }
2033
+ },
2034
+ "__schema226": {
2035
+ "description": "Session latency in seconds (interchange field).",
2036
+ "$ref": "#/$defs/__schema227"
2037
+ },
2038
+ "__schema227": {
2039
+ "type": "number"
2040
+ },
2041
+ "__schema228": {
2042
+ "description": "1 when the harness run failed, 0 on success.",
2043
+ "$ref": "#/$defs/__schema229"
2044
+ },
2045
+ "__schema229": {
2046
+ "anyOf": [
2047
+ {
2048
+ "type": "number",
2049
+ "const": 0
2050
+ },
2051
+ {
2052
+ "type": "number",
2053
+ "const": 1
2054
+ }
2055
+ ]
2056
+ },
2057
+ "__schema230": {
2058
+ "description": "Trajectory-level metrics when reference_trajectory is provided.",
2059
+ "$ref": "#/$defs/TrajectoryMetrics"
2060
+ },
2061
+ "TrajectoryMetrics": {
2062
+ "type": "object",
2063
+ "properties": {
2064
+ "trajectory_exact_match": {
2065
+ "$ref": "#/$defs/__schema231"
2066
+ },
2067
+ "trajectory_in_order_match": {
2068
+ "$ref": "#/$defs/__schema232"
2069
+ },
2070
+ "trajectory_any_order_match": {
2071
+ "$ref": "#/$defs/__schema233"
2072
+ },
2073
+ "trajectory_precision": {
2074
+ "$ref": "#/$defs/__schema234"
2075
+ },
2076
+ "trajectory_recall": {
2077
+ "$ref": "#/$defs/__schema235"
2078
+ },
2079
+ "trajectory_single_tool_use": {
2080
+ "$ref": "#/$defs/__schema236"
2081
+ }
2082
+ },
2083
+ "required": [
2084
+ "trajectory_exact_match",
2085
+ "trajectory_in_order_match",
2086
+ "trajectory_any_order_match",
2087
+ "trajectory_precision",
2088
+ "trajectory_recall",
2089
+ "trajectory_single_tool_use"
2090
+ ],
2091
+ "additionalProperties": false,
2092
+ "title": "TrajectoryMetrics",
2093
+ "description": "Trajectory-level metric scores for one repetition."
2094
+ },
2095
+ "__schema231": {
2096
+ "type": "number",
2097
+ "description": "Exact trajectory match score (0 or 1)."
2098
+ },
2099
+ "__schema232": {
2100
+ "type": "number",
2101
+ "description": "In-order trajectory match score (0 or 1)."
2102
+ },
2103
+ "__schema233": {
2104
+ "type": "number",
2105
+ "description": "Any-order trajectory match score (0 or 1)."
2106
+ },
2107
+ "__schema234": {
2108
+ "type": "number",
2109
+ "description": "Trajectory precision (0..1)."
2110
+ },
2111
+ "__schema235": {
2112
+ "type": "number",
2113
+ "description": "Trajectory recall (0..1)."
2114
+ },
2115
+ "__schema236": {
2116
+ "type": "number",
2117
+ "description": "Single-tool-use match score (0 or 1)."
2118
+ },
2119
+ "__schema237": {
2120
+ "description": "Tool-call-level metrics when reference_trajectory is provided.",
2121
+ "$ref": "#/$defs/ToolCallMetrics"
2122
+ },
2123
+ "ToolCallMetrics": {
2124
+ "type": "object",
2125
+ "properties": {
2126
+ "tool_call_valid": {
2127
+ "$ref": "#/$defs/__schema238"
2128
+ },
2129
+ "tool_name_match": {
2130
+ "$ref": "#/$defs/__schema239"
2131
+ },
2132
+ "tool_parameter_key_match": {
2133
+ "$ref": "#/$defs/__schema240"
2134
+ },
2135
+ "tool_parameter_kv_match": {
2136
+ "$ref": "#/$defs/__schema241"
2137
+ }
2138
+ },
2139
+ "required": [
2140
+ "tool_call_valid",
2141
+ "tool_name_match",
2142
+ "tool_parameter_key_match",
2143
+ "tool_parameter_kv_match"
2144
+ ],
2145
+ "additionalProperties": false,
2146
+ "title": "ToolCallMetrics",
2147
+ "description": "Tool-call-level metric scores for one repetition."
2148
+ },
2149
+ "__schema238": {
2150
+ "type": "number",
2151
+ "description": "Tool call validity score (0..1)."
2152
+ },
2153
+ "__schema239": {
2154
+ "type": "number",
2155
+ "description": "Tool name match score (0..1)."
2156
+ },
2157
+ "__schema240": {
2158
+ "type": "number",
2159
+ "description": "Tool parameter key match score (0..1)."
2160
+ },
2161
+ "__schema241": {
2162
+ "type": "number",
2163
+ "description": "Tool parameter key-value match score (0..1)."
2164
+ },
2165
+ "__schema242": {
2166
+ "description": "Present when the harness failed without producing a trajectory.",
2167
+ "$ref": "#/$defs/RepetitionError"
2168
+ },
2169
+ "RepetitionError": {
2170
+ "type": "object",
2171
+ "properties": {
2172
+ "message": {
2173
+ "$ref": "#/$defs/__schema243"
2174
+ },
2175
+ "diagnostics": {
2176
+ "$ref": "#/$defs/__schema244"
2177
+ }
2178
+ },
2179
+ "required": [
2180
+ "message"
2181
+ ],
2182
+ "additionalProperties": false,
2183
+ "title": "RepetitionError",
2184
+ "description": "Harness failure for one repetition without a usable TrajectoryView."
2185
+ },
2186
+ "__schema243": {
2187
+ "type": "string",
2188
+ "description": "Harness failure message for this repetition."
2189
+ },
2190
+ "__schema244": {
2191
+ "description": "Adapter diagnostics when the harness failed before producing a trajectory.",
2192
+ "$ref": "#/$defs/AdapterDiagnostics"
2193
+ }
2194
+ }
2195
+ }