@tangle-network/agent-eval 0.24.0 → 0.27.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +138 -0
- package/README.md +72 -0
- package/dist/{chunk-SY6WAAAD.js → chunk-5LBB5B3Z.js} +296 -5
- package/dist/chunk-5LBB5B3Z.js.map +1 -0
- package/dist/{chunk-OHEPNJQN.js → chunk-JLZQWFV3.js} +65 -1
- package/dist/chunk-JLZQWFV3.js.map +1 -0
- package/dist/{chunk-VRJVTXRV.js → chunk-WHZMVFUV.js} +85 -85
- package/dist/chunk-WHZMVFUV.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/governance/index.d.ts +1 -1
- package/dist/{index-Oj9fAPPN.d.ts → index-D3iBCjdF.d.ts} +63 -2
- package/dist/index.d.ts +529 -12
- package/dist/index.js +1106 -17
- package/dist/index.js.map +1 -1
- package/dist/openapi.json +491 -1
- package/dist/optimization.d.ts +2 -2
- package/dist/optimization.js +1 -1
- package/dist/pipelines/index.js +3 -67
- package/dist/pipelines/index.js.map +1 -1
- package/dist/{release-report-TDPn1cxq.d.ts → release-report-wfUySN5F.d.ts} +1 -1
- package/dist/reporting.d.ts +2 -2
- package/dist/{researcher-CUOiGcGv.d.ts → researcher-bGkI7vCl.d.ts} +1 -1
- package/dist/rl.d.ts +3 -3
- package/dist/{summary-report-BXGs_9V0.d.ts → summary-report-DZVXOCK_.d.ts} +13 -1
- package/dist/wire/index.d.ts +347 -3
- package/dist/wire/index.js +19 -1
- package/docs/concepts.md +11 -0
- package/package.json +1 -1
- package/dist/chunk-OHEPNJQN.js.map +0 -1
- package/dist/chunk-SY6WAAAD.js.map +0 -1
- package/dist/chunk-VRJVTXRV.js.map +0 -1
package/dist/openapi.json
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
"openapi": "3.1.0",
|
|
3
3
|
"info": {
|
|
4
4
|
"title": "@tangle-network/agent-eval — wire protocol",
|
|
5
|
-
"version": "0.
|
|
5
|
+
"version": "0.27.0",
|
|
6
6
|
"description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
|
|
7
7
|
"contact": {
|
|
8
8
|
"name": "Tangle Network",
|
|
@@ -382,6 +382,377 @@
|
|
|
382
382
|
"required": [
|
|
383
383
|
"error"
|
|
384
384
|
]
|
|
385
|
+
},
|
|
386
|
+
"TracesIngestRequest": {
|
|
387
|
+
"type": "object",
|
|
388
|
+
"properties": {
|
|
389
|
+
"events": {
|
|
390
|
+
"type": "array",
|
|
391
|
+
"items": {
|
|
392
|
+
"$ref": "#/components/schemas/TraceEvent"
|
|
393
|
+
},
|
|
394
|
+
"minItems": 1,
|
|
395
|
+
"maxItems": 10000,
|
|
396
|
+
"description": "Batch of events. Max 10k per call — bigger streams should be chunked."
|
|
397
|
+
}
|
|
398
|
+
},
|
|
399
|
+
"required": [
|
|
400
|
+
"events"
|
|
401
|
+
]
|
|
402
|
+
},
|
|
403
|
+
"TraceEvent": {
|
|
404
|
+
"type": "object",
|
|
405
|
+
"properties": {
|
|
406
|
+
"eventId": {
|
|
407
|
+
"type": "string",
|
|
408
|
+
"minLength": 1,
|
|
409
|
+
"description": "Stable id for the event. Use ULID or UUID."
|
|
410
|
+
},
|
|
411
|
+
"runId": {
|
|
412
|
+
"type": "string",
|
|
413
|
+
"minLength": 1,
|
|
414
|
+
"description": "Run this event belongs to."
|
|
415
|
+
},
|
|
416
|
+
"spanId": {
|
|
417
|
+
"type": "string",
|
|
418
|
+
"description": "Span that emitted the event, if any."
|
|
419
|
+
},
|
|
420
|
+
"kind": {
|
|
421
|
+
"type": "string",
|
|
422
|
+
"enum": [
|
|
423
|
+
"log",
|
|
424
|
+
"error",
|
|
425
|
+
"budget_decrement",
|
|
426
|
+
"budget_breach",
|
|
427
|
+
"state_mutation",
|
|
428
|
+
"policy_violation",
|
|
429
|
+
"redaction_applied",
|
|
430
|
+
"custom"
|
|
431
|
+
],
|
|
432
|
+
"description": "Coarse event category — matches the TraceSchema v1 EventKind enum."
|
|
433
|
+
},
|
|
434
|
+
"timestamp": {
|
|
435
|
+
"type": "integer",
|
|
436
|
+
"minimum": 0,
|
|
437
|
+
"description": "Unix millis. Must be monotonically non-decreasing within a span."
|
|
438
|
+
},
|
|
439
|
+
"payload": {
|
|
440
|
+
"type": "object",
|
|
441
|
+
"additionalProperties": {},
|
|
442
|
+
"description": "Free-form payload — the runtime owns the shape."
|
|
443
|
+
}
|
|
444
|
+
},
|
|
445
|
+
"required": [
|
|
446
|
+
"eventId",
|
|
447
|
+
"runId",
|
|
448
|
+
"kind",
|
|
449
|
+
"timestamp",
|
|
450
|
+
"payload"
|
|
451
|
+
]
|
|
452
|
+
},
|
|
453
|
+
"TracesIngestResponse": {
|
|
454
|
+
"type": "object",
|
|
455
|
+
"properties": {
|
|
456
|
+
"accepted": {
|
|
457
|
+
"type": "integer",
|
|
458
|
+
"minimum": 0,
|
|
459
|
+
"description": "Number of events persisted."
|
|
460
|
+
},
|
|
461
|
+
"rejected": {
|
|
462
|
+
"type": "integer",
|
|
463
|
+
"minimum": 0,
|
|
464
|
+
"description": "Number of events the store refused — see `errors[]` for reasons."
|
|
465
|
+
},
|
|
466
|
+
"errors": {
|
|
467
|
+
"type": "array",
|
|
468
|
+
"items": {
|
|
469
|
+
"type": "object",
|
|
470
|
+
"properties": {
|
|
471
|
+
"eventId": {
|
|
472
|
+
"type": "string",
|
|
473
|
+
"description": "Event id this error applies to."
|
|
474
|
+
},
|
|
475
|
+
"message": {
|
|
476
|
+
"type": "string",
|
|
477
|
+
"description": "Why the event was rejected."
|
|
478
|
+
}
|
|
479
|
+
},
|
|
480
|
+
"required": [
|
|
481
|
+
"eventId",
|
|
482
|
+
"message"
|
|
483
|
+
]
|
|
484
|
+
},
|
|
485
|
+
"default": []
|
|
486
|
+
}
|
|
487
|
+
},
|
|
488
|
+
"required": [
|
|
489
|
+
"accepted",
|
|
490
|
+
"rejected"
|
|
491
|
+
]
|
|
492
|
+
},
|
|
493
|
+
"FeedbackTrajectory": {
|
|
494
|
+
"type": "object",
|
|
495
|
+
"properties": {
|
|
496
|
+
"id": {
|
|
497
|
+
"type": "string",
|
|
498
|
+
"minLength": 1,
|
|
499
|
+
"description": "Stable id; idempotency key for the trajectory."
|
|
500
|
+
},
|
|
501
|
+
"projectId": {
|
|
502
|
+
"type": "string"
|
|
503
|
+
},
|
|
504
|
+
"scenarioId": {
|
|
505
|
+
"type": "string"
|
|
506
|
+
},
|
|
507
|
+
"task": {
|
|
508
|
+
"type": "object",
|
|
509
|
+
"properties": {
|
|
510
|
+
"intent": {
|
|
511
|
+
"type": "string",
|
|
512
|
+
"minLength": 1
|
|
513
|
+
},
|
|
514
|
+
"context": {}
|
|
515
|
+
},
|
|
516
|
+
"required": [
|
|
517
|
+
"intent"
|
|
518
|
+
]
|
|
519
|
+
},
|
|
520
|
+
"attempts": {
|
|
521
|
+
"type": "array",
|
|
522
|
+
"items": {
|
|
523
|
+
"$ref": "#/components/schemas/FeedbackAttempt"
|
|
524
|
+
},
|
|
525
|
+
"default": []
|
|
526
|
+
},
|
|
527
|
+
"labels": {
|
|
528
|
+
"type": "array",
|
|
529
|
+
"items": {
|
|
530
|
+
"$ref": "#/components/schemas/FeedbackLabel"
|
|
531
|
+
},
|
|
532
|
+
"default": []
|
|
533
|
+
},
|
|
534
|
+
"outcome": {
|
|
535
|
+
"type": "object",
|
|
536
|
+
"properties": {
|
|
537
|
+
"success": {
|
|
538
|
+
"type": "boolean"
|
|
539
|
+
},
|
|
540
|
+
"score": {
|
|
541
|
+
"type": "number"
|
|
542
|
+
},
|
|
543
|
+
"metrics": {
|
|
544
|
+
"type": "object",
|
|
545
|
+
"additionalProperties": {
|
|
546
|
+
"type": "number"
|
|
547
|
+
}
|
|
548
|
+
},
|
|
549
|
+
"costUsd": {
|
|
550
|
+
"type": "number"
|
|
551
|
+
},
|
|
552
|
+
"detail": {
|
|
553
|
+
"type": "string"
|
|
554
|
+
},
|
|
555
|
+
"observedAt": {
|
|
556
|
+
"type": "string"
|
|
557
|
+
},
|
|
558
|
+
"metadata": {
|
|
559
|
+
"type": "object",
|
|
560
|
+
"additionalProperties": {}
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
},
|
|
564
|
+
"split": {
|
|
565
|
+
"type": "string",
|
|
566
|
+
"enum": [
|
|
567
|
+
"train",
|
|
568
|
+
"dev",
|
|
569
|
+
"test",
|
|
570
|
+
"holdout"
|
|
571
|
+
]
|
|
572
|
+
},
|
|
573
|
+
"tags": {
|
|
574
|
+
"type": "object",
|
|
575
|
+
"additionalProperties": {
|
|
576
|
+
"type": "string"
|
|
577
|
+
}
|
|
578
|
+
},
|
|
579
|
+
"createdAt": {
|
|
580
|
+
"type": "string",
|
|
581
|
+
"description": "ISO-8601 UTC."
|
|
582
|
+
},
|
|
583
|
+
"updatedAt": {
|
|
584
|
+
"type": "string"
|
|
585
|
+
},
|
|
586
|
+
"metadata": {
|
|
587
|
+
"type": "object",
|
|
588
|
+
"additionalProperties": {}
|
|
589
|
+
}
|
|
590
|
+
},
|
|
591
|
+
"required": [
|
|
592
|
+
"id",
|
|
593
|
+
"task",
|
|
594
|
+
"createdAt"
|
|
595
|
+
]
|
|
596
|
+
},
|
|
597
|
+
"FeedbackAttempt": {
|
|
598
|
+
"type": "object",
|
|
599
|
+
"properties": {
|
|
600
|
+
"id": {
|
|
601
|
+
"type": "string",
|
|
602
|
+
"minLength": 1
|
|
603
|
+
},
|
|
604
|
+
"stepIndex": {
|
|
605
|
+
"type": "integer",
|
|
606
|
+
"minimum": 0
|
|
607
|
+
},
|
|
608
|
+
"artifactType": {
|
|
609
|
+
"type": "string",
|
|
610
|
+
"enum": [
|
|
611
|
+
"text",
|
|
612
|
+
"code",
|
|
613
|
+
"plan",
|
|
614
|
+
"research",
|
|
615
|
+
"action",
|
|
616
|
+
"ui",
|
|
617
|
+
"decision",
|
|
618
|
+
"data",
|
|
619
|
+
"other"
|
|
620
|
+
]
|
|
621
|
+
},
|
|
622
|
+
"artifact": {},
|
|
623
|
+
"options": {
|
|
624
|
+
"type": "array",
|
|
625
|
+
"items": {}
|
|
626
|
+
},
|
|
627
|
+
"proposedAction": {
|
|
628
|
+
"type": "object",
|
|
629
|
+
"properties": {
|
|
630
|
+
"type": {
|
|
631
|
+
"type": "string"
|
|
632
|
+
},
|
|
633
|
+
"risk": {
|
|
634
|
+
"type": "string",
|
|
635
|
+
"enum": [
|
|
636
|
+
"low",
|
|
637
|
+
"medium",
|
|
638
|
+
"high"
|
|
639
|
+
]
|
|
640
|
+
},
|
|
641
|
+
"costUsd": {
|
|
642
|
+
"type": "number"
|
|
643
|
+
},
|
|
644
|
+
"externalSideEffect": {
|
|
645
|
+
"type": "boolean"
|
|
646
|
+
},
|
|
647
|
+
"requiresApproval": {
|
|
648
|
+
"type": "boolean"
|
|
649
|
+
},
|
|
650
|
+
"metadata": {
|
|
651
|
+
"type": "object",
|
|
652
|
+
"additionalProperties": {}
|
|
653
|
+
}
|
|
654
|
+
},
|
|
655
|
+
"required": [
|
|
656
|
+
"type"
|
|
657
|
+
]
|
|
658
|
+
},
|
|
659
|
+
"feedback": {
|
|
660
|
+
"type": "array",
|
|
661
|
+
"items": {
|
|
662
|
+
"$ref": "#/components/schemas/FeedbackLabel"
|
|
663
|
+
}
|
|
664
|
+
},
|
|
665
|
+
"createdAt": {
|
|
666
|
+
"type": "string"
|
|
667
|
+
},
|
|
668
|
+
"metadata": {
|
|
669
|
+
"type": "object",
|
|
670
|
+
"additionalProperties": {}
|
|
671
|
+
}
|
|
672
|
+
},
|
|
673
|
+
"required": [
|
|
674
|
+
"id",
|
|
675
|
+
"stepIndex",
|
|
676
|
+
"artifactType",
|
|
677
|
+
"createdAt"
|
|
678
|
+
]
|
|
679
|
+
},
|
|
680
|
+
"FeedbackLabel": {
|
|
681
|
+
"type": "object",
|
|
682
|
+
"properties": {
|
|
683
|
+
"id": {
|
|
684
|
+
"type": "string"
|
|
685
|
+
},
|
|
686
|
+
"source": {
|
|
687
|
+
"type": "string",
|
|
688
|
+
"enum": [
|
|
689
|
+
"user",
|
|
690
|
+
"judge",
|
|
691
|
+
"environment",
|
|
692
|
+
"metric",
|
|
693
|
+
"policy",
|
|
694
|
+
"system"
|
|
695
|
+
]
|
|
696
|
+
},
|
|
697
|
+
"kind": {
|
|
698
|
+
"type": "string",
|
|
699
|
+
"enum": [
|
|
700
|
+
"approve",
|
|
701
|
+
"reject",
|
|
702
|
+
"select",
|
|
703
|
+
"edit",
|
|
704
|
+
"rank",
|
|
705
|
+
"rate",
|
|
706
|
+
"comment",
|
|
707
|
+
"metric_outcome",
|
|
708
|
+
"policy_block",
|
|
709
|
+
"revision_request"
|
|
710
|
+
]
|
|
711
|
+
},
|
|
712
|
+
"value": {},
|
|
713
|
+
"reason": {
|
|
714
|
+
"type": "string"
|
|
715
|
+
},
|
|
716
|
+
"severity": {
|
|
717
|
+
"type": "string",
|
|
718
|
+
"enum": [
|
|
719
|
+
"info",
|
|
720
|
+
"warning",
|
|
721
|
+
"error",
|
|
722
|
+
"critical"
|
|
723
|
+
]
|
|
724
|
+
},
|
|
725
|
+
"createdAt": {
|
|
726
|
+
"type": "string",
|
|
727
|
+
"description": "ISO-8601 UTC."
|
|
728
|
+
},
|
|
729
|
+
"metadata": {
|
|
730
|
+
"type": "object",
|
|
731
|
+
"additionalProperties": {}
|
|
732
|
+
}
|
|
733
|
+
},
|
|
734
|
+
"required": [
|
|
735
|
+
"source",
|
|
736
|
+
"kind",
|
|
737
|
+
"createdAt"
|
|
738
|
+
]
|
|
739
|
+
},
|
|
740
|
+
"FeedbackIngestResponse": {
|
|
741
|
+
"type": "object",
|
|
742
|
+
"properties": {
|
|
743
|
+
"id": {
|
|
744
|
+
"type": "string",
|
|
745
|
+
"description": "Trajectory id that was persisted."
|
|
746
|
+
},
|
|
747
|
+
"persisted": {
|
|
748
|
+
"type": "boolean",
|
|
749
|
+
"description": "True when the trajectory was saved (idempotent on id)."
|
|
750
|
+
}
|
|
751
|
+
},
|
|
752
|
+
"required": [
|
|
753
|
+
"id",
|
|
754
|
+
"persisted"
|
|
755
|
+
]
|
|
385
756
|
}
|
|
386
757
|
},
|
|
387
758
|
"parameters": {}
|
|
@@ -496,6 +867,125 @@
|
|
|
496
867
|
}
|
|
497
868
|
}
|
|
498
869
|
}
|
|
870
|
+
},
|
|
871
|
+
"/v1/traces/ingest": {
|
|
872
|
+
"post": {
|
|
873
|
+
"summary": "Ingest a batch of production TraceEvents",
|
|
874
|
+
"description": "Append a batch of TraceEvents to the configured TraceStore. Accepts application/json ({events:[...]}) or application/x-ndjson (one event per line). Returns counts of accepted + rejected events.",
|
|
875
|
+
"requestBody": {
|
|
876
|
+
"content": {
|
|
877
|
+
"application/json": {
|
|
878
|
+
"schema": {
|
|
879
|
+
"$ref": "#/components/schemas/TracesIngestRequest"
|
|
880
|
+
}
|
|
881
|
+
},
|
|
882
|
+
"application/x-ndjson": {
|
|
883
|
+
"schema": {
|
|
884
|
+
"$ref": "#/components/schemas/TracesIngestRequest"
|
|
885
|
+
}
|
|
886
|
+
}
|
|
887
|
+
}
|
|
888
|
+
},
|
|
889
|
+
"responses": {
|
|
890
|
+
"200": {
|
|
891
|
+
"description": "Ingestion summary",
|
|
892
|
+
"content": {
|
|
893
|
+
"application/json": {
|
|
894
|
+
"schema": {
|
|
895
|
+
"$ref": "#/components/schemas/TracesIngestResponse"
|
|
896
|
+
}
|
|
897
|
+
}
|
|
898
|
+
}
|
|
899
|
+
},
|
|
900
|
+
"400": {
|
|
901
|
+
"description": "Validation error",
|
|
902
|
+
"content": {
|
|
903
|
+
"application/json": {
|
|
904
|
+
"schema": {
|
|
905
|
+
"$ref": "#/components/schemas/ErrorResponse"
|
|
906
|
+
}
|
|
907
|
+
}
|
|
908
|
+
}
|
|
909
|
+
},
|
|
910
|
+
"401": {
|
|
911
|
+
"description": "Unauthorized (when bearer auth is configured)",
|
|
912
|
+
"content": {
|
|
913
|
+
"application/json": {
|
|
914
|
+
"schema": {
|
|
915
|
+
"$ref": "#/components/schemas/ErrorResponse"
|
|
916
|
+
}
|
|
917
|
+
}
|
|
918
|
+
}
|
|
919
|
+
},
|
|
920
|
+
"503": {
|
|
921
|
+
"description": "No trace store configured",
|
|
922
|
+
"content": {
|
|
923
|
+
"application/json": {
|
|
924
|
+
"schema": {
|
|
925
|
+
"$ref": "#/components/schemas/ErrorResponse"
|
|
926
|
+
}
|
|
927
|
+
}
|
|
928
|
+
}
|
|
929
|
+
}
|
|
930
|
+
}
|
|
931
|
+
}
|
|
932
|
+
},
|
|
933
|
+
"/v1/feedback": {
|
|
934
|
+
"post": {
|
|
935
|
+
"summary": "Ingest a FeedbackTrajectory from production",
|
|
936
|
+
"description": "Persist a single FeedbackTrajectory. Idempotent on trajectory.id — re-posting replaces the prior record. Used by production runtimes to forward user 👍/👎/edits into the eval substrate.",
|
|
937
|
+
"requestBody": {
|
|
938
|
+
"content": {
|
|
939
|
+
"application/json": {
|
|
940
|
+
"schema": {
|
|
941
|
+
"$ref": "#/components/schemas/FeedbackTrajectory"
|
|
942
|
+
}
|
|
943
|
+
}
|
|
944
|
+
}
|
|
945
|
+
},
|
|
946
|
+
"responses": {
|
|
947
|
+
"200": {
|
|
948
|
+
"description": "Persisted",
|
|
949
|
+
"content": {
|
|
950
|
+
"application/json": {
|
|
951
|
+
"schema": {
|
|
952
|
+
"$ref": "#/components/schemas/FeedbackIngestResponse"
|
|
953
|
+
}
|
|
954
|
+
}
|
|
955
|
+
}
|
|
956
|
+
},
|
|
957
|
+
"400": {
|
|
958
|
+
"description": "Validation error",
|
|
959
|
+
"content": {
|
|
960
|
+
"application/json": {
|
|
961
|
+
"schema": {
|
|
962
|
+
"$ref": "#/components/schemas/ErrorResponse"
|
|
963
|
+
}
|
|
964
|
+
}
|
|
965
|
+
}
|
|
966
|
+
},
|
|
967
|
+
"401": {
|
|
968
|
+
"description": "Unauthorized (when bearer auth is configured)",
|
|
969
|
+
"content": {
|
|
970
|
+
"application/json": {
|
|
971
|
+
"schema": {
|
|
972
|
+
"$ref": "#/components/schemas/ErrorResponse"
|
|
973
|
+
}
|
|
974
|
+
}
|
|
975
|
+
}
|
|
976
|
+
},
|
|
977
|
+
"503": {
|
|
978
|
+
"description": "No feedback store configured",
|
|
979
|
+
"content": {
|
|
980
|
+
"application/json": {
|
|
981
|
+
"schema": {
|
|
982
|
+
"$ref": "#/components/schemas/ErrorResponse"
|
|
983
|
+
}
|
|
984
|
+
}
|
|
985
|
+
}
|
|
986
|
+
}
|
|
987
|
+
}
|
|
988
|
+
}
|
|
499
989
|
}
|
|
500
990
|
},
|
|
501
991
|
"webhooks": {}
|
package/dist/optimization.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-
|
|
1
|
+
export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-bGkI7vCl.js';
|
|
2
2
|
export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-DfFdrraJ.js';
|
|
3
|
-
export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-
|
|
3
|
+
export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-DZVXOCK_.js';
|
|
4
4
|
import './errors-BZ9sTdz7.js';
|
|
5
5
|
import './integrity-DK2EBVZC.js';
|
|
6
6
|
import './store-Db2Bv8Cf.js';
|
package/dist/optimization.js
CHANGED
package/dist/pipelines/index.js
CHANGED
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
import {
|
|
2
|
-
DEFAULT_RULES,
|
|
3
|
-
classifyFailure,
|
|
4
2
|
compareToBaseline,
|
|
5
|
-
computeToolUseMetrics
|
|
6
|
-
|
|
3
|
+
computeToolUseMetrics,
|
|
4
|
+
failureClusterView
|
|
5
|
+
} from "../chunk-JLZQWFV3.js";
|
|
7
6
|
import {
|
|
8
7
|
buildTrajectory
|
|
9
8
|
} from "../chunk-RZTMDUO7.js";
|
|
@@ -62,69 +61,6 @@ async function budgetBreachView(store, options = {}) {
|
|
|
62
61
|
};
|
|
63
62
|
}
|
|
64
63
|
|
|
65
|
-
// src/pipelines/failure-cluster.ts
|
|
66
|
-
async function failureClusterView(store, options = {}) {
|
|
67
|
-
const rules = options.rules ?? DEFAULT_RULES;
|
|
68
|
-
const minSize = options.minClusterSize ?? 1;
|
|
69
|
-
const runs = await store.listRuns();
|
|
70
|
-
const clusters = /* @__PURE__ */ new Map();
|
|
71
|
-
let totalFailures = 0;
|
|
72
|
-
for (const run of runs) {
|
|
73
|
-
if (run.status === "completed" && run.outcome?.pass !== false) continue;
|
|
74
|
-
totalFailures++;
|
|
75
|
-
const spans = await store.spans({ runId: run.runId });
|
|
76
|
-
const events = await store.events({ runId: run.runId });
|
|
77
|
-
const cls = classifyFailure({ run, spans, events }, rules);
|
|
78
|
-
let toolName;
|
|
79
|
-
let argPrefix;
|
|
80
|
-
let dimension;
|
|
81
|
-
if (cls.triggerSpanId) {
|
|
82
|
-
const trig = spans.find((s) => s.spanId === cls.triggerSpanId);
|
|
83
|
-
if (trig?.kind === "tool") {
|
|
84
|
-
toolName = trig.toolName;
|
|
85
|
-
argPrefix = argHash(trig.args).slice(0, 16);
|
|
86
|
-
} else if (trig?.kind === "judge") {
|
|
87
|
-
dimension = trig.dimension;
|
|
88
|
-
}
|
|
89
|
-
}
|
|
90
|
-
if (!toolName) {
|
|
91
|
-
const ts = await toolSpans(store, run.runId);
|
|
92
|
-
const errored = ts.filter((t) => t.status === "error").pop();
|
|
93
|
-
if (errored) {
|
|
94
|
-
toolName = errored.toolName;
|
|
95
|
-
argPrefix = argHash(errored.args).slice(0, 16);
|
|
96
|
-
}
|
|
97
|
-
}
|
|
98
|
-
if (!dimension) {
|
|
99
|
-
const judge = spans.find((s) => s.kind === "judge" && typeof s.dimension === "string");
|
|
100
|
-
if (judge?.kind === "judge") dimension = judge.dimension;
|
|
101
|
-
}
|
|
102
|
-
const key = `${cls.failureClass}|${toolName ?? ""}|${argPrefix ?? ""}|${dimension ?? ""}`;
|
|
103
|
-
let cluster = clusters.get(key);
|
|
104
|
-
if (!cluster) {
|
|
105
|
-
cluster = {
|
|
106
|
-
failureClass: cls.failureClass,
|
|
107
|
-
toolName,
|
|
108
|
-
argPrefix,
|
|
109
|
-
dimension,
|
|
110
|
-
runCount: 0,
|
|
111
|
-
scenarioIds: [],
|
|
112
|
-
exampleRunId: run.runId,
|
|
113
|
-
exampleError: firstErrorMessage(spans) ?? cls.reason
|
|
114
|
-
};
|
|
115
|
-
clusters.set(key, cluster);
|
|
116
|
-
}
|
|
117
|
-
cluster.runCount++;
|
|
118
|
-
if (!cluster.scenarioIds.includes(run.scenarioId)) cluster.scenarioIds.push(run.scenarioId);
|
|
119
|
-
}
|
|
120
|
-
const arr = [...clusters.values()].filter((c) => c.runCount >= minSize).sort((a, b) => b.runCount - a.runCount);
|
|
121
|
-
return { clusters: arr, totalFailures, totalRuns: runs.length };
|
|
122
|
-
}
|
|
123
|
-
function firstErrorMessage(spans) {
|
|
124
|
-
const errored = spans.find((s) => s.status === "error");
|
|
125
|
-
return errored?.error;
|
|
126
|
-
}
|
|
127
|
-
|
|
128
64
|
// src/pipelines/first-divergence.ts
|
|
129
65
|
async function firstDivergenceView(store, runA, runB, options = {}) {
|
|
130
66
|
const [a, b] = await Promise.all([buildTrajectory(store, runA), buildTrajectory(store, runB)]);
|