@tangle-network/agent-eval 0.24.0 → 0.27.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/openapi.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "openapi": "3.1.0",
3
3
  "info": {
4
4
  "title": "@tangle-network/agent-eval — wire protocol",
5
- "version": "0.24.0",
5
+ "version": "0.27.0",
6
6
  "description": "HTTP and stdio RPC interface to agent-eval. The TypeScript runtime is the source of truth; this spec is the contract that cross-language clients (Python, Rust, Go) generate from.\n\nWire-protocol version: 1.0.0. Bumps on breaking changes to request/response schemas.",
7
7
  "contact": {
8
8
  "name": "Tangle Network",
@@ -382,6 +382,377 @@
382
382
  "required": [
383
383
  "error"
384
384
  ]
385
+ },
386
+ "TracesIngestRequest": {
387
+ "type": "object",
388
+ "properties": {
389
+ "events": {
390
+ "type": "array",
391
+ "items": {
392
+ "$ref": "#/components/schemas/TraceEvent"
393
+ },
394
+ "minItems": 1,
395
+ "maxItems": 10000,
396
+ "description": "Batch of events. Max 10k per call — bigger streams should be chunked."
397
+ }
398
+ },
399
+ "required": [
400
+ "events"
401
+ ]
402
+ },
403
+ "TraceEvent": {
404
+ "type": "object",
405
+ "properties": {
406
+ "eventId": {
407
+ "type": "string",
408
+ "minLength": 1,
409
+ "description": "Stable id for the event. Use ULID or UUID."
410
+ },
411
+ "runId": {
412
+ "type": "string",
413
+ "minLength": 1,
414
+ "description": "Run this event belongs to."
415
+ },
416
+ "spanId": {
417
+ "type": "string",
418
+ "description": "Span that emitted the event, if any."
419
+ },
420
+ "kind": {
421
+ "type": "string",
422
+ "enum": [
423
+ "log",
424
+ "error",
425
+ "budget_decrement",
426
+ "budget_breach",
427
+ "state_mutation",
428
+ "policy_violation",
429
+ "redaction_applied",
430
+ "custom"
431
+ ],
432
+ "description": "Coarse event category — matches the TraceSchema v1 EventKind enum."
433
+ },
434
+ "timestamp": {
435
+ "type": "integer",
436
+ "minimum": 0,
437
+ "description": "Unix millis. Must be monotonically non-decreasing within a span."
438
+ },
439
+ "payload": {
440
+ "type": "object",
441
+ "additionalProperties": {},
442
+ "description": "Free-form payload — the runtime owns the shape."
443
+ }
444
+ },
445
+ "required": [
446
+ "eventId",
447
+ "runId",
448
+ "kind",
449
+ "timestamp",
450
+ "payload"
451
+ ]
452
+ },
453
+ "TracesIngestResponse": {
454
+ "type": "object",
455
+ "properties": {
456
+ "accepted": {
457
+ "type": "integer",
458
+ "minimum": 0,
459
+ "description": "Number of events persisted."
460
+ },
461
+ "rejected": {
462
+ "type": "integer",
463
+ "minimum": 0,
464
+ "description": "Number of events the store refused — see `errors[]` for reasons."
465
+ },
466
+ "errors": {
467
+ "type": "array",
468
+ "items": {
469
+ "type": "object",
470
+ "properties": {
471
+ "eventId": {
472
+ "type": "string",
473
+ "description": "Event id this error applies to."
474
+ },
475
+ "message": {
476
+ "type": "string",
477
+ "description": "Why the event was rejected."
478
+ }
479
+ },
480
+ "required": [
481
+ "eventId",
482
+ "message"
483
+ ]
484
+ },
485
+ "default": []
486
+ }
487
+ },
488
+ "required": [
489
+ "accepted",
490
+ "rejected"
491
+ ]
492
+ },
493
+ "FeedbackTrajectory": {
494
+ "type": "object",
495
+ "properties": {
496
+ "id": {
497
+ "type": "string",
498
+ "minLength": 1,
499
+ "description": "Stable id; idempotency key for the trajectory."
500
+ },
501
+ "projectId": {
502
+ "type": "string"
503
+ },
504
+ "scenarioId": {
505
+ "type": "string"
506
+ },
507
+ "task": {
508
+ "type": "object",
509
+ "properties": {
510
+ "intent": {
511
+ "type": "string",
512
+ "minLength": 1
513
+ },
514
+ "context": {}
515
+ },
516
+ "required": [
517
+ "intent"
518
+ ]
519
+ },
520
+ "attempts": {
521
+ "type": "array",
522
+ "items": {
523
+ "$ref": "#/components/schemas/FeedbackAttempt"
524
+ },
525
+ "default": []
526
+ },
527
+ "labels": {
528
+ "type": "array",
529
+ "items": {
530
+ "$ref": "#/components/schemas/FeedbackLabel"
531
+ },
532
+ "default": []
533
+ },
534
+ "outcome": {
535
+ "type": "object",
536
+ "properties": {
537
+ "success": {
538
+ "type": "boolean"
539
+ },
540
+ "score": {
541
+ "type": "number"
542
+ },
543
+ "metrics": {
544
+ "type": "object",
545
+ "additionalProperties": {
546
+ "type": "number"
547
+ }
548
+ },
549
+ "costUsd": {
550
+ "type": "number"
551
+ },
552
+ "detail": {
553
+ "type": "string"
554
+ },
555
+ "observedAt": {
556
+ "type": "string"
557
+ },
558
+ "metadata": {
559
+ "type": "object",
560
+ "additionalProperties": {}
561
+ }
562
+ }
563
+ },
564
+ "split": {
565
+ "type": "string",
566
+ "enum": [
567
+ "train",
568
+ "dev",
569
+ "test",
570
+ "holdout"
571
+ ]
572
+ },
573
+ "tags": {
574
+ "type": "object",
575
+ "additionalProperties": {
576
+ "type": "string"
577
+ }
578
+ },
579
+ "createdAt": {
580
+ "type": "string",
581
+ "description": "ISO-8601 UTC."
582
+ },
583
+ "updatedAt": {
584
+ "type": "string"
585
+ },
586
+ "metadata": {
587
+ "type": "object",
588
+ "additionalProperties": {}
589
+ }
590
+ },
591
+ "required": [
592
+ "id",
593
+ "task",
594
+ "createdAt"
595
+ ]
596
+ },
597
+ "FeedbackAttempt": {
598
+ "type": "object",
599
+ "properties": {
600
+ "id": {
601
+ "type": "string",
602
+ "minLength": 1
603
+ },
604
+ "stepIndex": {
605
+ "type": "integer",
606
+ "minimum": 0
607
+ },
608
+ "artifactType": {
609
+ "type": "string",
610
+ "enum": [
611
+ "text",
612
+ "code",
613
+ "plan",
614
+ "research",
615
+ "action",
616
+ "ui",
617
+ "decision",
618
+ "data",
619
+ "other"
620
+ ]
621
+ },
622
+ "artifact": {},
623
+ "options": {
624
+ "type": "array",
625
+ "items": {}
626
+ },
627
+ "proposedAction": {
628
+ "type": "object",
629
+ "properties": {
630
+ "type": {
631
+ "type": "string"
632
+ },
633
+ "risk": {
634
+ "type": "string",
635
+ "enum": [
636
+ "low",
637
+ "medium",
638
+ "high"
639
+ ]
640
+ },
641
+ "costUsd": {
642
+ "type": "number"
643
+ },
644
+ "externalSideEffect": {
645
+ "type": "boolean"
646
+ },
647
+ "requiresApproval": {
648
+ "type": "boolean"
649
+ },
650
+ "metadata": {
651
+ "type": "object",
652
+ "additionalProperties": {}
653
+ }
654
+ },
655
+ "required": [
656
+ "type"
657
+ ]
658
+ },
659
+ "feedback": {
660
+ "type": "array",
661
+ "items": {
662
+ "$ref": "#/components/schemas/FeedbackLabel"
663
+ }
664
+ },
665
+ "createdAt": {
666
+ "type": "string"
667
+ },
668
+ "metadata": {
669
+ "type": "object",
670
+ "additionalProperties": {}
671
+ }
672
+ },
673
+ "required": [
674
+ "id",
675
+ "stepIndex",
676
+ "artifactType",
677
+ "createdAt"
678
+ ]
679
+ },
680
+ "FeedbackLabel": {
681
+ "type": "object",
682
+ "properties": {
683
+ "id": {
684
+ "type": "string"
685
+ },
686
+ "source": {
687
+ "type": "string",
688
+ "enum": [
689
+ "user",
690
+ "judge",
691
+ "environment",
692
+ "metric",
693
+ "policy",
694
+ "system"
695
+ ]
696
+ },
697
+ "kind": {
698
+ "type": "string",
699
+ "enum": [
700
+ "approve",
701
+ "reject",
702
+ "select",
703
+ "edit",
704
+ "rank",
705
+ "rate",
706
+ "comment",
707
+ "metric_outcome",
708
+ "policy_block",
709
+ "revision_request"
710
+ ]
711
+ },
712
+ "value": {},
713
+ "reason": {
714
+ "type": "string"
715
+ },
716
+ "severity": {
717
+ "type": "string",
718
+ "enum": [
719
+ "info",
720
+ "warning",
721
+ "error",
722
+ "critical"
723
+ ]
724
+ },
725
+ "createdAt": {
726
+ "type": "string",
727
+ "description": "ISO-8601 UTC."
728
+ },
729
+ "metadata": {
730
+ "type": "object",
731
+ "additionalProperties": {}
732
+ }
733
+ },
734
+ "required": [
735
+ "source",
736
+ "kind",
737
+ "createdAt"
738
+ ]
739
+ },
740
+ "FeedbackIngestResponse": {
741
+ "type": "object",
742
+ "properties": {
743
+ "id": {
744
+ "type": "string",
745
+ "description": "Trajectory id that was persisted."
746
+ },
747
+ "persisted": {
748
+ "type": "boolean",
749
+ "description": "True when the trajectory was saved (idempotent on id)."
750
+ }
751
+ },
752
+ "required": [
753
+ "id",
754
+ "persisted"
755
+ ]
385
756
  }
386
757
  },
387
758
  "parameters": {}
@@ -496,6 +867,125 @@
496
867
  }
497
868
  }
498
869
  }
870
+ },
871
+ "/v1/traces/ingest": {
872
+ "post": {
873
+ "summary": "Ingest a batch of production TraceEvents",
874
+ "description": "Append a batch of TraceEvents to the configured TraceStore. Accepts application/json ({events:[...]}) or application/x-ndjson (one event per line). Returns counts of accepted + rejected events.",
875
+ "requestBody": {
876
+ "content": {
877
+ "application/json": {
878
+ "schema": {
879
+ "$ref": "#/components/schemas/TracesIngestRequest"
880
+ }
881
+ },
882
+ "application/x-ndjson": {
883
+ "schema": {
884
+ "$ref": "#/components/schemas/TracesIngestRequest"
885
+ }
886
+ }
887
+ }
888
+ },
889
+ "responses": {
890
+ "200": {
891
+ "description": "Ingestion summary",
892
+ "content": {
893
+ "application/json": {
894
+ "schema": {
895
+ "$ref": "#/components/schemas/TracesIngestResponse"
896
+ }
897
+ }
898
+ }
899
+ },
900
+ "400": {
901
+ "description": "Validation error",
902
+ "content": {
903
+ "application/json": {
904
+ "schema": {
905
+ "$ref": "#/components/schemas/ErrorResponse"
906
+ }
907
+ }
908
+ }
909
+ },
910
+ "401": {
911
+ "description": "Unauthorized (when bearer auth is configured)",
912
+ "content": {
913
+ "application/json": {
914
+ "schema": {
915
+ "$ref": "#/components/schemas/ErrorResponse"
916
+ }
917
+ }
918
+ }
919
+ },
920
+ "503": {
921
+ "description": "No trace store configured",
922
+ "content": {
923
+ "application/json": {
924
+ "schema": {
925
+ "$ref": "#/components/schemas/ErrorResponse"
926
+ }
927
+ }
928
+ }
929
+ }
930
+ }
931
+ }
932
+ },
933
+ "/v1/feedback": {
934
+ "post": {
935
+ "summary": "Ingest a FeedbackTrajectory from production",
936
+ "description": "Persist a single FeedbackTrajectory. Idempotent on trajectory.id — re-posting replaces the prior record. Used by production runtimes to forward user 👍/👎/edits into the eval substrate.",
937
+ "requestBody": {
938
+ "content": {
939
+ "application/json": {
940
+ "schema": {
941
+ "$ref": "#/components/schemas/FeedbackTrajectory"
942
+ }
943
+ }
944
+ }
945
+ },
946
+ "responses": {
947
+ "200": {
948
+ "description": "Persisted",
949
+ "content": {
950
+ "application/json": {
951
+ "schema": {
952
+ "$ref": "#/components/schemas/FeedbackIngestResponse"
953
+ }
954
+ }
955
+ }
956
+ },
957
+ "400": {
958
+ "description": "Validation error",
959
+ "content": {
960
+ "application/json": {
961
+ "schema": {
962
+ "$ref": "#/components/schemas/ErrorResponse"
963
+ }
964
+ }
965
+ }
966
+ },
967
+ "401": {
968
+ "description": "Unauthorized (when bearer auth is configured)",
969
+ "content": {
970
+ "application/json": {
971
+ "schema": {
972
+ "$ref": "#/components/schemas/ErrorResponse"
973
+ }
974
+ }
975
+ }
976
+ },
977
+ "503": {
978
+ "description": "No feedback store configured",
979
+ "content": {
980
+ "application/json": {
981
+ "schema": {
982
+ "$ref": "#/components/schemas/ErrorResponse"
983
+ }
984
+ }
985
+ }
986
+ }
987
+ }
988
+ }
499
989
  }
500
990
  },
501
991
  "webhooks": {}
@@ -1,6 +1,6 @@
1
- export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-CUOiGcGv.js';
1
+ export { C as CallbackResearcher, a as CallbackResearcherOptions, b as CampaignFactoryParams, c as CampaignIntegrityPolicy, d as CampaignRunContext, e as CampaignRunOutcome, f as CampaignRunner, g as CampaignScenario, h as CampaignVariant, E as EvalCampaignOptions, i as EvalCampaignResult, j as ExperimentPlan, k as ExperimentResult, F as FailedRun, l as FailureMode, N as NoopResearcher, R as Researcher, S as SteeringChange, r as runEvalCampaign } from './researcher-bGkI7vCl.js';
2
2
  export { F as FeedbackArtifactType, a as FeedbackAttempt, b as FeedbackLabel, c as FeedbackLabelKind, d as FeedbackLabelSource, e as FeedbackOptimizerRow, f as FeedbackOutcome, g as FeedbackReplayAdapter, h as FeedbackReplayResult, i as FeedbackSeverity, j as FeedbackSplitPolicy, k as FeedbackTask, l as FeedbackTrajectory, m as FeedbackTrajectoryFilter, n as FeedbackTrajectoryStore, o as FileSystemFeedbackTrajectoryStore, I as InMemoryFeedbackTrajectoryStore, P as PreferenceMemoryEntry, p as ProposedSideEffect, q as assignFeedbackSplit, r as controlRunToFeedbackTrajectory, s as createFeedbackTrajectory, t as feedbackTrajectoriesToDatasetScenarios, u as feedbackTrajectoriesToOptimizerRows, v as feedbackTrajectoryToDatasetScenario, w as feedbackTrajectoryToOptimizerRow, x as parseFeedbackTrajectoriesJsonl, y as renderPreferenceMemoryMarkdown, z as replayFeedbackTrajectories, A as replayFeedbackTrajectory, B as serializeFeedbackTrajectoriesJsonl, C as summarizePreferenceMemory, D as withAssignedFeedbackSplit } from './feedback-trajectory-DfFdrraJ.js';
3
- export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-BXGs_9V0.js';
3
+ export { A as ActionableSideInfo, a as AsiSeverity, D as DEFAULT_MUTATION_PRIMITIVES, E as EvolvableVariant, G as GenerationReport, I as InMemoryTrialCache, M as MultiShotGateConfig, b as MultiShotGateResult, c as MultiShotMutateAdapter, d as MultiShotOptimizationConfig, e as MultiShotOptimizationResult, f as MultiShotRun, g as MultiShotRunInput, h as MultiShotRunner, i as MultiShotScore, j as MultiShotScorer, k as MultiShotSplit, l as MultiShotTrace, m as MultiShotTrialResult, n as MultiShotVariant, o as MutateAdapter, P as PromptEvolutionConfig, p as PromptEvolutionEvent, q as PromptEvolutionResult, R as ReflectionContext, r as ReflectionProposal, S as ScenarioAggregate, s as ScoreAdapter, T as TrialCache, t as TrialResult, u as TrialTrace, V as VariantAggregate, v as buildReflectionPrompt, w as defaultMultiShotObjectives, x as parseReflectionResponse, y as runMultiShotOptimization, z as runPromptEvolution, B as trialTraceFromMultiShotTrial } from './summary-report-DZVXOCK_.js';
4
4
  import './errors-BZ9sTdz7.js';
5
5
  import './integrity-DK2EBVZC.js';
6
6
  import './store-Db2Bv8Cf.js';
@@ -25,7 +25,7 @@ import {
25
25
  summarizePreferenceMemory,
26
26
  trialTraceFromMultiShotTrial,
27
27
  withAssignedFeedbackSplit
28
- } from "./chunk-VRJVTXRV.js";
28
+ } from "./chunk-WHZMVFUV.js";
29
29
  import "./chunk-NLMNWKVM.js";
30
30
  import {
31
31
  runEvalCampaign
@@ -1,9 +1,8 @@
1
1
  import {
2
- DEFAULT_RULES,
3
- classifyFailure,
4
2
  compareToBaseline,
5
- computeToolUseMetrics
6
- } from "../chunk-OHEPNJQN.js";
3
+ computeToolUseMetrics,
4
+ failureClusterView
5
+ } from "../chunk-JLZQWFV3.js";
7
6
  import {
8
7
  buildTrajectory
9
8
  } from "../chunk-RZTMDUO7.js";
@@ -62,69 +61,6 @@ async function budgetBreachView(store, options = {}) {
62
61
  };
63
62
  }
64
63
 
65
- // src/pipelines/failure-cluster.ts
66
- async function failureClusterView(store, options = {}) {
67
- const rules = options.rules ?? DEFAULT_RULES;
68
- const minSize = options.minClusterSize ?? 1;
69
- const runs = await store.listRuns();
70
- const clusters = /* @__PURE__ */ new Map();
71
- let totalFailures = 0;
72
- for (const run of runs) {
73
- if (run.status === "completed" && run.outcome?.pass !== false) continue;
74
- totalFailures++;
75
- const spans = await store.spans({ runId: run.runId });
76
- const events = await store.events({ runId: run.runId });
77
- const cls = classifyFailure({ run, spans, events }, rules);
78
- let toolName;
79
- let argPrefix;
80
- let dimension;
81
- if (cls.triggerSpanId) {
82
- const trig = spans.find((s) => s.spanId === cls.triggerSpanId);
83
- if (trig?.kind === "tool") {
84
- toolName = trig.toolName;
85
- argPrefix = argHash(trig.args).slice(0, 16);
86
- } else if (trig?.kind === "judge") {
87
- dimension = trig.dimension;
88
- }
89
- }
90
- if (!toolName) {
91
- const ts = await toolSpans(store, run.runId);
92
- const errored = ts.filter((t) => t.status === "error").pop();
93
- if (errored) {
94
- toolName = errored.toolName;
95
- argPrefix = argHash(errored.args).slice(0, 16);
96
- }
97
- }
98
- if (!dimension) {
99
- const judge = spans.find((s) => s.kind === "judge" && typeof s.dimension === "string");
100
- if (judge?.kind === "judge") dimension = judge.dimension;
101
- }
102
- const key = `${cls.failureClass}|${toolName ?? ""}|${argPrefix ?? ""}|${dimension ?? ""}`;
103
- let cluster = clusters.get(key);
104
- if (!cluster) {
105
- cluster = {
106
- failureClass: cls.failureClass,
107
- toolName,
108
- argPrefix,
109
- dimension,
110
- runCount: 0,
111
- scenarioIds: [],
112
- exampleRunId: run.runId,
113
- exampleError: firstErrorMessage(spans) ?? cls.reason
114
- };
115
- clusters.set(key, cluster);
116
- }
117
- cluster.runCount++;
118
- if (!cluster.scenarioIds.includes(run.scenarioId)) cluster.scenarioIds.push(run.scenarioId);
119
- }
120
- const arr = [...clusters.values()].filter((c) => c.runCount >= minSize).sort((a, b) => b.runCount - a.runCount);
121
- return { clusters: arr, totalFailures, totalRuns: runs.length };
122
- }
123
- function firstErrorMessage(spans) {
124
- const errored = spans.find((s) => s.status === "error");
125
- return errored?.error;
126
- }
127
-
128
64
  // src/pipelines/first-divergence.ts
129
65
  async function firstDivergenceView(store, runA, runB, options = {}) {
130
66
  const [a, b] = await Promise.all([buildTrajectory(store, runA), buildTrajectory(store, runB)]);