@bolt-foundry/gambit 0.8.6-rc.1 → 0.8.6-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. package/CHANGELOG.md +60 -1
  2. package/esm/gambit/simulator-ui/dist/bundle.js +2033 -964
  3. package/esm/gambit/simulator-ui/dist/bundle.js.map +4 -4
  4. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts +2 -1
  5. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts.map +1 -1
  6. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.d.ts.map +1 -1
  7. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.js +288 -101
  8. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts +84 -45
  9. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts.map +1 -1
  10. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts +3 -2
  11. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts.map +1 -1
  12. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts +1 -1
  13. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts.map +1 -1
  14. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.js +85 -47
  15. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts +86 -48
  16. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts.map +1 -1
  17. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.d.ts.map +1 -1
  18. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.js +363 -97
  19. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.d.ts.map +1 -1
  20. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.js +154 -76
  21. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts +1 -1
  22. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts.map +1 -1
  23. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.js +45 -29
  24. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts +46 -30
  25. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts.map +1 -1
  26. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.d.ts.map +1 -1
  27. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.js +154 -76
  28. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts +45 -28
  29. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts.map +1 -1
  30. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts +1 -1
  31. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts.map +1 -1
  32. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.js +45 -29
  33. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts +46 -30
  34. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts.map +1 -1
  35. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.d.ts.map +1 -1
  36. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.js +194 -78
  37. package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts +45 -28
  38. package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts.map +1 -1
  39. package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.d.ts.map +1 -1
  40. package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.js +194 -78
  41. package/esm/simulator-ui/isograph/components/Query/SimulatorVerifyPage.d.ts.map +1 -1
  42. package/esm/simulator-ui/isograph/components/Query/SimulatorVerifyPage.js +257 -203
  43. package/esm/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.d.ts.map +1 -1
  44. package/esm/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.js +8 -3
  45. package/esm/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.d.ts.map +1 -1
  46. package/esm/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.js +45 -28
  47. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts +84 -45
  48. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts.map +1 -1
  49. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.js +85 -46
  50. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts +84 -45
  51. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts.map +1 -1
  52. package/esm/simulator-ui/src/AppShell.d.ts.map +1 -1
  53. package/esm/simulator-ui/src/AppShell.js +6 -4
  54. package/esm/simulator-ui/src/verify_unified.d.ts +17 -0
  55. package/esm/simulator-ui/src/verify_unified.d.ts.map +1 -0
  56. package/esm/simulator-ui/src/verify_unified.js +25 -0
  57. package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts +1 -1
  58. package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts.map +1 -1
  59. package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.js +1 -1
  60. package/esm/src/server.d.ts.map +1 -1
  61. package/esm/src/server.js +227 -45
  62. package/esm/src/server_isograph_environment.d.ts +11 -4
  63. package/esm/src/server_isograph_environment.d.ts.map +1 -1
  64. package/esm/src/server_isograph_environment.js +5 -0
  65. package/esm/src/simulator_graphql.d.ts +11 -4
  66. package/esm/src/simulator_graphql.d.ts.map +1 -1
  67. package/esm/src/simulator_graphql.js +278 -245
  68. package/package.json +2 -2
  69. package/script/gambit/simulator-ui/dist/bundle.js +2033 -964
  70. package/script/gambit/simulator-ui/dist/bundle.js.map +4 -4
  71. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts +2 -1
  72. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts.map +1 -1
  73. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.d.ts.map +1 -1
  74. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.js +288 -101
  75. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts +84 -45
  76. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts.map +1 -1
  77. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts +3 -2
  78. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts.map +1 -1
  79. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts +1 -1
  80. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts.map +1 -1
  81. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.js +85 -47
  82. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts +86 -48
  83. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts.map +1 -1
  84. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.d.ts.map +1 -1
  85. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.js +363 -97
  86. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.d.ts.map +1 -1
  87. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.js +154 -76
  88. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts +1 -1
  89. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts.map +1 -1
  90. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.js +45 -29
  91. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts +46 -30
  92. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts.map +1 -1
  93. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.d.ts.map +1 -1
  94. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.js +154 -76
  95. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts +45 -28
  96. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts.map +1 -1
  97. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts +1 -1
  98. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts.map +1 -1
  99. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.js +45 -29
  100. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts +46 -30
  101. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts.map +1 -1
  102. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.d.ts.map +1 -1
  103. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.js +194 -78
  104. package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts +45 -28
  105. package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts.map +1 -1
  106. package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.d.ts.map +1 -1
  107. package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.js +194 -78
  108. package/script/simulator-ui/isograph/components/Query/SimulatorVerifyPage.d.ts.map +1 -1
  109. package/script/simulator-ui/isograph/components/Query/SimulatorVerifyPage.js +256 -202
  110. package/script/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.d.ts.map +1 -1
  111. package/script/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.js +8 -3
  112. package/script/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.d.ts.map +1 -1
  113. package/script/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.js +45 -28
  114. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts +84 -45
  115. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts.map +1 -1
  116. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.js +85 -46
  117. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts +84 -45
  118. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts.map +1 -1
  119. package/script/simulator-ui/src/AppShell.d.ts.map +1 -1
  120. package/script/simulator-ui/src/AppShell.js +6 -4
  121. package/script/simulator-ui/src/verify_unified.d.ts +17 -0
  122. package/script/simulator-ui/src/verify_unified.d.ts.map +1 -0
  123. package/script/simulator-ui/src/verify_unified.js +29 -0
  124. package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts +1 -1
  125. package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts.map +1 -1
  126. package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.js +1 -1
  127. package/script/src/server.d.ts.map +1 -1
  128. package/script/src/server.js +227 -45
  129. package/script/src/server_isograph_environment.d.ts +11 -4
  130. package/script/src/server_isograph_environment.d.ts.map +1 -1
  131. package/script/src/server_isograph_environment.js +5 -0
  132. package/script/src/simulator_graphql.d.ts +11 -4
  133. package/script/src/simulator_graphql.d.ts.map +1 -1
  134. package/script/src/simulator_graphql.js +278 -245
  135. package/esm/simulator-ui/src/verify_metrics.d.ts +0 -50
  136. package/esm/simulator-ui/src/verify_metrics.d.ts.map +0 -1
  137. package/esm/simulator-ui/src/verify_metrics.js +0 -260
  138. package/script/simulator-ui/src/verify_metrics.d.ts +0 -50
  139. package/script/simulator-ui/src/verify_metrics.d.ts.map +0 -1
  140. package/script/simulator-ui/src/verify_metrics.js +0 -264
@@ -537,20 +537,6 @@ function toWorkspaceGradeRunStatus(value) {
537
537
  return "COMPLETED";
538
538
  return "ERROR";
539
539
  }
540
- const VERIFY_CONSISTENCY_THRESHOLDS = {
541
- minSampleSize: 6,
542
- instabilityScoreDelta: 1.5,
543
- pass: {
544
- agreementMin: 0.9,
545
- maxSpread: 1,
546
- maxInstabilityCount: 0,
547
- },
548
- warn: {
549
- agreementMin: 0.75,
550
- maxSpread: 2,
551
- maxInstabilityCount: 2,
552
- },
553
- };
554
540
  function round2(value) {
555
541
  return Math.round(value * 100) / 100;
556
542
  }
@@ -563,195 +549,179 @@ function median(values) {
563
549
  return sorted[mid];
564
550
  return (sorted[mid - 1] + sorted[mid]) / 2;
565
551
  }
566
- function flattenVerifyRunExamples(run) {
567
- if (run.turns.length > 0) {
568
- return run.turns.map((turn, fallbackIndex) => {
569
- const index = typeof turn.turnIndex === "number"
570
- ? turn.turnIndex
571
- : fallbackIndex;
572
- const key = turn.refId && turn.refId.trim().length > 0
573
- ? `ref:${turn.refId}`
574
- : `turn:${index}`;
575
- const pass = typeof turn.score === "number" ? turn.score >= 0 : undefined;
576
- return {
577
- key,
578
- label: `Assistant turn ${turn.turnNumber || fallbackIndex + 1}`,
579
- points: [{
580
- runId: run.id,
581
- score: turn.score,
582
- pass,
583
- reason: turn.reason,
584
- turnIndex: index,
585
- messageRefId: turn.refId,
586
- }],
587
- };
588
- });
552
+ function extractRepresentativeScore(run) {
553
+ const summaryScore = run.summary?.score;
554
+ if (typeof summaryScore === "number" && Number.isFinite(summaryScore)) {
555
+ return summaryScore;
589
556
  }
590
- const score = run.summary?.score;
591
- const reason = run.summary?.reason;
592
- const pass = typeof score === "number" ? score >= 0 : undefined;
593
- return [{
594
- key: "conversation",
595
- label: "Conversation score",
596
- points: [{
597
- runId: run.id,
598
- score,
599
- pass,
600
- reason,
601
- }],
602
- }];
557
+ const turnScores = run.turns
558
+ .map((turn) => turn.score)
559
+ .filter((score) => typeof score === "number" && Number.isFinite(score));
560
+ if (turnScores.length === 0)
561
+ return null;
562
+ return median(turnScores);
603
563
  }
604
- function resolveVerifyVerdict(input) {
605
- if (input.sampleSize < VERIFY_CONSISTENCY_THRESHOLDS.minSampleSize) {
606
- return {
607
- verdict: "WARN",
608
- reason: `Need at least ${VERIFY_CONSISTENCY_THRESHOLDS.minSampleSize} samples before issuing a firm verdict.`,
609
- };
610
- }
611
- if (input.agreementRate === null) {
612
- return {
613
- verdict: "WARN",
614
- reason: "No comparable pass/fail evidence was found in the sampled runs.",
615
- };
616
- }
617
- const spreadMax = input.spreadMax ?? 0;
618
- if (input.agreementRate >= VERIFY_CONSISTENCY_THRESHOLDS.pass.agreementMin &&
619
- spreadMax <= VERIFY_CONSISTENCY_THRESHOLDS.pass.maxSpread &&
620
- input.instabilityCount <=
621
- VERIFY_CONSISTENCY_THRESHOLDS.pass.maxInstabilityCount) {
622
- return {
623
- verdict: "PASS",
624
- reason: "Agreement, spread, and instability all meet PASS thresholds.",
625
- };
626
- }
627
- if (input.agreementRate >= VERIFY_CONSISTENCY_THRESHOLDS.warn.agreementMin &&
628
- spreadMax <= VERIFY_CONSISTENCY_THRESHOLDS.warn.maxSpread &&
629
- input.instabilityCount <=
630
- VERIFY_CONSISTENCY_THRESHOLDS.warn.maxInstabilityCount) {
631
- return {
632
- verdict: "WARN",
633
- reason: "Some variation was detected, but results remain within WARN thresholds.",
634
- };
564
+ function extractRepresentativeReason(run) {
565
+ if (typeof run.summary?.reason === "string" &&
566
+ run.summary.reason.trim().length > 0) {
567
+ return run.summary.reason.trim();
635
568
  }
636
- return {
637
- verdict: "FAIL",
638
- reason: "Agreement/spread instability exceeds WARN thresholds.",
639
- };
569
+ const turnReason = run.turns.find((turn) => typeof turn.reason === "string" && turn.reason.trim().length > 0)?.reason;
570
+ return typeof turnReason === "string" ? turnReason.trim() : null;
640
571
  }
641
- function buildVerifyMetricsFromRuns(runs) {
642
- const completedRuns = runs.filter((run) => run.status === "completed");
643
- const sampleSize = completedRuns.length;
644
- const bucketsByKey = new Map();
645
- completedRuns.forEach((run) => {
646
- flattenVerifyRunExamples(run).forEach((entry) => {
647
- const existing = bucketsByKey.get(entry.key);
648
- if (!existing) {
649
- bucketsByKey.set(entry.key, {
650
- key: entry.key,
651
- label: entry.label,
652
- points: [...entry.points],
653
- });
654
- return;
655
- }
656
- existing.points.push(...entry.points);
657
- });
658
- });
659
- const outliers = [];
660
- let agreementVotes = 0;
661
- let agreementTotal = 0;
662
- const scoreDeltas = [];
663
- bucketsByKey.forEach((bucket) => {
664
- const scores = bucket.points
665
- .map((point) => point.score)
666
- .filter((score) => typeof score === "number" && Number.isFinite(score));
667
- const minScore = scores.length > 0 ? Math.min(...scores) : null;
668
- const maxScore = scores.length > 0 ? Math.max(...scores) : null;
669
- const scoreDelta = minScore !== null && maxScore !== null
670
- ? round2(maxScore - minScore)
671
- : null;
672
- const passVotes = bucket.points
673
- .map((point) => point.pass)
674
- .filter((pass) => typeof pass === "boolean");
675
- const passCount = passVotes.filter((value) => value).length;
676
- const failCount = passVotes.length - passCount;
677
- const agreementRate = passVotes.length > 0
678
- ? round2(Math.max(passCount, failCount) / passVotes.length)
679
- : null;
680
- if (passVotes.length > 0) {
681
- agreementVotes += Math.max(passCount, failCount);
682
- agreementTotal += passVotes.length;
572
+ function buildVerifyMetrics(args) {
573
+ const runById = new Map(args.runs.map((run) => [run.id, run]));
574
+ const reasonCounter = new Map();
575
+ const scoreValues = [];
576
+ let passCount = 0;
577
+ let scoreCount = 0;
578
+ let gradingFailureCount = 0;
579
+ const scenarioGroups = new Map();
580
+ const addReason = (kind, reason) => {
581
+ const normalized = typeof reason === "string" ? reason.trim() : "";
582
+ if (!normalized)
583
+ return;
584
+ const key = `${kind}:${normalized}`;
585
+ const existing = reasonCounter.get(key);
586
+ if (existing) {
587
+ existing.count += 1;
588
+ return;
683
589
  }
684
- if (scoreDelta !== null) {
685
- scoreDeltas.push(scoreDelta);
590
+ reasonCounter.set(key, { key, kind, reason: normalized, count: 1 });
591
+ };
592
+ for (const request of args.batch.requests) {
593
+ if (!request.scenarioRunId)
594
+ continue;
595
+ const existing = scenarioGroups.get(request.scenarioRunId);
596
+ if (existing) {
597
+ existing.requests.push(request);
598
+ continue;
686
599
  }
687
- const passFlip = passCount > 0 && failCount > 0;
688
- const instability = passFlip ||
689
- (scoreDelta !== null &&
690
- scoreDelta > VERIFY_CONSISTENCY_THRESHOLDS.instabilityScoreDelta);
691
- const minPoint = minScore === null
692
- ? undefined
693
- : bucket.points.find((point) => point.score === minScore);
694
- const maxPoint = maxScore === null
695
- ? undefined
696
- : bucket.points.find((point) => point.score === maxScore);
697
- outliers.push({
698
- key: bucket.key,
699
- label: bucket.label,
700
- sampleSize: bucket.points.length,
701
- agreementRate,
702
- scoreDelta,
703
- passFlip,
704
- instability,
705
- minRunId: minPoint?.runId,
706
- maxRunId: maxPoint?.runId,
707
- turnIndex: maxPoint?.turnIndex ?? minPoint?.turnIndex,
708
- messageRefId: maxPoint?.messageRefId ?? minPoint?.messageRefId,
600
+ scenarioGroups.set(request.scenarioRunId, {
601
+ scenarioRunId: request.scenarioRunId,
602
+ requests: [request],
603
+ scores: [],
604
+ executionFailureCount: 0,
605
+ gradingFailureCount: 0,
606
+ completedSampleCount: 0,
709
607
  });
710
- });
711
- outliers.sort((left, right) => {
712
- if (left.instability !== right.instability) {
713
- return left.instability ? -1 : 1;
608
+ }
609
+ for (const request of args.batch.requests) {
610
+ if (request.status === "error") {
611
+ addReason("execution", request.error);
612
+ if (request.scenarioRunId) {
613
+ const group = scenarioGroups.get(request.scenarioRunId);
614
+ if (group)
615
+ group.executionFailureCount += 1;
616
+ }
617
+ continue;
618
+ }
619
+ if (request.status !== "completed" || !request.runId)
620
+ continue;
621
+ const run = runById.get(request.runId);
622
+ if (!run || run.status !== "completed")
623
+ continue;
624
+ const score = extractRepresentativeScore(run);
625
+ if (request.scenarioRunId) {
626
+ const group = scenarioGroups.get(request.scenarioRunId);
627
+ if (group) {
628
+ group.completedSampleCount += 1;
629
+ }
714
630
  }
715
- if (left.passFlip !== right.passFlip)
716
- return left.passFlip ? -1 : 1;
717
- const leftDelta = left.scoreDelta ?? -1;
718
- const rightDelta = right.scoreDelta ?? -1;
719
- if (leftDelta !== rightDelta)
720
- return rightDelta - leftDelta;
721
- if (left.sampleSize !== right.sampleSize) {
722
- return right.sampleSize - left.sampleSize;
631
+ if (score === null)
632
+ continue;
633
+ const rounded = round2(score);
634
+ scoreValues.push(rounded);
635
+ scoreCount += 1;
636
+ if (rounded >= 0) {
637
+ passCount += 1;
638
+ }
639
+ else {
640
+ gradingFailureCount += 1;
641
+ addReason("grading", extractRepresentativeReason(run));
642
+ if (request.scenarioRunId) {
643
+ const group = scenarioGroups.get(request.scenarioRunId);
644
+ if (group)
645
+ group.gradingFailureCount += 1;
646
+ }
647
+ }
648
+ if (request.scenarioRunId) {
649
+ const group = scenarioGroups.get(request.scenarioRunId);
650
+ if (group) {
651
+ const messageRefId = run.turns.find((turn) => typeof turn.refId === "string" && turn.refId.trim().length > 0)?.refId;
652
+ group.scores.push({
653
+ runId: run.id,
654
+ value: rounded,
655
+ messageRefId,
656
+ });
657
+ }
723
658
  }
724
- return left.label.localeCompare(right.label);
659
+ }
660
+ const outlierScenarioRuns = [
661
+ ...scenarioGroups.values(),
662
+ ].map((group) => {
663
+ const sortedScores = [...group.scores].sort((left, right) => left.value - right.value);
664
+ const numericScores = sortedScores.map((entry) => entry.value);
665
+ const min = numericScores.length > 0 ? numericScores[0] : null;
666
+ const max = numericScores.length > 0
667
+ ? numericScores[numericScores.length - 1]
668
+ : null;
669
+ const avg = numericScores.length > 0
670
+ ? round2(numericScores.reduce((sum, value) => sum + value, 0) /
671
+ numericScores.length)
672
+ : null;
673
+ return {
674
+ key: group.scenarioRunId,
675
+ scenarioRunId: group.scenarioRunId,
676
+ gradeSampleCount: group.requests.length,
677
+ completedSampleCount: group.completedSampleCount,
678
+ executionFailureCount: group.executionFailureCount,
679
+ gradingFailureCount: group.gradingFailureCount,
680
+ averageScore: avg,
681
+ minScore: min === null ? null : round2(min),
682
+ maxScore: max === null ? null : round2(max),
683
+ failed: group.executionFailureCount > 0,
684
+ minRunId: sortedScores[0]?.runId,
685
+ maxRunId: sortedScores[sortedScores.length - 1]?.runId,
686
+ messageRefId: sortedScores[0]?.messageRefId ??
687
+ sortedScores[sortedScores.length - 1]?.messageRefId,
688
+ };
689
+ }).sort((left, right) => {
690
+ if (left.failed !== right.failed)
691
+ return left.failed ? -1 : 1;
692
+ const leftScore = left.averageScore ?? Number.POSITIVE_INFINITY;
693
+ const rightScore = right.averageScore ?? Number.POSITIVE_INFINITY;
694
+ if (leftScore !== rightScore)
695
+ return leftScore - rightScore;
696
+ return left.scenarioRunId.localeCompare(right.scenarioRunId);
725
697
  });
726
- const agreementRate = agreementTotal > 0
727
- ? round2(agreementVotes / agreementTotal)
728
- : null;
729
- const scoreSpreadMin = scoreDeltas.length > 0
730
- ? Math.min(...scoreDeltas)
698
+ const scoreMin = scoreValues.length > 0 ? Math.min(...scoreValues) : null;
699
+ const scoreMax = scoreValues.length > 0 ? Math.max(...scoreValues) : null;
700
+ const scoreMedian = median(scoreValues);
701
+ const scoreMean = scoreValues.length > 0
702
+ ? round2(scoreValues.reduce((sum, value) => sum + value, 0) / scoreValues.length)
731
703
  : null;
732
- const scoreSpreadMax = scoreDeltas.length > 0
733
- ? Math.max(...scoreDeltas)
734
- : null;
735
- const scoreSpreadMedian = median(scoreDeltas);
736
- const instabilityCount = outliers.filter((entry) => entry.instability).length;
737
- const verdict = resolveVerifyVerdict({
738
- sampleSize,
739
- agreementRate,
740
- spreadMax: scoreSpreadMax,
741
- instabilityCount,
742
- });
704
+ const passRate = scoreCount > 0 ? round2(passCount / scoreCount) : null;
743
705
  return {
744
- sampleSize,
745
- agreementRate,
746
- scoreSpreadMin: scoreSpreadMin === null ? null : round2(scoreSpreadMin),
747
- scoreSpreadMedian: scoreSpreadMedian === null
748
- ? null
749
- : round2(scoreSpreadMedian),
750
- scoreSpreadMax: scoreSpreadMax === null ? null : round2(scoreSpreadMax),
751
- instabilityCount,
752
- verdict: verdict.verdict,
753
- verdictReason: verdict.reason,
754
- outliers,
706
+ scenarioRunCountRequested: args.batch.scenarioRuns,
707
+ scenarioRunCountCompleted: args.batch.scenarioRunsCompleted,
708
+ scenarioRunCountFailed: args.batch.scenarioRunsFailed,
709
+ gradeSampleCountRequested: args.batch.requested,
710
+ gradeSampleCountCompleted: args.batch.completed,
711
+ gradeSampleCountFailed: args.batch.failed,
712
+ executionFailureCount: args.batch.failed,
713
+ gradingFailureCount,
714
+ passRate,
715
+ scoreMin: scoreMin === null ? null : round2(scoreMin),
716
+ scoreMedian: scoreMedian === null ? null : round2(scoreMedian),
717
+ scoreMax: scoreMax === null ? null : round2(scoreMax),
718
+ scoreMean,
719
+ outlierScenarioRuns,
720
+ failureReasons: [...reasonCounter.values()].sort((left, right) => {
721
+ if (left.count !== right.count)
722
+ return right.count - left.count;
723
+ return left.reason.localeCompare(right.reason);
724
+ }),
755
725
  };
756
726
  }
757
727
  function toWorkspaceVerifyBatchStatus(value) {
@@ -778,8 +748,8 @@ const WorkspaceVerifyBatchStatusEnum = builder.enumType("WorkspaceVerifyBatchSta
778
748
  const WorkspaceVerifyBatchRequestStatusEnum = builder.enumType("WorkspaceVerifyBatchRequestStatus", {
779
749
  values: ["QUEUED", "RUNNING", "COMPLETED", "ERROR"],
780
750
  });
781
- const WorkspaceVerifyVerdictEnum = builder.enumType("WorkspaceVerifyVerdict", {
782
- values: ["PASS", "WARN", "FAIL"],
751
+ const WorkspaceVerifyFailureReasonKindEnum = builder.enumType("WorkspaceVerifyFailureReasonKind", {
752
+ values: ["EXECUTION", "GRADING"],
783
753
  });
784
754
  const WorkspaceGradeTurnType = builder.objectRef("WorkspaceGradeTurn");
785
755
  WorkspaceGradeTurnType.implement({
@@ -930,28 +900,44 @@ WorkspaceVerifyBatchRequestType.implement({
930
900
  nullable: true,
931
901
  resolve: (parent) => parent.runId ?? null,
932
902
  }),
903
+ scenarioRunId: t.id({
904
+ nullable: true,
905
+ resolve: (parent) => parent.scenarioRunId ?? null,
906
+ }),
933
907
  error: t.string({
934
908
  nullable: true,
935
909
  resolve: (parent) => parent.error ?? null,
936
910
  }),
937
911
  }),
938
912
  });
939
- const WorkspaceVerifyOutlierType = builder.objectRef("WorkspaceVerifyOutlier");
940
- WorkspaceVerifyOutlierType.implement({
913
+ const WorkspaceVerifyScenarioOutlierType = builder.objectRef("WorkspaceVerifyScenarioOutlier");
914
+ WorkspaceVerifyScenarioOutlierType.implement({
941
915
  fields: (t) => ({
942
916
  key: t.id({ resolve: (parent) => parent.key }),
943
- label: t.string({ resolve: (parent) => parent.label }),
944
- sampleSize: t.int({ resolve: (parent) => parent.sampleSize }),
945
- agreementRate: t.float({
917
+ scenarioRunId: t.id({ resolve: (parent) => parent.scenarioRunId }),
918
+ gradeSampleCount: t.int({ resolve: (parent) => parent.gradeSampleCount }),
919
+ completedSampleCount: t.int({
920
+ resolve: (parent) => parent.completedSampleCount,
921
+ }),
922
+ executionFailureCount: t.int({
923
+ resolve: (parent) => parent.executionFailureCount,
924
+ }),
925
+ gradingFailureCount: t.int({
926
+ resolve: (parent) => parent.gradingFailureCount,
927
+ }),
928
+ averageScore: t.float({
946
929
  nullable: true,
947
- resolve: (parent) => parent.agreementRate,
930
+ resolve: (parent) => parent.averageScore,
948
931
  }),
949
- scoreDelta: t.float({
932
+ minScore: t.float({
950
933
  nullable: true,
951
- resolve: (parent) => parent.scoreDelta,
934
+ resolve: (parent) => parent.minScore,
952
935
  }),
953
- passFlip: t.boolean({ resolve: (parent) => parent.passFlip }),
954
- instability: t.boolean({ resolve: (parent) => parent.instability }),
936
+ maxScore: t.float({
937
+ nullable: true,
938
+ resolve: (parent) => parent.maxScore,
939
+ }),
940
+ failed: t.boolean({ resolve: (parent) => parent.failed }),
955
941
  minRunId: t.id({
956
942
  nullable: true,
957
943
  resolve: (parent) => parent.minRunId ?? null,
@@ -960,53 +946,86 @@ WorkspaceVerifyOutlierType.implement({
960
946
  nullable: true,
961
947
  resolve: (parent) => parent.maxRunId ?? null,
962
948
  }),
963
- turnIndex: t.int({
964
- nullable: true,
965
- resolve: (parent) => parent.turnIndex ?? null,
966
- }),
967
949
  messageRefId: t.id({
968
950
  nullable: true,
969
951
  resolve: (parent) => parent.messageRefId ?? null,
970
952
  }),
971
953
  }),
972
954
  });
955
+ const WorkspaceVerifyFailureReasonGroupType = builder.objectRef("WorkspaceVerifyFailureReasonGroup");
956
+ WorkspaceVerifyFailureReasonGroupType.implement({
957
+ fields: (t) => ({
958
+ key: t.id({ resolve: (parent) => parent.key }),
959
+ kind: t.field({
960
+ type: WorkspaceVerifyFailureReasonKindEnum,
961
+ resolve: (parent) => parent.kind === "execution" ? "EXECUTION" : "GRADING",
962
+ }),
963
+ reason: t.string({ resolve: (parent) => parent.reason }),
964
+ count: t.int({ resolve: (parent) => parent.count }),
965
+ }),
966
+ });
973
967
  const WorkspaceVerifyMetricsType = builder.objectRef("WorkspaceVerifyMetrics");
974
968
  WorkspaceVerifyMetricsType.implement({
975
969
  fields: (t) => ({
976
- sampleSize: t.int({ resolve: (parent) => parent.sampleSize }),
977
- agreementRate: t.float({
978
- nullable: true,
979
- resolve: (parent) => parent.agreementRate,
970
+ scenarioRunCountRequested: t.int({
971
+ resolve: (parent) => parent.scenarioRunCountRequested,
972
+ }),
973
+ scenarioRunCountCompleted: t.int({
974
+ resolve: (parent) => parent.scenarioRunCountCompleted,
975
+ }),
976
+ scenarioRunCountFailed: t.int({
977
+ resolve: (parent) => parent.scenarioRunCountFailed,
978
+ }),
979
+ gradeSampleCountRequested: t.int({
980
+ resolve: (parent) => parent.gradeSampleCountRequested,
981
+ }),
982
+ gradeSampleCountCompleted: t.int({
983
+ resolve: (parent) => parent.gradeSampleCountCompleted,
984
+ }),
985
+ gradeSampleCountFailed: t.int({
986
+ resolve: (parent) => parent.gradeSampleCountFailed,
980
987
  }),
981
- scoreSpreadMin: t.float({
988
+ executionFailureCount: t.int({
989
+ resolve: (parent) => parent.executionFailureCount,
990
+ }),
991
+ gradingFailureCount: t.int({
992
+ resolve: (parent) => parent.gradingFailureCount,
993
+ }),
994
+ passRate: t.float({
982
995
  nullable: true,
983
- resolve: (parent) => parent.scoreSpreadMin,
996
+ resolve: (parent) => parent.passRate,
984
997
  }),
985
- scoreSpreadMedian: t.float({
998
+ scoreMin: t.float({
986
999
  nullable: true,
987
- resolve: (parent) => parent.scoreSpreadMedian,
1000
+ resolve: (parent) => parent.scoreMin,
988
1001
  }),
989
- scoreSpreadMax: t.float({
1002
+ scoreMedian: t.float({
990
1003
  nullable: true,
991
- resolve: (parent) => parent.scoreSpreadMax,
1004
+ resolve: (parent) => parent.scoreMedian,
992
1005
  }),
993
- instabilityCount: t.int({
994
- resolve: (parent) => parent.instabilityCount,
1006
+ scoreMax: t.float({
1007
+ nullable: true,
1008
+ resolve: (parent) => parent.scoreMax,
995
1009
  }),
996
- verdict: t.field({
997
- type: WorkspaceVerifyVerdictEnum,
998
- resolve: (parent) => parent.verdict,
1010
+ scoreMean: t.float({
1011
+ nullable: true,
1012
+ resolve: (parent) => parent.scoreMean,
999
1013
  }),
1000
- verdictReason: t.string({
1001
- resolve: (parent) => parent.verdictReason,
1014
+ outlierScenarioRuns: t.connection({
1015
+ type: WorkspaceVerifyScenarioOutlierType,
1016
+ args: {
1017
+ first: t.arg.int(),
1018
+ after: t.arg.string(),
1019
+ },
1020
+ resolve: (parent, args) => resolveArrayConnection({ args }, parent.outlierScenarioRuns),
1002
1021
  }),
1003
- outliers: t.connection({
1004
- type: WorkspaceVerifyOutlierType,
1022
+ failureReasons: t.connection({
1023
+ type: WorkspaceVerifyFailureReasonGroupType,
1005
1024
  args: {
1006
1025
  first: t.arg.int(),
1007
1026
  after: t.arg.string(),
1008
1027
  },
1009
- resolve: (parent, args) => resolveArrayConnection({ args }, parent.outliers),
1028
+ resolve: (parent, args) => resolveArrayConnection({ args }, parent.failureReasons),
1010
1029
  }),
1011
1030
  }),
1012
1031
  });
@@ -1015,10 +1034,22 @@ WorkspaceVerifyBatchType.implement({
1015
1034
  fields: (t) => ({
1016
1035
  id: t.id({ resolve: (parent) => parent.id }),
1017
1036
  workspaceId: t.id({ resolve: (parent) => parent.workspaceId }),
1018
- graderId: t.id({ resolve: (parent) => parent.graderId }),
1019
- scenarioRunId: t.id({
1037
+ scenarioDeckId: t.id({
1020
1038
  nullable: true,
1021
- resolve: (parent) => parent.scenarioRunId ?? null,
1039
+ resolve: (parent) => parent.scenarioDeckId ?? null,
1040
+ }),
1041
+ graderId: t.id({ resolve: (parent) => parent.graderId }),
1042
+ scenarioRuns: t.int({
1043
+ resolve: (parent) => parent.scenarioRuns,
1044
+ }),
1045
+ graderRepeatsPerScenario: t.int({
1046
+ resolve: (parent) => parent.graderRepeatsPerScenario,
1047
+ }),
1048
+ scenarioRunsCompleted: t.int({
1049
+ resolve: (parent) => parent.scenarioRunsCompleted,
1050
+ }),
1051
+ scenarioRunsFailed: t.int({
1052
+ resolve: (parent) => parent.scenarioRunsFailed,
1022
1053
  }),
1023
1054
  status: t.field({
1024
1055
  type: WorkspaceVerifyBatchStatusEnum,
@@ -1050,19 +1081,17 @@ WorkspaceVerifyBatchType.implement({
1050
1081
  resolve: async (parent, _args, context) => {
1051
1082
  if (!context.readWorkspaceGradeRuns)
1052
1083
  return null;
1053
- const completedRunIds = parent.requests
1054
- .filter((request) => request.status === "completed" &&
1055
- typeof request.runId === "string" &&
1084
+ const runIds = parent.requests
1085
+ .filter((request) => typeof request.runId === "string" &&
1056
1086
  request.runId.trim().length > 0)
1057
1087
  .map((request) => request.runId);
1058
- if (completedRunIds.length === 0)
1059
- return null;
1060
- const runIdSet = new Set(completedRunIds);
1088
+ const runIdSet = new Set(runIds);
1061
1089
  const allRuns = await context.readWorkspaceGradeRuns(parent.workspaceId);
1062
1090
  const selectedRuns = allRuns.filter((run) => runIdSet.has(run.id));
1063
- if (selectedRuns.length === 0)
1064
- return null;
1065
- return buildVerifyMetricsFromRuns(selectedRuns);
1091
+ return buildVerifyMetrics({
1092
+ batch: parent,
1093
+ runs: selectedRuns,
1094
+ });
1066
1095
  },
1067
1096
  }),
1068
1097
  }),
@@ -1907,7 +1936,8 @@ const WorkspaceConversationSessionStartInput = builder.inputType("WorkspaceConve
1907
1936
  assistantInit: t.field({ type: "JSON" }),
1908
1937
  graderId: t.id(),
1909
1938
  scenarioRunId: t.id(),
1910
- batchSize: t.int(),
1939
+ scenarioRuns: t.int(),
1940
+ graderRepeatsPerScenario: t.int(),
1911
1941
  concurrency: t.int(),
1912
1942
  }),
1913
1943
  });
@@ -1939,9 +1969,10 @@ const WorkspaceGradeRunCreateInput = builder.inputType("WorkspaceGradeRunCreateI
1939
1969
  const WorkspaceVerifyBatchRunCreateInput = builder.inputType("WorkspaceVerifyBatchRunCreateInput", {
1940
1970
  fields: (t) => ({
1941
1971
  workspaceId: t.id({ required: true }),
1972
+ scenarioDeckId: t.id(),
1942
1973
  graderId: t.id({ required: true }),
1943
- scenarioRunId: t.id(),
1944
- batchSize: t.int({ required: true }),
1974
+ scenarioRuns: t.int({ required: true }),
1975
+ graderRepeatsPerScenario: t.int({ required: true }),
1945
1976
  concurrency: t.int({ required: true }),
1946
1977
  }),
1947
1978
  });
@@ -2149,7 +2180,8 @@ builder.mutationType({
2149
2180
  assistantInit: args.input.assistantInit,
2150
2181
  graderId: args.input.graderId ?? null,
2151
2182
  scenarioRunId: args.input.scenarioRunId ?? null,
2152
- batchSize: args.input.batchSize ?? null,
2183
+ scenarioRuns: args.input.scenarioRuns ?? null,
2184
+ graderRepeatsPerScenario: args.input.graderRepeatsPerScenario ?? null,
2153
2185
  concurrency: args.input.concurrency ?? null,
2154
2186
  });
2155
2187
  return {
@@ -2269,9 +2301,10 @@ builder.mutationType({
2269
2301
  }
2270
2302
  const batch = await context.createWorkspaceVerifyBatchRun({
2271
2303
  workspaceId: args.input.workspaceId,
2304
+ scenarioDeckId: args.input.scenarioDeckId ?? null,
2272
2305
  graderId: args.input.graderId,
2273
- scenarioRunId: args.input.scenarioRunId ?? null,
2274
- batchSize: args.input.batchSize,
2306
+ scenarioRuns: args.input.scenarioRuns,
2307
+ graderRepeatsPerScenario: args.input.graderRepeatsPerScenario,
2275
2308
  concurrency: args.input.concurrency,
2276
2309
  });
2277
2310
  return {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@bolt-foundry/gambit",
3
- "version": "0.8.6-rc.1",
3
+ "version": "0.8.6-rc.2",
4
4
  "description": "Agent harness framework for building, running, and verifying LLM workflows in Markdown and code.",
5
5
  "homepage": "https://github.com/bolt-foundry/gambit",
6
6
  "repository": {
@@ -24,7 +24,7 @@
24
24
  "gambit": "bin/gambit.cjs"
25
25
  },
26
26
  "dependencies": {
27
- "@bolt-foundry/gambit-core": "^0.8.6-rc.1",
27
+ "@bolt-foundry/gambit-core": "file:../../packages/gambit-core/dist/npm",
28
28
  "@isograph/react": "0.5.2",
29
29
  "@pothos/core": "^4.7.3",
30
30
  "@pothos/plugin-relay": "^4.4.2",