@bolt-foundry/gambit 0.8.6-rc.1 → 0.8.6-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. package/CHANGELOG.md +60 -1
  2. package/esm/gambit/simulator-ui/dist/bundle.js +2033 -964
  3. package/esm/gambit/simulator-ui/dist/bundle.js.map +4 -4
  4. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts +2 -1
  5. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts.map +1 -1
  6. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.d.ts.map +1 -1
  7. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.js +288 -101
  8. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts +84 -45
  9. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts.map +1 -1
  10. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts +3 -2
  11. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts.map +1 -1
  12. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts +1 -1
  13. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts.map +1 -1
  14. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.js +85 -47
  15. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts +86 -48
  16. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts.map +1 -1
  17. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.d.ts.map +1 -1
  18. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.js +363 -97
  19. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.d.ts.map +1 -1
  20. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.js +154 -76
  21. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts +1 -1
  22. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts.map +1 -1
  23. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.js +45 -29
  24. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts +46 -30
  25. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts.map +1 -1
  26. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.d.ts.map +1 -1
  27. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.js +154 -76
  28. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts +45 -28
  29. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts.map +1 -1
  30. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts +1 -1
  31. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts.map +1 -1
  32. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.js +45 -29
  33. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts +46 -30
  34. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts.map +1 -1
  35. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.d.ts.map +1 -1
  36. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.js +194 -78
  37. package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts +45 -28
  38. package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts.map +1 -1
  39. package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.d.ts.map +1 -1
  40. package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.js +194 -78
  41. package/esm/simulator-ui/isograph/components/Query/SimulatorVerifyPage.d.ts.map +1 -1
  42. package/esm/simulator-ui/isograph/components/Query/SimulatorVerifyPage.js +257 -203
  43. package/esm/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.d.ts.map +1 -1
  44. package/esm/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.js +8 -3
  45. package/esm/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.d.ts.map +1 -1
  46. package/esm/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.js +45 -28
  47. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts +84 -45
  48. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts.map +1 -1
  49. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.js +85 -46
  50. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts +84 -45
  51. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts.map +1 -1
  52. package/esm/simulator-ui/src/AppShell.d.ts.map +1 -1
  53. package/esm/simulator-ui/src/AppShell.js +6 -4
  54. package/esm/simulator-ui/src/verify_unified.d.ts +17 -0
  55. package/esm/simulator-ui/src/verify_unified.d.ts.map +1 -0
  56. package/esm/simulator-ui/src/verify_unified.js +25 -0
  57. package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts +1 -1
  58. package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts.map +1 -1
  59. package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.js +1 -1
  60. package/esm/src/server.d.ts.map +1 -1
  61. package/esm/src/server.js +227 -45
  62. package/esm/src/server_isograph_environment.d.ts +11 -4
  63. package/esm/src/server_isograph_environment.d.ts.map +1 -1
  64. package/esm/src/server_isograph_environment.js +5 -0
  65. package/esm/src/simulator_graphql.d.ts +11 -4
  66. package/esm/src/simulator_graphql.d.ts.map +1 -1
  67. package/esm/src/simulator_graphql.js +278 -245
  68. package/package.json +2 -2
  69. package/script/gambit/simulator-ui/dist/bundle.js +2033 -964
  70. package/script/gambit/simulator-ui/dist/bundle.js.map +4 -4
  71. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts +2 -1
  72. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts.map +1 -1
  73. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.d.ts.map +1 -1
  74. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.js +288 -101
  75. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts +84 -45
  76. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts.map +1 -1
  77. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts +3 -2
  78. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts.map +1 -1
  79. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts +1 -1
  80. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts.map +1 -1
  81. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.js +85 -47
  82. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts +86 -48
  83. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts.map +1 -1
  84. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.d.ts.map +1 -1
  85. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.js +363 -97
  86. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.d.ts.map +1 -1
  87. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.js +154 -76
  88. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts +1 -1
  89. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts.map +1 -1
  90. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.js +45 -29
  91. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts +46 -30
  92. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts.map +1 -1
  93. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.d.ts.map +1 -1
  94. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.js +154 -76
  95. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts +45 -28
  96. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts.map +1 -1
  97. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts +1 -1
  98. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts.map +1 -1
  99. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.js +45 -29
  100. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts +46 -30
  101. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts.map +1 -1
  102. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.d.ts.map +1 -1
  103. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.js +194 -78
  104. package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts +45 -28
  105. package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts.map +1 -1
  106. package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.d.ts.map +1 -1
  107. package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.js +194 -78
  108. package/script/simulator-ui/isograph/components/Query/SimulatorVerifyPage.d.ts.map +1 -1
  109. package/script/simulator-ui/isograph/components/Query/SimulatorVerifyPage.js +256 -202
  110. package/script/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.d.ts.map +1 -1
  111. package/script/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.js +8 -3
  112. package/script/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.d.ts.map +1 -1
  113. package/script/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.js +45 -28
  114. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts +84 -45
  115. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts.map +1 -1
  116. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.js +85 -46
  117. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts +84 -45
  118. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts.map +1 -1
  119. package/script/simulator-ui/src/AppShell.d.ts.map +1 -1
  120. package/script/simulator-ui/src/AppShell.js +6 -4
  121. package/script/simulator-ui/src/verify_unified.d.ts +17 -0
  122. package/script/simulator-ui/src/verify_unified.d.ts.map +1 -0
  123. package/script/simulator-ui/src/verify_unified.js +29 -0
  124. package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts +1 -1
  125. package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts.map +1 -1
  126. package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.js +1 -1
  127. package/script/src/server.d.ts.map +1 -1
  128. package/script/src/server.js +227 -45
  129. package/script/src/server_isograph_environment.d.ts +11 -4
  130. package/script/src/server_isograph_environment.d.ts.map +1 -1
  131. package/script/src/server_isograph_environment.js +5 -0
  132. package/script/src/simulator_graphql.d.ts +11 -4
  133. package/script/src/simulator_graphql.d.ts.map +1 -1
  134. package/script/src/simulator_graphql.js +278 -245
  135. package/esm/simulator-ui/src/verify_metrics.d.ts +0 -50
  136. package/esm/simulator-ui/src/verify_metrics.d.ts.map +0 -1
  137. package/esm/simulator-ui/src/verify_metrics.js +0 -260
  138. package/script/simulator-ui/src/verify_metrics.d.ts +0 -50
  139. package/script/simulator-ui/src/verify_metrics.d.ts.map +0 -1
  140. package/script/simulator-ui/src/verify_metrics.js +0 -264
@@ -576,20 +576,6 @@ function toWorkspaceGradeRunStatus(value) {
576
576
  return "COMPLETED";
577
577
  return "ERROR";
578
578
  }
579
- const VERIFY_CONSISTENCY_THRESHOLDS = {
580
- minSampleSize: 6,
581
- instabilityScoreDelta: 1.5,
582
- pass: {
583
- agreementMin: 0.9,
584
- maxSpread: 1,
585
- maxInstabilityCount: 0,
586
- },
587
- warn: {
588
- agreementMin: 0.75,
589
- maxSpread: 2,
590
- maxInstabilityCount: 2,
591
- },
592
- };
593
579
  function round2(value) {
594
580
  return Math.round(value * 100) / 100;
595
581
  }
@@ -602,195 +588,179 @@ function median(values) {
602
588
  return sorted[mid];
603
589
  return (sorted[mid - 1] + sorted[mid]) / 2;
604
590
  }
605
- function flattenVerifyRunExamples(run) {
606
- if (run.turns.length > 0) {
607
- return run.turns.map((turn, fallbackIndex) => {
608
- const index = typeof turn.turnIndex === "number"
609
- ? turn.turnIndex
610
- : fallbackIndex;
611
- const key = turn.refId && turn.refId.trim().length > 0
612
- ? `ref:${turn.refId}`
613
- : `turn:${index}`;
614
- const pass = typeof turn.score === "number" ? turn.score >= 0 : undefined;
615
- return {
616
- key,
617
- label: `Assistant turn ${turn.turnNumber || fallbackIndex + 1}`,
618
- points: [{
619
- runId: run.id,
620
- score: turn.score,
621
- pass,
622
- reason: turn.reason,
623
- turnIndex: index,
624
- messageRefId: turn.refId,
625
- }],
626
- };
627
- });
591
+ function extractRepresentativeScore(run) {
592
+ const summaryScore = run.summary?.score;
593
+ if (typeof summaryScore === "number" && Number.isFinite(summaryScore)) {
594
+ return summaryScore;
628
595
  }
629
- const score = run.summary?.score;
630
- const reason = run.summary?.reason;
631
- const pass = typeof score === "number" ? score >= 0 : undefined;
632
- return [{
633
- key: "conversation",
634
- label: "Conversation score",
635
- points: [{
636
- runId: run.id,
637
- score,
638
- pass,
639
- reason,
640
- }],
641
- }];
596
+ const turnScores = run.turns
597
+ .map((turn) => turn.score)
598
+ .filter((score) => typeof score === "number" && Number.isFinite(score));
599
+ if (turnScores.length === 0)
600
+ return null;
601
+ return median(turnScores);
642
602
  }
643
- function resolveVerifyVerdict(input) {
644
- if (input.sampleSize < VERIFY_CONSISTENCY_THRESHOLDS.minSampleSize) {
645
- return {
646
- verdict: "WARN",
647
- reason: `Need at least ${VERIFY_CONSISTENCY_THRESHOLDS.minSampleSize} samples before issuing a firm verdict.`,
648
- };
649
- }
650
- if (input.agreementRate === null) {
651
- return {
652
- verdict: "WARN",
653
- reason: "No comparable pass/fail evidence was found in the sampled runs.",
654
- };
655
- }
656
- const spreadMax = input.spreadMax ?? 0;
657
- if (input.agreementRate >= VERIFY_CONSISTENCY_THRESHOLDS.pass.agreementMin &&
658
- spreadMax <= VERIFY_CONSISTENCY_THRESHOLDS.pass.maxSpread &&
659
- input.instabilityCount <=
660
- VERIFY_CONSISTENCY_THRESHOLDS.pass.maxInstabilityCount) {
661
- return {
662
- verdict: "PASS",
663
- reason: "Agreement, spread, and instability all meet PASS thresholds.",
664
- };
665
- }
666
- if (input.agreementRate >= VERIFY_CONSISTENCY_THRESHOLDS.warn.agreementMin &&
667
- spreadMax <= VERIFY_CONSISTENCY_THRESHOLDS.warn.maxSpread &&
668
- input.instabilityCount <=
669
- VERIFY_CONSISTENCY_THRESHOLDS.warn.maxInstabilityCount) {
670
- return {
671
- verdict: "WARN",
672
- reason: "Some variation was detected, but results remain within WARN thresholds.",
673
- };
603
+ function extractRepresentativeReason(run) {
604
+ if (typeof run.summary?.reason === "string" &&
605
+ run.summary.reason.trim().length > 0) {
606
+ return run.summary.reason.trim();
674
607
  }
675
- return {
676
- verdict: "FAIL",
677
- reason: "Agreement/spread instability exceeds WARN thresholds.",
678
- };
608
+ const turnReason = run.turns.find((turn) => typeof turn.reason === "string" && turn.reason.trim().length > 0)?.reason;
609
+ return typeof turnReason === "string" ? turnReason.trim() : null;
679
610
  }
680
- function buildVerifyMetricsFromRuns(runs) {
681
- const completedRuns = runs.filter((run) => run.status === "completed");
682
- const sampleSize = completedRuns.length;
683
- const bucketsByKey = new Map();
684
- completedRuns.forEach((run) => {
685
- flattenVerifyRunExamples(run).forEach((entry) => {
686
- const existing = bucketsByKey.get(entry.key);
687
- if (!existing) {
688
- bucketsByKey.set(entry.key, {
689
- key: entry.key,
690
- label: entry.label,
691
- points: [...entry.points],
692
- });
693
- return;
694
- }
695
- existing.points.push(...entry.points);
696
- });
697
- });
698
- const outliers = [];
699
- let agreementVotes = 0;
700
- let agreementTotal = 0;
701
- const scoreDeltas = [];
702
- bucketsByKey.forEach((bucket) => {
703
- const scores = bucket.points
704
- .map((point) => point.score)
705
- .filter((score) => typeof score === "number" && Number.isFinite(score));
706
- const minScore = scores.length > 0 ? Math.min(...scores) : null;
707
- const maxScore = scores.length > 0 ? Math.max(...scores) : null;
708
- const scoreDelta = minScore !== null && maxScore !== null
709
- ? round2(maxScore - minScore)
710
- : null;
711
- const passVotes = bucket.points
712
- .map((point) => point.pass)
713
- .filter((pass) => typeof pass === "boolean");
714
- const passCount = passVotes.filter((value) => value).length;
715
- const failCount = passVotes.length - passCount;
716
- const agreementRate = passVotes.length > 0
717
- ? round2(Math.max(passCount, failCount) / passVotes.length)
718
- : null;
719
- if (passVotes.length > 0) {
720
- agreementVotes += Math.max(passCount, failCount);
721
- agreementTotal += passVotes.length;
611
+ function buildVerifyMetrics(args) {
612
+ const runById = new Map(args.runs.map((run) => [run.id, run]));
613
+ const reasonCounter = new Map();
614
+ const scoreValues = [];
615
+ let passCount = 0;
616
+ let scoreCount = 0;
617
+ let gradingFailureCount = 0;
618
+ const scenarioGroups = new Map();
619
+ const addReason = (kind, reason) => {
620
+ const normalized = typeof reason === "string" ? reason.trim() : "";
621
+ if (!normalized)
622
+ return;
623
+ const key = `${kind}:${normalized}`;
624
+ const existing = reasonCounter.get(key);
625
+ if (existing) {
626
+ existing.count += 1;
627
+ return;
722
628
  }
723
- if (scoreDelta !== null) {
724
- scoreDeltas.push(scoreDelta);
629
+ reasonCounter.set(key, { key, kind, reason: normalized, count: 1 });
630
+ };
631
+ for (const request of args.batch.requests) {
632
+ if (!request.scenarioRunId)
633
+ continue;
634
+ const existing = scenarioGroups.get(request.scenarioRunId);
635
+ if (existing) {
636
+ existing.requests.push(request);
637
+ continue;
725
638
  }
726
- const passFlip = passCount > 0 && failCount > 0;
727
- const instability = passFlip ||
728
- (scoreDelta !== null &&
729
- scoreDelta > VERIFY_CONSISTENCY_THRESHOLDS.instabilityScoreDelta);
730
- const minPoint = minScore === null
731
- ? undefined
732
- : bucket.points.find((point) => point.score === minScore);
733
- const maxPoint = maxScore === null
734
- ? undefined
735
- : bucket.points.find((point) => point.score === maxScore);
736
- outliers.push({
737
- key: bucket.key,
738
- label: bucket.label,
739
- sampleSize: bucket.points.length,
740
- agreementRate,
741
- scoreDelta,
742
- passFlip,
743
- instability,
744
- minRunId: minPoint?.runId,
745
- maxRunId: maxPoint?.runId,
746
- turnIndex: maxPoint?.turnIndex ?? minPoint?.turnIndex,
747
- messageRefId: maxPoint?.messageRefId ?? minPoint?.messageRefId,
639
+ scenarioGroups.set(request.scenarioRunId, {
640
+ scenarioRunId: request.scenarioRunId,
641
+ requests: [request],
642
+ scores: [],
643
+ executionFailureCount: 0,
644
+ gradingFailureCount: 0,
645
+ completedSampleCount: 0,
748
646
  });
749
- });
750
- outliers.sort((left, right) => {
751
- if (left.instability !== right.instability) {
752
- return left.instability ? -1 : 1;
647
+ }
648
+ for (const request of args.batch.requests) {
649
+ if (request.status === "error") {
650
+ addReason("execution", request.error);
651
+ if (request.scenarioRunId) {
652
+ const group = scenarioGroups.get(request.scenarioRunId);
653
+ if (group)
654
+ group.executionFailureCount += 1;
655
+ }
656
+ continue;
657
+ }
658
+ if (request.status !== "completed" || !request.runId)
659
+ continue;
660
+ const run = runById.get(request.runId);
661
+ if (!run || run.status !== "completed")
662
+ continue;
663
+ const score = extractRepresentativeScore(run);
664
+ if (request.scenarioRunId) {
665
+ const group = scenarioGroups.get(request.scenarioRunId);
666
+ if (group) {
667
+ group.completedSampleCount += 1;
668
+ }
669
+ }
670
+ if (score === null)
671
+ continue;
672
+ const rounded = round2(score);
673
+ scoreValues.push(rounded);
674
+ scoreCount += 1;
675
+ if (rounded >= 0) {
676
+ passCount += 1;
753
677
  }
754
- if (left.passFlip !== right.passFlip)
755
- return left.passFlip ? -1 : 1;
756
- const leftDelta = left.scoreDelta ?? -1;
757
- const rightDelta = right.scoreDelta ?? -1;
758
- if (leftDelta !== rightDelta)
759
- return rightDelta - leftDelta;
760
- if (left.sampleSize !== right.sampleSize) {
761
- return right.sampleSize - left.sampleSize;
678
+ else {
679
+ gradingFailureCount += 1;
680
+ addReason("grading", extractRepresentativeReason(run));
681
+ if (request.scenarioRunId) {
682
+ const group = scenarioGroups.get(request.scenarioRunId);
683
+ if (group)
684
+ group.gradingFailureCount += 1;
685
+ }
762
686
  }
763
- return left.label.localeCompare(right.label);
687
+ if (request.scenarioRunId) {
688
+ const group = scenarioGroups.get(request.scenarioRunId);
689
+ if (group) {
690
+ const messageRefId = run.turns.find((turn) => typeof turn.refId === "string" && turn.refId.trim().length > 0)?.refId;
691
+ group.scores.push({
692
+ runId: run.id,
693
+ value: rounded,
694
+ messageRefId,
695
+ });
696
+ }
697
+ }
698
+ }
699
+ const outlierScenarioRuns = [
700
+ ...scenarioGroups.values(),
701
+ ].map((group) => {
702
+ const sortedScores = [...group.scores].sort((left, right) => left.value - right.value);
703
+ const numericScores = sortedScores.map((entry) => entry.value);
704
+ const min = numericScores.length > 0 ? numericScores[0] : null;
705
+ const max = numericScores.length > 0
706
+ ? numericScores[numericScores.length - 1]
707
+ : null;
708
+ const avg = numericScores.length > 0
709
+ ? round2(numericScores.reduce((sum, value) => sum + value, 0) /
710
+ numericScores.length)
711
+ : null;
712
+ return {
713
+ key: group.scenarioRunId,
714
+ scenarioRunId: group.scenarioRunId,
715
+ gradeSampleCount: group.requests.length,
716
+ completedSampleCount: group.completedSampleCount,
717
+ executionFailureCount: group.executionFailureCount,
718
+ gradingFailureCount: group.gradingFailureCount,
719
+ averageScore: avg,
720
+ minScore: min === null ? null : round2(min),
721
+ maxScore: max === null ? null : round2(max),
722
+ failed: group.executionFailureCount > 0,
723
+ minRunId: sortedScores[0]?.runId,
724
+ maxRunId: sortedScores[sortedScores.length - 1]?.runId,
725
+ messageRefId: sortedScores[0]?.messageRefId ??
726
+ sortedScores[sortedScores.length - 1]?.messageRefId,
727
+ };
728
+ }).sort((left, right) => {
729
+ if (left.failed !== right.failed)
730
+ return left.failed ? -1 : 1;
731
+ const leftScore = left.averageScore ?? Number.POSITIVE_INFINITY;
732
+ const rightScore = right.averageScore ?? Number.POSITIVE_INFINITY;
733
+ if (leftScore !== rightScore)
734
+ return leftScore - rightScore;
735
+ return left.scenarioRunId.localeCompare(right.scenarioRunId);
764
736
  });
765
- const agreementRate = agreementTotal > 0
766
- ? round2(agreementVotes / agreementTotal)
737
+ const scoreMin = scoreValues.length > 0 ? Math.min(...scoreValues) : null;
738
+ const scoreMax = scoreValues.length > 0 ? Math.max(...scoreValues) : null;
739
+ const scoreMedian = median(scoreValues);
740
+ const scoreMean = scoreValues.length > 0
741
+ ? round2(scoreValues.reduce((sum, value) => sum + value, 0) / scoreValues.length)
767
742
  : null;
768
- const scoreSpreadMin = scoreDeltas.length > 0
769
- ? Math.min(...scoreDeltas)
770
- : null;
771
- const scoreSpreadMax = scoreDeltas.length > 0
772
- ? Math.max(...scoreDeltas)
773
- : null;
774
- const scoreSpreadMedian = median(scoreDeltas);
775
- const instabilityCount = outliers.filter((entry) => entry.instability).length;
776
- const verdict = resolveVerifyVerdict({
777
- sampleSize,
778
- agreementRate,
779
- spreadMax: scoreSpreadMax,
780
- instabilityCount,
781
- });
743
+ const passRate = scoreCount > 0 ? round2(passCount / scoreCount) : null;
782
744
  return {
783
- sampleSize,
784
- agreementRate,
785
- scoreSpreadMin: scoreSpreadMin === null ? null : round2(scoreSpreadMin),
786
- scoreSpreadMedian: scoreSpreadMedian === null
787
- ? null
788
- : round2(scoreSpreadMedian),
789
- scoreSpreadMax: scoreSpreadMax === null ? null : round2(scoreSpreadMax),
790
- instabilityCount,
791
- verdict: verdict.verdict,
792
- verdictReason: verdict.reason,
793
- outliers,
745
+ scenarioRunCountRequested: args.batch.scenarioRuns,
746
+ scenarioRunCountCompleted: args.batch.scenarioRunsCompleted,
747
+ scenarioRunCountFailed: args.batch.scenarioRunsFailed,
748
+ gradeSampleCountRequested: args.batch.requested,
749
+ gradeSampleCountCompleted: args.batch.completed,
750
+ gradeSampleCountFailed: args.batch.failed,
751
+ executionFailureCount: args.batch.failed,
752
+ gradingFailureCount,
753
+ passRate,
754
+ scoreMin: scoreMin === null ? null : round2(scoreMin),
755
+ scoreMedian: scoreMedian === null ? null : round2(scoreMedian),
756
+ scoreMax: scoreMax === null ? null : round2(scoreMax),
757
+ scoreMean,
758
+ outlierScenarioRuns,
759
+ failureReasons: [...reasonCounter.values()].sort((left, right) => {
760
+ if (left.count !== right.count)
761
+ return right.count - left.count;
762
+ return left.reason.localeCompare(right.reason);
763
+ }),
794
764
  };
795
765
  }
796
766
  function toWorkspaceVerifyBatchStatus(value) {
@@ -817,8 +787,8 @@ const WorkspaceVerifyBatchStatusEnum = builder.enumType("WorkspaceVerifyBatchSta
817
787
  const WorkspaceVerifyBatchRequestStatusEnum = builder.enumType("WorkspaceVerifyBatchRequestStatus", {
818
788
  values: ["QUEUED", "RUNNING", "COMPLETED", "ERROR"],
819
789
  });
820
- const WorkspaceVerifyVerdictEnum = builder.enumType("WorkspaceVerifyVerdict", {
821
- values: ["PASS", "WARN", "FAIL"],
790
+ const WorkspaceVerifyFailureReasonKindEnum = builder.enumType("WorkspaceVerifyFailureReasonKind", {
791
+ values: ["EXECUTION", "GRADING"],
822
792
  });
823
793
  const WorkspaceGradeTurnType = builder.objectRef("WorkspaceGradeTurn");
824
794
  WorkspaceGradeTurnType.implement({
@@ -969,28 +939,44 @@ WorkspaceVerifyBatchRequestType.implement({
969
939
  nullable: true,
970
940
  resolve: (parent) => parent.runId ?? null,
971
941
  }),
942
+ scenarioRunId: t.id({
943
+ nullable: true,
944
+ resolve: (parent) => parent.scenarioRunId ?? null,
945
+ }),
972
946
  error: t.string({
973
947
  nullable: true,
974
948
  resolve: (parent) => parent.error ?? null,
975
949
  }),
976
950
  }),
977
951
  });
978
- const WorkspaceVerifyOutlierType = builder.objectRef("WorkspaceVerifyOutlier");
979
- WorkspaceVerifyOutlierType.implement({
952
+ const WorkspaceVerifyScenarioOutlierType = builder.objectRef("WorkspaceVerifyScenarioOutlier");
953
+ WorkspaceVerifyScenarioOutlierType.implement({
980
954
  fields: (t) => ({
981
955
  key: t.id({ resolve: (parent) => parent.key }),
982
- label: t.string({ resolve: (parent) => parent.label }),
983
- sampleSize: t.int({ resolve: (parent) => parent.sampleSize }),
984
- agreementRate: t.float({
956
+ scenarioRunId: t.id({ resolve: (parent) => parent.scenarioRunId }),
957
+ gradeSampleCount: t.int({ resolve: (parent) => parent.gradeSampleCount }),
958
+ completedSampleCount: t.int({
959
+ resolve: (parent) => parent.completedSampleCount,
960
+ }),
961
+ executionFailureCount: t.int({
962
+ resolve: (parent) => parent.executionFailureCount,
963
+ }),
964
+ gradingFailureCount: t.int({
965
+ resolve: (parent) => parent.gradingFailureCount,
966
+ }),
967
+ averageScore: t.float({
968
+ nullable: true,
969
+ resolve: (parent) => parent.averageScore,
970
+ }),
971
+ minScore: t.float({
985
972
  nullable: true,
986
- resolve: (parent) => parent.agreementRate,
973
+ resolve: (parent) => parent.minScore,
987
974
  }),
988
- scoreDelta: t.float({
975
+ maxScore: t.float({
989
976
  nullable: true,
990
- resolve: (parent) => parent.scoreDelta,
977
+ resolve: (parent) => parent.maxScore,
991
978
  }),
992
- passFlip: t.boolean({ resolve: (parent) => parent.passFlip }),
993
- instability: t.boolean({ resolve: (parent) => parent.instability }),
979
+ failed: t.boolean({ resolve: (parent) => parent.failed }),
994
980
  minRunId: t.id({
995
981
  nullable: true,
996
982
  resolve: (parent) => parent.minRunId ?? null,
@@ -999,53 +985,86 @@ WorkspaceVerifyOutlierType.implement({
999
985
  nullable: true,
1000
986
  resolve: (parent) => parent.maxRunId ?? null,
1001
987
  }),
1002
- turnIndex: t.int({
1003
- nullable: true,
1004
- resolve: (parent) => parent.turnIndex ?? null,
1005
- }),
1006
988
  messageRefId: t.id({
1007
989
  nullable: true,
1008
990
  resolve: (parent) => parent.messageRefId ?? null,
1009
991
  }),
1010
992
  }),
1011
993
  });
994
+ const WorkspaceVerifyFailureReasonGroupType = builder.objectRef("WorkspaceVerifyFailureReasonGroup");
995
+ WorkspaceVerifyFailureReasonGroupType.implement({
996
+ fields: (t) => ({
997
+ key: t.id({ resolve: (parent) => parent.key }),
998
+ kind: t.field({
999
+ type: WorkspaceVerifyFailureReasonKindEnum,
1000
+ resolve: (parent) => parent.kind === "execution" ? "EXECUTION" : "GRADING",
1001
+ }),
1002
+ reason: t.string({ resolve: (parent) => parent.reason }),
1003
+ count: t.int({ resolve: (parent) => parent.count }),
1004
+ }),
1005
+ });
1012
1006
  const WorkspaceVerifyMetricsType = builder.objectRef("WorkspaceVerifyMetrics");
1013
1007
  WorkspaceVerifyMetricsType.implement({
1014
1008
  fields: (t) => ({
1015
- sampleSize: t.int({ resolve: (parent) => parent.sampleSize }),
1016
- agreementRate: t.float({
1017
- nullable: true,
1018
- resolve: (parent) => parent.agreementRate,
1009
+ scenarioRunCountRequested: t.int({
1010
+ resolve: (parent) => parent.scenarioRunCountRequested,
1011
+ }),
1012
+ scenarioRunCountCompleted: t.int({
1013
+ resolve: (parent) => parent.scenarioRunCountCompleted,
1014
+ }),
1015
+ scenarioRunCountFailed: t.int({
1016
+ resolve: (parent) => parent.scenarioRunCountFailed,
1017
+ }),
1018
+ gradeSampleCountRequested: t.int({
1019
+ resolve: (parent) => parent.gradeSampleCountRequested,
1020
+ }),
1021
+ gradeSampleCountCompleted: t.int({
1022
+ resolve: (parent) => parent.gradeSampleCountCompleted,
1023
+ }),
1024
+ gradeSampleCountFailed: t.int({
1025
+ resolve: (parent) => parent.gradeSampleCountFailed,
1019
1026
  }),
1020
- scoreSpreadMin: t.float({
1027
+ executionFailureCount: t.int({
1028
+ resolve: (parent) => parent.executionFailureCount,
1029
+ }),
1030
+ gradingFailureCount: t.int({
1031
+ resolve: (parent) => parent.gradingFailureCount,
1032
+ }),
1033
+ passRate: t.float({
1021
1034
  nullable: true,
1022
- resolve: (parent) => parent.scoreSpreadMin,
1035
+ resolve: (parent) => parent.passRate,
1023
1036
  }),
1024
- scoreSpreadMedian: t.float({
1037
+ scoreMin: t.float({
1025
1038
  nullable: true,
1026
- resolve: (parent) => parent.scoreSpreadMedian,
1039
+ resolve: (parent) => parent.scoreMin,
1027
1040
  }),
1028
- scoreSpreadMax: t.float({
1041
+ scoreMedian: t.float({
1029
1042
  nullable: true,
1030
- resolve: (parent) => parent.scoreSpreadMax,
1043
+ resolve: (parent) => parent.scoreMedian,
1031
1044
  }),
1032
- instabilityCount: t.int({
1033
- resolve: (parent) => parent.instabilityCount,
1045
+ scoreMax: t.float({
1046
+ nullable: true,
1047
+ resolve: (parent) => parent.scoreMax,
1034
1048
  }),
1035
- verdict: t.field({
1036
- type: WorkspaceVerifyVerdictEnum,
1037
- resolve: (parent) => parent.verdict,
1049
+ scoreMean: t.float({
1050
+ nullable: true,
1051
+ resolve: (parent) => parent.scoreMean,
1038
1052
  }),
1039
- verdictReason: t.string({
1040
- resolve: (parent) => parent.verdictReason,
1053
+ outlierScenarioRuns: t.connection({
1054
+ type: WorkspaceVerifyScenarioOutlierType,
1055
+ args: {
1056
+ first: t.arg.int(),
1057
+ after: t.arg.string(),
1058
+ },
1059
+ resolve: (parent, args) => (0, plugin_relay_1.resolveArrayConnection)({ args }, parent.outlierScenarioRuns),
1041
1060
  }),
1042
- outliers: t.connection({
1043
- type: WorkspaceVerifyOutlierType,
1061
+ failureReasons: t.connection({
1062
+ type: WorkspaceVerifyFailureReasonGroupType,
1044
1063
  args: {
1045
1064
  first: t.arg.int(),
1046
1065
  after: t.arg.string(),
1047
1066
  },
1048
- resolve: (parent, args) => (0, plugin_relay_1.resolveArrayConnection)({ args }, parent.outliers),
1067
+ resolve: (parent, args) => (0, plugin_relay_1.resolveArrayConnection)({ args }, parent.failureReasons),
1049
1068
  }),
1050
1069
  }),
1051
1070
  });
@@ -1054,10 +1073,22 @@ WorkspaceVerifyBatchType.implement({
1054
1073
  fields: (t) => ({
1055
1074
  id: t.id({ resolve: (parent) => parent.id }),
1056
1075
  workspaceId: t.id({ resolve: (parent) => parent.workspaceId }),
1057
- graderId: t.id({ resolve: (parent) => parent.graderId }),
1058
- scenarioRunId: t.id({
1076
+ scenarioDeckId: t.id({
1059
1077
  nullable: true,
1060
- resolve: (parent) => parent.scenarioRunId ?? null,
1078
+ resolve: (parent) => parent.scenarioDeckId ?? null,
1079
+ }),
1080
+ graderId: t.id({ resolve: (parent) => parent.graderId }),
1081
+ scenarioRuns: t.int({
1082
+ resolve: (parent) => parent.scenarioRuns,
1083
+ }),
1084
+ graderRepeatsPerScenario: t.int({
1085
+ resolve: (parent) => parent.graderRepeatsPerScenario,
1086
+ }),
1087
+ scenarioRunsCompleted: t.int({
1088
+ resolve: (parent) => parent.scenarioRunsCompleted,
1089
+ }),
1090
+ scenarioRunsFailed: t.int({
1091
+ resolve: (parent) => parent.scenarioRunsFailed,
1061
1092
  }),
1062
1093
  status: t.field({
1063
1094
  type: WorkspaceVerifyBatchStatusEnum,
@@ -1089,19 +1120,17 @@ WorkspaceVerifyBatchType.implement({
1089
1120
  resolve: async (parent, _args, context) => {
1090
1121
  if (!context.readWorkspaceGradeRuns)
1091
1122
  return null;
1092
- const completedRunIds = parent.requests
1093
- .filter((request) => request.status === "completed" &&
1094
- typeof request.runId === "string" &&
1123
+ const runIds = parent.requests
1124
+ .filter((request) => typeof request.runId === "string" &&
1095
1125
  request.runId.trim().length > 0)
1096
1126
  .map((request) => request.runId);
1097
- if (completedRunIds.length === 0)
1098
- return null;
1099
- const runIdSet = new Set(completedRunIds);
1127
+ const runIdSet = new Set(runIds);
1100
1128
  const allRuns = await context.readWorkspaceGradeRuns(parent.workspaceId);
1101
1129
  const selectedRuns = allRuns.filter((run) => runIdSet.has(run.id));
1102
- if (selectedRuns.length === 0)
1103
- return null;
1104
- return buildVerifyMetricsFromRuns(selectedRuns);
1130
+ return buildVerifyMetrics({
1131
+ batch: parent,
1132
+ runs: selectedRuns,
1133
+ });
1105
1134
  },
1106
1135
  }),
1107
1136
  }),
@@ -1946,7 +1975,8 @@ const WorkspaceConversationSessionStartInput = builder.inputType("WorkspaceConve
1946
1975
  assistantInit: t.field({ type: "JSON" }),
1947
1976
  graderId: t.id(),
1948
1977
  scenarioRunId: t.id(),
1949
- batchSize: t.int(),
1978
+ scenarioRuns: t.int(),
1979
+ graderRepeatsPerScenario: t.int(),
1950
1980
  concurrency: t.int(),
1951
1981
  }),
1952
1982
  });
@@ -1978,9 +2008,10 @@ const WorkspaceGradeRunCreateInput = builder.inputType("WorkspaceGradeRunCreateI
1978
2008
  const WorkspaceVerifyBatchRunCreateInput = builder.inputType("WorkspaceVerifyBatchRunCreateInput", {
1979
2009
  fields: (t) => ({
1980
2010
  workspaceId: t.id({ required: true }),
2011
+ scenarioDeckId: t.id(),
1981
2012
  graderId: t.id({ required: true }),
1982
- scenarioRunId: t.id(),
1983
- batchSize: t.int({ required: true }),
2013
+ scenarioRuns: t.int({ required: true }),
2014
+ graderRepeatsPerScenario: t.int({ required: true }),
1984
2015
  concurrency: t.int({ required: true }),
1985
2016
  }),
1986
2017
  });
@@ -2188,7 +2219,8 @@ builder.mutationType({
2188
2219
  assistantInit: args.input.assistantInit,
2189
2220
  graderId: args.input.graderId ?? null,
2190
2221
  scenarioRunId: args.input.scenarioRunId ?? null,
2191
- batchSize: args.input.batchSize ?? null,
2222
+ scenarioRuns: args.input.scenarioRuns ?? null,
2223
+ graderRepeatsPerScenario: args.input.graderRepeatsPerScenario ?? null,
2192
2224
  concurrency: args.input.concurrency ?? null,
2193
2225
  });
2194
2226
  return {
@@ -2308,9 +2340,10 @@ builder.mutationType({
2308
2340
  }
2309
2341
  const batch = await context.createWorkspaceVerifyBatchRun({
2310
2342
  workspaceId: args.input.workspaceId,
2343
+ scenarioDeckId: args.input.scenarioDeckId ?? null,
2311
2344
  graderId: args.input.graderId,
2312
- scenarioRunId: args.input.scenarioRunId ?? null,
2313
- batchSize: args.input.batchSize,
2345
+ scenarioRuns: args.input.scenarioRuns,
2346
+ graderRepeatsPerScenario: args.input.graderRepeatsPerScenario,
2314
2347
  concurrency: args.input.concurrency,
2315
2348
  });
2316
2349
  return {