@bolt-foundry/gambit 0.8.6-rc.1 → 0.8.6-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. package/CHANGELOG.md +60 -1
  2. package/esm/gambit/simulator-ui/dist/bundle.js +2033 -964
  3. package/esm/gambit/simulator-ui/dist/bundle.js.map +4 -4
  4. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts +2 -1
  5. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts.map +1 -1
  6. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.d.ts.map +1 -1
  7. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.js +288 -101
  8. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts +84 -45
  9. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts.map +1 -1
  10. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts +3 -2
  11. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts.map +1 -1
  12. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts +1 -1
  13. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts.map +1 -1
  14. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.js +85 -47
  15. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts +86 -48
  16. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts.map +1 -1
  17. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.d.ts.map +1 -1
  18. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.js +363 -97
  19. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.d.ts.map +1 -1
  20. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.js +154 -76
  21. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts +1 -1
  22. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts.map +1 -1
  23. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.js +45 -29
  24. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts +46 -30
  25. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts.map +1 -1
  26. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.d.ts.map +1 -1
  27. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.js +154 -76
  28. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts +45 -28
  29. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts.map +1 -1
  30. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts +1 -1
  31. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts.map +1 -1
  32. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.js +45 -29
  33. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts +46 -30
  34. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts.map +1 -1
  35. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.d.ts.map +1 -1
  36. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.js +194 -78
  37. package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts +45 -28
  38. package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts.map +1 -1
  39. package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.d.ts.map +1 -1
  40. package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.js +194 -78
  41. package/esm/simulator-ui/isograph/components/Query/SimulatorVerifyPage.d.ts.map +1 -1
  42. package/esm/simulator-ui/isograph/components/Query/SimulatorVerifyPage.js +257 -203
  43. package/esm/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.d.ts.map +1 -1
  44. package/esm/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.js +8 -3
  45. package/esm/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.d.ts.map +1 -1
  46. package/esm/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.js +45 -28
  47. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts +84 -45
  48. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts.map +1 -1
  49. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.js +85 -46
  50. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts +84 -45
  51. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts.map +1 -1
  52. package/esm/simulator-ui/src/AppShell.d.ts.map +1 -1
  53. package/esm/simulator-ui/src/AppShell.js +6 -4
  54. package/esm/simulator-ui/src/verify_unified.d.ts +17 -0
  55. package/esm/simulator-ui/src/verify_unified.d.ts.map +1 -0
  56. package/esm/simulator-ui/src/verify_unified.js +25 -0
  57. package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts +1 -1
  58. package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts.map +1 -1
  59. package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.js +1 -1
  60. package/esm/src/server.d.ts.map +1 -1
  61. package/esm/src/server.js +227 -45
  62. package/esm/src/server_isograph_environment.d.ts +11 -4
  63. package/esm/src/server_isograph_environment.d.ts.map +1 -1
  64. package/esm/src/server_isograph_environment.js +5 -0
  65. package/esm/src/simulator_graphql.d.ts +11 -4
  66. package/esm/src/simulator_graphql.d.ts.map +1 -1
  67. package/esm/src/simulator_graphql.js +278 -245
  68. package/package.json +2 -2
  69. package/script/gambit/simulator-ui/dist/bundle.js +2033 -964
  70. package/script/gambit/simulator-ui/dist/bundle.js.map +4 -4
  71. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts +2 -1
  72. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts.map +1 -1
  73. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.d.ts.map +1 -1
  74. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.js +288 -101
  75. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts +84 -45
  76. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts.map +1 -1
  77. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts +3 -2
  78. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts.map +1 -1
  79. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts +1 -1
  80. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts.map +1 -1
  81. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.js +85 -47
  82. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts +86 -48
  83. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts.map +1 -1
  84. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.d.ts.map +1 -1
  85. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.js +363 -97
  86. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.d.ts.map +1 -1
  87. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.js +154 -76
  88. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts +1 -1
  89. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts.map +1 -1
  90. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.js +45 -29
  91. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts +46 -30
  92. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts.map +1 -1
  93. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.d.ts.map +1 -1
  94. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.js +154 -76
  95. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts +45 -28
  96. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts.map +1 -1
  97. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts +1 -1
  98. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts.map +1 -1
  99. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.js +45 -29
  100. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts +46 -30
  101. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts.map +1 -1
  102. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.d.ts.map +1 -1
  103. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.js +194 -78
  104. package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts +45 -28
  105. package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts.map +1 -1
  106. package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.d.ts.map +1 -1
  107. package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.js +194 -78
  108. package/script/simulator-ui/isograph/components/Query/SimulatorVerifyPage.d.ts.map +1 -1
  109. package/script/simulator-ui/isograph/components/Query/SimulatorVerifyPage.js +256 -202
  110. package/script/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.d.ts.map +1 -1
  111. package/script/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.js +8 -3
  112. package/script/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.d.ts.map +1 -1
  113. package/script/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.js +45 -28
  114. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts +84 -45
  115. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts.map +1 -1
  116. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.js +85 -46
  117. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts +84 -45
  118. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts.map +1 -1
  119. package/script/simulator-ui/src/AppShell.d.ts.map +1 -1
  120. package/script/simulator-ui/src/AppShell.js +6 -4
  121. package/script/simulator-ui/src/verify_unified.d.ts +17 -0
  122. package/script/simulator-ui/src/verify_unified.d.ts.map +1 -0
  123. package/script/simulator-ui/src/verify_unified.js +29 -0
  124. package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts +1 -1
  125. package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts.map +1 -1
  126. package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.js +1 -1
  127. package/script/src/server.d.ts.map +1 -1
  128. package/script/src/server.js +227 -45
  129. package/script/src/server_isograph_environment.d.ts +11 -4
  130. package/script/src/server_isograph_environment.d.ts.map +1 -1
  131. package/script/src/server_isograph_environment.js +5 -0
  132. package/script/src/simulator_graphql.d.ts +11 -4
  133. package/script/src/simulator_graphql.d.ts.map +1 -1
  134. package/script/src/simulator_graphql.js +278 -245
  135. package/esm/simulator-ui/src/verify_metrics.d.ts +0 -50
  136. package/esm/simulator-ui/src/verify_metrics.d.ts.map +0 -1
  137. package/esm/simulator-ui/src/verify_metrics.js +0 -260
  138. package/script/simulator-ui/src/verify_metrics.d.ts +0 -50
  139. package/script/simulator-ui/src/verify_metrics.d.ts.map +0 -1
  140. package/script/simulator-ui/src/verify_metrics.js +0 -264
@@ -13,13 +13,14 @@ import Listbox from "../../../src/gds/Listbox.js";
13
13
  import PageGrid from "../../../src/gds/PageGrid.js";
14
14
  import PageShell from "../../../src/gds/PageShell.js";
15
15
  import Panel from "../../../src/gds/Panel.js";
16
- import { classNames, formatTimestampShort, scenarioNameFromValue, } from "../../../src/utils.js";
17
- import { VERIFY_CONSISTENCY_THRESHOLDS } from "../../../src/verify_metrics.js";
18
- const MAX_BATCH_SIZE = 24;
19
- const MAX_BATCH_CONCURRENCY = 6;
20
- const DEFAULT_BATCH_SIZE = 8;
21
- const DEFAULT_BATCH_CONCURRENCY = 3;
22
- const NO_SCENARIO_RUN_VALUE = "__workspace_context__";
16
+ import { formatTimestampShort, scenarioNameFromValue, } from "../../../src/utils.js";
17
+ import { sortVerifyOutlierScenarioRuns, VERIFY_DEFAULTS, VERIFY_LIMITS, } from "../../../src/verify_unified.js";
18
+ const MAX_SCENARIO_RUNS = VERIFY_LIMITS.scenarioRunsMax;
19
+ const MAX_GRADER_REPEATS = VERIFY_LIMITS.graderRepeatsMax;
20
+ const MAX_BATCH_CONCURRENCY = VERIFY_LIMITS.concurrencyMax;
21
+ const DEFAULT_SCENARIO_RUNS = VERIFY_DEFAULTS.scenarioRuns;
22
+ const DEFAULT_GRADER_REPEATS = VERIFY_DEFAULTS.graderRepeatsPerScenario;
23
+ const DEFAULT_BATCH_CONCURRENCY = VERIFY_DEFAULTS.concurrency;
23
24
  function getRoutePrefix(path) {
24
25
  return path === "/isograph" || path.startsWith("/isograph/")
25
26
  ? "/isograph"
@@ -49,19 +50,19 @@ function clampInt(value, min, max) {
49
50
  const rounded = Number.isFinite(value) ? Math.round(value) : min;
50
51
  return Math.max(min, Math.min(max, rounded));
51
52
  }
53
+ function formatPercent(value) {
54
+ if (typeof value !== "number" || !Number.isFinite(value))
55
+ return "-";
56
+ return `${Math.round(value * 100)}%`;
57
+ }
52
58
  export const SimulatorVerifyPage = iso(`
53
59
  field Workspace.VerifyTab @component {
54
60
  id
55
- scenarioRuns(first: 50) {
56
- edges {
57
- node {
58
- id
59
- status
60
- startedAt
61
- finishedAt
62
- error
63
- }
64
- }
61
+ scenarioDecks {
62
+ id
63
+ label
64
+ description
65
+ path
65
66
  }
66
67
  verification {
67
68
  graderDecks(first: 50) {
@@ -79,8 +80,10 @@ export const SimulatorVerifyPage = iso(`
79
80
  node {
80
81
  id
81
82
  workspaceId
83
+ scenarioDeckId
82
84
  graderId
83
- scenarioRunId
85
+ scenarioRuns
86
+ graderRepeatsPerScenario
84
87
  status
85
88
  startedAt
86
89
  finishedAt
@@ -88,10 +91,13 @@ export const SimulatorVerifyPage = iso(`
88
91
  active
89
92
  completed
90
93
  failed
91
- requests(first: 50) {
94
+ scenarioRunsCompleted
95
+ scenarioRunsFailed
96
+ requests(first: 200) {
92
97
  edges {
93
98
  node {
94
99
  id
100
+ scenarioRunId
95
101
  status
96
102
  runId
97
103
  error
@@ -99,31 +105,48 @@ export const SimulatorVerifyPage = iso(`
99
105
  }
100
106
  }
101
107
  metrics {
102
- sampleSize
103
- agreementRate
104
- scoreSpreadMin
105
- scoreSpreadMedian
106
- scoreSpreadMax
107
- instabilityCount
108
- verdict
109
- verdictReason
110
- outliers(first: 25) {
108
+ scenarioRunCountRequested
109
+ scenarioRunCountCompleted
110
+ scenarioRunCountFailed
111
+ gradeSampleCountRequested
112
+ gradeSampleCountCompleted
113
+ gradeSampleCountFailed
114
+ executionFailureCount
115
+ gradingFailureCount
116
+ passRate
117
+ scoreMin
118
+ scoreMedian
119
+ scoreMax
120
+ scoreMean
121
+ outlierScenarioRuns(first: 25) {
111
122
  edges {
112
123
  node {
113
124
  key
114
- label
115
- sampleSize
116
- agreementRate
117
- scoreDelta
118
- passFlip
119
- instability
125
+ scenarioRunId
126
+ gradeSampleCount
127
+ completedSampleCount
128
+ executionFailureCount
129
+ gradingFailureCount
130
+ averageScore
131
+ minScore
132
+ maxScore
133
+ failed
120
134
  minRunId
121
135
  maxRunId
122
- turnIndex
123
136
  messageRefId
124
137
  }
125
138
  }
126
139
  }
140
+ failureReasons(first: 25) {
141
+ edges {
142
+ node {
143
+ key
144
+ kind
145
+ reason
146
+ count
147
+ }
148
+ }
149
+ }
127
150
  }
128
151
  }
129
152
  }
@@ -139,6 +162,16 @@ export const SimulatorVerifyPage = iso(`
139
162
  const toPrefixedPath = useCallback((path) => `${routePrefix}${path}`, [routePrefix]);
140
163
  const runBatchMutation = useGambitTypedMutation(gambitWorkspaceVerifyBatchRunCreateMutation);
141
164
  useGambitTypedSubscription(gambitWorkspaceVerifyLiveSubscription, workspaceId ? { workspaceId } : null);
165
+ const scenarioDecks = useMemo(() => data.scenarioDecks?.flatMap((deck) => {
166
+ if (!deck?.id || !deck.label)
167
+ return [];
168
+ return [{
169
+ id: deck.id,
170
+ label: deck.label,
171
+ description: deck.description ?? null,
172
+ path: deck.path ?? "",
173
+ }];
174
+ }) ?? [], [data.scenarioDecks]);
142
175
  const graders = useMemo(() => (data.verification?.graderDecks?.edges ?? []).flatMap((edge) => {
143
176
  const grader = edge?.node;
144
177
  if (!grader?.id || !grader.label)
@@ -150,30 +183,16 @@ export const SimulatorVerifyPage = iso(`
150
183
  path: grader.path ?? "",
151
184
  }];
152
185
  }), [data.verification?.graderDecks?.edges]);
153
- const scenarioRuns = useMemo(() => (data.scenarioRuns?.edges ?? []).flatMap((edge) => {
154
- const run = edge?.node;
155
- if (!run?.id)
156
- return [];
157
- return [{
158
- id: run.id,
159
- status: toBatchStatus(run.status),
160
- startedAt: run.startedAt ?? null,
161
- finishedAt: run.finishedAt ?? null,
162
- error: run.error ?? null,
163
- }];
164
- }).sort((left, right) => {
165
- const leftKey = left.finishedAt ?? left.startedAt ?? left.id;
166
- const rightKey = right.finishedAt ?? right.startedAt ?? right.id;
167
- return rightKey.localeCompare(leftKey);
168
- }), [data.scenarioRuns?.edges]);
169
186
  const batches = useMemo(() => (data.verification?.batches?.edges ?? []).flatMap((edge) => {
170
187
  const batch = edge?.node;
171
188
  if (!batch?.id || !batch.graderId)
172
189
  return [];
173
190
  return [{
174
191
  id: batch.id,
192
+ scenarioDeckId: batch.scenarioDeckId ?? null,
175
193
  graderId: batch.graderId,
176
- scenarioRunId: batch.scenarioRunId ?? null,
194
+ scenarioRuns: batch.scenarioRuns ?? 0,
195
+ graderRepeatsPerScenario: batch.graderRepeatsPerScenario ?? 0,
177
196
  status: toBatchStatus(batch.status),
178
197
  startedAt: batch.startedAt ?? null,
179
198
  finishedAt: batch.finishedAt ?? null,
@@ -181,12 +200,15 @@ export const SimulatorVerifyPage = iso(`
181
200
  active: batch.active ?? 0,
182
201
  completed: batch.completed ?? 0,
183
202
  failed: batch.failed ?? 0,
203
+ scenarioRunsCompleted: batch.scenarioRunsCompleted ?? 0,
204
+ scenarioRunsFailed: batch.scenarioRunsFailed ?? 0,
184
205
  requests: (batch.requests?.edges ?? []).flatMap((requestEdge) => {
185
206
  const request = requestEdge?.node;
186
207
  if (!request?.id)
187
208
  return [];
188
209
  return [{
189
210
  id: request.id,
211
+ scenarioRunId: request.scenarioRunId ?? undefined,
190
212
  status: toBatchRequestStatus(request.status),
191
213
  runId: request.runId ?? undefined,
192
214
  error: request.error ?? undefined,
@@ -194,59 +216,76 @@ export const SimulatorVerifyPage = iso(`
194
216
  }),
195
217
  metrics: batch.metrics
196
218
  ? {
197
- sampleSize: batch.metrics.sampleSize ?? 0,
198
- agreementRate: typeof batch.metrics.agreementRate === "number"
199
- ? batch.metrics.agreementRate
219
+ scenarioRunCountRequested: batch.metrics.scenarioRunCountRequested ?? 0,
220
+ scenarioRunCountCompleted: batch.metrics.scenarioRunCountCompleted ?? 0,
221
+ scenarioRunCountFailed: batch.metrics.scenarioRunCountFailed ?? 0,
222
+ gradeSampleCountRequested: batch.metrics.gradeSampleCountRequested ?? 0,
223
+ gradeSampleCountCompleted: batch.metrics.gradeSampleCountCompleted ?? 0,
224
+ gradeSampleCountFailed: batch.metrics.gradeSampleCountFailed ?? 0,
225
+ executionFailureCount: batch.metrics.executionFailureCount ?? 0,
226
+ gradingFailureCount: batch.metrics.gradingFailureCount ?? 0,
227
+ passRate: typeof batch.metrics.passRate === "number"
228
+ ? batch.metrics.passRate
200
229
  : null,
201
- scoreSpreadMin: typeof batch.metrics.scoreSpreadMin === "number"
202
- ? batch.metrics.scoreSpreadMin
230
+ scoreMin: typeof batch.metrics.scoreMin === "number"
231
+ ? batch.metrics.scoreMin
203
232
  : null,
204
- scoreSpreadMedian: typeof batch.metrics.scoreSpreadMedian === "number"
205
- ? batch.metrics.scoreSpreadMedian
233
+ scoreMedian: typeof batch.metrics.scoreMedian === "number"
234
+ ? batch.metrics.scoreMedian
206
235
  : null,
207
- scoreSpreadMax: typeof batch.metrics.scoreSpreadMax === "number"
208
- ? batch.metrics.scoreSpreadMax
236
+ scoreMax: typeof batch.metrics.scoreMax === "number"
237
+ ? batch.metrics.scoreMax
209
238
  : null,
210
- instabilityCount: batch.metrics.instabilityCount ?? 0,
211
- verdict: batch.metrics.verdict === "PASS" ||
212
- batch.metrics.verdict === "WARN" ||
213
- batch.metrics.verdict === "FAIL"
214
- ? batch.metrics.verdict
215
- : "WARN",
216
- verdictReason: batch.metrics.verdictReason ??
217
- "Verify batch completed.",
218
- outliers: (batch.metrics.outliers?.edges ?? []).flatMap((outlierEdge) => {
239
+ scoreMean: typeof batch.metrics.scoreMean === "number"
240
+ ? batch.metrics.scoreMean
241
+ : null,
242
+ outlierScenarioRuns: (batch.metrics.outlierScenarioRuns?.edges ?? []).flatMap((outlierEdge) => {
219
243
  const outlier = outlierEdge?.node;
220
- if (!outlier?.key || !outlier.label)
244
+ if (!outlier?.key || !outlier.scenarioRunId)
221
245
  return [];
222
246
  return [{
223
247
  key: outlier.key,
224
- label: outlier.label,
225
- sampleSize: outlier.sampleSize ?? 0,
226
- agreementRate: typeof outlier.agreementRate === "number"
227
- ? outlier.agreementRate
248
+ scenarioRunId: outlier.scenarioRunId,
249
+ gradeSampleCount: outlier.gradeSampleCount ?? 0,
250
+ completedSampleCount: outlier.completedSampleCount ?? 0,
251
+ executionFailureCount: outlier.executionFailureCount ?? 0,
252
+ gradingFailureCount: outlier.gradingFailureCount ?? 0,
253
+ averageScore: typeof outlier.averageScore === "number"
254
+ ? outlier.averageScore
255
+ : null,
256
+ minScore: typeof outlier.minScore === "number"
257
+ ? outlier.minScore
228
258
  : null,
229
- scoreDelta: typeof outlier.scoreDelta === "number"
230
- ? outlier.scoreDelta
259
+ maxScore: typeof outlier.maxScore === "number"
260
+ ? outlier.maxScore
231
261
  : null,
232
- passFlip: Boolean(outlier.passFlip),
233
- instability: Boolean(outlier.instability),
262
+ failed: Boolean(outlier.failed),
234
263
  minRunId: outlier.minRunId ?? undefined,
235
264
  maxRunId: outlier.maxRunId ?? undefined,
236
- turnIndex: typeof outlier.turnIndex === "number"
237
- ? outlier.turnIndex
238
- : undefined,
239
265
  messageRefId: outlier.messageRefId ?? undefined,
240
266
  }];
241
267
  }),
268
+ failureReasons: (batch.metrics.failureReasons?.edges ?? [])
269
+ .flatMap((reasonEdge) => {
270
+ const reason = reasonEdge?.node;
271
+ if (!reason?.key || !reason.reason)
272
+ return [];
273
+ return [{
274
+ key: reason.key,
275
+ kind: reason.kind === "GRADING" ? "grading" : "execution",
276
+ reason: reason.reason,
277
+ count: reason.count ?? 0,
278
+ }];
279
+ }),
242
280
  }
243
281
  : null,
244
282
  }];
245
283
  }), [data.verification?.batches?.edges]);
246
- const [selectedScenarioRunId, setSelectedScenarioRunId] = useState(null);
284
+ const [selectedScenarioDeckId, setSelectedScenarioDeckId] = useState(null);
247
285
  const [selectedGraderId, setSelectedGraderId] = useState(null);
248
286
  const [selectedBatchId, setSelectedBatchId] = useState(null);
249
- const [batchSize, setBatchSize] = useState(DEFAULT_BATCH_SIZE);
287
+ const [scenarioRuns, setScenarioRuns] = useState(DEFAULT_SCENARIO_RUNS);
288
+ const [graderRepeatsPerScenario, setGraderRepeatsPerScenario] = useState(DEFAULT_GRADER_REPEATS);
250
289
  const [batchConcurrency, setBatchConcurrency] = useState(DEFAULT_BATCH_CONCURRENCY);
251
290
  const [mutationError, setMutationError] = useState(null);
252
291
  useEffect(() => {
@@ -257,22 +296,23 @@ export const SimulatorVerifyPage = iso(`
257
296
  setSelectedGraderId(graders[0]?.id ?? null);
258
297
  }, [graders, selectedGraderId]);
259
298
  useEffect(() => {
260
- if (selectedScenarioRunId &&
261
- scenarioRuns.some((scenarioRun) => scenarioRun.id === selectedScenarioRunId)) {
299
+ if (selectedScenarioDeckId &&
300
+ scenarioDecks.some((deck) => deck.id === selectedScenarioDeckId)) {
262
301
  return;
263
302
  }
264
- setSelectedScenarioRunId(scenarioRuns[0]?.id ?? null);
265
- }, [scenarioRuns, selectedScenarioRunId]);
303
+ setSelectedScenarioDeckId(scenarioDecks[0]?.id ?? null);
304
+ }, [scenarioDecks, selectedScenarioDeckId]);
266
305
  const filteredBatches = useMemo(() => {
267
306
  return batches.filter((batch) => {
268
307
  if (selectedGraderId && batch.graderId !== selectedGraderId)
269
308
  return false;
270
- if (selectedScenarioRunId && batch.scenarioRunId !== selectedScenarioRunId) {
309
+ if (selectedScenarioDeckId &&
310
+ batch.scenarioDeckId !== selectedScenarioDeckId) {
271
311
  return false;
272
312
  }
273
313
  return true;
274
314
  });
275
- }, [batches, selectedGraderId, selectedScenarioRunId]);
315
+ }, [batches, selectedGraderId, selectedScenarioDeckId]);
276
316
  const visibleBatches = filteredBatches.length > 0 ? filteredBatches : batches;
277
317
  useEffect(() => {
278
318
  if (selectedBatchId &&
@@ -290,10 +330,11 @@ export const SimulatorVerifyPage = iso(`
290
330
  return visibleBatches[0] ?? null;
291
331
  }, [selectedBatchId, visibleBatches]);
292
332
  const selectedGrader = useMemo(() => graders.find((grader) => grader.id === selectedGraderId) ?? null, [graders, selectedGraderId]);
293
- const queuedCount = useMemo(() => (selectedBatch?.requests ?? []).filter((request) => request.status === "queued")
294
- .length, [selectedBatch?.requests]);
333
+ const selectedScenarioDeck = useMemo(() => scenarioDecks.find((deck) => deck.id === selectedScenarioDeckId) ?? null, [scenarioDecks, selectedScenarioDeckId]);
334
+ const queuedCount = useMemo(() => (selectedBatch?.requests ?? []).filter((request) => request.status === "queued").length, [selectedBatch?.requests]);
295
335
  const metrics = selectedBatch?.metrics ?? null;
296
- const topOutliers = (metrics?.outliers ?? []).slice(0, 8);
336
+ const topOutlierScenarioRuns = useMemo(() => sortVerifyOutlierScenarioRuns(metrics?.outlierScenarioRuns ?? [])
337
+ .slice(0, 8), [metrics?.outlierScenarioRuns]);
297
338
  const hasRunningBatch = visibleBatches.some((batch) => batch.status === "running");
298
339
  const canRun = Boolean(workspaceId && selectedGraderId && !runBatchMutation.inFlight &&
299
340
  !hasRunningBatch);
@@ -301,16 +342,18 @@ export const SimulatorVerifyPage = iso(`
301
342
  if (!workspaceId || !selectedGraderId)
302
343
  return;
303
344
  setMutationError(null);
304
- const nextBatchSize = clampInt(batchSize, 1, MAX_BATCH_SIZE);
305
- const nextConcurrency = clampInt(batchConcurrency, 1, Math.min(MAX_BATCH_CONCURRENCY, nextBatchSize));
345
+ const nextScenarioRuns = clampInt(scenarioRuns, 1, MAX_SCENARIO_RUNS);
346
+ const nextRepeats = clampInt(graderRepeatsPerScenario, 1, MAX_GRADER_REPEATS);
347
+ const nextConcurrency = clampInt(batchConcurrency, 1, MAX_BATCH_CONCURRENCY);
306
348
  runBatchMutation.commit({
307
349
  input: {
308
350
  workspaceId,
309
- graderId: selectedGraderId,
310
- ...(selectedScenarioRunId
311
- ? { scenarioRunId: selectedScenarioRunId }
351
+ ...(selectedScenarioDeckId
352
+ ? { scenarioDeckId: selectedScenarioDeckId }
312
353
  : {}),
313
- batchSize: nextBatchSize,
354
+ graderId: selectedGraderId,
355
+ scenarioRuns: nextScenarioRuns,
356
+ graderRepeatsPerScenario: nextRepeats,
314
357
  concurrency: nextConcurrency,
315
358
  },
316
359
  }, {
@@ -327,40 +370,28 @@ export const SimulatorVerifyPage = iso(`
327
370
  });
328
371
  }, [
329
372
  batchConcurrency,
330
- batchSize,
373
+ graderRepeatsPerScenario,
331
374
  runBatchMutation,
375
+ scenarioRuns,
332
376
  selectedGraderId,
333
- selectedScenarioRunId,
377
+ selectedScenarioDeckId,
334
378
  workspaceId,
335
379
  ]);
336
380
  const navigateToGradeRun = useCallback((runId) => {
337
381
  const gradePath = buildWorkspacePath("grade", workspaceId, { runId });
338
382
  navigate(toPrefixedPath(gradePath));
339
383
  }, [navigate, toPrefixedPath, workspaceId]);
340
- const scenarioOptions = useMemo(() => [
341
- {
342
- value: NO_SCENARIO_RUN_VALUE,
343
- label: "Current workspace context",
344
- meta: "Run without a prior scenario run binding",
345
- },
346
- ...scenarioRuns.map((run) => ({
347
- value: run.id,
348
- label: scenarioNameFromValue(run.id) ?? run.id,
349
- meta: [
350
- run.status,
351
- run.finishedAt ?? run.startedAt,
352
- ].filter(Boolean).join(" · "),
353
- })),
354
- ], [scenarioRuns]);
355
384
  return (React.createElement(PageShell, { className: "verify-shell" },
356
385
  React.createElement(PageGrid, { as: "main", className: "verify-layout" },
357
386
  React.createElement(Panel, { className: "verify-controls", "data-testid": "verify-tab-scaffold" },
358
387
  React.createElement("div", { className: "verify-controls-header" },
359
- React.createElement("strong", null, "Verify consistency"),
360
- React.createElement("span", { className: "secondary-note" }, "Run repeated grading checks against one grader and scenario.")),
361
- scenarioRuns.length > 0 && (React.createElement(Listbox, { label: "Scenario run", value: selectedScenarioRunId ?? NO_SCENARIO_RUN_VALUE, onChange: (runId) => {
362
- setSelectedScenarioRunId(runId === NO_SCENARIO_RUN_VALUE ? null : runId);
363
- }, options: scenarioOptions, placeholder: "Select scenario run" })),
388
+ React.createElement("strong", null, "Verify repeated evidence"),
389
+ React.createElement("span", { className: "secondary-note" }, "Generate scenario runs, then grade each run repeatedly.")),
390
+ React.createElement(Listbox, { label: "Scenario deck", value: selectedScenarioDeckId ?? "", onChange: (value) => setSelectedScenarioDeckId(value.length ? value : null), options: scenarioDecks.map((deck) => ({
391
+ value: deck.id,
392
+ label: deck.label,
393
+ meta: deck.path,
394
+ })), placeholder: "Select scenario deck", disabled: scenarioDecks.length === 0 }),
364
395
  React.createElement(Listbox, { label: "Grader", value: selectedGraderId ?? "", onChange: (value) => setSelectedGraderId(value.length ? value : null), options: graders.map((grader) => ({
365
396
  value: grader.id,
366
397
  label: grader.label,
@@ -368,19 +399,28 @@ export const SimulatorVerifyPage = iso(`
368
399
  })), placeholder: "Select grader", disabled: graders.length === 0 }),
369
400
  React.createElement("div", { className: "verify-number-grid" },
370
401
  React.createElement("label", { className: "verify-number-field" },
371
- "Batch size",
372
- React.createElement("input", { type: "number", min: 1, max: MAX_BATCH_SIZE, value: batchSize, onChange: (event) => setBatchSize(clampInt(Number(event.target.value), 1, MAX_BATCH_SIZE)) })),
402
+ "Scenario runs",
403
+ React.createElement("input", { type: "number", min: 1, max: MAX_SCENARIO_RUNS, value: scenarioRuns, onChange: (event) => setScenarioRuns(clampInt(Number(event.target.value), 1, MAX_SCENARIO_RUNS)) })),
404
+ React.createElement("label", { className: "verify-number-field" },
405
+ "Grader repeats per scenario",
406
+ React.createElement("input", { type: "number", min: 1, max: MAX_GRADER_REPEATS, value: graderRepeatsPerScenario, onChange: (event) => setGraderRepeatsPerScenario(clampInt(Number(event.target.value), 1, MAX_GRADER_REPEATS)) })),
373
407
  React.createElement("label", { className: "verify-number-field" },
374
408
  "Concurrency",
375
409
  React.createElement("input", { type: "number", min: 1, max: MAX_BATCH_CONCURRENCY, value: batchConcurrency, onChange: (event) => setBatchConcurrency(clampInt(Number(event.target.value), 1, MAX_BATCH_CONCURRENCY)) }))),
376
410
  React.createElement(Button, { "data-testid": "verify-run-batch", variant: "primary", onClick: runBatch, disabled: !canRun }, hasRunningBatch || runBatchMutation.inFlight
377
- ? "Running consistency batch"
378
- : "Run consistency batch"),
411
+ ? "Running verify batch..."
412
+ : "Run verify batch"),
413
+ scenarioDecks.length === 0 && (React.createElement(Callout, null,
414
+ "No scenario decks are available. Add ",
415
+ React.createElement("code", null, "[[testDecks]]"),
416
+ " ",
417
+ "entries to the active root deck.")),
379
418
  graders.length === 0 && (React.createElement(Callout, null,
380
419
  "No graders are available. Add ",
381
420
  React.createElement("code", null, "[[graders]]"),
382
421
  " ",
383
422
  "entries to the active root deck.")),
423
+ selectedScenarioDeck?.description && (React.createElement(Callout, null, selectedScenarioDeck.description)),
384
424
  selectedGrader?.description && (React.createElement(Callout, null, selectedGrader.description)),
385
425
  React.createElement(Callout, { variant: "emphasis", title: "Build assistant stays available" }, "Use the chat drawer toggle in the top-right corner to investigate and iterate while this page remains open.")),
386
426
  React.createElement(Panel, { className: "verify-results", "data-testid": "verify-results" },
@@ -395,93 +435,93 @@ export const SimulatorVerifyPage = iso(`
395
435
  : "",
396
436
  selectedBatch?.finishedAt
397
437
  ? ` · finished ${formatTimestampShort(selectedBatch.finishedAt)}`
398
- : "")),
399
- metrics && metrics.sampleSize > 0 && (React.createElement("span", { className: classNames("verify-verdict-badge", `verify-verdict-badge--${metrics.verdict.toLowerCase()}`) }, metrics.verdict))),
400
- selectedBatch && selectedBatch.requested > 0 && (React.createElement("div", { className: "verify-progress-row" },
401
- React.createElement("span", null,
402
- "Queued: ",
403
- queuedCount),
404
- React.createElement("span", null,
405
- "Running: ",
406
- selectedBatch.active),
407
- React.createElement("span", null,
408
- "Completed: ",
409
- selectedBatch.completed),
410
- React.createElement("span", null,
411
- "Failed: ",
412
- selectedBatch.failed))),
413
- !selectedBatch && (React.createElement(Callout, null, "Run a consistency batch to compute agreement, spread, and instability for the selected grader.")),
414
- metrics && metrics.sampleSize > 0 && (React.createElement(React.Fragment, null,
438
+ : ""))),
439
+ !selectedBatch && (React.createElement(Callout, null, "Run a verify batch to generate repeated grading evidence.")),
440
+ selectedBatch && selectedBatch.requested > 0 && (React.createElement(React.Fragment, null,
441
+ React.createElement("div", { className: "verify-progress-row" },
442
+ React.createElement("span", null,
443
+ "Scenario runs: ",
444
+ selectedBatch.scenarioRunsCompleted,
445
+ "/",
446
+ selectedBatch.scenarioRuns),
447
+ React.createElement("span", null,
448
+ "Scenario failures: ",
449
+ selectedBatch.scenarioRunsFailed),
450
+ React.createElement("span", null,
451
+ "Queued: ",
452
+ queuedCount),
453
+ React.createElement("span", null,
454
+ "Running: ",
455
+ selectedBatch.active),
456
+ React.createElement("span", null,
457
+ "Completed: ",
458
+ selectedBatch.completed),
459
+ React.createElement("span", null,
460
+ "Failed: ",
461
+ selectedBatch.failed)))),
462
+ metrics && (React.createElement(React.Fragment, null,
415
463
  React.createElement("div", { className: "verify-metric-grid" },
416
464
  React.createElement("div", { className: "verify-metric-card" },
417
- React.createElement("div", { className: "verify-metric-label" }, "Sample size"),
418
- React.createElement("div", { className: "verify-metric-value" }, metrics.sampleSize)),
465
+ React.createElement("div", { className: "verify-metric-label" }, "Scenario runs"),
466
+ React.createElement("div", { className: "verify-metric-value" },
467
+ metrics.scenarioRunCountCompleted,
468
+ "/",
469
+ metrics.scenarioRunCountRequested)),
419
470
  React.createElement("div", { className: "verify-metric-card" },
420
- React.createElement("div", { className: "verify-metric-label" }, "Agreement rate"),
421
- React.createElement("div", { className: "verify-metric-value" }, metrics.agreementRate === null
422
- ? "—"
423
- : `${Math.round(metrics.agreementRate * 100)}%`)),
471
+ React.createElement("div", { className: "verify-metric-label" }, "Grade samples"),
472
+ React.createElement("div", { className: "verify-metric-value" },
473
+ metrics.gradeSampleCountCompleted,
474
+ "/",
475
+ metrics.gradeSampleCountRequested)),
424
476
  React.createElement("div", { className: "verify-metric-card" },
425
- React.createElement("div", { className: "verify-metric-label" }, "Score spread (min/median/max)"),
426
- React.createElement("div", { className: "verify-metric-value verify-metric-value--compact" }, metrics.scoreSpreadMin === null
427
- ? "—"
428
- : `${metrics.scoreSpreadMin} / ${metrics.scoreSpreadMedian ?? "—"} / ${metrics.scoreSpreadMax ?? "—"}`)),
477
+ React.createElement("div", { className: "verify-metric-label" }, "Pass rate"),
478
+ React.createElement("div", { className: "verify-metric-value" }, formatPercent(metrics.passRate))),
429
479
  React.createElement("div", { className: "verify-metric-card" },
430
- React.createElement("div", { className: "verify-metric-label" }, "Instability count"),
431
- React.createElement("div", { className: "verify-metric-value" }, metrics.instabilityCount))),
432
- React.createElement(Callout, { variant: metrics.verdict === "FAIL"
433
- ? "danger"
434
- : metrics.verdict === "WARN"
435
- ? "emphasis"
436
- : "muted", title: `Verdict: ${metrics.verdict}` }, metrics.verdictReason))),
437
- React.createElement(Callout, { title: "Thresholds in code" },
438
- "Min sample size: ",
439
- VERIFY_CONSISTENCY_THRESHOLDS.minSampleSize,
440
- " ",
441
- "\u00B7 PASS requires agreement \u2265 ",
442
- Math.round(VERIFY_CONSISTENCY_THRESHOLDS.pass.agreementMin * 100),
443
- "%, spread \u2264",
444
- " ",
445
- VERIFY_CONSISTENCY_THRESHOLDS.pass.maxSpread,
446
- ", instability \u2264",
447
- " ",
448
- VERIFY_CONSISTENCY_THRESHOLDS.pass.maxInstabilityCount,
449
- " ",
450
- "\u00B7 WARN allows agreement \u2265 ",
451
- Math.round(VERIFY_CONSISTENCY_THRESHOLDS.warn.agreementMin * 100),
452
- "%, spread \u2264",
453
- " ",
454
- VERIFY_CONSISTENCY_THRESHOLDS.warn.maxSpread,
455
- ", instability \u2264",
456
- " ",
457
- VERIFY_CONSISTENCY_THRESHOLDS.warn.maxInstabilityCount,
458
- "."),
480
+ React.createElement("div", { className: "verify-metric-label" }, "Score mean"),
481
+ React.createElement("div", { className: "verify-metric-value" }, metrics.scoreMean === null ? "-" : metrics.scoreMean)),
482
+ React.createElement("div", { className: "verify-metric-card" },
483
+ React.createElement("div", { className: "verify-metric-label" }, "Score min/median/max"),
484
+ React.createElement("div", { className: "verify-metric-value verify-metric-value--compact" }, metrics.scoreMin === null
485
+ ? "-"
486
+ : `${metrics.scoreMin} / ${metrics.scoreMedian ?? "-"} / ${metrics.scoreMax ?? "-"}`)),
487
+ React.createElement("div", { className: "verify-metric-card" },
488
+ React.createElement("div", { className: "verify-metric-label" }, "Execution failures"),
489
+ React.createElement("div", { className: "verify-metric-value" }, metrics.executionFailureCount)),
490
+ React.createElement("div", { className: "verify-metric-card" },
491
+ React.createElement("div", { className: "verify-metric-label" }, "Grading failures"),
492
+ React.createElement("div", { className: "verify-metric-value" }, metrics.gradingFailureCount))))),
459
493
  React.createElement("div", { className: "verify-section" },
460
- React.createElement("strong", null, "Most inconsistent examples"),
461
- topOutliers.length === 0
462
- ? (React.createElement(Callout, null, "Inconsistent examples will appear here as soon as at least one completed run is available in this batch."))
463
- : (React.createElement("div", { className: "verify-outlier-list" }, topOutliers.map((outlier) => {
464
- const runLinks = [outlier.maxRunId, outlier.minRunId]
494
+ React.createElement("strong", null, "Outlier scenario runs"),
495
+ topOutlierScenarioRuns.length === 0
496
+ ? (React.createElement(Callout, null, "Outlier scenario runs appear as soon as completed grade samples are available."))
497
+ : (React.createElement("div", { className: "verify-outlier-list" }, topOutlierScenarioRuns.map((outlier) => {
498
+ const runLinks = [outlier.minRunId, outlier.maxRunId]
465
499
  .filter((value) => Boolean(value));
466
500
  const uniqueRunLinks = [...new Set(runLinks)];
467
501
  return (React.createElement("div", { key: outlier.key, className: "verify-outlier-card" },
468
502
  React.createElement("div", { className: "verify-outlier-header" },
469
- React.createElement("strong", null, outlier.label),
470
- React.createElement(Badge, { variant: outlier.instability
471
- ? "error"
472
- : "completed" }, outlier.instability ? "Unstable" : "Stable")),
503
+ React.createElement("strong", null, scenarioNameFromValue(outlier.scenarioRunId) ??
504
+ outlier.scenarioRunId),
505
+ React.createElement(Badge, { variant: outlier.failed ? "error" : "completed" }, outlier.failed ? "Failed" : "Scored")),
473
506
  React.createElement("div", { className: "verify-outlier-meta" },
474
- "agreement ",
475
- outlier.agreementRate === null
476
- ? "—"
477
- : `${Math.round(outlier.agreementRate * 100)}%`,
507
+ "avg ",
508
+ outlier.averageScore ?? "-",
509
+ " \u00B7 min/max",
510
+ " ",
511
+ outlier.minScore ?? "-",
512
+ "/",
513
+ outlier.maxScore ?? "-",
514
+ " ",
515
+ "\u00B7 samples ",
516
+ outlier.completedSampleCount,
517
+ "/",
518
+ outlier.gradeSampleCount,
519
+ " \u00B7 execution failures",
478
520
  " ",
479
- "\u00B7 delta ",
480
- outlier.scoreDelta ?? "—",
481
- " \u00B7 samples",
521
+ outlier.executionFailureCount,
522
+ " \u00B7 grading failures",
482
523
  " ",
483
- outlier.sampleSize,
484
- outlier.passFlip ? " · pass/fail flip" : "",
524
+ outlier.gradingFailureCount,
485
525
  outlier.messageRefId
486
526
  ? ` · ref ${outlier.messageRefId}`
487
527
  : ""),
@@ -494,6 +534,18 @@ export const SimulatorVerifyPage = iso(`
494
534
  "Open grade run ",
495
535
  runId)))))));
496
536
  })))),
537
+ React.createElement("div", { className: "verify-section" },
538
+ React.createElement("strong", null, "Failure reasons"),
539
+ !metrics || metrics.failureReasons.length === 0
540
+ ? React.createElement(Callout, null, "No failure reasons captured yet.")
541
+ : (React.createElement("ul", { className: "verify-request-list" }, metrics.failureReasons.map((reason) => (React.createElement("li", { key: reason.key, className: "verify-request-row" },
542
+ React.createElement(Badge, { variant: reason.kind === "execution"
543
+ ? "error"
544
+ : "running" }, reason.kind),
545
+ React.createElement("span", null, reason.reason),
546
+ React.createElement("span", { className: "secondary-note" },
547
+ "x",
548
+ reason.count))))))),
497
549
  visibleBatches.length > 0 && (React.createElement("div", { className: "verify-section" },
498
550
  React.createElement("strong", null, "Batch history"),
499
551
  React.createElement("ul", { className: "verify-request-list" }, visibleBatches.slice(0, 15).map((batch, index) => (React.createElement("li", { key: batch.id, className: "verify-request-row" },
@@ -503,10 +555,10 @@ export const SimulatorVerifyPage = iso(`
503
555
  React.createElement(Badge, { status: batch.status }, batch.status),
504
556
  React.createElement("button", { type: "button", className: "link-button", onClick: () => setSelectedBatchId(batch.id) }, batch.id),
505
557
  React.createElement("span", { className: "secondary-note" },
506
- batch.completed,
507
- "/",
508
- batch.requested,
509
- " complete"))))))),
558
+ batch.scenarioRuns,
559
+ " runs \u00D7",
560
+ " ",
561
+ batch.graderRepeatsPerScenario))))))),
510
562
  selectedBatch?.requests.length
511
563
  ? (React.createElement("div", { className: "verify-section" },
512
564
  React.createElement("strong", null, "Batch requests"),
@@ -517,6 +569,8 @@ export const SimulatorVerifyPage = iso(`
517
569
  React.createElement(Badge, { status: request.status === "queued"
518
570
  ? "idle"
519
571
  : request.status }, request.status),
572
+ request.scenarioRunId && (React.createElement("span", { className: "secondary-note" }, scenarioNameFromValue(request.scenarioRunId) ??
573
+ request.scenarioRunId)),
520
574
  request.runId
521
575
  ? (React.createElement("a", { href: toPrefixedPath(buildWorkspacePath("grade", workspaceId, {
522
576
  runId: request.runId,