@bolt-foundry/gambit 0.8.6-rc.1 → 0.8.6-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (140) hide show
  1. package/CHANGELOG.md +60 -1
  2. package/esm/gambit/simulator-ui/dist/bundle.js +2033 -964
  3. package/esm/gambit/simulator-ui/dist/bundle.js.map +4 -4
  4. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts +2 -1
  5. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts.map +1 -1
  6. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.d.ts.map +1 -1
  7. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.js +288 -101
  8. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts +84 -45
  9. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts.map +1 -1
  10. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts +3 -2
  11. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts.map +1 -1
  12. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts +1 -1
  13. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts.map +1 -1
  14. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.js +85 -47
  15. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts +86 -48
  16. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts.map +1 -1
  17. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.d.ts.map +1 -1
  18. package/esm/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.js +363 -97
  19. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.d.ts.map +1 -1
  20. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.js +154 -76
  21. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts +1 -1
  22. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts.map +1 -1
  23. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.js +45 -29
  24. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts +46 -30
  25. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts.map +1 -1
  26. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.d.ts.map +1 -1
  27. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.js +154 -76
  28. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts +45 -28
  29. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts.map +1 -1
  30. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts +1 -1
  31. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts.map +1 -1
  32. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.js +45 -29
  33. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts +46 -30
  34. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts.map +1 -1
  35. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.d.ts.map +1 -1
  36. package/esm/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.js +194 -78
  37. package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts +45 -28
  38. package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts.map +1 -1
  39. package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.d.ts.map +1 -1
  40. package/esm/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.js +194 -78
  41. package/esm/simulator-ui/isograph/components/Query/SimulatorVerifyPage.d.ts.map +1 -1
  42. package/esm/simulator-ui/isograph/components/Query/SimulatorVerifyPage.js +257 -203
  43. package/esm/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.d.ts.map +1 -1
  44. package/esm/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.js +8 -3
  45. package/esm/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.d.ts.map +1 -1
  46. package/esm/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.js +45 -28
  47. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts +84 -45
  48. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts.map +1 -1
  49. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.js +85 -46
  50. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts +84 -45
  51. package/esm/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts.map +1 -1
  52. package/esm/simulator-ui/src/AppShell.d.ts.map +1 -1
  53. package/esm/simulator-ui/src/AppShell.js +6 -4
  54. package/esm/simulator-ui/src/verify_unified.d.ts +17 -0
  55. package/esm/simulator-ui/src/verify_unified.d.ts.map +1 -0
  56. package/esm/simulator-ui/src/verify_unified.js +25 -0
  57. package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts +1 -1
  58. package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts.map +1 -1
  59. package/esm/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.js +1 -1
  60. package/esm/src/server.d.ts.map +1 -1
  61. package/esm/src/server.js +227 -45
  62. package/esm/src/server_isograph_environment.d.ts +11 -4
  63. package/esm/src/server_isograph_environment.d.ts.map +1 -1
  64. package/esm/src/server_isograph_environment.js +5 -0
  65. package/esm/src/simulator_graphql.d.ts +11 -4
  66. package/esm/src/simulator_graphql.d.ts.map +1 -1
  67. package/esm/src/simulator_graphql.js +278 -245
  68. package/package.json +2 -2
  69. package/script/gambit/simulator-ui/dist/bundle.js +2033 -964
  70. package/script/gambit/simulator-ui/dist/bundle.js.map +4 -4
  71. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts +2 -1
  72. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceConversationSessionStart/parameters_type.d.ts.map +1 -1
  73. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.d.ts.map +1 -1
  74. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/normalization_ast.js +288 -101
  75. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts +84 -45
  76. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/param_type.d.ts.map +1 -1
  77. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts +3 -2
  78. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/parameters_type.d.ts.map +1 -1
  79. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts +1 -1
  80. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.d.ts.map +1 -1
  81. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/query_text.js +85 -47
  82. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts +86 -48
  83. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/raw_response_type.d.ts.map +1 -1
  84. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.d.ts.map +1 -1
  85. package/script/simulator-ui/__generated__/__isograph/Mutation/GambitWorkspaceVerifyBatchRunCreate/resolver_reader.js +363 -97
  86. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.d.ts.map +1 -1
  87. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/normalization_ast.js +154 -76
  88. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts +1 -1
  89. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.d.ts.map +1 -1
  90. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/query_text.js +45 -29
  91. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts +46 -30
  92. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointSimulatorVerifyPage/raw_response_type.d.ts.map +1 -1
  93. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.d.ts.map +1 -1
  94. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/normalization_ast.js +154 -76
  95. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts +45 -28
  96. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/param_type.d.ts.map +1 -1
  97. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts +1 -1
  98. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.d.ts.map +1 -1
  99. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/query_text.js +45 -29
  100. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts +46 -30
  101. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/raw_response_type.d.ts.map +1 -1
  102. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.d.ts.map +1 -1
  103. package/script/simulator-ui/__generated__/__isograph/Query/EntrypointWorkspaceVerifyLiveWrite/resolver_reader.js +194 -78
  104. package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts +45 -28
  105. package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/param_type.d.ts.map +1 -1
  106. package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.d.ts.map +1 -1
  107. package/script/simulator-ui/__generated__/__isograph/Workspace/VerifyTab/resolver_reader.js +194 -78
  108. package/script/simulator-ui/isograph/components/Query/SimulatorVerifyPage.d.ts.map +1 -1
  109. package/script/simulator-ui/isograph/components/Query/SimulatorVerifyPage.js +256 -202
  110. package/script/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.d.ts.map +1 -1
  111. package/script/simulator-ui/isograph/components/WorkspaceConversationRun/WorkbenchConversationRunChat.js +8 -3
  112. package/script/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.d.ts.map +1 -1
  113. package/script/simulator-ui/isograph/entrypoints/EntrypointWorkspaceVerifyLiveWrite.js +45 -28
  114. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts +84 -45
  115. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.d.ts.map +1 -1
  116. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreate.js +85 -46
  117. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts +84 -45
  118. package/script/simulator-ui/mutations/GambitWorkspaceVerifyBatchRunCreateMutation.d.ts.map +1 -1
  119. package/script/simulator-ui/src/AppShell.d.ts.map +1 -1
  120. package/script/simulator-ui/src/AppShell.js +6 -4
  121. package/script/simulator-ui/src/verify_unified.d.ts +17 -0
  122. package/script/simulator-ui/src/verify_unified.d.ts.map +1 -0
  123. package/script/simulator-ui/src/verify_unified.js +29 -0
  124. package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts +1 -1
  125. package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.d.ts.map +1 -1
  126. package/script/simulator-ui/subscriptions/__generated__/workspaceVerifyLiveSubscriptionQuery.js +1 -1
  127. package/script/src/server.d.ts.map +1 -1
  128. package/script/src/server.js +227 -45
  129. package/script/src/server_isograph_environment.d.ts +11 -4
  130. package/script/src/server_isograph_environment.d.ts.map +1 -1
  131. package/script/src/server_isograph_environment.js +5 -0
  132. package/script/src/simulator_graphql.d.ts +11 -4
  133. package/script/src/simulator_graphql.d.ts.map +1 -1
  134. package/script/src/simulator_graphql.js +278 -245
  135. package/esm/simulator-ui/src/verify_metrics.d.ts +0 -50
  136. package/esm/simulator-ui/src/verify_metrics.d.ts.map +0 -1
  137. package/esm/simulator-ui/src/verify_metrics.js +0 -260
  138. package/script/simulator-ui/src/verify_metrics.d.ts +0 -50
  139. package/script/simulator-ui/src/verify_metrics.d.ts.map +0 -1
  140. package/script/simulator-ui/src/verify_metrics.js +0 -264
@@ -20,12 +20,13 @@ const PageGrid_js_1 = __importDefault(require("../../../src/gds/PageGrid.js"));
20
20
  const PageShell_js_1 = __importDefault(require("../../../src/gds/PageShell.js"));
21
21
  const Panel_js_1 = __importDefault(require("../../../src/gds/Panel.js"));
22
22
  const utils_js_1 = require("../../../src/utils.js");
23
- const verify_metrics_js_1 = require("../../../src/verify_metrics.js");
24
- const MAX_BATCH_SIZE = 24;
25
- const MAX_BATCH_CONCURRENCY = 6;
26
- const DEFAULT_BATCH_SIZE = 8;
27
- const DEFAULT_BATCH_CONCURRENCY = 3;
28
- const NO_SCENARIO_RUN_VALUE = "__workspace_context__";
23
+ const verify_unified_js_1 = require("../../../src/verify_unified.js");
24
+ const MAX_SCENARIO_RUNS = verify_unified_js_1.VERIFY_LIMITS.scenarioRunsMax;
25
+ const MAX_GRADER_REPEATS = verify_unified_js_1.VERIFY_LIMITS.graderRepeatsMax;
26
+ const MAX_BATCH_CONCURRENCY = verify_unified_js_1.VERIFY_LIMITS.concurrencyMax;
27
+ const DEFAULT_SCENARIO_RUNS = verify_unified_js_1.VERIFY_DEFAULTS.scenarioRuns;
28
+ const DEFAULT_GRADER_REPEATS = verify_unified_js_1.VERIFY_DEFAULTS.graderRepeatsPerScenario;
29
+ const DEFAULT_BATCH_CONCURRENCY = verify_unified_js_1.VERIFY_DEFAULTS.concurrency;
29
30
  function getRoutePrefix(path) {
30
31
  return path === "/isograph" || path.startsWith("/isograph/")
31
32
  ? "/isograph"
@@ -55,19 +56,19 @@ function clampInt(value, min, max) {
55
56
  const rounded = Number.isFinite(value) ? Math.round(value) : min;
56
57
  return Math.max(min, Math.min(max, rounded));
57
58
  }
59
+ function formatPercent(value) {
60
+ if (typeof value !== "number" || !Number.isFinite(value))
61
+ return "-";
62
+ return `${Math.round(value * 100)}%`;
63
+ }
58
64
  exports.SimulatorVerifyPage = (0, iso_js_1.iso)(`
59
65
  field Workspace.VerifyTab @component {
60
66
  id
61
- scenarioRuns(first: 50) {
62
- edges {
63
- node {
64
- id
65
- status
66
- startedAt
67
- finishedAt
68
- error
69
- }
70
- }
67
+ scenarioDecks {
68
+ id
69
+ label
70
+ description
71
+ path
71
72
  }
72
73
  verification {
73
74
  graderDecks(first: 50) {
@@ -85,8 +86,10 @@ exports.SimulatorVerifyPage = (0, iso_js_1.iso)(`
85
86
  node {
86
87
  id
87
88
  workspaceId
89
+ scenarioDeckId
88
90
  graderId
89
- scenarioRunId
91
+ scenarioRuns
92
+ graderRepeatsPerScenario
90
93
  status
91
94
  startedAt
92
95
  finishedAt
@@ -94,10 +97,13 @@ exports.SimulatorVerifyPage = (0, iso_js_1.iso)(`
94
97
  active
95
98
  completed
96
99
  failed
97
- requests(first: 50) {
100
+ scenarioRunsCompleted
101
+ scenarioRunsFailed
102
+ requests(first: 200) {
98
103
  edges {
99
104
  node {
100
105
  id
106
+ scenarioRunId
101
107
  status
102
108
  runId
103
109
  error
@@ -105,31 +111,48 @@ exports.SimulatorVerifyPage = (0, iso_js_1.iso)(`
105
111
  }
106
112
  }
107
113
  metrics {
108
- sampleSize
109
- agreementRate
110
- scoreSpreadMin
111
- scoreSpreadMedian
112
- scoreSpreadMax
113
- instabilityCount
114
- verdict
115
- verdictReason
116
- outliers(first: 25) {
114
+ scenarioRunCountRequested
115
+ scenarioRunCountCompleted
116
+ scenarioRunCountFailed
117
+ gradeSampleCountRequested
118
+ gradeSampleCountCompleted
119
+ gradeSampleCountFailed
120
+ executionFailureCount
121
+ gradingFailureCount
122
+ passRate
123
+ scoreMin
124
+ scoreMedian
125
+ scoreMax
126
+ scoreMean
127
+ outlierScenarioRuns(first: 25) {
117
128
  edges {
118
129
  node {
119
130
  key
120
- label
121
- sampleSize
122
- agreementRate
123
- scoreDelta
124
- passFlip
125
- instability
131
+ scenarioRunId
132
+ gradeSampleCount
133
+ completedSampleCount
134
+ executionFailureCount
135
+ gradingFailureCount
136
+ averageScore
137
+ minScore
138
+ maxScore
139
+ failed
126
140
  minRunId
127
141
  maxRunId
128
- turnIndex
129
142
  messageRefId
130
143
  }
131
144
  }
132
145
  }
146
+ failureReasons(first: 25) {
147
+ edges {
148
+ node {
149
+ key
150
+ kind
151
+ reason
152
+ count
153
+ }
154
+ }
155
+ }
133
156
  }
134
157
  }
135
158
  }
@@ -145,6 +168,16 @@ exports.SimulatorVerifyPage = (0, iso_js_1.iso)(`
145
168
  const toPrefixedPath = (0, react_1.useCallback)((path) => `${routePrefix}${path}`, [routePrefix]);
146
169
  const runBatchMutation = (0, useGambitTypedMutation_js_1.useGambitTypedMutation)(GambitWorkspaceVerifyBatchRunCreateMutation_js_1.default);
147
170
  (0, useGambitTypedSubscription_js_1.useGambitTypedSubscription)(GambitWorkspaceVerifyLiveSubscription_js_1.default, workspaceId ? { workspaceId } : null);
171
+ const scenarioDecks = (0, react_1.useMemo)(() => data.scenarioDecks?.flatMap((deck) => {
172
+ if (!deck?.id || !deck.label)
173
+ return [];
174
+ return [{
175
+ id: deck.id,
176
+ label: deck.label,
177
+ description: deck.description ?? null,
178
+ path: deck.path ?? "",
179
+ }];
180
+ }) ?? [], [data.scenarioDecks]);
148
181
  const graders = (0, react_1.useMemo)(() => (data.verification?.graderDecks?.edges ?? []).flatMap((edge) => {
149
182
  const grader = edge?.node;
150
183
  if (!grader?.id || !grader.label)
@@ -156,30 +189,16 @@ exports.SimulatorVerifyPage = (0, iso_js_1.iso)(`
156
189
  path: grader.path ?? "",
157
190
  }];
158
191
  }), [data.verification?.graderDecks?.edges]);
159
- const scenarioRuns = (0, react_1.useMemo)(() => (data.scenarioRuns?.edges ?? []).flatMap((edge) => {
160
- const run = edge?.node;
161
- if (!run?.id)
162
- return [];
163
- return [{
164
- id: run.id,
165
- status: toBatchStatus(run.status),
166
- startedAt: run.startedAt ?? null,
167
- finishedAt: run.finishedAt ?? null,
168
- error: run.error ?? null,
169
- }];
170
- }).sort((left, right) => {
171
- const leftKey = left.finishedAt ?? left.startedAt ?? left.id;
172
- const rightKey = right.finishedAt ?? right.startedAt ?? right.id;
173
- return rightKey.localeCompare(leftKey);
174
- }), [data.scenarioRuns?.edges]);
175
192
  const batches = (0, react_1.useMemo)(() => (data.verification?.batches?.edges ?? []).flatMap((edge) => {
176
193
  const batch = edge?.node;
177
194
  if (!batch?.id || !batch.graderId)
178
195
  return [];
179
196
  return [{
180
197
  id: batch.id,
198
+ scenarioDeckId: batch.scenarioDeckId ?? null,
181
199
  graderId: batch.graderId,
182
- scenarioRunId: batch.scenarioRunId ?? null,
200
+ scenarioRuns: batch.scenarioRuns ?? 0,
201
+ graderRepeatsPerScenario: batch.graderRepeatsPerScenario ?? 0,
183
202
  status: toBatchStatus(batch.status),
184
203
  startedAt: batch.startedAt ?? null,
185
204
  finishedAt: batch.finishedAt ?? null,
@@ -187,12 +206,15 @@ exports.SimulatorVerifyPage = (0, iso_js_1.iso)(`
187
206
  active: batch.active ?? 0,
188
207
  completed: batch.completed ?? 0,
189
208
  failed: batch.failed ?? 0,
209
+ scenarioRunsCompleted: batch.scenarioRunsCompleted ?? 0,
210
+ scenarioRunsFailed: batch.scenarioRunsFailed ?? 0,
190
211
  requests: (batch.requests?.edges ?? []).flatMap((requestEdge) => {
191
212
  const request = requestEdge?.node;
192
213
  if (!request?.id)
193
214
  return [];
194
215
  return [{
195
216
  id: request.id,
217
+ scenarioRunId: request.scenarioRunId ?? undefined,
196
218
  status: toBatchRequestStatus(request.status),
197
219
  runId: request.runId ?? undefined,
198
220
  error: request.error ?? undefined,
@@ -200,59 +222,76 @@ exports.SimulatorVerifyPage = (0, iso_js_1.iso)(`
200
222
  }),
201
223
  metrics: batch.metrics
202
224
  ? {
203
- sampleSize: batch.metrics.sampleSize ?? 0,
204
- agreementRate: typeof batch.metrics.agreementRate === "number"
205
- ? batch.metrics.agreementRate
225
+ scenarioRunCountRequested: batch.metrics.scenarioRunCountRequested ?? 0,
226
+ scenarioRunCountCompleted: batch.metrics.scenarioRunCountCompleted ?? 0,
227
+ scenarioRunCountFailed: batch.metrics.scenarioRunCountFailed ?? 0,
228
+ gradeSampleCountRequested: batch.metrics.gradeSampleCountRequested ?? 0,
229
+ gradeSampleCountCompleted: batch.metrics.gradeSampleCountCompleted ?? 0,
230
+ gradeSampleCountFailed: batch.metrics.gradeSampleCountFailed ?? 0,
231
+ executionFailureCount: batch.metrics.executionFailureCount ?? 0,
232
+ gradingFailureCount: batch.metrics.gradingFailureCount ?? 0,
233
+ passRate: typeof batch.metrics.passRate === "number"
234
+ ? batch.metrics.passRate
206
235
  : null,
207
- scoreSpreadMin: typeof batch.metrics.scoreSpreadMin === "number"
208
- ? batch.metrics.scoreSpreadMin
236
+ scoreMin: typeof batch.metrics.scoreMin === "number"
237
+ ? batch.metrics.scoreMin
209
238
  : null,
210
- scoreSpreadMedian: typeof batch.metrics.scoreSpreadMedian === "number"
211
- ? batch.metrics.scoreSpreadMedian
239
+ scoreMedian: typeof batch.metrics.scoreMedian === "number"
240
+ ? batch.metrics.scoreMedian
212
241
  : null,
213
- scoreSpreadMax: typeof batch.metrics.scoreSpreadMax === "number"
214
- ? batch.metrics.scoreSpreadMax
242
+ scoreMax: typeof batch.metrics.scoreMax === "number"
243
+ ? batch.metrics.scoreMax
215
244
  : null,
216
- instabilityCount: batch.metrics.instabilityCount ?? 0,
217
- verdict: batch.metrics.verdict === "PASS" ||
218
- batch.metrics.verdict === "WARN" ||
219
- batch.metrics.verdict === "FAIL"
220
- ? batch.metrics.verdict
221
- : "WARN",
222
- verdictReason: batch.metrics.verdictReason ??
223
- "Verify batch completed.",
224
- outliers: (batch.metrics.outliers?.edges ?? []).flatMap((outlierEdge) => {
245
+ scoreMean: typeof batch.metrics.scoreMean === "number"
246
+ ? batch.metrics.scoreMean
247
+ : null,
248
+ outlierScenarioRuns: (batch.metrics.outlierScenarioRuns?.edges ?? []).flatMap((outlierEdge) => {
225
249
  const outlier = outlierEdge?.node;
226
- if (!outlier?.key || !outlier.label)
250
+ if (!outlier?.key || !outlier.scenarioRunId)
227
251
  return [];
228
252
  return [{
229
253
  key: outlier.key,
230
- label: outlier.label,
231
- sampleSize: outlier.sampleSize ?? 0,
232
- agreementRate: typeof outlier.agreementRate === "number"
233
- ? outlier.agreementRate
254
+ scenarioRunId: outlier.scenarioRunId,
255
+ gradeSampleCount: outlier.gradeSampleCount ?? 0,
256
+ completedSampleCount: outlier.completedSampleCount ?? 0,
257
+ executionFailureCount: outlier.executionFailureCount ?? 0,
258
+ gradingFailureCount: outlier.gradingFailureCount ?? 0,
259
+ averageScore: typeof outlier.averageScore === "number"
260
+ ? outlier.averageScore
261
+ : null,
262
+ minScore: typeof outlier.minScore === "number"
263
+ ? outlier.minScore
234
264
  : null,
235
- scoreDelta: typeof outlier.scoreDelta === "number"
236
- ? outlier.scoreDelta
265
+ maxScore: typeof outlier.maxScore === "number"
266
+ ? outlier.maxScore
237
267
  : null,
238
- passFlip: Boolean(outlier.passFlip),
239
- instability: Boolean(outlier.instability),
268
+ failed: Boolean(outlier.failed),
240
269
  minRunId: outlier.minRunId ?? undefined,
241
270
  maxRunId: outlier.maxRunId ?? undefined,
242
- turnIndex: typeof outlier.turnIndex === "number"
243
- ? outlier.turnIndex
244
- : undefined,
245
271
  messageRefId: outlier.messageRefId ?? undefined,
246
272
  }];
247
273
  }),
274
+ failureReasons: (batch.metrics.failureReasons?.edges ?? [])
275
+ .flatMap((reasonEdge) => {
276
+ const reason = reasonEdge?.node;
277
+ if (!reason?.key || !reason.reason)
278
+ return [];
279
+ return [{
280
+ key: reason.key,
281
+ kind: reason.kind === "GRADING" ? "grading" : "execution",
282
+ reason: reason.reason,
283
+ count: reason.count ?? 0,
284
+ }];
285
+ }),
248
286
  }
249
287
  : null,
250
288
  }];
251
289
  }), [data.verification?.batches?.edges]);
252
- const [selectedScenarioRunId, setSelectedScenarioRunId] = (0, react_1.useState)(null);
290
+ const [selectedScenarioDeckId, setSelectedScenarioDeckId] = (0, react_1.useState)(null);
253
291
  const [selectedGraderId, setSelectedGraderId] = (0, react_1.useState)(null);
254
292
  const [selectedBatchId, setSelectedBatchId] = (0, react_1.useState)(null);
255
- const [batchSize, setBatchSize] = (0, react_1.useState)(DEFAULT_BATCH_SIZE);
293
+ const [scenarioRuns, setScenarioRuns] = (0, react_1.useState)(DEFAULT_SCENARIO_RUNS);
294
+ const [graderRepeatsPerScenario, setGraderRepeatsPerScenario] = (0, react_1.useState)(DEFAULT_GRADER_REPEATS);
256
295
  const [batchConcurrency, setBatchConcurrency] = (0, react_1.useState)(DEFAULT_BATCH_CONCURRENCY);
257
296
  const [mutationError, setMutationError] = (0, react_1.useState)(null);
258
297
  (0, react_1.useEffect)(() => {
@@ -263,22 +302,23 @@ exports.SimulatorVerifyPage = (0, iso_js_1.iso)(`
263
302
  setSelectedGraderId(graders[0]?.id ?? null);
264
303
  }, [graders, selectedGraderId]);
265
304
  (0, react_1.useEffect)(() => {
266
- if (selectedScenarioRunId &&
267
- scenarioRuns.some((scenarioRun) => scenarioRun.id === selectedScenarioRunId)) {
305
+ if (selectedScenarioDeckId &&
306
+ scenarioDecks.some((deck) => deck.id === selectedScenarioDeckId)) {
268
307
  return;
269
308
  }
270
- setSelectedScenarioRunId(scenarioRuns[0]?.id ?? null);
271
- }, [scenarioRuns, selectedScenarioRunId]);
309
+ setSelectedScenarioDeckId(scenarioDecks[0]?.id ?? null);
310
+ }, [scenarioDecks, selectedScenarioDeckId]);
272
311
  const filteredBatches = (0, react_1.useMemo)(() => {
273
312
  return batches.filter((batch) => {
274
313
  if (selectedGraderId && batch.graderId !== selectedGraderId)
275
314
  return false;
276
- if (selectedScenarioRunId && batch.scenarioRunId !== selectedScenarioRunId) {
315
+ if (selectedScenarioDeckId &&
316
+ batch.scenarioDeckId !== selectedScenarioDeckId) {
277
317
  return false;
278
318
  }
279
319
  return true;
280
320
  });
281
- }, [batches, selectedGraderId, selectedScenarioRunId]);
321
+ }, [batches, selectedGraderId, selectedScenarioDeckId]);
282
322
  const visibleBatches = filteredBatches.length > 0 ? filteredBatches : batches;
283
323
  (0, react_1.useEffect)(() => {
284
324
  if (selectedBatchId &&
@@ -296,10 +336,11 @@ exports.SimulatorVerifyPage = (0, iso_js_1.iso)(`
296
336
  return visibleBatches[0] ?? null;
297
337
  }, [selectedBatchId, visibleBatches]);
298
338
  const selectedGrader = (0, react_1.useMemo)(() => graders.find((grader) => grader.id === selectedGraderId) ?? null, [graders, selectedGraderId]);
299
- const queuedCount = (0, react_1.useMemo)(() => (selectedBatch?.requests ?? []).filter((request) => request.status === "queued")
300
- .length, [selectedBatch?.requests]);
339
+ const selectedScenarioDeck = (0, react_1.useMemo)(() => scenarioDecks.find((deck) => deck.id === selectedScenarioDeckId) ?? null, [scenarioDecks, selectedScenarioDeckId]);
340
+ const queuedCount = (0, react_1.useMemo)(() => (selectedBatch?.requests ?? []).filter((request) => request.status === "queued").length, [selectedBatch?.requests]);
301
341
  const metrics = selectedBatch?.metrics ?? null;
302
- const topOutliers = (metrics?.outliers ?? []).slice(0, 8);
342
+ const topOutlierScenarioRuns = (0, react_1.useMemo)(() => (0, verify_unified_js_1.sortVerifyOutlierScenarioRuns)(metrics?.outlierScenarioRuns ?? [])
343
+ .slice(0, 8), [metrics?.outlierScenarioRuns]);
303
344
  const hasRunningBatch = visibleBatches.some((batch) => batch.status === "running");
304
345
  const canRun = Boolean(workspaceId && selectedGraderId && !runBatchMutation.inFlight &&
305
346
  !hasRunningBatch);
@@ -307,16 +348,18 @@ exports.SimulatorVerifyPage = (0, iso_js_1.iso)(`
307
348
  if (!workspaceId || !selectedGraderId)
308
349
  return;
309
350
  setMutationError(null);
310
- const nextBatchSize = clampInt(batchSize, 1, MAX_BATCH_SIZE);
311
- const nextConcurrency = clampInt(batchConcurrency, 1, Math.min(MAX_BATCH_CONCURRENCY, nextBatchSize));
351
+ const nextScenarioRuns = clampInt(scenarioRuns, 1, MAX_SCENARIO_RUNS);
352
+ const nextRepeats = clampInt(graderRepeatsPerScenario, 1, MAX_GRADER_REPEATS);
353
+ const nextConcurrency = clampInt(batchConcurrency, 1, MAX_BATCH_CONCURRENCY);
312
354
  runBatchMutation.commit({
313
355
  input: {
314
356
  workspaceId,
315
- graderId: selectedGraderId,
316
- ...(selectedScenarioRunId
317
- ? { scenarioRunId: selectedScenarioRunId }
357
+ ...(selectedScenarioDeckId
358
+ ? { scenarioDeckId: selectedScenarioDeckId }
318
359
  : {}),
319
- batchSize: nextBatchSize,
360
+ graderId: selectedGraderId,
361
+ scenarioRuns: nextScenarioRuns,
362
+ graderRepeatsPerScenario: nextRepeats,
320
363
  concurrency: nextConcurrency,
321
364
  },
322
365
  }, {
@@ -333,40 +376,28 @@ exports.SimulatorVerifyPage = (0, iso_js_1.iso)(`
333
376
  });
334
377
  }, [
335
378
  batchConcurrency,
336
- batchSize,
379
+ graderRepeatsPerScenario,
337
380
  runBatchMutation,
381
+ scenarioRuns,
338
382
  selectedGraderId,
339
- selectedScenarioRunId,
383
+ selectedScenarioDeckId,
340
384
  workspaceId,
341
385
  ]);
342
386
  const navigateToGradeRun = (0, react_1.useCallback)((runId) => {
343
387
  const gradePath = (0, workspace_routes_js_1.buildWorkspacePath)("grade", workspaceId, { runId });
344
388
  navigate(toPrefixedPath(gradePath));
345
389
  }, [navigate, toPrefixedPath, workspaceId]);
346
- const scenarioOptions = (0, react_1.useMemo)(() => [
347
- {
348
- value: NO_SCENARIO_RUN_VALUE,
349
- label: "Current workspace context",
350
- meta: "Run without a prior scenario run binding",
351
- },
352
- ...scenarioRuns.map((run) => ({
353
- value: run.id,
354
- label: (0, utils_js_1.scenarioNameFromValue)(run.id) ?? run.id,
355
- meta: [
356
- run.status,
357
- run.finishedAt ?? run.startedAt,
358
- ].filter(Boolean).join(" · "),
359
- })),
360
- ], [scenarioRuns]);
361
390
  return (React.createElement(PageShell_js_1.default, { className: "verify-shell" },
362
391
  React.createElement(PageGrid_js_1.default, { as: "main", className: "verify-layout" },
363
392
  React.createElement(Panel_js_1.default, { className: "verify-controls", "data-testid": "verify-tab-scaffold" },
364
393
  React.createElement("div", { className: "verify-controls-header" },
365
- React.createElement("strong", null, "Verify consistency"),
366
- React.createElement("span", { className: "secondary-note" }, "Run repeated grading checks against one grader and scenario.")),
367
- scenarioRuns.length > 0 && (React.createElement(Listbox_js_1.default, { label: "Scenario run", value: selectedScenarioRunId ?? NO_SCENARIO_RUN_VALUE, onChange: (runId) => {
368
- setSelectedScenarioRunId(runId === NO_SCENARIO_RUN_VALUE ? null : runId);
369
- }, options: scenarioOptions, placeholder: "Select scenario run" })),
394
+ React.createElement("strong", null, "Verify repeated evidence"),
395
+ React.createElement("span", { className: "secondary-note" }, "Generate scenario runs, then grade each run repeatedly.")),
396
+ React.createElement(Listbox_js_1.default, { label: "Scenario deck", value: selectedScenarioDeckId ?? "", onChange: (value) => setSelectedScenarioDeckId(value.length ? value : null), options: scenarioDecks.map((deck) => ({
397
+ value: deck.id,
398
+ label: deck.label,
399
+ meta: deck.path,
400
+ })), placeholder: "Select scenario deck", disabled: scenarioDecks.length === 0 }),
370
401
  React.createElement(Listbox_js_1.default, { label: "Grader", value: selectedGraderId ?? "", onChange: (value) => setSelectedGraderId(value.length ? value : null), options: graders.map((grader) => ({
371
402
  value: grader.id,
372
403
  label: grader.label,
@@ -374,19 +405,28 @@ exports.SimulatorVerifyPage = (0, iso_js_1.iso)(`
374
405
  })), placeholder: "Select grader", disabled: graders.length === 0 }),
375
406
  React.createElement("div", { className: "verify-number-grid" },
376
407
  React.createElement("label", { className: "verify-number-field" },
377
- "Batch size",
378
- React.createElement("input", { type: "number", min: 1, max: MAX_BATCH_SIZE, value: batchSize, onChange: (event) => setBatchSize(clampInt(Number(event.target.value), 1, MAX_BATCH_SIZE)) })),
408
+ "Scenario runs",
409
+ React.createElement("input", { type: "number", min: 1, max: MAX_SCENARIO_RUNS, value: scenarioRuns, onChange: (event) => setScenarioRuns(clampInt(Number(event.target.value), 1, MAX_SCENARIO_RUNS)) })),
410
+ React.createElement("label", { className: "verify-number-field" },
411
+ "Grader repeats per scenario",
412
+ React.createElement("input", { type: "number", min: 1, max: MAX_GRADER_REPEATS, value: graderRepeatsPerScenario, onChange: (event) => setGraderRepeatsPerScenario(clampInt(Number(event.target.value), 1, MAX_GRADER_REPEATS)) })),
379
413
  React.createElement("label", { className: "verify-number-field" },
380
414
  "Concurrency",
381
415
  React.createElement("input", { type: "number", min: 1, max: MAX_BATCH_CONCURRENCY, value: batchConcurrency, onChange: (event) => setBatchConcurrency(clampInt(Number(event.target.value), 1, MAX_BATCH_CONCURRENCY)) }))),
382
416
  React.createElement(Button_js_1.default, { "data-testid": "verify-run-batch", variant: "primary", onClick: runBatch, disabled: !canRun }, hasRunningBatch || runBatchMutation.inFlight
383
- ? "Running consistency batch"
384
- : "Run consistency batch"),
417
+ ? "Running verify batch..."
418
+ : "Run verify batch"),
419
+ scenarioDecks.length === 0 && (React.createElement(Callout_js_1.default, null,
420
+ "No scenario decks are available. Add ",
421
+ React.createElement("code", null, "[[testDecks]]"),
422
+ " ",
423
+ "entries to the active root deck.")),
385
424
  graders.length === 0 && (React.createElement(Callout_js_1.default, null,
386
425
  "No graders are available. Add ",
387
426
  React.createElement("code", null, "[[graders]]"),
388
427
  " ",
389
428
  "entries to the active root deck.")),
429
+ selectedScenarioDeck?.description && (React.createElement(Callout_js_1.default, null, selectedScenarioDeck.description)),
390
430
  selectedGrader?.description && (React.createElement(Callout_js_1.default, null, selectedGrader.description)),
391
431
  React.createElement(Callout_js_1.default, { variant: "emphasis", title: "Build assistant stays available" }, "Use the chat drawer toggle in the top-right corner to investigate and iterate while this page remains open.")),
392
432
  React.createElement(Panel_js_1.default, { className: "verify-results", "data-testid": "verify-results" },
@@ -401,93 +441,93 @@ exports.SimulatorVerifyPage = (0, iso_js_1.iso)(`
401
441
  : "",
402
442
  selectedBatch?.finishedAt
403
443
  ? ` · finished ${(0, utils_js_1.formatTimestampShort)(selectedBatch.finishedAt)}`
404
- : "")),
405
- metrics && metrics.sampleSize > 0 && (React.createElement("span", { className: (0, utils_js_1.classNames)("verify-verdict-badge", `verify-verdict-badge--${metrics.verdict.toLowerCase()}`) }, metrics.verdict))),
406
- selectedBatch && selectedBatch.requested > 0 && (React.createElement("div", { className: "verify-progress-row" },
407
- React.createElement("span", null,
408
- "Queued: ",
409
- queuedCount),
410
- React.createElement("span", null,
411
- "Running: ",
412
- selectedBatch.active),
413
- React.createElement("span", null,
414
- "Completed: ",
415
- selectedBatch.completed),
416
- React.createElement("span", null,
417
- "Failed: ",
418
- selectedBatch.failed))),
419
- !selectedBatch && (React.createElement(Callout_js_1.default, null, "Run a consistency batch to compute agreement, spread, and instability for the selected grader.")),
420
- metrics && metrics.sampleSize > 0 && (React.createElement(React.Fragment, null,
444
+ : ""))),
445
+ !selectedBatch && (React.createElement(Callout_js_1.default, null, "Run a verify batch to generate repeated grading evidence.")),
446
+ selectedBatch && selectedBatch.requested > 0 && (React.createElement(React.Fragment, null,
447
+ React.createElement("div", { className: "verify-progress-row" },
448
+ React.createElement("span", null,
449
+ "Scenario runs: ",
450
+ selectedBatch.scenarioRunsCompleted,
451
+ "/",
452
+ selectedBatch.scenarioRuns),
453
+ React.createElement("span", null,
454
+ "Scenario failures: ",
455
+ selectedBatch.scenarioRunsFailed),
456
+ React.createElement("span", null,
457
+ "Queued: ",
458
+ queuedCount),
459
+ React.createElement("span", null,
460
+ "Running: ",
461
+ selectedBatch.active),
462
+ React.createElement("span", null,
463
+ "Completed: ",
464
+ selectedBatch.completed),
465
+ React.createElement("span", null,
466
+ "Failed: ",
467
+ selectedBatch.failed)))),
468
+ metrics && (React.createElement(React.Fragment, null,
421
469
  React.createElement("div", { className: "verify-metric-grid" },
422
470
  React.createElement("div", { className: "verify-metric-card" },
423
- React.createElement("div", { className: "verify-metric-label" }, "Sample size"),
424
- React.createElement("div", { className: "verify-metric-value" }, metrics.sampleSize)),
471
+ React.createElement("div", { className: "verify-metric-label" }, "Scenario runs"),
472
+ React.createElement("div", { className: "verify-metric-value" },
473
+ metrics.scenarioRunCountCompleted,
474
+ "/",
475
+ metrics.scenarioRunCountRequested)),
425
476
  React.createElement("div", { className: "verify-metric-card" },
426
- React.createElement("div", { className: "verify-metric-label" }, "Agreement rate"),
427
- React.createElement("div", { className: "verify-metric-value" }, metrics.agreementRate === null
428
- ? "—"
429
- : `${Math.round(metrics.agreementRate * 100)}%`)),
477
+ React.createElement("div", { className: "verify-metric-label" }, "Grade samples"),
478
+ React.createElement("div", { className: "verify-metric-value" },
479
+ metrics.gradeSampleCountCompleted,
480
+ "/",
481
+ metrics.gradeSampleCountRequested)),
430
482
  React.createElement("div", { className: "verify-metric-card" },
431
- React.createElement("div", { className: "verify-metric-label" }, "Score spread (min/median/max)"),
432
- React.createElement("div", { className: "verify-metric-value verify-metric-value--compact" }, metrics.scoreSpreadMin === null
433
- ? "—"
434
- : `${metrics.scoreSpreadMin} / ${metrics.scoreSpreadMedian ?? "—"} / ${metrics.scoreSpreadMax ?? "—"}`)),
483
+ React.createElement("div", { className: "verify-metric-label" }, "Pass rate"),
484
+ React.createElement("div", { className: "verify-metric-value" }, formatPercent(metrics.passRate))),
435
485
  React.createElement("div", { className: "verify-metric-card" },
436
- React.createElement("div", { className: "verify-metric-label" }, "Instability count"),
437
- React.createElement("div", { className: "verify-metric-value" }, metrics.instabilityCount))),
438
- React.createElement(Callout_js_1.default, { variant: metrics.verdict === "FAIL"
439
- ? "danger"
440
- : metrics.verdict === "WARN"
441
- ? "emphasis"
442
- : "muted", title: `Verdict: ${metrics.verdict}` }, metrics.verdictReason))),
443
- React.createElement(Callout_js_1.default, { title: "Thresholds in code" },
444
- "Min sample size: ",
445
- verify_metrics_js_1.VERIFY_CONSISTENCY_THRESHOLDS.minSampleSize,
446
- " ",
447
- "\u00B7 PASS requires agreement \u2265 ",
448
- Math.round(verify_metrics_js_1.VERIFY_CONSISTENCY_THRESHOLDS.pass.agreementMin * 100),
449
- "%, spread \u2264",
450
- " ",
451
- verify_metrics_js_1.VERIFY_CONSISTENCY_THRESHOLDS.pass.maxSpread,
452
- ", instability \u2264",
453
- " ",
454
- verify_metrics_js_1.VERIFY_CONSISTENCY_THRESHOLDS.pass.maxInstabilityCount,
455
- " ",
456
- "\u00B7 WARN allows agreement \u2265 ",
457
- Math.round(verify_metrics_js_1.VERIFY_CONSISTENCY_THRESHOLDS.warn.agreementMin * 100),
458
- "%, spread \u2264",
459
- " ",
460
- verify_metrics_js_1.VERIFY_CONSISTENCY_THRESHOLDS.warn.maxSpread,
461
- ", instability \u2264",
462
- " ",
463
- verify_metrics_js_1.VERIFY_CONSISTENCY_THRESHOLDS.warn.maxInstabilityCount,
464
- "."),
486
+ React.createElement("div", { className: "verify-metric-label" }, "Score mean"),
487
+ React.createElement("div", { className: "verify-metric-value" }, metrics.scoreMean === null ? "-" : metrics.scoreMean)),
488
+ React.createElement("div", { className: "verify-metric-card" },
489
+ React.createElement("div", { className: "verify-metric-label" }, "Score min/median/max"),
490
+ React.createElement("div", { className: "verify-metric-value verify-metric-value--compact" }, metrics.scoreMin === null
491
+ ? "-"
492
+ : `${metrics.scoreMin} / ${metrics.scoreMedian ?? "-"} / ${metrics.scoreMax ?? "-"}`)),
493
+ React.createElement("div", { className: "verify-metric-card" },
494
+ React.createElement("div", { className: "verify-metric-label" }, "Execution failures"),
495
+ React.createElement("div", { className: "verify-metric-value" }, metrics.executionFailureCount)),
496
+ React.createElement("div", { className: "verify-metric-card" },
497
+ React.createElement("div", { className: "verify-metric-label" }, "Grading failures"),
498
+ React.createElement("div", { className: "verify-metric-value" }, metrics.gradingFailureCount))))),
465
499
  React.createElement("div", { className: "verify-section" },
466
- React.createElement("strong", null, "Most inconsistent examples"),
467
- topOutliers.length === 0
468
- ? (React.createElement(Callout_js_1.default, null, "Inconsistent examples will appear here as soon as at least one completed run is available in this batch."))
469
- : (React.createElement("div", { className: "verify-outlier-list" }, topOutliers.map((outlier) => {
470
- const runLinks = [outlier.maxRunId, outlier.minRunId]
500
+ React.createElement("strong", null, "Outlier scenario runs"),
501
+ topOutlierScenarioRuns.length === 0
502
+ ? (React.createElement(Callout_js_1.default, null, "Outlier scenario runs appear as soon as completed grade samples are available."))
503
+ : (React.createElement("div", { className: "verify-outlier-list" }, topOutlierScenarioRuns.map((outlier) => {
504
+ const runLinks = [outlier.minRunId, outlier.maxRunId]
471
505
  .filter((value) => Boolean(value));
472
506
  const uniqueRunLinks = [...new Set(runLinks)];
473
507
  return (React.createElement("div", { key: outlier.key, className: "verify-outlier-card" },
474
508
  React.createElement("div", { className: "verify-outlier-header" },
475
- React.createElement("strong", null, outlier.label),
476
- React.createElement(Badge_js_1.default, { variant: outlier.instability
477
- ? "error"
478
- : "completed" }, outlier.instability ? "Unstable" : "Stable")),
509
+ React.createElement("strong", null, (0, utils_js_1.scenarioNameFromValue)(outlier.scenarioRunId) ??
510
+ outlier.scenarioRunId),
511
+ React.createElement(Badge_js_1.default, { variant: outlier.failed ? "error" : "completed" }, outlier.failed ? "Failed" : "Scored")),
479
512
  React.createElement("div", { className: "verify-outlier-meta" },
480
- "agreement ",
481
- outlier.agreementRate === null
482
- ? "—"
483
- : `${Math.round(outlier.agreementRate * 100)}%`,
513
+ "avg ",
514
+ outlier.averageScore ?? "-",
515
+ " \u00B7 min/max",
516
+ " ",
517
+ outlier.minScore ?? "-",
518
+ "/",
519
+ outlier.maxScore ?? "-",
520
+ " ",
521
+ "\u00B7 samples ",
522
+ outlier.completedSampleCount,
523
+ "/",
524
+ outlier.gradeSampleCount,
525
+ " \u00B7 execution failures",
484
526
  " ",
485
- "\u00B7 delta ",
486
- outlier.scoreDelta ?? "—",
487
- " \u00B7 samples",
527
+ outlier.executionFailureCount,
528
+ " \u00B7 grading failures",
488
529
  " ",
489
- outlier.sampleSize,
490
- outlier.passFlip ? " · pass/fail flip" : "",
530
+ outlier.gradingFailureCount,
491
531
  outlier.messageRefId
492
532
  ? ` · ref ${outlier.messageRefId}`
493
533
  : ""),
@@ -500,6 +540,18 @@ exports.SimulatorVerifyPage = (0, iso_js_1.iso)(`
500
540
  "Open grade run ",
501
541
  runId)))))));
502
542
  })))),
543
+ React.createElement("div", { className: "verify-section" },
544
+ React.createElement("strong", null, "Failure reasons"),
545
+ !metrics || metrics.failureReasons.length === 0
546
+ ? React.createElement(Callout_js_1.default, null, "No failure reasons captured yet.")
547
+ : (React.createElement("ul", { className: "verify-request-list" }, metrics.failureReasons.map((reason) => (React.createElement("li", { key: reason.key, className: "verify-request-row" },
548
+ React.createElement(Badge_js_1.default, { variant: reason.kind === "execution"
549
+ ? "error"
550
+ : "running" }, reason.kind),
551
+ React.createElement("span", null, reason.reason),
552
+ React.createElement("span", { className: "secondary-note" },
553
+ "x",
554
+ reason.count))))))),
503
555
  visibleBatches.length > 0 && (React.createElement("div", { className: "verify-section" },
504
556
  React.createElement("strong", null, "Batch history"),
505
557
  React.createElement("ul", { className: "verify-request-list" }, visibleBatches.slice(0, 15).map((batch, index) => (React.createElement("li", { key: batch.id, className: "verify-request-row" },
@@ -509,10 +561,10 @@ exports.SimulatorVerifyPage = (0, iso_js_1.iso)(`
509
561
  React.createElement(Badge_js_1.default, { status: batch.status }, batch.status),
510
562
  React.createElement("button", { type: "button", className: "link-button", onClick: () => setSelectedBatchId(batch.id) }, batch.id),
511
563
  React.createElement("span", { className: "secondary-note" },
512
- batch.completed,
513
- "/",
514
- batch.requested,
515
- " complete"))))))),
564
+ batch.scenarioRuns,
565
+ " runs \u00D7",
566
+ " ",
567
+ batch.graderRepeatsPerScenario))))))),
516
568
  selectedBatch?.requests.length
517
569
  ? (React.createElement("div", { className: "verify-section" },
518
570
  React.createElement("strong", null, "Batch requests"),
@@ -523,6 +575,8 @@ exports.SimulatorVerifyPage = (0, iso_js_1.iso)(`
523
575
  React.createElement(Badge_js_1.default, { status: request.status === "queued"
524
576
  ? "idle"
525
577
  : request.status }, request.status),
578
+ request.scenarioRunId && (React.createElement("span", { className: "secondary-note" }, (0, utils_js_1.scenarioNameFromValue)(request.scenarioRunId) ??
579
+ request.scenarioRunId)),
526
580
  request.runId
527
581
  ? (React.createElement("a", { href: toPrefixedPath((0, workspace_routes_js_1.buildWorkspacePath)("grade", workspaceId, {
528
582
  runId: request.runId,