agent-scenario-loop 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. package/README.md +9 -9
  2. package/app/profile-session.ts +98 -4
  3. package/dist/core/agent-summary.d.ts +3 -2
  4. package/dist/core/agent-summary.js +44 -2
  5. package/dist/core/artifact-contract.d.ts +22 -4
  6. package/dist/core/artifact-contract.js +512 -11
  7. package/dist/core/comparison.d.ts +57 -3
  8. package/dist/core/comparison.js +113 -1
  9. package/dist/core/planner.d.ts +32 -1
  10. package/dist/core/planner.js +144 -0
  11. package/dist/core/run-index.d.ts +4 -0
  12. package/dist/core/run-index.js +55 -1
  13. package/dist/core/schema-validator.d.ts +1 -0
  14. package/dist/core/schema-validator.js +1 -0
  15. package/dist/runner/compare-latest.d.ts +8 -4
  16. package/dist/runner/compare-latest.js +24 -5
  17. package/dist/runner/example-android-live.d.ts +10 -1
  18. package/dist/runner/example-android-live.js +55 -0
  19. package/dist/runner/example-ios-live.d.ts +10 -1
  20. package/dist/runner/example-ios-live.js +55 -0
  21. package/dist/runner/ios-simctl.d.ts +5 -0
  22. package/dist/runner/ios-simctl.js +6 -0
  23. package/dist/runner/live-comparison.d.ts +2 -2
  24. package/dist/runner/live-comparison.js +2 -1
  25. package/dist/runner/live-proof-summary.d.ts +5 -4
  26. package/dist/runner/live-proof-summary.js +12 -2
  27. package/dist/runner/live-proof.d.ts +3 -2
  28. package/dist/runner/live-proof.js +9 -2
  29. package/dist/runner/profile-android.d.ts +5 -0
  30. package/dist/runner/profile-android.js +148 -24
  31. package/dist/runner/profile-ios.d.ts +11 -1
  32. package/dist/runner/profile-ios.js +128 -9
  33. package/dist/runner/profile-mobile.d.ts +8 -0
  34. package/dist/runner/profile-mobile.js +267 -28
  35. package/docs/adapters.md +4 -0
  36. package/docs/architecture.md +90 -0
  37. package/docs/authoring.md +5 -1
  38. package/docs/concepts.md +3 -24
  39. package/docs/consumer-rehearsal.md +4 -0
  40. package/docs/contracts.md +30 -100
  41. package/docs/external-adapter-protocol.md +219 -0
  42. package/docs/live-proofs.md +83 -2
  43. package/docs/principles.md +9 -15
  44. package/examples/mobile-app/README.md +12 -0
  45. package/examples/mobile-app/runner-manifests/primary-runner.json +1 -0
  46. package/examples/runners/README.md +1 -0
  47. package/examples/runners/adb-android.json +1 -0
  48. package/examples/runners/agent-device-android.json +1 -0
  49. package/examples/runners/agent-device-ios.json +1 -0
  50. package/examples/runners/argent-android.json +1 -0
  51. package/examples/runners/argent-ios.json +1 -0
  52. package/examples/runners/xcodebuildmcp-ios.json +1 -0
  53. package/package.json +2 -1
  54. package/schemas/causal-run.schema.json +85 -2
  55. package/schemas/comparison.schema.json +130 -2
  56. package/schemas/external-adapter-message.schema.json +693 -0
  57. package/schemas/health.schema.json +72 -0
  58. package/schemas/live-proof-set.schema.json +1 -1
  59. package/schemas/live-proof.schema.json +14 -6
  60. package/schemas/manifest.schema.json +442 -1
  61. package/schemas/runner-capabilities.schema.json +20 -0
  62. package/schemas/scenario.schema.json +16 -0
  63. package/templates/primary-runner.json +1 -0
@@ -11,7 +11,7 @@ type MetricComparison = {
11
11
  baseline: number | boolean | null;
12
12
  current: number | boolean | null;
13
13
  delta: number | null;
14
- status: 'better' | 'worse' | 'unchanged' | 'inconclusive';
14
+ status: 'better' | 'worse' | 'unchanged' | 'inconclusive' | 'low_confidence';
15
15
  notes?: string;
16
16
  };
17
17
  type ComparisonStatus = MetricComparison['status'] | 'mixed';
@@ -25,12 +25,18 @@ type ComparisonRunBasis = {
25
25
  type ComparisonSelectionBasis = {
26
26
  artifactRoot?: string;
27
27
  candidatesInspected?: number;
28
+ cohortHash?: string;
29
+ comparisonLane?: string;
28
30
  scenarioId?: string;
31
+ scenarioHash?: string;
29
32
  selectedRunDir?: string;
30
33
  selectedRunId?: string;
31
34
  skippedCurrentRun?: boolean;
35
+ trustedCohortCandidates?: number;
36
+ trustedComparableCandidates?: number;
32
37
  trustedCandidates?: number;
33
38
  trustedPriorCandidates?: number;
39
+ trustedScenarioContractCandidates?: number;
34
40
  };
35
41
  type ComparisonBasis = {
36
42
  baseline: ComparisonRunBasis;
@@ -38,6 +44,42 @@ type ComparisonBasis = {
38
44
  selection?: ComparisonSelectionBasis;
39
45
  strategy: ComparisonBasisStrategy;
40
46
  };
47
+ type MeasurementPolicy = {
48
+ baselineSelection: {
49
+ mode: 'explicit' | 'latestTrustedPrior';
50
+ poisoningProtection: {
51
+ requirePassedHealth: boolean;
52
+ requirePassedVerdict: boolean;
53
+ requireMatchingScenarioId: boolean;
54
+ comparisonLane?: string;
55
+ scenarioHash?: string;
56
+ cohortHash?: string;
57
+ };
58
+ };
59
+ samples: {
60
+ baseline: {
61
+ validSamples: number;
62
+ warmupSamples: number;
63
+ outliersExcluded: number;
64
+ };
65
+ current: {
66
+ validSamples: number;
67
+ warmupSamples: number;
68
+ outliersExcluded: number;
69
+ };
70
+ };
71
+ tolerance: {
72
+ timing: {
73
+ absoluteMs: number;
74
+ relative: number;
75
+ };
76
+ };
77
+ confidence: {
78
+ level: 'single_run' | 'multi_sample' | 'insufficient' | 'low_confidence';
79
+ minValidSamples: number;
80
+ reason?: string;
81
+ };
82
+ };
41
83
  type BuildComparisonOptions = {
42
84
  baselineHealth: ComparisonRecord;
43
85
  baselineVerdict: ComparisonRecord;
@@ -103,6 +145,18 @@ declare function buildComparisonBasis({ baselineDir, currentDir, baselineHealth,
103
145
  selection?: ComparisonSelectionBasis;
104
146
  strategy: ComparisonBasisStrategy;
105
147
  }): ComparisonBasis;
148
+ /**
149
+ * Builds the measurement policy block for a comparison artifact.
150
+ *
151
+ * @param {{baselineVerdict: Record<string, unknown>, comparisonBasis?: ComparisonBasis, currentVerdict: Record<string, unknown>, metricComparisons: MetricComparison[]}} options
152
+ * @returns {MeasurementPolicy}
153
+ */
154
+ declare function buildMeasurementPolicy({ baselineVerdict, comparisonBasis, currentVerdict, metricComparisons, }: {
155
+ baselineVerdict: ComparisonRecord;
156
+ comparisonBasis: ComparisonBasis | undefined;
157
+ currentVerdict: ComparisonRecord;
158
+ metricComparisons: MetricComparison[];
159
+ }): MeasurementPolicy;
106
160
  /**
107
161
  * Builds a comparison artifact from two validated run artifact sets.
108
162
  *
@@ -129,5 +183,5 @@ declare function summarizeComparison({ comparisonStatus, missingRequired, metric
129
183
  metricComparisons: MetricComparison[];
130
184
  warnings: string[];
131
185
  }): string;
132
- export { buildComparisonBasis, buildComparisonArtifact, compareBudgetCheck, compareRunDirectories, indexBudgetChecks, readRunArtifacts, resolveComparisonStatus, summarizeComparison, };
133
- export type { BuildComparisonOptions, ComparisonBasis, ComparisonBasisStrategy, CompareRunDirectoriesOptions, ComparisonBudgetCheck, ComparisonRecord, ComparisonStatus, MetricComparison, };
186
+ export { buildComparisonBasis, buildComparisonArtifact, compareBudgetCheck, compareRunDirectories, buildMeasurementPolicy, indexBudgetChecks, readRunArtifacts, resolveComparisonStatus, summarizeComparison, };
187
+ export type { BuildComparisonOptions, ComparisonBasis, ComparisonBasisStrategy, CompareRunDirectoriesOptions, ComparisonBudgetCheck, ComparisonRecord, ComparisonStatus, MeasurementPolicy, MetricComparison, };
@@ -4,6 +4,7 @@ exports.buildComparisonBasis = buildComparisonBasis;
4
4
  exports.buildComparisonArtifact = buildComparisonArtifact;
5
5
  exports.compareBudgetCheck = compareBudgetCheck;
6
6
  exports.compareRunDirectories = compareRunDirectories;
7
+ exports.buildMeasurementPolicy = buildMeasurementPolicy;
7
8
  exports.indexBudgetChecks = indexBudgetChecks;
8
9
  exports.readRunArtifacts = readRunArtifacts;
9
10
  exports.resolveComparisonStatus = resolveComparisonStatus;
@@ -113,6 +114,21 @@ function compareBudgetCheck(baseline, current) {
113
114
  : {}),
114
115
  };
115
116
  }
117
+ /**
118
+ * Returns whether a directional timing delta should be reported as low confidence.
119
+ *
120
+ * @param {MetricComparison} metric
121
+ * @param {ComparisonBudgetCheck} baseline
122
+ * @param {ComparisonBudgetCheck} current
123
+ * @returns {boolean}
124
+ */
125
+ function isLowConfidenceTimingMovement(metric, baseline, current) {
126
+ return (metric.status === 'worse' &&
127
+ baseline.unit === 'ms' &&
128
+ current.unit === 'ms' &&
129
+ baseline.pass === true &&
130
+ current.pass === true);
131
+ }
116
132
  /**
117
133
  * Collapses metric-level comparison statuses into the run-level comparison status.
118
134
  *
@@ -123,6 +139,7 @@ function compareBudgetCheck(baseline, current) {
123
139
  function resolveComparisonStatus(metricComparisons, { baselineVerdictStatus, currentVerdictStatus, }) {
124
140
  const hasBetterMetric = metricComparisons.some((metric) => metric.status === 'better');
125
141
  const hasWorseMetric = metricComparisons.some((metric) => metric.status === 'worse');
142
+ const hasLowConfidenceMetric = metricComparisons.some((metric) => metric.status === 'low_confidence');
126
143
  if (hasBetterMetric && hasWorseMetric) {
127
144
  return 'mixed';
128
145
  }
@@ -132,6 +149,9 @@ function resolveComparisonStatus(metricComparisons, { baselineVerdictStatus, cur
132
149
  if (hasBetterMetric) {
133
150
  return 'better';
134
151
  }
152
+ if (hasLowConfidenceMetric) {
153
+ return 'low_confidence';
154
+ }
135
155
  if (metricComparisons.length > 0 && metricComparisons.every((metric) => metric.status === 'unchanged')) {
136
156
  return 'unchanged';
137
157
  }
@@ -169,6 +189,79 @@ function buildComparisonBasis({ baselineDir, currentDir, baselineHealth, baselin
169
189
  ...(selection ? { selection } : {}),
170
190
  };
171
191
  }
192
+ /**
193
+ * Counts valid numeric or boolean budget samples in a verdict artifact.
194
+ *
195
+ * @param {unknown} checks
196
+ * @returns {number}
197
+ */
198
+ function countValidBudgetSamples(checks) {
199
+ if (!Array.isArray(checks)) {
200
+ return 0;
201
+ }
202
+ return checks.filter((check) => (check &&
203
+ typeof check === 'object' &&
204
+ (typeof check.actual === 'number' ||
205
+ typeof check.actual === 'boolean'))).length;
206
+ }
207
+ /**
208
+ * Builds the measurement policy block for a comparison artifact.
209
+ *
210
+ * @param {{baselineVerdict: Record<string, unknown>, comparisonBasis?: ComparisonBasis, currentVerdict: Record<string, unknown>, metricComparisons: MetricComparison[]}} options
211
+ * @returns {MeasurementPolicy}
212
+ */
213
+ function buildMeasurementPolicy({ baselineVerdict, comparisonBasis, currentVerdict, metricComparisons, }) {
214
+ const selection = comparisonBasis?.selection;
215
+ const validSamples = metricComparisons.length;
216
+ const hasLowConfidenceMovement = metricComparisons.some((metric) => metric.status === 'low_confidence');
217
+ const confidenceLevel = hasLowConfidenceMovement
218
+ ? 'low_confidence'
219
+ :
220
+ validSamples === 0
221
+ ? 'insufficient'
222
+ : validSamples === 1
223
+ ? 'single_run'
224
+ : 'multi_sample';
225
+ const poisoningProtection = {
226
+ requirePassedHealth: true,
227
+ requirePassedVerdict: comparisonBasis?.strategy === 'latest_trusted_prior',
228
+ requireMatchingScenarioId: true,
229
+ ...(typeof selection?.comparisonLane === 'string' ? { comparisonLane: selection.comparisonLane } : {}),
230
+ ...(typeof selection?.scenarioHash === 'string' ? { scenarioHash: selection.scenarioHash } : {}),
231
+ ...(typeof selection?.cohortHash === 'string' ? { cohortHash: selection.cohortHash } : {}),
232
+ };
233
+ return {
234
+ baselineSelection: {
235
+ mode: comparisonBasis?.strategy === 'latest_trusted_prior' ? 'latestTrustedPrior' : 'explicit',
236
+ poisoningProtection,
237
+ },
238
+ samples: {
239
+ baseline: {
240
+ validSamples: countValidBudgetSamples(baselineVerdict.budgetChecks),
241
+ warmupSamples: 0,
242
+ outliersExcluded: 0,
243
+ },
244
+ current: {
245
+ validSamples: countValidBudgetSamples(currentVerdict.budgetChecks),
246
+ warmupSamples: 0,
247
+ outliersExcluded: 0,
248
+ },
249
+ },
250
+ tolerance: {
251
+ timing: {
252
+ absoluteMs: MIN_MS_COMPARISON_TOLERANCE,
253
+ relative: RELATIVE_MS_COMPARISON_TOLERANCE,
254
+ },
255
+ },
256
+ confidence: {
257
+ level: confidenceLevel,
258
+ minValidSamples: 1,
259
+ ...(hasLowConfidenceMovement
260
+ ? { reason: 'Single-run timing movement stayed within passing budgets; repeat or multi-sample proof is required before treating it as a regression.' }
261
+ : {}),
262
+ },
263
+ };
264
+ }
172
265
  /**
173
266
  * Builds a comparison artifact from two validated run artifact sets.
174
267
  *
@@ -202,7 +295,17 @@ function buildComparisonArtifact({ baselineHealth, baselineVerdict, comparisonBa
202
295
  warnings.push(`No baseline budget check matched ${currentCheck.name}.`);
203
296
  continue;
204
297
  }
205
- metricComparisons.push(compareBudgetCheck(baselineCheck, currentCheck));
298
+ const metricComparison = compareBudgetCheck(baselineCheck, currentCheck);
299
+ if (comparisonBasis?.strategy === 'latest_trusted_prior' &&
300
+ isLowConfidenceTimingMovement(metricComparison, baselineCheck, currentCheck)) {
301
+ metricComparisons.push({
302
+ ...metricComparison,
303
+ status: 'low_confidence',
304
+ notes: 'Single-run timing movement stayed within passing budgets; repeat or multi-sample proof is required before treating it as a regression.',
305
+ });
306
+ continue;
307
+ }
308
+ metricComparisons.push(metricComparison);
206
309
  }
207
310
  if (metricComparisons.length === 0) {
208
311
  warnings.push('No comparable budget checks were available.');
@@ -228,6 +331,12 @@ function buildComparisonArtifact({ baselineHealth, baselineVerdict, comparisonBa
228
331
  healthStatus: canCompare ? 'passed' : 'failed',
229
332
  verdictStatus: typeof currentVerdict.verdictStatus === 'string' ? currentVerdict.verdictStatus : 'inconclusive',
230
333
  ...(comparisonBasis ? { comparisonBasis } : {}),
334
+ measurementPolicy: buildMeasurementPolicy({
335
+ baselineVerdict,
336
+ comparisonBasis,
337
+ currentVerdict,
338
+ metricComparisons,
339
+ }),
231
340
  ...(metricComparisons.length > 0 ? { metricComparisons } : {}),
232
341
  evidence: {
233
342
  missingRequired,
@@ -287,6 +396,9 @@ function summarizeComparison({ comparisonStatus, missingRequired, metricComparis
287
396
  if (comparisonStatus === 'mixed') {
288
397
  return 'Current run has mixed metric movement against the explicit baseline.';
289
398
  }
399
+ if (comparisonStatus === 'low_confidence') {
400
+ return 'Current run has low-confidence timing movement against the baseline; repeat or multi-sample proof is required before treating it as a regression.';
401
+ }
290
402
  if (comparisonStatus === 'unchanged') {
291
403
  return 'Current run matched the explicit baseline.';
292
404
  }
@@ -8,10 +8,18 @@ type CompatibilityResult = {
8
8
  compatible: boolean;
9
9
  errors: PlannerIssue[];
10
10
  warnings: PlannerIssue[];
11
+ downgradePolicy: {
12
+ mode: string;
13
+ allowedSubstitutions: Array<Record<string, unknown>>;
14
+ substitutions: Array<Record<string, unknown>>;
15
+ unsupported: Array<Record<string, unknown>>;
16
+ warnings: Array<Record<string, unknown>>;
17
+ };
11
18
  matched: {
12
19
  platforms: string[];
13
20
  capabilities: string[];
14
21
  driverActions: string[];
22
+ uiContexts: string[];
15
23
  artifacts: string[];
16
24
  evidenceProviders: string[];
17
25
  };
@@ -22,6 +30,7 @@ type ScenarioStep = ManifestRecord & {
22
30
  id?: unknown;
23
31
  required?: unknown;
24
32
  selector?: unknown;
33
+ uiContext?: unknown;
25
34
  };
26
35
  type ScenarioManifest = ManifestRecord & {
27
36
  adapterOptions?: unknown;
@@ -44,6 +53,7 @@ type RunnerManifest = ManifestRecord & {
44
53
  platforms?: unknown[];
45
54
  capabilities?: unknown[];
46
55
  driverActions?: unknown[];
56
+ uiContexts?: unknown[];
47
57
  artifactOutputs?: unknown[];
48
58
  };
49
59
  /**
@@ -72,6 +82,27 @@ declare function collectProvidedDriverActions({ runner, evidenceProviders, effec
72
82
  evidenceProviders: RunnerManifest[];
73
83
  effectivePlatforms: string[];
74
84
  }): string[];
85
+ /**
86
+ * Collects UI/system contexts owned by the primary runner and active providers.
87
+ *
88
+ * @param {{runner: Record<string, unknown>, evidenceProviders: Record<string, unknown>[], effectivePlatforms: string[]}} options
89
+ * @returns {string[]}
90
+ */
91
+ declare function collectProvidedUiContexts({ runner, evidenceProviders, effectivePlatforms, }: {
92
+ runner: RunnerManifest;
93
+ evidenceProviders: RunnerManifest[];
94
+ effectivePlatforms: string[];
95
+ }): string[];
96
+ /**
97
+ * Collects UI/system contexts required by scenario steps.
98
+ *
99
+ * @param {Record<string, unknown>} scenario
100
+ * @returns {{required: string[], optional: string[]}}
101
+ */
102
+ declare function collectScenarioUiContexts(scenario: ScenarioManifest): {
103
+ required: string[];
104
+ optional: string[];
105
+ };
75
106
  /**
76
107
  * Collects driver operations required by scenario steps.
77
108
  *
@@ -128,5 +159,5 @@ declare function buildUnevaluatedVerdict({ scenario, runId, health, }: {
128
159
  runId?: string;
129
160
  health: ManifestRecord;
130
161
  }): ManifestRecord;
131
- export { buildCompatibilityHealth, buildUnevaluatedVerdict, collectProvidedDriverActions, collectScenarioDriverActions, evaluateRunnerCompatibility, intersection, uniqueSorted, validateScenarioAdapterOptions, };
162
+ export { buildCompatibilityHealth, buildUnevaluatedVerdict, collectProvidedDriverActions, collectProvidedUiContexts, collectScenarioDriverActions, collectScenarioUiContexts, evaluateRunnerCompatibility, intersection, uniqueSorted, validateScenarioAdapterOptions, };
132
163
  export type { CompatibilityResult, ManifestRecord, PlannerIssue, RunnerManifest, ScenarioManifest, };
@@ -3,11 +3,21 @@ Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.buildCompatibilityHealth = buildCompatibilityHealth;
4
4
  exports.buildUnevaluatedVerdict = buildUnevaluatedVerdict;
5
5
  exports.collectProvidedDriverActions = collectProvidedDriverActions;
6
+ exports.collectProvidedUiContexts = collectProvidedUiContexts;
6
7
  exports.collectScenarioDriverActions = collectScenarioDriverActions;
8
+ exports.collectScenarioUiContexts = collectScenarioUiContexts;
7
9
  exports.evaluateRunnerCompatibility = evaluateRunnerCompatibility;
8
10
  exports.intersection = intersection;
9
11
  exports.uniqueSorted = uniqueSorted;
10
12
  exports.validateScenarioAdapterOptions = validateScenarioAdapterOptions;
13
+ const UI_DRIVER_ACTIONS = new Set([
14
+ 'tap',
15
+ 'scroll',
16
+ 'assertVisible',
17
+ 'inspectTree',
18
+ 'screenshot',
19
+ 'record',
20
+ ]);
11
21
  /**
12
22
  * Returns `value` when it is already an array; otherwise returns an empty array.
13
23
  *
@@ -64,6 +74,67 @@ function createIssue(code, message, metadata = {}) {
64
74
  ...metadata,
65
75
  };
66
76
  }
77
+ /**
78
+ * Converts a planner issue into a capability policy entry when it affects proof strength.
79
+ *
80
+ * @param {Record<string, unknown>} issue
81
+ * @param {'unsupported' | 'warning'} status
82
+ * @returns {Record<string, unknown> | null}
83
+ */
84
+ function issueToCapabilityPolicyEntry(issue, status) {
85
+ if (typeof issue.capability === 'string') {
86
+ return {
87
+ kind: 'capability',
88
+ name: issue.capability,
89
+ status,
90
+ code: issue.code,
91
+ };
92
+ }
93
+ if (typeof issue.driverAction === 'string') {
94
+ return {
95
+ kind: 'driverAction',
96
+ name: issue.driverAction,
97
+ status,
98
+ code: issue.code,
99
+ };
100
+ }
101
+ if (typeof issue.uiContext === 'string') {
102
+ return {
103
+ kind: 'uiContext',
104
+ name: issue.uiContext,
105
+ status,
106
+ code: issue.code,
107
+ };
108
+ }
109
+ if (typeof issue.artifact === 'string') {
110
+ return {
111
+ kind: 'artifact',
112
+ name: issue.artifact,
113
+ status,
114
+ code: issue.code,
115
+ };
116
+ }
117
+ return null;
118
+ }
119
+ /**
120
+ * Builds the no-silent-downgrade policy artifact from planner results.
121
+ *
122
+ * @param {{errors: Record<string, unknown>[], warnings: Record<string, unknown>[]}} options
123
+ * @returns {Record<string, unknown>}
124
+ */
125
+ function buildDowngradePolicy({ errors, warnings, }) {
126
+ return {
127
+ mode: 'no-silent-downgrade',
128
+ allowedSubstitutions: [],
129
+ substitutions: [],
130
+ unsupported: errors
131
+ .map((issue) => issueToCapabilityPolicyEntry(issue, 'unsupported'))
132
+ .filter((entry) => entry !== null),
133
+ warnings: warnings
134
+ .map((issue) => issueToCapabilityPolicyEntry(issue, 'warning'))
135
+ .filter((entry) => entry !== null),
136
+ };
137
+ }
67
138
  /**
68
139
  * Returns `value` when it is a plain object; otherwise returns an empty object.
69
140
  *
@@ -253,6 +324,53 @@ function collectProvidedDriverActions({ runner, evidenceProviders, effectivePlat
253
324
  ...activeProviders.flatMap((provider) => asArray(provider?.driverActions)),
254
325
  ]);
255
326
  }
327
+ /**
328
+ * Collects UI/system contexts owned by the primary runner and active providers.
329
+ *
330
+ * @param {{runner: Record<string, unknown>, evidenceProviders: Record<string, unknown>[], effectivePlatforms: string[]}} options
331
+ * @returns {string[]}
332
+ */
333
+ function collectProvidedUiContexts({ runner, evidenceProviders, effectivePlatforms, }) {
334
+ const activeProviders = evidenceProviders.filter((provider) => isProviderActiveForPlatforms(provider, effectivePlatforms));
335
+ return uniqueSorted([
336
+ ...asArray(runner?.uiContexts),
337
+ ...activeProviders.flatMap((provider) => asArray(provider?.uiContexts)),
338
+ ]);
339
+ }
340
+ /**
341
+ * Collects UI/system contexts required by scenario steps.
342
+ *
343
+ * @param {Record<string, unknown>} scenario
344
+ * @returns {{required: string[], optional: string[]}}
345
+ */
346
+ function collectScenarioUiContexts(scenario) {
347
+ const steps = Array.isArray(scenario.steps) ? scenario.steps : [];
348
+ const required = [];
349
+ const optional = [];
350
+ for (const step of steps) {
351
+ if (!step || typeof step !== 'object') {
352
+ continue;
353
+ }
354
+ const uiContext = typeof step.uiContext === 'string'
355
+ ? step.uiContext
356
+ : typeof step.driverAction === 'string' && UI_DRIVER_ACTIONS.has(step.driverAction)
357
+ ? 'app'
358
+ : null;
359
+ if (!uiContext) {
360
+ continue;
361
+ }
362
+ if (step.required === false) {
363
+ optional.push(uiContext);
364
+ }
365
+ else {
366
+ required.push(uiContext);
367
+ }
368
+ }
369
+ return {
370
+ required: uniqueSorted(required),
371
+ optional: uniqueSorted(optional),
372
+ };
373
+ }
256
374
  /**
257
375
  * Collects driver operations required by scenario steps.
258
376
  *
@@ -653,10 +771,12 @@ function evaluateRunnerCompatibility({ scenario, runner, evidenceProviders = [],
653
771
  compatible: false,
654
772
  errors,
655
773
  warnings,
774
+ downgradePolicy: buildDowngradePolicy({ errors, warnings }),
656
775
  matched: {
657
776
  platforms: [],
658
777
  capabilities: [],
659
778
  driverActions: [],
779
+ uiContexts: [],
660
780
  artifacts: [],
661
781
  evidenceProviders: [],
662
782
  },
@@ -705,6 +825,26 @@ function evaluateRunnerCompatibility({ scenario, runner, evidenceProviders = [],
705
825
  driverAction,
706
826
  }));
707
827
  }
828
+ const providedUiContexts = collectProvidedUiContexts({
829
+ runner: primaryRunner,
830
+ evidenceProviders,
831
+ effectivePlatforms,
832
+ });
833
+ const scenarioUiContexts = collectScenarioUiContexts(scenario);
834
+ for (const uiContext of includesAll(providedUiContexts, scenarioUiContexts.required)) {
835
+ errors.push(createIssue('missing_required_ui_context', `No active runner or provider declares required UI context \`${uiContext}\`.`, {
836
+ runnerId: getRunnerId(primaryRunner),
837
+ scenarioId: getScenarioId(scenario),
838
+ uiContext,
839
+ }));
840
+ }
841
+ for (const uiContext of includesAll(providedUiContexts, scenarioUiContexts.optional)) {
842
+ warnings.push(createIssue('missing_optional_ui_context', `No active runner or provider declares optional UI context \`${uiContext}\`.`, {
843
+ runnerId: getRunnerId(primaryRunner),
844
+ scenarioId: getScenarioId(scenario),
845
+ uiContext,
846
+ }));
847
+ }
708
848
  const { activeProviders, artifacts } = collectProvidedArtifacts({
709
849
  runner: primaryRunner,
710
850
  evidenceProviders,
@@ -728,10 +868,12 @@ function evaluateRunnerCompatibility({ scenario, runner, evidenceProviders = [],
728
868
  compatible: errors.length === 0,
729
869
  errors,
730
870
  warnings,
871
+ downgradePolicy: buildDowngradePolicy({ errors, warnings }),
731
872
  matched: {
732
873
  platforms: effectivePlatforms,
733
874
  capabilities: providedCapabilities,
734
875
  driverActions: providedDriverActions,
876
+ uiContexts: providedUiContexts,
735
877
  artifacts,
736
878
  evidenceProviders: activeProviders.map((provider) => getRunnerId(provider)),
737
879
  },
@@ -769,10 +911,12 @@ function buildCompatibilityHealth({ scenario, runId, compatibility, }) {
769
911
  healthStatus: failedChecks.length > 0 ? 'failed' : 'passed',
770
912
  checks,
771
913
  ...(warningChecks.length > 0 ? { warnings: warningChecks } : {}),
914
+ downgradePolicy: compatibility.downgradePolicy ?? buildDowngradePolicy({ errors, warnings }),
772
915
  matched: {
773
916
  platforms: uniqueSorted(asArray(compatibility?.matched?.platforms)),
774
917
  capabilities: uniqueSorted(asArray(compatibility?.matched?.capabilities)),
775
918
  driverActions: uniqueSorted(asArray(compatibility?.matched?.driverActions)),
919
+ uiContexts: uniqueSorted(asArray(compatibility?.matched?.uiContexts)),
776
920
  artifacts: uniqueSorted(asArray(compatibility?.matched?.artifacts)),
777
921
  evidenceProviders: uniqueSorted(asArray(compatibility?.matched?.evidenceProviders)),
778
922
  },
@@ -1,10 +1,14 @@
1
1
  type RunIndexEntry = {
2
2
  runDir: string;
3
3
  scenarioId: string;
4
+ attemptId?: string;
5
+ attemptNumber?: number;
4
6
  scenarioHash?: string;
7
+ cohortHash?: string;
5
8
  runId: string;
6
9
  healthStatus: string;
7
10
  trusted: boolean;
11
+ trustReason: string;
8
12
  durationMs?: number;
9
13
  endedAt?: string;
10
14
  flowId?: string;
@@ -17,6 +17,51 @@ const { ARTIFACT_FILENAMES, PROFILE_ARTIFACT_FILENAMES } = require('./artifact-l
17
17
  function readJson(filePath) {
18
18
  return JSON.parse(fs.readFileSync(filePath, 'utf8'));
19
19
  }
20
+ /**
21
+ * Returns whether a value is a plain object record.
22
+ *
23
+ * @param {unknown} value
24
+ * @returns {value is Record<string, unknown>}
25
+ */
26
+ function isRecord(value) {
27
+ return Boolean(value) && typeof value === 'object' && !Array.isArray(value);
28
+ }
29
+ /**
30
+ * Returns a stable reason explaining whether this run can seed latest-trusted comparisons.
31
+ *
32
+ * @param {{healthStatus: string, verdictStatus?: string, manifest: Record<string, unknown>}} options
33
+ * @returns {string}
34
+ */
35
+ function resolveTrustReason({ healthStatus, manifest, verdictStatus, }) {
36
+ if (healthStatus !== 'passed') {
37
+ return 'health_not_passed';
38
+ }
39
+ if (verdictStatus !== 'passed') {
40
+ return 'verdict_not_passed';
41
+ }
42
+ const attempt = isRecord(manifest.attempt) ? manifest.attempt : null;
43
+ if (!attempt) {
44
+ return 'trusted_legacy_without_attempt';
45
+ }
46
+ if (attempt.status !== 'passed' || attempt.terminalState !== 'passed') {
47
+ return 'attempt_not_passed';
48
+ }
49
+ if (typeof attempt.attemptNumber === 'number' && attempt.attemptNumber !== 1) {
50
+ return 'retry_attempt_not_baseline_trusted';
51
+ }
52
+ if (typeof attempt.retryOfAttemptId === 'string' || typeof attempt.retryReason === 'string') {
53
+ return 'retry_lineage_not_baseline_trusted';
54
+ }
55
+ const cleanup = isRecord(attempt.cleanup) ? attempt.cleanup : null;
56
+ if (cleanup?.status === 'failed' || cleanup?.status === 'partial') {
57
+ return 'cleanup_not_complete';
58
+ }
59
+ const partialArtifacts = isRecord(attempt.partialArtifacts) ? attempt.partialArtifacts : null;
60
+ if (partialArtifacts?.valid === true) {
61
+ return 'partial_artifacts_not_baseline_trusted';
62
+ }
63
+ return 'trusted';
64
+ }
20
65
  /**
21
66
  * Returns whether a directory contains the minimum run artifact pair.
22
67
  *
@@ -79,13 +124,22 @@ function readRunIndexEntry(runDir) {
79
124
  : path.basename(runDir);
80
125
  const healthStatus = typeof health.healthStatus === 'string' ? health.healthStatus : 'unknown';
81
126
  const verdictStatus = typeof verdict.verdictStatus === 'string' ? verdict.verdictStatus : undefined;
127
+ const provenance = isRecord(manifest.provenance) ? manifest.provenance : {};
128
+ const attempt = isRecord(manifest.attempt) ? manifest.attempt : null;
129
+ const trustReason = resolveTrustReason({ healthStatus, manifest, verdictStatus });
82
130
  return {
83
131
  runDir,
84
132
  scenarioId,
85
133
  runId,
134
+ ...(typeof attempt?.attemptId === 'string' ? { attemptId: attempt.attemptId } : {}),
135
+ ...(typeof attempt?.attemptNumber === 'number' ? { attemptNumber: attempt.attemptNumber } : {}),
86
136
  ...(typeof manifest.scenarioHash === 'string' ? { scenarioHash: manifest.scenarioHash } : {}),
137
+ ...(typeof provenance.cohortHash === 'string'
138
+ ? { cohortHash: provenance.cohortHash }
139
+ : {}),
87
140
  healthStatus,
88
- trusted: healthStatus === 'passed' && verdictStatus === 'passed',
141
+ trusted: trustReason === 'trusted' || trustReason === 'trusted_legacy_without_attempt',
142
+ trustReason,
89
143
  ...(typeof manifest.durationMs === 'number' ? { durationMs: manifest.durationMs } : {}),
90
144
  ...(typeof manifest.endedAt === 'string' ? { endedAt: manifest.endedAt } : {}),
91
145
  ...(typeof health.flowId === 'string' ? { flowId: health.flowId } : {}),
@@ -46,6 +46,7 @@ declare const SCHEMAS: {
46
46
  budgetVerdict: JsonSchema;
47
47
  causalRun: JsonSchema;
48
48
  comparison: JsonSchema;
49
+ externalAdapterMessage: JsonSchema;
49
50
  health: JsonSchema;
50
51
  liveProof: JsonSchema;
51
52
  liveProofSet: JsonSchema;
@@ -49,6 +49,7 @@ const SCHEMAS = {
49
49
  budgetVerdict: loadSchema('budget-verdict.schema.json'),
50
50
  causalRun: loadSchema('causal-run.schema.json'),
51
51
  comparison: loadSchema('comparison.schema.json'),
52
+ externalAdapterMessage: loadSchema('external-adapter-message.schema.json'),
52
53
  health: loadSchema('health.schema.json'),
53
54
  liveProof: loadSchema('live-proof.schema.json'),
54
55
  liveProofSet: loadSchema('live-proof-set.schema.json'),