agent-scenario-loop 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. package/README.md +9 -9
  2. package/app/profile-session.ts +98 -4
  3. package/dist/core/agent-summary.d.ts +3 -2
  4. package/dist/core/agent-summary.js +44 -2
  5. package/dist/core/artifact-contract.d.ts +22 -4
  6. package/dist/core/artifact-contract.js +512 -11
  7. package/dist/core/comparison.d.ts +57 -3
  8. package/dist/core/comparison.js +113 -1
  9. package/dist/core/planner.d.ts +32 -1
  10. package/dist/core/planner.js +144 -0
  11. package/dist/core/run-index.d.ts +4 -0
  12. package/dist/core/run-index.js +55 -1
  13. package/dist/core/schema-validator.d.ts +1 -0
  14. package/dist/core/schema-validator.js +1 -0
  15. package/dist/runner/compare-latest.d.ts +8 -4
  16. package/dist/runner/compare-latest.js +24 -5
  17. package/dist/runner/example-android-live.d.ts +10 -1
  18. package/dist/runner/example-android-live.js +55 -0
  19. package/dist/runner/example-ios-live.d.ts +10 -1
  20. package/dist/runner/example-ios-live.js +55 -0
  21. package/dist/runner/ios-simctl.d.ts +5 -0
  22. package/dist/runner/ios-simctl.js +6 -0
  23. package/dist/runner/live-comparison.d.ts +2 -2
  24. package/dist/runner/live-comparison.js +2 -1
  25. package/dist/runner/live-proof-summary.d.ts +5 -4
  26. package/dist/runner/live-proof-summary.js +12 -2
  27. package/dist/runner/live-proof.d.ts +3 -2
  28. package/dist/runner/live-proof.js +9 -2
  29. package/dist/runner/profile-android.d.ts +5 -0
  30. package/dist/runner/profile-android.js +148 -24
  31. package/dist/runner/profile-ios.d.ts +11 -1
  32. package/dist/runner/profile-ios.js +128 -9
  33. package/dist/runner/profile-mobile.d.ts +8 -0
  34. package/dist/runner/profile-mobile.js +267 -28
  35. package/docs/adapters.md +4 -0
  36. package/docs/architecture.md +90 -0
  37. package/docs/authoring.md +5 -1
  38. package/docs/concepts.md +3 -24
  39. package/docs/consumer-rehearsal.md +4 -0
  40. package/docs/contracts.md +30 -100
  41. package/docs/external-adapter-protocol.md +219 -0
  42. package/docs/live-proofs.md +83 -2
  43. package/docs/principles.md +9 -15
  44. package/examples/mobile-app/README.md +12 -0
  45. package/examples/mobile-app/runner-manifests/primary-runner.json +1 -0
  46. package/examples/runners/README.md +1 -0
  47. package/examples/runners/adb-android.json +1 -0
  48. package/examples/runners/agent-device-android.json +1 -0
  49. package/examples/runners/agent-device-ios.json +1 -0
  50. package/examples/runners/argent-android.json +1 -0
  51. package/examples/runners/argent-ios.json +1 -0
  52. package/examples/runners/xcodebuildmcp-ios.json +1 -0
  53. package/package.json +2 -1
  54. package/schemas/causal-run.schema.json +85 -2
  55. package/schemas/comparison.schema.json +130 -2
  56. package/schemas/external-adapter-message.schema.json +693 -0
  57. package/schemas/health.schema.json +72 -0
  58. package/schemas/live-proof-set.schema.json +1 -1
  59. package/schemas/live-proof.schema.json +14 -6
  60. package/schemas/manifest.schema.json +442 -1
  61. package/schemas/runner-capabilities.schema.json +20 -0
  62. package/schemas/scenario.schema.json +16 -0
  63. package/templates/primary-runner.json +1 -0
@@ -14,6 +14,7 @@ type CompareLatestResult = {
14
14
  type LatestTrustedSelection = {
15
15
  artifactRoot: string;
16
16
  candidatesInspected: number;
17
+ cohortHash?: string;
17
18
  scenarioId: string;
18
19
  selectedRunDir: string;
19
20
  selectedRunId: string;
@@ -21,6 +22,7 @@ type LatestTrustedSelection = {
21
22
  comparisonLane?: string;
22
23
  scenarioHash?: string;
23
24
  trustedCandidates: number;
25
+ trustedCohortCandidates?: number;
24
26
  trustedComparableCandidates?: number;
25
27
  trustedScenarioContractCandidates?: number;
26
28
  trustedPriorCandidates: number;
@@ -58,10 +60,11 @@ declare function isComparableScenarioContract(entry: RunIndexEntry, scenarioHash
58
60
  /**
59
61
  * Finds the newest trusted run for a scenario while excluding the current run directory.
60
62
  *
61
- * @param {{index: RunIndex, scenarioId: string, currentDir: string, comparisonLane?: string, scenarioHash?: string}} options
63
+ * @param {{index: RunIndex, scenarioId: string, currentDir: string, cohortHash?: string, comparisonLane?: string, scenarioHash?: string}} options
62
64
  * @returns {RunIndexEntry | null}
63
65
  */
64
- declare function findLatestTrustedPriorRun({ comparisonLane, index, scenarioHash, scenarioId, currentDir, }: {
66
+ declare function findLatestTrustedPriorRun({ cohortHash, comparisonLane, index, scenarioHash, scenarioId, currentDir, }: {
67
+ cohortHash?: string;
65
68
  comparisonLane?: string;
66
69
  index: RunIndex;
67
70
  scenarioHash?: string;
@@ -71,11 +74,12 @@ declare function findLatestTrustedPriorRun({ comparisonLane, index, scenarioHash
71
74
  /**
72
75
  * Builds stable provenance for the latest-trusted baseline selection.
73
76
  *
74
- * @param {{baseline: RunIndexEntry, comparisonLane?: string, currentDir: string, index: RunIndex, rootDir: string, scenarioHash?: string, scenarioId: string}} options
77
+ * @param {{baseline: RunIndexEntry, cohortHash?: string, comparisonLane?: string, currentDir: string, index: RunIndex, rootDir: string, scenarioHash?: string, scenarioId: string}} options
75
78
  * @returns {LatestTrustedSelection}
76
79
  */
77
- declare function buildLatestTrustedSelection({ baseline, comparisonLane, currentDir, index, rootDir, scenarioHash, scenarioId, }: {
80
+ declare function buildLatestTrustedSelection({ baseline, cohortHash, comparisonLane, currentDir, index, rootDir, scenarioHash, scenarioId, }: {
78
81
  baseline: RunIndexEntry;
82
+ cohortHash?: string;
79
83
  comparisonLane?: string;
80
84
  currentDir: string;
81
85
  index: RunIndex;
@@ -79,31 +79,44 @@ function isComparableLane(entry, comparisonLane) {
79
79
  function isComparableScenarioContract(entry, scenarioHash) {
80
80
  return scenarioHash ? entry.scenarioHash === scenarioHash : true;
81
81
  }
82
+ /**
83
+ * Returns whether a historical run belongs to the requested provenance cohort.
84
+ * Runs without a current cohort hash keep legacy behavior for old artifacts.
85
+ *
86
+ * @param {RunIndexEntry} entry
87
+ * @param {string | undefined} cohortHash
88
+ * @returns {boolean}
89
+ */
90
+ function isComparableCohort(entry, cohortHash) {
91
+ return cohortHash ? entry.cohortHash === cohortHash : true;
92
+ }
82
93
  /**
83
94
  * Finds the newest trusted run for a scenario while excluding the current run directory.
84
95
  *
85
- * @param {{index: RunIndex, scenarioId: string, currentDir: string, comparisonLane?: string, scenarioHash?: string}} options
96
+ * @param {{index: RunIndex, scenarioId: string, currentDir: string, cohortHash?: string, comparisonLane?: string, scenarioHash?: string}} options
86
97
  * @returns {RunIndexEntry | null}
87
98
  */
88
- function findLatestTrustedPriorRun({ comparisonLane, index, scenarioHash, scenarioId, currentDir, }) {
99
+ function findLatestTrustedPriorRun({ cohortHash, comparisonLane, index, scenarioHash, scenarioId, currentDir, }) {
89
100
  const resolvedCurrentDir = path.resolve(currentDir);
90
101
  return index.trusted.find((entry) => (entry.scenarioId === scenarioId &&
91
102
  isComparableLane(entry, comparisonLane) &&
92
103
  isComparableScenarioContract(entry, scenarioHash) &&
104
+ isComparableCohort(entry, cohortHash) &&
93
105
  path.resolve(entry.runDir) !== resolvedCurrentDir)) ?? null;
94
106
  }
95
107
  /**
96
108
  * Builds stable provenance for the latest-trusted baseline selection.
97
109
  *
98
- * @param {{baseline: RunIndexEntry, comparisonLane?: string, currentDir: string, index: RunIndex, rootDir: string, scenarioHash?: string, scenarioId: string}} options
110
+ * @param {{baseline: RunIndexEntry, cohortHash?: string, comparisonLane?: string, currentDir: string, index: RunIndex, rootDir: string, scenarioHash?: string, scenarioId: string}} options
99
111
  * @returns {LatestTrustedSelection}
100
112
  */
101
- function buildLatestTrustedSelection({ baseline, comparisonLane, currentDir, index, rootDir, scenarioHash, scenarioId, }) {
113
+ function buildLatestTrustedSelection({ baseline, cohortHash, comparisonLane, currentDir, index, rootDir, scenarioHash, scenarioId, }) {
102
114
  const resolvedCurrentDir = path.resolve(currentDir);
103
115
  const trustedPriorCandidates = index.trusted.filter((entry) => (entry.scenarioId === scenarioId &&
104
116
  path.resolve(entry.runDir) !== resolvedCurrentDir));
105
117
  const trustedComparableCandidates = trustedPriorCandidates.filter((entry) => (isComparableLane(entry, comparisonLane)));
106
118
  const trustedScenarioContractCandidates = trustedComparableCandidates.filter((entry) => (isComparableScenarioContract(entry, scenarioHash)));
119
+ const trustedCohortCandidates = trustedScenarioContractCandidates.filter((entry) => (isComparableCohort(entry, cohortHash)));
107
120
  return {
108
121
  artifactRoot: rootDir,
109
122
  candidatesInspected: index.entries.length,
@@ -113,9 +126,11 @@ function buildLatestTrustedSelection({ baseline, comparisonLane, currentDir, ind
113
126
  skippedCurrentRun: index.entries.some((entry) => path.resolve(entry.runDir) === resolvedCurrentDir),
114
127
  ...(comparisonLane ? { comparisonLane } : {}),
115
128
  ...(scenarioHash ? { scenarioHash } : {}),
129
+ ...(cohortHash ? { cohortHash } : {}),
116
130
  trustedCandidates: index.trusted.length,
117
131
  trustedComparableCandidates: trustedComparableCandidates.length,
118
132
  ...(scenarioHash ? { trustedScenarioContractCandidates: trustedScenarioContractCandidates.length } : {}),
133
+ ...(cohortHash ? { trustedCohortCandidates: trustedCohortCandidates.length } : {}),
119
134
  trustedPriorCandidates: trustedPriorCandidates.length,
120
135
  };
121
136
  }
@@ -132,8 +147,10 @@ function compareLatestTrustedRun({ comparisonLane, currentDir, rootDir, scenario
132
147
  const currentEntry = readRunIndexEntry(resolvedCurrentDir);
133
148
  const resolvedComparisonLane = comparisonLane ?? currentEntry.comparisonLane;
134
149
  const scenarioHash = currentEntry.scenarioHash;
150
+ const cohortHash = currentEntry.cohortHash;
135
151
  const index = buildRunIndex({ rootDir: resolvedRootDir, scenarioId });
136
152
  const baseline = findLatestTrustedPriorRun({
153
+ ...(cohortHash ? { cohortHash } : {}),
137
154
  ...(resolvedComparisonLane ? { comparisonLane: resolvedComparisonLane } : {}),
138
155
  ...(scenarioHash ? { scenarioHash } : {}),
139
156
  index,
@@ -145,7 +162,8 @@ function compareLatestTrustedRun({ comparisonLane, currentDir, rootDir, scenario
145
162
  ? ` in comparison lane '${resolvedComparisonLane}'`
146
163
  : ' without a comparison lane';
147
164
  const scenarioHashSuffix = scenarioHash ? ` and scenario hash '${scenarioHash}'` : '';
148
- throw new Error(`No trusted prior run found for scenario '${scenarioId}'${laneSuffix}${scenarioHashSuffix} under ${resolvedRootDir}; inspected ${index.entries.length} candidate run(s), ${index.trusted.length} trusted.`);
165
+ const cohortHashSuffix = cohortHash ? ` and cohort hash '${cohortHash}'` : '';
166
+ throw new Error(`No trusted prior run found for scenario '${scenarioId}'${laneSuffix}${scenarioHashSuffix}${cohortHashSuffix} under ${resolvedRootDir}; inspected ${index.entries.length} candidate run(s), ${index.trusted.length} trusted.`);
149
167
  }
150
168
  return {
151
169
  baselineDir: baseline.runDir,
@@ -154,6 +172,7 @@ function compareLatestTrustedRun({ comparisonLane, currentDir, rootDir, scenario
154
172
  currentDir: resolvedCurrentDir,
155
173
  selection: buildLatestTrustedSelection({
156
174
  baseline,
175
+ ...(cohortHash ? { cohortHash } : {}),
157
176
  ...(resolvedComparisonLane ? { comparisonLane: resolvedComparisonLane } : {}),
158
177
  currentDir: resolvedCurrentDir,
159
178
  index,
@@ -33,6 +33,7 @@ type AndroidLiveProofResult = {
33
33
  outputDir: string;
34
34
  preflightDir: string;
35
35
  profiles: AndroidLiveProfile[];
36
+ seededBaselines: AndroidLiveProfile[];
36
37
  skippedInteractionProofs: AndroidSkippedInteractionProof[];
37
38
  };
38
39
  type RegressionGateOptions = {
@@ -77,6 +78,14 @@ declare function buildLiveRunId(baseRunId: string, suffix: string | null): strin
77
78
  * @returns {string}
78
79
  */
79
80
  declare function buildInteractionComparisonLane(runnerIds: string[]): string;
81
+ /**
82
+ * Builds a deterministic run id for a seeded baseline profile.
83
+ *
84
+ * @param {string} baseRunId
85
+ * @param {string | null} suffix
86
+ * @returns {string}
87
+ */
88
+ declare function buildBaselineRunId(baseRunId: string, suffix: string | null): string;
80
89
  /**
81
90
  * Reports whether profile evidence is healthy enough to trust sidecar proofs and comparisons.
82
91
  *
@@ -133,5 +142,5 @@ declare function formatResult(result: AndroidLiveProofResult): string;
133
142
  * @returns {Promise<void>}
134
143
  */
135
144
  declare function main(): Promise<void>;
136
- export { assertAggregatePassed, formatResult, assertNoRegressedComparisons, buildLiveRunId, buildSkippedInteractionProofs, buildInteractionComparisonLane, isTrustedProfileRun, main, normalizeRunSuffix, resolveAndroidSerial, runExampleAndroidLiveProof, usage, };
145
+ export { assertAggregatePassed, formatResult, assertNoRegressedComparisons, buildLiveRunId, buildBaselineRunId, buildSkippedInteractionProofs, buildInteractionComparisonLane, isTrustedProfileRun, main, normalizeRunSuffix, resolveAndroidSerial, runExampleAndroidLiveProof, usage, };
137
146
  export type { AndroidLiveProofOptions, AndroidInteractionProof, AndroidLiveProofResult, AndroidLiveProfile, };
@@ -5,6 +5,7 @@ exports.assertAggregatePassed = assertAggregatePassed;
5
5
  exports.formatResult = formatResult;
6
6
  exports.assertNoRegressedComparisons = assertNoRegressedComparisons;
7
7
  exports.buildLiveRunId = buildLiveRunId;
8
+ exports.buildBaselineRunId = buildBaselineRunId;
8
9
  exports.buildSkippedInteractionProofs = buildSkippedInteractionProofs;
9
10
  exports.buildInteractionComparisonLane = buildInteractionComparisonLane;
10
11
  exports.isTrustedProfileRun = isTrustedProfileRun;
@@ -58,6 +59,7 @@ function usage(output = process.stderr) {
58
59
  'The example app must already be installed and reachable on an online Android emulator or device.',
59
60
  `By default, the runner sets the app React Native debug host to ${DEFAULT_REACT_NATIVE_DEBUG_HOST} for the isolated Metro server.`,
60
61
  'Use --run-suffix to preserve multiple live proof artifact sets without changing deterministic default run ids.',
62
+ 'Use --seed-baseline with --compare-latest to capture a trusted compatible baseline before the measured run.',
61
63
  'Use --compare-latest to compare each passed scenario against the latest trusted prior run under the artifact root.',
62
64
  'Use --fail-on-regression with --compare-latest to exit nonzero after writing evidence when any comparison regressed.',
63
65
  'Use --agent-device-proof to attach the shared startup UI assertion through agent-device; pass --agent-device-session-mode bind when a named session should still receive the configured serial.',
@@ -155,6 +157,16 @@ function buildInteractionComparisonLane(runnerIds) {
155
157
  ? `example-android-live+${runnerIds.join('+')}`
156
158
  : 'example-android-live';
157
159
  }
160
+ /**
161
+ * Builds a deterministic run id for a seeded baseline profile.
162
+ *
163
+ * @param {string} baseRunId
164
+ * @param {string | null} suffix
165
+ * @returns {string}
166
+ */
167
+ function buildBaselineRunId(baseRunId, suffix) {
168
+ return buildLiveRunId(baseRunId, suffix ? `${suffix}-baseline` : 'baseline');
169
+ }
158
170
  /**
159
171
  * Reports whether profile evidence is healthy enough to trust sidecar proofs and comparisons.
160
172
  *
@@ -280,8 +292,49 @@ async function runExampleAndroidLiveProof(args, options = {}) {
280
292
  throw new Error(`Android live proof preflight failed; inspect ${preflight.runDir}/agent-summary.md.`);
281
293
  }
282
294
  const interactionProofs = [];
295
+ const seededBaselines = [];
283
296
  const profiles = [];
284
297
  const failedProfiles = [];
298
+ if (isEnabledFlag(args['seed-baseline'])) {
299
+ for (const profile of EXAMPLE_PROFILES) {
300
+ const baselineRunId = buildBaselineRunId(profile.runId, runSuffix);
301
+ const result = await runProfileAndroid({
302
+ ...(typeof args.adb === 'string' ? { adb: args.adb } : {}),
303
+ 'adb-capture': true,
304
+ 'clear-logcat': true,
305
+ config: configPath,
306
+ 'command-wait-ms': typeof args['command-wait-ms'] === 'string' ? args['command-wait-ms'] : '250',
307
+ launch: true,
308
+ 'launch-wait-ms': typeof args['launch-wait-ms'] === 'string' ? args['launch-wait-ms'] : '1500',
309
+ 'logcat-lines': typeof args['logcat-lines'] === 'string' ? args['logcat-lines'] : '1000',
310
+ out: outputDir,
311
+ ...(packageName ? { package: packageName } : {}),
312
+ 'profile-session': true,
313
+ 'react-native-debug-host': reactNativeDebugHost,
314
+ 'run-id': baselineRunId,
315
+ scenario: path.join(exampleRoot, 'scenarios', 'mobile', profile.scenario),
316
+ serial,
317
+ 'wait-ms': typeof args['wait-ms'] === 'string' ? args['wait-ms'] : '1000',
318
+ }, {
319
+ comparisonLane,
320
+ ...(options.delay ? { delay: options.delay } : {}),
321
+ ...(options.executor ? { executor: options.executor } : {}),
322
+ });
323
+ const baselinePointer = {
324
+ healthStatus: typeof result.health.healthStatus === 'string' ? result.health.healthStatus : 'unknown',
325
+ label: `${profile.label}-baseline`,
326
+ runDir: result.runDir,
327
+ runId: baselineRunId,
328
+ scenario: profile.scenario,
329
+ scenarioId: profile.scenarioId,
330
+ verdictStatus: typeof result.verdict.verdictStatus === 'string' ? result.verdict.verdictStatus : 'unknown',
331
+ };
332
+ seededBaselines.push(baselinePointer);
333
+ if (!isTrustedProfileRun({ health: result.health, verdict: result.verdict })) {
334
+ throw new Error(`Android seeded baseline failed for ${profile.label}. Inspect ${result.runDir}/agent-summary.md.`);
335
+ }
336
+ }
337
+ }
285
338
  for (const profile of EXAMPLE_PROFILES) {
286
339
  const profileRunId = buildLiveRunId(profile.runId, runSuffix);
287
340
  const result = await runProfileAndroid({
@@ -395,6 +448,7 @@ async function runExampleAndroidLiveProof(args, options = {}) {
395
448
  outputDir,
396
449
  preflightDir: preflight.runDir,
397
450
  profiles,
451
+ seededBaselines,
398
452
  skippedInteractionProofs,
399
453
  };
400
454
  if (isEnabledFlag(args['fail-on-regression'])) {
@@ -419,6 +473,7 @@ function formatResult(result) {
419
473
  `Preflight: ${result.preflightDir}/agent-summary.md`,
420
474
  ...result.profiles.map((profile) => (`${profile.label}: ${profile.runDir}/agent-summary.md`)),
421
475
  ...result.interactionProofs.map((proof) => (`${proof.label}: ${proof.runDir}/agent-summary.md`)),
476
+ ...result.seededBaselines.map((profile) => (`${profile.label}: ${profile.runDir}/agent-summary.md`)),
422
477
  ...(result.comparisons.length > 0
423
478
  ? [
424
479
  'Comparisons:',
@@ -33,6 +33,7 @@ type IosLiveProofResult = {
33
33
  outputDir: string;
34
34
  preflightDir: string;
35
35
  profiles: IosLiveProfile[];
36
+ seededBaselines: IosLiveProfile[];
36
37
  skippedInteractionProofs: IosSkippedInteractionProof[];
37
38
  };
38
39
  type RegressionGateOptions = {
@@ -77,6 +78,14 @@ declare function buildLiveRunId(baseRunId: string, suffix: string | null): strin
77
78
  * @returns {string}
78
79
  */
79
80
  declare function buildInteractionComparisonLane(runnerIds: string[]): string;
81
+ /**
82
+ * Builds a deterministic run id for a seeded baseline profile.
83
+ *
84
+ * @param {string} baseRunId
85
+ * @param {string | null} suffix
86
+ * @returns {string}
87
+ */
88
+ declare function buildBaselineRunId(baseRunId: string, suffix: string | null): string;
80
89
  /**
81
90
  * Reports whether profile evidence is healthy enough to trust sidecar proofs and comparisons.
82
91
  *
@@ -133,5 +142,5 @@ declare function formatResult(result: IosLiveProofResult): string;
133
142
  * @returns {Promise<void>}
134
143
  */
135
144
  declare function main(): Promise<void>;
136
- export { assertAggregatePassed, buildLiveRunId, formatResult, assertNoRegressedComparisons, buildSkippedInteractionProofs, buildInteractionComparisonLane, isTrustedProfileRun, main, normalizeRunSuffix, resolveIosDeviceId, runExampleIosLiveProof, usage, };
145
+ export { assertAggregatePassed, buildLiveRunId, formatResult, assertNoRegressedComparisons, buildBaselineRunId, buildSkippedInteractionProofs, buildInteractionComparisonLane, isTrustedProfileRun, main, normalizeRunSuffix, resolveIosDeviceId, runExampleIosLiveProof, usage, };
137
146
  export type { IosLiveProofOptions, IosInteractionProof, IosLiveProofResult, IosLiveProfile, };
@@ -5,6 +5,7 @@ exports.assertAggregatePassed = assertAggregatePassed;
5
5
  exports.buildLiveRunId = buildLiveRunId;
6
6
  exports.formatResult = formatResult;
7
7
  exports.assertNoRegressedComparisons = assertNoRegressedComparisons;
8
+ exports.buildBaselineRunId = buildBaselineRunId;
8
9
  exports.buildSkippedInteractionProofs = buildSkippedInteractionProofs;
9
10
  exports.buildInteractionComparisonLane = buildInteractionComparisonLane;
10
11
  exports.isTrustedProfileRun = isTrustedProfileRun;
@@ -56,6 +57,7 @@ function usage(output = process.stderr) {
56
57
  'Runs the packaged example iOS live proof: simctl preflight, startup, open-close, and scroll-settle.',
57
58
  'The example app must already be installed on a booted iOS simulator and connected to Metro. Set ASL_EXAMPLE_IOS_DEV_CLIENT_URL for Expo dev-client builds that need an explicit Metro URL.',
58
59
  'Use --run-suffix to preserve multiple live proof artifact sets without changing deterministic default run ids.',
60
+ 'Use --seed-baseline with --compare-latest to capture a trusted compatible baseline before the measured run.',
59
61
  'Use --compare-latest to compare each passed scenario against the latest trusted prior run under the artifact root.',
60
62
  'Use --fail-on-regression with --compare-latest to exit nonzero after writing evidence when any comparison regressed.',
61
63
  'Use --agent-device-proof to attach the shared startup UI assertion through agent-device; pass --agent-device-session-mode bind when a named session should still receive the configured UDID.',
@@ -166,6 +168,16 @@ function buildInteractionComparisonLane(runnerIds) {
166
168
  ? `example-ios-live+${runnerIds.join('+')}`
167
169
  : 'example-ios-live';
168
170
  }
171
+ /**
172
+ * Builds a deterministic run id for a seeded baseline profile.
173
+ *
174
+ * @param {string} baseRunId
175
+ * @param {string | null} suffix
176
+ * @returns {string}
177
+ */
178
+ function buildBaselineRunId(baseRunId, suffix) {
179
+ return buildLiveRunId(baseRunId, suffix ? `${suffix}-baseline` : 'baseline');
180
+ }
169
181
  /**
170
182
  * Reports whether profile evidence is healthy enough to trust sidecar proofs and comparisons.
171
183
  *
@@ -295,8 +307,49 @@ async function runExampleIosLiveProof(args, options = {}) {
295
307
  throw new Error(`iOS live proof preflight failed; inspect ${preflight.runDir}/agent-summary.md.`);
296
308
  }
297
309
  const interactionProofs = [];
310
+ const seededBaselines = [];
298
311
  const profiles = [];
299
312
  const failedProfiles = [];
313
+ if (isEnabledFlag(args['seed-baseline'])) {
314
+ for (const profile of EXAMPLE_PROFILES) {
315
+ const baselineRunId = buildBaselineRunId(profile.runId, runSuffix);
316
+ const result = await runProfileIos({
317
+ config: configPath,
318
+ device: deviceId,
319
+ ...(typeof args['log-last'] === 'string' ? { 'log-last': args['log-last'] } : {}),
320
+ launch: true,
321
+ out: outputDir,
322
+ ...(iosDevClientUrl ? { 'ios-dev-client-url': iosDevClientUrl } : {}),
323
+ ...(iosDevClientWaitMs ? { 'ios-dev-client-wait-ms': iosDevClientWaitMs } : {}),
324
+ 'profile-session': true,
325
+ 'profile-session-storage': true,
326
+ 'run-id': baselineRunId,
327
+ scenario: path.join(exampleRoot, 'scenarios', 'mobile', profile.scenario),
328
+ 'simctl-capture': true,
329
+ 'simctl-out': path.join(outputDir, '_ios-simctl-captures', baselineRunId),
330
+ ...(typeof args['wait-ms'] === 'string' ? { 'wait-ms': args['wait-ms'] } : {}),
331
+ ...(bundleId ? { bundle: bundleId } : {}),
332
+ ...(typeof args.xcrun === 'string' ? { xcrun: args.xcrun } : {}),
333
+ }, {
334
+ comparisonLane,
335
+ ...(options.delay ? { delay: options.delay } : {}),
336
+ ...(options.executor ? { executor: options.executor } : {}),
337
+ });
338
+ const baselinePointer = {
339
+ healthStatus: typeof result.health.healthStatus === 'string' ? result.health.healthStatus : 'unknown',
340
+ label: `${profile.label}-baseline`,
341
+ runDir: result.runDir,
342
+ runId: baselineRunId,
343
+ scenario: profile.scenario,
344
+ scenarioId: profile.scenarioId,
345
+ verdictStatus: typeof result.verdict.verdictStatus === 'string' ? result.verdict.verdictStatus : 'unknown',
346
+ };
347
+ seededBaselines.push(baselinePointer);
348
+ if (!isTrustedProfileRun({ health: result.health, verdict: result.verdict })) {
349
+ throw new Error(`iOS seeded baseline failed for ${profile.label}. Inspect ${result.runDir}/agent-summary.md.`);
350
+ }
351
+ }
352
+ }
300
353
  for (const profile of EXAMPLE_PROFILES) {
301
354
  const profileRunId = buildLiveRunId(profile.runId, runSuffix);
302
355
  const result = await runProfileIos({
@@ -412,6 +465,7 @@ async function runExampleIosLiveProof(args, options = {}) {
412
465
  outputDir,
413
466
  preflightDir: preflight.runDir,
414
467
  profiles,
468
+ seededBaselines,
415
469
  skippedInteractionProofs,
416
470
  };
417
471
  if (isEnabledFlag(args['fail-on-regression'])) {
@@ -436,6 +490,7 @@ function formatResult(result) {
436
490
  `Preflight: ${result.preflightDir}/agent-summary.md`,
437
491
  ...result.profiles.map((profile) => (`${profile.label}: ${profile.runDir}/agent-summary.md`)),
438
492
  ...result.interactionProofs.map((proof) => (`${proof.label}: ${proof.runDir}/agent-summary.md`)),
493
+ ...result.seededBaselines.map((profile) => (`${profile.label}: ${profile.runDir}/agent-summary.md`)),
439
494
  ...(result.comparisons.length > 0
440
495
  ? [
441
496
  'Comparisons:',
@@ -43,9 +43,14 @@ type IosSimctlDeepLink = {
43
43
  };
44
44
  type IosProfileSessionStorageCommand = {
45
45
  command: string;
46
+ commandId?: string;
46
47
  id?: string;
47
48
  label?: string;
49
+ queueId?: string;
50
+ sequence?: number;
48
51
  timestamp?: number;
52
+ waitForMilestone?: string;
53
+ waitTimeoutMs?: number;
49
54
  };
50
55
  type IosProfileSessionStorageSeed = {
51
56
  commands?: IosProfileSessionStorageCommand[];
@@ -456,7 +456,13 @@ async function seedProfileSessionStorage({ bundleId, commands = [], dataContaine
456
456
  scenario,
457
457
  runId,
458
458
  command: profileCommand.command,
459
+ ...(typeof profileCommand.commandId === 'string' ? { commandId: profileCommand.commandId } : {}),
460
+ ...(typeof profileCommand.label === 'string' ? { label: profileCommand.label } : {}),
461
+ ...(typeof profileCommand.queueId === 'string' ? { queueId: profileCommand.queueId } : {}),
462
+ ...(typeof profileCommand.sequence === 'number' ? { sequence: profileCommand.sequence } : {}),
459
463
  timestamp: typeof profileCommand.timestamp === 'number' ? profileCommand.timestamp : startedAt + index + 1,
464
+ ...(typeof profileCommand.waitForMilestone === 'string' ? { waitForMilestone: profileCommand.waitForMilestone } : {}),
465
+ ...(typeof profileCommand.waitTimeoutMs === 'number' ? { waitTimeoutMs: profileCommand.waitTimeoutMs } : {}),
460
466
  }));
461
467
  manifest[profileStorageKeys.session] = JSON.stringify(session);
462
468
  if (queuedCommands.length > 0) {
@@ -4,7 +4,7 @@ type LiveProfileForComparison = {
4
4
  runId: string;
5
5
  scenarioId: string;
6
6
  };
7
- type ComparisonMetricStatus = 'better' | 'worse' | 'unchanged' | 'inconclusive';
7
+ type ComparisonMetricStatus = 'better' | 'worse' | 'unchanged' | 'inconclusive' | 'low_confidence';
8
8
  type ComparisonMetricHighlight = {
9
9
  baseline: number | boolean | null;
10
10
  current: number | boolean | null;
@@ -25,7 +25,7 @@ type LiveComparisonResult = {
25
25
  reason: string | null;
26
26
  runId: string;
27
27
  scenarioId: string;
28
- status: 'better' | 'worse' | 'unchanged' | 'mixed' | 'inconclusive' | 'skipped';
28
+ status: 'better' | 'worse' | 'unchanged' | 'mixed' | 'inconclusive' | 'low_confidence' | 'skipped';
29
29
  summaryPath: string | null;
30
30
  };
31
31
  type CompareLiveProfilesOptions = {
@@ -44,6 +44,7 @@ function buildComparisonMetricSummary(comparison) {
44
44
  worse: 0,
45
45
  unchanged: 0,
46
46
  inconclusive: 0,
47
+ low_confidence: 0,
47
48
  };
48
49
  const notableMetrics = [];
49
50
  for (const metric of comparison.metricComparisons) {
@@ -52,7 +53,7 @@ function buildComparisonMetricSummary(comparison) {
52
53
  }
53
54
  const record = metric;
54
55
  const status = record.status;
55
- if (status !== 'better' && status !== 'worse' && status !== 'unchanged' && status !== 'inconclusive') {
56
+ if (status !== 'better' && status !== 'worse' && status !== 'unchanged' && status !== 'inconclusive' && status !== 'low_confidence') {
56
57
  continue;
57
58
  }
58
59
  counts[status] += 1;
@@ -53,10 +53,10 @@ type LiveProofComparisonPointer = {
53
53
  reason: string | null;
54
54
  runId: string;
55
55
  scenarioId: string;
56
- status: 'better' | 'worse' | 'unchanged' | 'mixed' | 'inconclusive' | 'skipped';
56
+ status: 'better' | 'worse' | 'unchanged' | 'mixed' | 'inconclusive' | 'low_confidence' | 'skipped';
57
57
  summaryPath: string | null;
58
58
  };
59
- type LiveProofComparisonMetricStatus = 'better' | 'worse' | 'unchanged' | 'inconclusive';
59
+ type LiveProofComparisonMetricStatus = 'better' | 'worse' | 'unchanged' | 'inconclusive' | 'low_confidence';
60
60
  type LiveProofComparisonMetricSummary = {
61
61
  counts: Record<LiveProofComparisonMetricStatus, number>;
62
62
  notableMetrics: Array<{
@@ -94,17 +94,18 @@ type LiveProofArtifact = {
94
94
  status: 'passed' | 'failed';
95
95
  summary: string;
96
96
  };
97
- type LiveProofComparisonStatus = ('baseline_missing' | 'improved' | 'inconclusive' | 'mixed' | 'not_compared' | 'regressed' | 'unchanged');
97
+ type LiveProofComparisonStatus = ('baseline_missing' | 'improved' | 'inconclusive' | 'low_confidence' | 'mixed' | 'not_compared' | 'regressed' | 'unchanged');
98
98
  type LiveProofComparisonCounts = {
99
99
  better: number;
100
100
  inconclusive: number;
101
+ low_confidence: number;
101
102
  mixed: number;
102
103
  skipped: number;
103
104
  unchanged: number;
104
105
  worse: number;
105
106
  };
106
107
  type LiveProofNextAction = {
107
- code: 'establish_baseline' | 'inspect_failed_run' | 'inspect_inconclusive' | 'inspect_mixed' | 'inspect_regressions' | 'inspect_summary';
108
+ code: 'establish_baseline' | 'inspect_failed_run' | 'inspect_inconclusive' | 'inspect_low_confidence' | 'inspect_mixed' | 'inspect_regressions' | 'inspect_summary';
108
109
  summary: string;
109
110
  };
110
111
  type LiveProofSummaryResult = {
@@ -168,6 +168,9 @@ function buildLiveProofComparisonStatus(comparisons) {
168
168
  if (statuses.includes('inconclusive')) {
169
169
  return 'inconclusive';
170
170
  }
171
+ if (statuses.includes('low_confidence')) {
172
+ return 'low_confidence';
173
+ }
171
174
  if (statuses.every((status) => status === 'skipped')) {
172
175
  return 'baseline_missing';
173
176
  }
@@ -192,6 +195,7 @@ function buildLiveProofComparisonCounts(comparisons) {
192
195
  const counts = {
193
196
  better: 0,
194
197
  inconclusive: 0,
198
+ low_confidence: 0,
195
199
  mixed: 0,
196
200
  skipped: 0,
197
201
  unchanged: 0,
@@ -233,6 +237,12 @@ function buildLiveProofNextAction(comparisonStatus, status = 'passed') {
233
237
  summary: 'Some comparisons are inconclusive or incomplete; inspect scenario health and missing baseline details.',
234
238
  };
235
239
  }
240
+ if (comparisonStatus === 'low_confidence') {
241
+ return {
242
+ code: 'inspect_low_confidence',
243
+ summary: 'Some comparisons show low-confidence timing movement; repeat or multi-sample proof is required before treating it as a regression.',
244
+ };
245
+ }
236
246
  if (comparisonStatus === 'mixed') {
237
247
  return {
238
248
  code: 'inspect_mixed',
@@ -292,7 +302,7 @@ function formatComparisonMetricSummary(comparison) {
292
302
  if (!summary) {
293
303
  return '';
294
304
  }
295
- const counts = `metrics better=${summary.counts.better} worse=${summary.counts.worse} unchanged=${summary.counts.unchanged} inconclusive=${summary.counts.inconclusive}`;
305
+ const counts = `metrics better=${summary.counts.better} worse=${summary.counts.worse} unchanged=${summary.counts.unchanged} inconclusive=${summary.counts.inconclusive} low_confidence=${summary.counts.low_confidence}`;
296
306
  const highlights = summary.notableMetrics.length > 0
297
307
  ? `; notable: ${summary.notableMetrics.map(formatComparisonMetricHighlight).join(', ')}`
298
308
  : '';
@@ -345,7 +355,7 @@ function buildLiveProofMarkdown(artifact) {
345
355
  `Status: ${artifact.status}`,
346
356
  `Run: ${artifact.runId}`,
347
357
  `Comparison status: ${artifact.comparisonStatus}`,
348
- `Comparison counts: better=${artifact.comparisonCounts.better} worse=${artifact.comparisonCounts.worse} unchanged=${artifact.comparisonCounts.unchanged} mixed=${artifact.comparisonCounts.mixed} inconclusive=${artifact.comparisonCounts.inconclusive} skipped=${artifact.comparisonCounts.skipped}`,
358
+ `Comparison counts: better=${artifact.comparisonCounts.better} worse=${artifact.comparisonCounts.worse} unchanged=${artifact.comparisonCounts.unchanged} mixed=${artifact.comparisonCounts.mixed} inconclusive=${artifact.comparisonCounts.inconclusive} low_confidence=${artifact.comparisonCounts.low_confidence} skipped=${artifact.comparisonCounts.skipped}`,
349
359
  `Next action: ${artifact.nextAction.code} - ${artifact.nextAction.summary}`,
350
360
  `Summary: ${artifact.summary}`,
351
361
  '',
@@ -17,6 +17,7 @@ type LiveProofArtifact = {
17
17
  skipped: number;
18
18
  unchanged: number;
19
19
  worse: number;
20
+ low_confidence: number;
20
21
  };
21
22
  comparisonStatus: string;
22
23
  comparisons: LiveProofComparisonPointer[];
@@ -82,7 +83,7 @@ type LiveProofArtifact = {
82
83
  summary: string;
83
84
  };
84
85
  type LiveProofComparisonCounts = LiveProofArtifact['comparisonCounts'];
85
- type LiveProofMetricStatus = 'better' | 'worse' | 'unchanged' | 'inconclusive';
86
+ type LiveProofMetricStatus = 'better' | 'worse' | 'unchanged' | 'inconclusive' | 'low_confidence';
86
87
  type LiveProofPlatform = LiveProofArtifact['platform'];
87
88
  type LiveProofComparisonPointer = {
88
89
  baselineDir?: string | null;
@@ -102,7 +103,7 @@ type LiveProofComparisonPointer = {
102
103
  status?: string;
103
104
  summaryPath?: string | null;
104
105
  };
105
- type LiveProofAggregateStatus = ('baseline_missing' | 'improved' | 'inconclusive' | 'mixed' | 'not_compared' | 'regressed' | 'unchanged');
106
+ type LiveProofAggregateStatus = ('baseline_missing' | 'improved' | 'inconclusive' | 'low_confidence' | 'mixed' | 'not_compared' | 'regressed' | 'unchanged');
106
107
  type LiveProofNextActionCode = LiveProofArtifact['nextAction']['code'];
107
108
  type LiveProofSetArtifact = {
108
109
  failureReasons: string[];
@@ -153,6 +153,7 @@ function countLiveProofComparisons(comparisons) {
153
153
  const counts = {
154
154
  better: 0,
155
155
  inconclusive: 0,
156
+ low_confidence: 0,
156
157
  mixed: 0,
157
158
  skipped: 0,
158
159
  unchanged: 0,
@@ -183,6 +184,9 @@ function deriveLiveProofComparisonStatus(comparisons) {
183
184
  if (statuses.includes('inconclusive')) {
184
185
  return 'inconclusive';
185
186
  }
187
+ if (statuses.includes('low_confidence')) {
188
+ return 'low_confidence';
189
+ }
186
190
  if (statuses.every((status) => status === 'skipped')) {
187
191
  return 'baseline_missing';
188
192
  }
@@ -217,6 +221,9 @@ function expectedLiveProofNextActionCode(comparisonStatus, status = 'passed') {
217
221
  if (comparisonStatus === 'inconclusive') {
218
222
  return 'inspect_inconclusive';
219
223
  }
224
+ if (comparisonStatus === 'low_confidence') {
225
+ return 'inspect_low_confidence';
226
+ }
220
227
  if (comparisonStatus === 'mixed') {
221
228
  return 'inspect_mixed';
222
229
  }
@@ -439,7 +446,7 @@ function formatComparisonPointerMetrics(comparison) {
439
446
  const highlightText = highlights.length > 0
440
447
  ? `; notable: ${highlights.map(formatMetricHighlight).join(', ')}`
441
448
  : '';
442
- return ` (metrics better=${counts.better} worse=${counts.worse} unchanged=${counts.unchanged} inconclusive=${counts.inconclusive}${highlightText})`;
449
+ return ` (metrics better=${counts.better} worse=${counts.worse} unchanged=${counts.unchanged} inconclusive=${counts.inconclusive} low_confidence=${counts.low_confidence}${highlightText})`;
443
450
  }
444
451
  /**
445
452
  * Formats capture counts for one interaction proof pointer.
@@ -815,7 +822,7 @@ function formatLiveProof(proof) {
815
822
  `Skipped interaction proofs: ${proof.skippedInteractionProofs?.length ?? 0}`,
816
823
  ...(proof.skippedInteractionProofs ?? []).map((proofPointer) => (`- ${proofPointer.label} (${proofPointer.runnerId}/${proofPointer.scenarioId}/${proofPointer.runId}): ${proofPointer.reason} next=${proofPointer.nextAction.code}`)),
817
824
  `Comparisons: ${proof.comparisons.length}`,
818
- `Comparison counts: better=${proof.comparisonCounts.better} worse=${proof.comparisonCounts.worse} unchanged=${proof.comparisonCounts.unchanged} mixed=${proof.comparisonCounts.mixed} inconclusive=${proof.comparisonCounts.inconclusive} skipped=${proof.comparisonCounts.skipped}`,
825
+ `Comparison counts: better=${proof.comparisonCounts.better} worse=${proof.comparisonCounts.worse} unchanged=${proof.comparisonCounts.unchanged} mixed=${proof.comparisonCounts.mixed} inconclusive=${proof.comparisonCounts.inconclusive} low_confidence=${proof.comparisonCounts.low_confidence} skipped=${proof.comparisonCounts.skipped}`,
819
826
  ...proof.comparisons.map((comparison) => (`- ${comparison.label ?? 'comparison'} (${comparison.scenarioId ?? 'unknown-scenario'}/${comparison.runId ?? 'unknown-run'}): ${comparison.status ?? 'unknown'}${formatComparisonPointerMetrics(comparison)}`)),
820
827
  `Next action: ${proof.nextAction.code} - ${proof.nextAction.summary}`,
821
828
  `Summary: ${proof.summary}`,
@@ -8,8 +8,13 @@ type AndroidProfileOptions = {
8
8
  };
9
9
  type AndroidAdbProfileCommand = {
10
10
  command: string;
11
+ commandId?: string;
11
12
  label?: string;
13
+ queueId?: string;
14
+ sequence?: number;
15
+ waitForMilestone?: string;
12
16
  waitMs?: number;
17
+ waitTimeoutMs?: number;
13
18
  };
14
19
  type AndroidAdbDriverStep = import('./android-adb').AndroidAdbDriverStep;
15
20
  /**