npm - agent-scenario-loop - Versions diffs - 0.1.2 → 0.1.3 - Mend

agent-scenario-loop 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

package/README.md +9 -9
package/app/profile-session.ts +98 -4
package/dist/core/agent-summary.d.ts +3 -2
package/dist/core/agent-summary.js +44 -2
package/dist/core/artifact-contract.d.ts +22 -4
package/dist/core/artifact-contract.js +512 -11
package/dist/core/comparison.d.ts +57 -3
package/dist/core/comparison.js +113 -1
package/dist/core/planner.d.ts +32 -1
package/dist/core/planner.js +144 -0
package/dist/core/run-index.d.ts +4 -0
package/dist/core/run-index.js +55 -1
package/dist/core/schema-validator.d.ts +1 -0
package/dist/core/schema-validator.js +1 -0
package/dist/runner/compare-latest.d.ts +8 -4
package/dist/runner/compare-latest.js +24 -5
package/dist/runner/example-android-live.d.ts +10 -1
package/dist/runner/example-android-live.js +55 -0
package/dist/runner/example-ios-live.d.ts +10 -1
package/dist/runner/example-ios-live.js +55 -0
package/dist/runner/ios-simctl.d.ts +5 -0
package/dist/runner/ios-simctl.js +6 -0
package/dist/runner/live-comparison.d.ts +2 -2
package/dist/runner/live-comparison.js +2 -1
package/dist/runner/live-proof-summary.d.ts +5 -4
package/dist/runner/live-proof-summary.js +12 -2
package/dist/runner/live-proof.d.ts +3 -2
package/dist/runner/live-proof.js +9 -2
package/dist/runner/profile-android.d.ts +5 -0
package/dist/runner/profile-android.js +148 -24
package/dist/runner/profile-ios.d.ts +11 -1
package/dist/runner/profile-ios.js +128 -9
package/dist/runner/profile-mobile.d.ts +8 -0
package/dist/runner/profile-mobile.js +267 -28
package/docs/adapters.md +4 -0
package/docs/architecture.md +90 -0
package/docs/authoring.md +5 -1
package/docs/concepts.md +3 -24
package/docs/consumer-rehearsal.md +4 -0
package/docs/contracts.md +30 -100
package/docs/external-adapter-protocol.md +219 -0
package/docs/live-proofs.md +83 -2
package/docs/principles.md +9 -15
package/examples/mobile-app/README.md +12 -0
package/examples/mobile-app/runner-manifests/primary-runner.json +1 -0
package/examples/runners/README.md +1 -0
package/examples/runners/adb-android.json +1 -0
package/examples/runners/agent-device-android.json +1 -0
package/examples/runners/agent-device-ios.json +1 -0
package/examples/runners/argent-android.json +1 -0
package/examples/runners/argent-ios.json +1 -0
package/examples/runners/xcodebuildmcp-ios.json +1 -0
package/package.json +2 -1
package/schemas/causal-run.schema.json +85 -2
package/schemas/comparison.schema.json +130 -2
package/schemas/external-adapter-message.schema.json +693 -0
package/schemas/health.schema.json +72 -0
package/schemas/live-proof-set.schema.json +1 -1
package/schemas/live-proof.schema.json +14 -6
package/schemas/manifest.schema.json +442 -1
package/schemas/runner-capabilities.schema.json +20 -0
package/schemas/scenario.schema.json +16 -0
package/templates/primary-runner.json +1 -0

package/dist/runner/compare-latest.d.ts CHANGED Viewed

@@ -14,6 +14,7 @@ type CompareLatestResult = {
 type LatestTrustedSelection = {
     artifactRoot: string;
     candidatesInspected: number;
+    cohortHash?: string;
     scenarioId: string;
     selectedRunDir: string;
     selectedRunId: string;
@@ -21,6 +22,7 @@ type LatestTrustedSelection = {
     comparisonLane?: string;
     scenarioHash?: string;
     trustedCandidates: number;
+    trustedCohortCandidates?: number;
     trustedComparableCandidates?: number;
     trustedScenarioContractCandidates?: number;
     trustedPriorCandidates: number;
@@ -58,10 +60,11 @@ declare function isComparableScenarioContract(entry: RunIndexEntry, scenarioHash
 /**
  * Finds the newest trusted run for a scenario while excluding the current run directory.
  *
- * @param {{index: RunIndex, scenarioId: string, currentDir: string, comparisonLane?: string, scenarioHash?: string}} options
+ * @param {{index: RunIndex, scenarioId: string, currentDir: string, cohortHash?: string, comparisonLane?: string, scenarioHash?: string}} options
  * @returns {RunIndexEntry | null}
  */
-declare function findLatestTrustedPriorRun({ comparisonLane, index, scenarioHash, scenarioId, currentDir, }: {
+declare function findLatestTrustedPriorRun({ cohortHash, comparisonLane, index, scenarioHash, scenarioId, currentDir, }: {
+    cohortHash?: string;
     comparisonLane?: string;
     index: RunIndex;
     scenarioHash?: string;
@@ -71,11 +74,12 @@ declare function findLatestTrustedPriorRun({ comparisonLane, index, scenarioHash
 /**
  * Builds stable provenance for the latest-trusted baseline selection.
  *
- * @param {{baseline: RunIndexEntry, comparisonLane?: string, currentDir: string, index: RunIndex, rootDir: string, scenarioHash?: string, scenarioId: string}} options
+ * @param {{baseline: RunIndexEntry, cohortHash?: string, comparisonLane?: string, currentDir: string, index: RunIndex, rootDir: string, scenarioHash?: string, scenarioId: string}} options
  * @returns {LatestTrustedSelection}
  */
-declare function buildLatestTrustedSelection({ baseline, comparisonLane, currentDir, index, rootDir, scenarioHash, scenarioId, }: {
+declare function buildLatestTrustedSelection({ baseline, cohortHash, comparisonLane, currentDir, index, rootDir, scenarioHash, scenarioId, }: {
     baseline: RunIndexEntry;
+    cohortHash?: string;
     comparisonLane?: string;
     currentDir: string;
     index: RunIndex;

package/dist/runner/compare-latest.js CHANGED Viewed

@@ -79,31 +79,44 @@ function isComparableLane(entry, comparisonLane) {
 function isComparableScenarioContract(entry, scenarioHash) {
     return scenarioHash ? entry.scenarioHash === scenarioHash : true;
 }
+/**
+ * Returns whether a historical run belongs to the requested provenance cohort.
+ * Runs without a current cohort hash keep legacy behavior for old artifacts.
+ *
+ * @param {RunIndexEntry} entry
+ * @param {string | undefined} cohortHash
+ * @returns {boolean}
+ */
+function isComparableCohort(entry, cohortHash) {
+    return cohortHash ? entry.cohortHash === cohortHash : true;
+}
 /**
  * Finds the newest trusted run for a scenario while excluding the current run directory.
  *
- * @param {{index: RunIndex, scenarioId: string, currentDir: string, comparisonLane?: string, scenarioHash?: string}} options
+ * @param {{index: RunIndex, scenarioId: string, currentDir: string, cohortHash?: string, comparisonLane?: string, scenarioHash?: string}} options
  * @returns {RunIndexEntry | null}
  */
-function findLatestTrustedPriorRun({ comparisonLane, index, scenarioHash, scenarioId, currentDir, }) {
+function findLatestTrustedPriorRun({ cohortHash, comparisonLane, index, scenarioHash, scenarioId, currentDir, }) {
     const resolvedCurrentDir = path.resolve(currentDir);
     return index.trusted.find((entry) => (entry.scenarioId === scenarioId &&
         isComparableLane(entry, comparisonLane) &&
         isComparableScenarioContract(entry, scenarioHash) &&
+        isComparableCohort(entry, cohortHash) &&
         path.resolve(entry.runDir) !== resolvedCurrentDir)) ?? null;
 }
 /**
  * Builds stable provenance for the latest-trusted baseline selection.
  *
- * @param {{baseline: RunIndexEntry, comparisonLane?: string, currentDir: string, index: RunIndex, rootDir: string, scenarioHash?: string, scenarioId: string}} options
+ * @param {{baseline: RunIndexEntry, cohortHash?: string, comparisonLane?: string, currentDir: string, index: RunIndex, rootDir: string, scenarioHash?: string, scenarioId: string}} options
  * @returns {LatestTrustedSelection}
  */
-function buildLatestTrustedSelection({ baseline, comparisonLane, currentDir, index, rootDir, scenarioHash, scenarioId, }) {
+function buildLatestTrustedSelection({ baseline, cohortHash, comparisonLane, currentDir, index, rootDir, scenarioHash, scenarioId, }) {
     const resolvedCurrentDir = path.resolve(currentDir);
     const trustedPriorCandidates = index.trusted.filter((entry) => (entry.scenarioId === scenarioId &&
         path.resolve(entry.runDir) !== resolvedCurrentDir));
     const trustedComparableCandidates = trustedPriorCandidates.filter((entry) => (isComparableLane(entry, comparisonLane)));
     const trustedScenarioContractCandidates = trustedComparableCandidates.filter((entry) => (isComparableScenarioContract(entry, scenarioHash)));
+    const trustedCohortCandidates = trustedScenarioContractCandidates.filter((entry) => (isComparableCohort(entry, cohortHash)));
     return {
         artifactRoot: rootDir,
         candidatesInspected: index.entries.length,
@@ -113,9 +126,11 @@ function buildLatestTrustedSelection({ baseline, comparisonLane, currentDir, ind
         skippedCurrentRun: index.entries.some((entry) => path.resolve(entry.runDir) === resolvedCurrentDir),
         ...(comparisonLane ? { comparisonLane } : {}),
         ...(scenarioHash ? { scenarioHash } : {}),
+        ...(cohortHash ? { cohortHash } : {}),
         trustedCandidates: index.trusted.length,
         trustedComparableCandidates: trustedComparableCandidates.length,
         ...(scenarioHash ? { trustedScenarioContractCandidates: trustedScenarioContractCandidates.length } : {}),
+        ...(cohortHash ? { trustedCohortCandidates: trustedCohortCandidates.length } : {}),
         trustedPriorCandidates: trustedPriorCandidates.length,
     };
 }
@@ -132,8 +147,10 @@ function compareLatestTrustedRun({ comparisonLane, currentDir, rootDir, scenario
     const currentEntry = readRunIndexEntry(resolvedCurrentDir);
     const resolvedComparisonLane = comparisonLane ?? currentEntry.comparisonLane;
     const scenarioHash = currentEntry.scenarioHash;
+    const cohortHash = currentEntry.cohortHash;
     const index = buildRunIndex({ rootDir: resolvedRootDir, scenarioId });
     const baseline = findLatestTrustedPriorRun({
+        ...(cohortHash ? { cohortHash } : {}),
         ...(resolvedComparisonLane ? { comparisonLane: resolvedComparisonLane } : {}),
         ...(scenarioHash ? { scenarioHash } : {}),
         index,
@@ -145,7 +162,8 @@ function compareLatestTrustedRun({ comparisonLane, currentDir, rootDir, scenario
             ? ` in comparison lane '${resolvedComparisonLane}'`
             : ' without a comparison lane';
         const scenarioHashSuffix = scenarioHash ? ` and scenario hash '${scenarioHash}'` : '';
-        throw new Error(`No trusted prior run found for scenario '${scenarioId}'${laneSuffix}${scenarioHashSuffix} under ${resolvedRootDir}; inspected ${index.entries.length} candidate run(s), ${index.trusted.length} trusted.`);
+        const cohortHashSuffix = cohortHash ? ` and cohort hash '${cohortHash}'` : '';
+        throw new Error(`No trusted prior run found for scenario '${scenarioId}'${laneSuffix}${scenarioHashSuffix}${cohortHashSuffix} under ${resolvedRootDir}; inspected ${index.entries.length} candidate run(s), ${index.trusted.length} trusted.`);
     }
     return {
         baselineDir: baseline.runDir,
@@ -154,6 +172,7 @@ function compareLatestTrustedRun({ comparisonLane, currentDir, rootDir, scenario
             currentDir: resolvedCurrentDir,
             selection: buildLatestTrustedSelection({
                 baseline,
+                ...(cohortHash ? { cohortHash } : {}),
                 ...(resolvedComparisonLane ? { comparisonLane: resolvedComparisonLane } : {}),
                 currentDir: resolvedCurrentDir,
                 index,

package/dist/runner/example-android-live.d.ts CHANGED Viewed

@@ -33,6 +33,7 @@ type AndroidLiveProofResult = {
     outputDir: string;
     preflightDir: string;
     profiles: AndroidLiveProfile[];
+    seededBaselines: AndroidLiveProfile[];
     skippedInteractionProofs: AndroidSkippedInteractionProof[];
 };
 type RegressionGateOptions = {
@@ -77,6 +78,14 @@ declare function buildLiveRunId(baseRunId: string, suffix: string | null): strin
  * @returns {string}
  */
 declare function buildInteractionComparisonLane(runnerIds: string[]): string;
+/**
+ * Builds a deterministic run id for a seeded baseline profile.
+ *
+ * @param {string} baseRunId
+ * @param {string | null} suffix
+ * @returns {string}
+ */
+declare function buildBaselineRunId(baseRunId: string, suffix: string | null): string;
 /**
  * Reports whether profile evidence is healthy enough to trust sidecar proofs and comparisons.
  *
@@ -133,5 +142,5 @@ declare function formatResult(result: AndroidLiveProofResult): string;
  * @returns {Promise<void>}
  */
 declare function main(): Promise<void>;
-export { assertAggregatePassed, formatResult, assertNoRegressedComparisons, buildLiveRunId, buildSkippedInteractionProofs, buildInteractionComparisonLane, isTrustedProfileRun, main, normalizeRunSuffix, resolveAndroidSerial, runExampleAndroidLiveProof, usage, };
+export { assertAggregatePassed, formatResult, assertNoRegressedComparisons, buildLiveRunId, buildBaselineRunId, buildSkippedInteractionProofs, buildInteractionComparisonLane, isTrustedProfileRun, main, normalizeRunSuffix, resolveAndroidSerial, runExampleAndroidLiveProof, usage, };
 export type { AndroidLiveProofOptions, AndroidInteractionProof, AndroidLiveProofResult, AndroidLiveProfile, };

package/dist/runner/example-android-live.js CHANGED Viewed

@@ -5,6 +5,7 @@ exports.assertAggregatePassed = assertAggregatePassed;
 exports.formatResult = formatResult;
 exports.assertNoRegressedComparisons = assertNoRegressedComparisons;
 exports.buildLiveRunId = buildLiveRunId;
+exports.buildBaselineRunId = buildBaselineRunId;
 exports.buildSkippedInteractionProofs = buildSkippedInteractionProofs;
 exports.buildInteractionComparisonLane = buildInteractionComparisonLane;
 exports.isTrustedProfileRun = isTrustedProfileRun;
@@ -58,6 +59,7 @@ function usage(output = process.stderr) {
         'The example app must already be installed and reachable on an online Android emulator or device.',
         `By default, the runner sets the app React Native debug host to ${DEFAULT_REACT_NATIVE_DEBUG_HOST} for the isolated Metro server.`,
         'Use --run-suffix to preserve multiple live proof artifact sets without changing deterministic default run ids.',
+        'Use --seed-baseline with --compare-latest to capture a trusted compatible baseline before the measured run.',
         'Use --compare-latest to compare each passed scenario against the latest trusted prior run under the artifact root.',
         'Use --fail-on-regression with --compare-latest to exit nonzero after writing evidence when any comparison regressed.',
         'Use --agent-device-proof to attach the shared startup UI assertion through agent-device; pass --agent-device-session-mode bind when a named session should still receive the configured serial.',
@@ -155,6 +157,16 @@ function buildInteractionComparisonLane(runnerIds) {
         ? `example-android-live+${runnerIds.join('+')}`
         : 'example-android-live';
 }
+/**
+ * Builds a deterministic run id for a seeded baseline profile.
+ *
+ * @param {string} baseRunId
+ * @param {string | null} suffix
+ * @returns {string}
+ */
+function buildBaselineRunId(baseRunId, suffix) {
+    return buildLiveRunId(baseRunId, suffix ? `${suffix}-baseline` : 'baseline');
+}
 /**
  * Reports whether profile evidence is healthy enough to trust sidecar proofs and comparisons.
  *
@@ -280,8 +292,49 @@ async function runExampleAndroidLiveProof(args, options = {}) {
         throw new Error(`Android live proof preflight failed; inspect ${preflight.runDir}/agent-summary.md.`);
     }
     const interactionProofs = [];
+    const seededBaselines = [];
     const profiles = [];
     const failedProfiles = [];
+    if (isEnabledFlag(args['seed-baseline'])) {
+        for (const profile of EXAMPLE_PROFILES) {
+            const baselineRunId = buildBaselineRunId(profile.runId, runSuffix);
+            const result = await runProfileAndroid({
+                ...(typeof args.adb === 'string' ? { adb: args.adb } : {}),
+                'adb-capture': true,
+                'clear-logcat': true,
+                config: configPath,
+                'command-wait-ms': typeof args['command-wait-ms'] === 'string' ? args['command-wait-ms'] : '250',
+                launch: true,
+                'launch-wait-ms': typeof args['launch-wait-ms'] === 'string' ? args['launch-wait-ms'] : '1500',
+                'logcat-lines': typeof args['logcat-lines'] === 'string' ? args['logcat-lines'] : '1000',
+                out: outputDir,
+                ...(packageName ? { package: packageName } : {}),
+                'profile-session': true,
+                'react-native-debug-host': reactNativeDebugHost,
+                'run-id': baselineRunId,
+                scenario: path.join(exampleRoot, 'scenarios', 'mobile', profile.scenario),
+                serial,
+                'wait-ms': typeof args['wait-ms'] === 'string' ? args['wait-ms'] : '1000',
+            }, {
+                comparisonLane,
+                ...(options.delay ? { delay: options.delay } : {}),
+                ...(options.executor ? { executor: options.executor } : {}),
+            });
+            const baselinePointer = {
+                healthStatus: typeof result.health.healthStatus === 'string' ? result.health.healthStatus : 'unknown',
+                label: `${profile.label}-baseline`,
+                runDir: result.runDir,
+                runId: baselineRunId,
+                scenario: profile.scenario,
+                scenarioId: profile.scenarioId,
+                verdictStatus: typeof result.verdict.verdictStatus === 'string' ? result.verdict.verdictStatus : 'unknown',
+            };
+            seededBaselines.push(baselinePointer);
+            if (!isTrustedProfileRun({ health: result.health, verdict: result.verdict })) {
+                throw new Error(`Android seeded baseline failed for ${profile.label}. Inspect ${result.runDir}/agent-summary.md.`);
+            }
+        }
+    }
     for (const profile of EXAMPLE_PROFILES) {
         const profileRunId = buildLiveRunId(profile.runId, runSuffix);
         const result = await runProfileAndroid({
@@ -395,6 +448,7 @@ async function runExampleAndroidLiveProof(args, options = {}) {
         outputDir,
         preflightDir: preflight.runDir,
         profiles,
+        seededBaselines,
         skippedInteractionProofs,
     };
     if (isEnabledFlag(args['fail-on-regression'])) {
@@ -419,6 +473,7 @@ function formatResult(result) {
         `Preflight: ${result.preflightDir}/agent-summary.md`,
         ...result.profiles.map((profile) => (`${profile.label}: ${profile.runDir}/agent-summary.md`)),
         ...result.interactionProofs.map((proof) => (`${proof.label}: ${proof.runDir}/agent-summary.md`)),
+        ...result.seededBaselines.map((profile) => (`${profile.label}: ${profile.runDir}/agent-summary.md`)),
         ...(result.comparisons.length > 0
             ? [
                 'Comparisons:',

package/dist/runner/example-ios-live.d.ts CHANGED Viewed

@@ -33,6 +33,7 @@ type IosLiveProofResult = {
     outputDir: string;
     preflightDir: string;
     profiles: IosLiveProfile[];
+    seededBaselines: IosLiveProfile[];
     skippedInteractionProofs: IosSkippedInteractionProof[];
 };
 type RegressionGateOptions = {
@@ -77,6 +78,14 @@ declare function buildLiveRunId(baseRunId: string, suffix: string | null): strin
  * @returns {string}
  */
 declare function buildInteractionComparisonLane(runnerIds: string[]): string;
+/**
+ * Builds a deterministic run id for a seeded baseline profile.
+ *
+ * @param {string} baseRunId
+ * @param {string | null} suffix
+ * @returns {string}
+ */
+declare function buildBaselineRunId(baseRunId: string, suffix: string | null): string;
 /**
  * Reports whether profile evidence is healthy enough to trust sidecar proofs and comparisons.
  *
@@ -133,5 +142,5 @@ declare function formatResult(result: IosLiveProofResult): string;
  * @returns {Promise<void>}
  */
 declare function main(): Promise<void>;
-export { assertAggregatePassed, buildLiveRunId, formatResult, assertNoRegressedComparisons, buildSkippedInteractionProofs, buildInteractionComparisonLane, isTrustedProfileRun, main, normalizeRunSuffix, resolveIosDeviceId, runExampleIosLiveProof, usage, };
+export { assertAggregatePassed, buildLiveRunId, formatResult, assertNoRegressedComparisons, buildBaselineRunId, buildSkippedInteractionProofs, buildInteractionComparisonLane, isTrustedProfileRun, main, normalizeRunSuffix, resolveIosDeviceId, runExampleIosLiveProof, usage, };
 export type { IosLiveProofOptions, IosInteractionProof, IosLiveProofResult, IosLiveProfile, };

package/dist/runner/example-ios-live.js CHANGED Viewed

@@ -5,6 +5,7 @@ exports.assertAggregatePassed = assertAggregatePassed;
 exports.buildLiveRunId = buildLiveRunId;
 exports.formatResult = formatResult;
 exports.assertNoRegressedComparisons = assertNoRegressedComparisons;
+exports.buildBaselineRunId = buildBaselineRunId;
 exports.buildSkippedInteractionProofs = buildSkippedInteractionProofs;
 exports.buildInteractionComparisonLane = buildInteractionComparisonLane;
 exports.isTrustedProfileRun = isTrustedProfileRun;
@@ -56,6 +57,7 @@ function usage(output = process.stderr) {
         'Runs the packaged example iOS live proof: simctl preflight, startup, open-close, and scroll-settle.',
         'The example app must already be installed on a booted iOS simulator and connected to Metro. Set ASL_EXAMPLE_IOS_DEV_CLIENT_URL for Expo dev-client builds that need an explicit Metro URL.',
         'Use --run-suffix to preserve multiple live proof artifact sets without changing deterministic default run ids.',
+        'Use --seed-baseline with --compare-latest to capture a trusted compatible baseline before the measured run.',
         'Use --compare-latest to compare each passed scenario against the latest trusted prior run under the artifact root.',
         'Use --fail-on-regression with --compare-latest to exit nonzero after writing evidence when any comparison regressed.',
         'Use --agent-device-proof to attach the shared startup UI assertion through agent-device; pass --agent-device-session-mode bind when a named session should still receive the configured UDID.',
@@ -166,6 +168,16 @@ function buildInteractionComparisonLane(runnerIds) {
         ? `example-ios-live+${runnerIds.join('+')}`
         : 'example-ios-live';
 }
+/**
+ * Builds a deterministic run id for a seeded baseline profile.
+ *
+ * @param {string} baseRunId
+ * @param {string | null} suffix
+ * @returns {string}
+ */
+function buildBaselineRunId(baseRunId, suffix) {
+    return buildLiveRunId(baseRunId, suffix ? `${suffix}-baseline` : 'baseline');
+}
 /**
  * Reports whether profile evidence is healthy enough to trust sidecar proofs and comparisons.
  *
@@ -295,8 +307,49 @@ async function runExampleIosLiveProof(args, options = {}) {
         throw new Error(`iOS live proof preflight failed; inspect ${preflight.runDir}/agent-summary.md.`);
     }
     const interactionProofs = [];
+    const seededBaselines = [];
     const profiles = [];
     const failedProfiles = [];
+    if (isEnabledFlag(args['seed-baseline'])) {
+        for (const profile of EXAMPLE_PROFILES) {
+            const baselineRunId = buildBaselineRunId(profile.runId, runSuffix);
+            const result = await runProfileIos({
+                config: configPath,
+                device: deviceId,
+                ...(typeof args['log-last'] === 'string' ? { 'log-last': args['log-last'] } : {}),
+                launch: true,
+                out: outputDir,
+                ...(iosDevClientUrl ? { 'ios-dev-client-url': iosDevClientUrl } : {}),
+                ...(iosDevClientWaitMs ? { 'ios-dev-client-wait-ms': iosDevClientWaitMs } : {}),
+                'profile-session': true,
+                'profile-session-storage': true,
+                'run-id': baselineRunId,
+                scenario: path.join(exampleRoot, 'scenarios', 'mobile', profile.scenario),
+                'simctl-capture': true,
+                'simctl-out': path.join(outputDir, '_ios-simctl-captures', baselineRunId),
+                ...(typeof args['wait-ms'] === 'string' ? { 'wait-ms': args['wait-ms'] } : {}),
+                ...(bundleId ? { bundle: bundleId } : {}),
+                ...(typeof args.xcrun === 'string' ? { xcrun: args.xcrun } : {}),
+            }, {
+                comparisonLane,
+                ...(options.delay ? { delay: options.delay } : {}),
+                ...(options.executor ? { executor: options.executor } : {}),
+            });
+            const baselinePointer = {
+                healthStatus: typeof result.health.healthStatus === 'string' ? result.health.healthStatus : 'unknown',
+                label: `${profile.label}-baseline`,
+                runDir: result.runDir,
+                runId: baselineRunId,
+                scenario: profile.scenario,
+                scenarioId: profile.scenarioId,
+                verdictStatus: typeof result.verdict.verdictStatus === 'string' ? result.verdict.verdictStatus : 'unknown',
+            };
+            seededBaselines.push(baselinePointer);
+            if (!isTrustedProfileRun({ health: result.health, verdict: result.verdict })) {
+                throw new Error(`iOS seeded baseline failed for ${profile.label}. Inspect ${result.runDir}/agent-summary.md.`);
+            }
+        }
+    }
     for (const profile of EXAMPLE_PROFILES) {
         const profileRunId = buildLiveRunId(profile.runId, runSuffix);
         const result = await runProfileIos({
@@ -412,6 +465,7 @@ async function runExampleIosLiveProof(args, options = {}) {
         outputDir,
         preflightDir: preflight.runDir,
         profiles,
+        seededBaselines,
         skippedInteractionProofs,
     };
     if (isEnabledFlag(args['fail-on-regression'])) {
@@ -436,6 +490,7 @@ function formatResult(result) {
         `Preflight: ${result.preflightDir}/agent-summary.md`,
         ...result.profiles.map((profile) => (`${profile.label}: ${profile.runDir}/agent-summary.md`)),
         ...result.interactionProofs.map((proof) => (`${proof.label}: ${proof.runDir}/agent-summary.md`)),
+        ...result.seededBaselines.map((profile) => (`${profile.label}: ${profile.runDir}/agent-summary.md`)),
         ...(result.comparisons.length > 0
             ? [
                 'Comparisons:',

package/dist/runner/ios-simctl.d.ts CHANGED Viewed

@@ -43,9 +43,14 @@ type IosSimctlDeepLink = {
 };
 type IosProfileSessionStorageCommand = {
     command: string;
+    commandId?: string;
     id?: string;
     label?: string;
+    queueId?: string;
+    sequence?: number;
     timestamp?: number;
+    waitForMilestone?: string;
+    waitTimeoutMs?: number;
 };
 type IosProfileSessionStorageSeed = {
     commands?: IosProfileSessionStorageCommand[];

package/dist/runner/ios-simctl.js CHANGED Viewed

@@ -456,7 +456,13 @@ async function seedProfileSessionStorage({ bundleId, commands = [], dataContaine
         scenario,
         runId,
         command: profileCommand.command,
+        ...(typeof profileCommand.commandId === 'string' ? { commandId: profileCommand.commandId } : {}),
+        ...(typeof profileCommand.label === 'string' ? { label: profileCommand.label } : {}),
+        ...(typeof profileCommand.queueId === 'string' ? { queueId: profileCommand.queueId } : {}),
+        ...(typeof profileCommand.sequence === 'number' ? { sequence: profileCommand.sequence } : {}),
         timestamp: typeof profileCommand.timestamp === 'number' ? profileCommand.timestamp : startedAt + index + 1,
+        ...(typeof profileCommand.waitForMilestone === 'string' ? { waitForMilestone: profileCommand.waitForMilestone } : {}),
+        ...(typeof profileCommand.waitTimeoutMs === 'number' ? { waitTimeoutMs: profileCommand.waitTimeoutMs } : {}),
     }));
     manifest[profileStorageKeys.session] = JSON.stringify(session);
     if (queuedCommands.length > 0) {

package/dist/runner/live-comparison.d.ts CHANGED Viewed

@@ -4,7 +4,7 @@ type LiveProfileForComparison = {
     runId: string;
     scenarioId: string;
 };
-type ComparisonMetricStatus = 'better' | 'worse' | 'unchanged' | 'inconclusive';
+type ComparisonMetricStatus = 'better' | 'worse' | 'unchanged' | 'inconclusive' | 'low_confidence';
 type ComparisonMetricHighlight = {
     baseline: number | boolean | null;
     current: number | boolean | null;
@@ -25,7 +25,7 @@ type LiveComparisonResult = {
     reason: string | null;
     runId: string;
     scenarioId: string;
-    status: 'better' | 'worse' | 'unchanged' | 'mixed' | 'inconclusive' | 'skipped';
+    status: 'better' | 'worse' | 'unchanged' | 'mixed' | 'inconclusive' | 'low_confidence' | 'skipped';
     summaryPath: string | null;
 };
 type CompareLiveProfilesOptions = {

package/dist/runner/live-comparison.js CHANGED Viewed

@@ -44,6 +44,7 @@ function buildComparisonMetricSummary(comparison) {
         worse: 0,
         unchanged: 0,
         inconclusive: 0,
+        low_confidence: 0,
     };
     const notableMetrics = [];
     for (const metric of comparison.metricComparisons) {
@@ -52,7 +53,7 @@ function buildComparisonMetricSummary(comparison) {
         }
         const record = metric;
         const status = record.status;
-        if (status !== 'better' && status !== 'worse' && status !== 'unchanged' && status !== 'inconclusive') {
+        if (status !== 'better' && status !== 'worse' && status !== 'unchanged' && status !== 'inconclusive' && status !== 'low_confidence') {
             continue;
         }
         counts[status] += 1;

package/dist/runner/live-proof-summary.d.ts CHANGED Viewed

@@ -53,10 +53,10 @@ type LiveProofComparisonPointer = {
     reason: string | null;
     runId: string;
     scenarioId: string;
-    status: 'better' | 'worse' | 'unchanged' | 'mixed' | 'inconclusive' | 'skipped';
+    status: 'better' | 'worse' | 'unchanged' | 'mixed' | 'inconclusive' | 'low_confidence' | 'skipped';
     summaryPath: string | null;
 };
-type LiveProofComparisonMetricStatus = 'better' | 'worse' | 'unchanged' | 'inconclusive';
+type LiveProofComparisonMetricStatus = 'better' | 'worse' | 'unchanged' | 'inconclusive' | 'low_confidence';
 type LiveProofComparisonMetricSummary = {
     counts: Record<LiveProofComparisonMetricStatus, number>;
     notableMetrics: Array<{
@@ -94,17 +94,18 @@ type LiveProofArtifact = {
     status: 'passed' | 'failed';
     summary: string;
 };
-type LiveProofComparisonStatus = ('baseline_missing' | 'improved' | 'inconclusive' | 'mixed' | 'not_compared' | 'regressed' | 'unchanged');
+type LiveProofComparisonStatus = ('baseline_missing' | 'improved' | 'inconclusive' | 'low_confidence' | 'mixed' | 'not_compared' | 'regressed' | 'unchanged');
 type LiveProofComparisonCounts = {
     better: number;
     inconclusive: number;
+    low_confidence: number;
     mixed: number;
     skipped: number;
     unchanged: number;
     worse: number;
 };
 type LiveProofNextAction = {
-    code: 'establish_baseline' | 'inspect_failed_run' | 'inspect_inconclusive' | 'inspect_mixed' | 'inspect_regressions' | 'inspect_summary';
+    code: 'establish_baseline' | 'inspect_failed_run' | 'inspect_inconclusive' | 'inspect_low_confidence' | 'inspect_mixed' | 'inspect_regressions' | 'inspect_summary';
     summary: string;
 };
 type LiveProofSummaryResult = {

package/dist/runner/live-proof-summary.js CHANGED Viewed

@@ -168,6 +168,9 @@ function buildLiveProofComparisonStatus(comparisons) {
     if (statuses.includes('inconclusive')) {
         return 'inconclusive';
     }
+    if (statuses.includes('low_confidence')) {
+        return 'low_confidence';
+    }
     if (statuses.every((status) => status === 'skipped')) {
         return 'baseline_missing';
     }
@@ -192,6 +195,7 @@ function buildLiveProofComparisonCounts(comparisons) {
     const counts = {
         better: 0,
         inconclusive: 0,
+        low_confidence: 0,
         mixed: 0,
         skipped: 0,
         unchanged: 0,
@@ -233,6 +237,12 @@ function buildLiveProofNextAction(comparisonStatus, status = 'passed') {
             summary: 'Some comparisons are inconclusive or incomplete; inspect scenario health and missing baseline details.',
         };
     }
+    if (comparisonStatus === 'low_confidence') {
+        return {
+            code: 'inspect_low_confidence',
+            summary: 'Some comparisons show low-confidence timing movement; repeat or multi-sample proof is required before treating it as a regression.',
+        };
+    }
     if (comparisonStatus === 'mixed') {
         return {
             code: 'inspect_mixed',
@@ -292,7 +302,7 @@ function formatComparisonMetricSummary(comparison) {
     if (!summary) {
         return '';
     }
-    const counts = `metrics better=${summary.counts.better} worse=${summary.counts.worse} unchanged=${summary.counts.unchanged} inconclusive=${summary.counts.inconclusive}`;
+    const counts = `metrics better=${summary.counts.better} worse=${summary.counts.worse} unchanged=${summary.counts.unchanged} inconclusive=${summary.counts.inconclusive} low_confidence=${summary.counts.low_confidence}`;
     const highlights = summary.notableMetrics.length > 0
         ? `; notable: ${summary.notableMetrics.map(formatComparisonMetricHighlight).join(', ')}`
         : '';
@@ -345,7 +355,7 @@ function buildLiveProofMarkdown(artifact) {
         `Status: ${artifact.status}`,
         `Run: ${artifact.runId}`,
         `Comparison status: ${artifact.comparisonStatus}`,
-        `Comparison counts: better=${artifact.comparisonCounts.better} worse=${artifact.comparisonCounts.worse} unchanged=${artifact.comparisonCounts.unchanged} mixed=${artifact.comparisonCounts.mixed} inconclusive=${artifact.comparisonCounts.inconclusive} skipped=${artifact.comparisonCounts.skipped}`,
+        `Comparison counts: better=${artifact.comparisonCounts.better} worse=${artifact.comparisonCounts.worse} unchanged=${artifact.comparisonCounts.unchanged} mixed=${artifact.comparisonCounts.mixed} inconclusive=${artifact.comparisonCounts.inconclusive} low_confidence=${artifact.comparisonCounts.low_confidence} skipped=${artifact.comparisonCounts.skipped}`,
         `Next action: ${artifact.nextAction.code} - ${artifact.nextAction.summary}`,
         `Summary: ${artifact.summary}`,
         '',

package/dist/runner/live-proof.d.ts CHANGED Viewed

@@ -17,6 +17,7 @@ type LiveProofArtifact = {
         skipped: number;
         unchanged: number;
         worse: number;
+        low_confidence: number;
     };
     comparisonStatus: string;
     comparisons: LiveProofComparisonPointer[];
@@ -82,7 +83,7 @@ type LiveProofArtifact = {
     summary: string;
 };
 type LiveProofComparisonCounts = LiveProofArtifact['comparisonCounts'];
-type LiveProofMetricStatus = 'better' | 'worse' | 'unchanged' | 'inconclusive';
+type LiveProofMetricStatus = 'better' | 'worse' | 'unchanged' | 'inconclusive' | 'low_confidence';
 type LiveProofPlatform = LiveProofArtifact['platform'];
 type LiveProofComparisonPointer = {
     baselineDir?: string | null;
@@ -102,7 +103,7 @@ type LiveProofComparisonPointer = {
     status?: string;
     summaryPath?: string | null;
 };
-type LiveProofAggregateStatus = ('baseline_missing' | 'improved' | 'inconclusive' | 'mixed' | 'not_compared' | 'regressed' | 'unchanged');
+type LiveProofAggregateStatus = ('baseline_missing' | 'improved' | 'inconclusive' | 'low_confidence' | 'mixed' | 'not_compared' | 'regressed' | 'unchanged');
 type LiveProofNextActionCode = LiveProofArtifact['nextAction']['code'];
 type LiveProofSetArtifact = {
     failureReasons: string[];

package/dist/runner/live-proof.js CHANGED Viewed

@@ -153,6 +153,7 @@ function countLiveProofComparisons(comparisons) {
     const counts = {
         better: 0,
         inconclusive: 0,
+        low_confidence: 0,
         mixed: 0,
         skipped: 0,
         unchanged: 0,
@@ -183,6 +184,9 @@ function deriveLiveProofComparisonStatus(comparisons) {
     if (statuses.includes('inconclusive')) {
         return 'inconclusive';
     }
+    if (statuses.includes('low_confidence')) {
+        return 'low_confidence';
+    }
     if (statuses.every((status) => status === 'skipped')) {
         return 'baseline_missing';
     }
@@ -217,6 +221,9 @@ function expectedLiveProofNextActionCode(comparisonStatus, status = 'passed') {
     if (comparisonStatus === 'inconclusive') {
         return 'inspect_inconclusive';
     }
+    if (comparisonStatus === 'low_confidence') {
+        return 'inspect_low_confidence';
+    }
     if (comparisonStatus === 'mixed') {
         return 'inspect_mixed';
     }
@@ -439,7 +446,7 @@ function formatComparisonPointerMetrics(comparison) {
     const highlightText = highlights.length > 0
         ? `; notable: ${highlights.map(formatMetricHighlight).join(', ')}`
         : '';
-    return ` (metrics better=${counts.better} worse=${counts.worse} unchanged=${counts.unchanged} inconclusive=${counts.inconclusive}${highlightText})`;
+    return ` (metrics better=${counts.better} worse=${counts.worse} unchanged=${counts.unchanged} inconclusive=${counts.inconclusive} low_confidence=${counts.low_confidence}${highlightText})`;
 }
 /**
  * Formats capture counts for one interaction proof pointer.
@@ -815,7 +822,7 @@ function formatLiveProof(proof) {
         `Skipped interaction proofs: ${proof.skippedInteractionProofs?.length ?? 0}`,
         ...(proof.skippedInteractionProofs ?? []).map((proofPointer) => (`- ${proofPointer.label} (${proofPointer.runnerId}/${proofPointer.scenarioId}/${proofPointer.runId}): ${proofPointer.reason} next=${proofPointer.nextAction.code}`)),
         `Comparisons: ${proof.comparisons.length}`,
-        `Comparison counts: better=${proof.comparisonCounts.better} worse=${proof.comparisonCounts.worse} unchanged=${proof.comparisonCounts.unchanged} mixed=${proof.comparisonCounts.mixed} inconclusive=${proof.comparisonCounts.inconclusive} skipped=${proof.comparisonCounts.skipped}`,
+        `Comparison counts: better=${proof.comparisonCounts.better} worse=${proof.comparisonCounts.worse} unchanged=${proof.comparisonCounts.unchanged} mixed=${proof.comparisonCounts.mixed} inconclusive=${proof.comparisonCounts.inconclusive} low_confidence=${proof.comparisonCounts.low_confidence} skipped=${proof.comparisonCounts.skipped}`,
         ...proof.comparisons.map((comparison) => (`- ${comparison.label ?? 'comparison'} (${comparison.scenarioId ?? 'unknown-scenario'}/${comparison.runId ?? 'unknown-run'}): ${comparison.status ?? 'unknown'}${formatComparisonPointerMetrics(comparison)}`)),
         `Next action: ${proof.nextAction.code} - ${proof.nextAction.summary}`,
         `Summary: ${proof.summary}`,

package/dist/runner/profile-android.d.ts CHANGED Viewed

@@ -8,8 +8,13 @@ type AndroidProfileOptions = {
 };
 type AndroidAdbProfileCommand = {
     command: string;
+    commandId?: string;
     label?: string;
+    queueId?: string;
+    sequence?: number;
+    waitForMilestone?: string;
     waitMs?: number;
+    waitTimeoutMs?: number;
 };
 type AndroidAdbDriverStep = import('./android-adb').AndroidAdbDriverStep;
 /**