npm - @elaraai/e3-core - Versions diffs - 0.0.2-beta.36 → 0.0.2-beta.38 - Mend

@elaraai/e3-core 0.0.2-beta.36 → 0.0.2-beta.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

package/dist/src/dataflow/api-compat.d.ts.map +1 -1
package/dist/src/dataflow/api-compat.js +6 -1
package/dist/src/dataflow/api-compat.js.map +1 -1
package/dist/src/dataflow/orchestrator/LocalOrchestrator.d.ts +22 -4
package/dist/src/dataflow/orchestrator/LocalOrchestrator.d.ts.map +1 -1
package/dist/src/dataflow/orchestrator/LocalOrchestrator.js +353 -79
package/dist/src/dataflow/orchestrator/LocalOrchestrator.js.map +1 -1
package/dist/src/dataflow/orchestrator/interfaces.d.ts +6 -0
package/dist/src/dataflow/orchestrator/interfaces.d.ts.map +1 -1
package/dist/src/dataflow/orchestrator/interfaces.js +1 -0
package/dist/src/dataflow/orchestrator/interfaces.js.map +1 -1
package/dist/src/dataflow/state-store/InMemoryStateStore.d.ts.map +1 -1
package/dist/src/dataflow/state-store/InMemoryStateStore.js +8 -0
package/dist/src/dataflow/state-store/InMemoryStateStore.js.map +1 -1
package/dist/src/dataflow/steps.d.ts +74 -28
package/dist/src/dataflow/steps.d.ts.map +1 -1
package/dist/src/dataflow/steps.js +221 -42
package/dist/src/dataflow/steps.js.map +1 -1
package/dist/src/dataflow/types.d.ts +13 -2
package/dist/src/dataflow/types.d.ts.map +1 -1
package/dist/src/dataflow.d.ts +37 -95
package/dist/src/dataflow.d.ts.map +1 -1
package/dist/src/dataflow.js +121 -631
package/dist/src/dataflow.js.map +1 -1
package/dist/src/dataset-refs.d.ts +124 -0
package/dist/src/dataset-refs.d.ts.map +1 -0
package/dist/src/dataset-refs.js +319 -0
package/dist/src/dataset-refs.js.map +1 -0
package/dist/src/execution/MockTaskRunner.d.ts +1 -1
package/dist/src/execution/MockTaskRunner.d.ts.map +1 -1
package/dist/src/execution/MockTaskRunner.js +1 -2
package/dist/src/execution/MockTaskRunner.js.map +1 -1
package/dist/src/index.d.ts +5 -4
package/dist/src/index.d.ts.map +1 -1
package/dist/src/index.js +6 -4
package/dist/src/index.js.map +1 -1
package/dist/src/packages.d.ts.map +1 -1
package/dist/src/packages.js +20 -7
package/dist/src/packages.js.map +1 -1
package/dist/src/storage/in-memory/InMemoryStorage.d.ts +26 -4
package/dist/src/storage/in-memory/InMemoryStorage.d.ts.map +1 -1
package/dist/src/storage/in-memory/InMemoryStorage.js +104 -21
package/dist/src/storage/in-memory/InMemoryStorage.js.map +1 -1
package/dist/src/storage/index.d.ts +2 -2
package/dist/src/storage/index.d.ts.map +1 -1
package/dist/src/storage/index.js +1 -1
package/dist/src/storage/index.js.map +1 -1
package/dist/src/storage/interfaces.d.ts +52 -1
package/dist/src/storage/interfaces.d.ts.map +1 -1
package/dist/src/storage/local/LocalBackend.d.ts +3 -1
package/dist/src/storage/local/LocalBackend.d.ts.map +1 -1
package/dist/src/storage/local/LocalBackend.js +5 -1
package/dist/src/storage/local/LocalBackend.js.map +1 -1
package/dist/src/storage/local/LocalDatasetRefStore.d.ts +22 -0
package/dist/src/storage/local/LocalDatasetRefStore.d.ts.map +1 -0
package/dist/src/storage/local/LocalDatasetRefStore.js +118 -0
package/dist/src/storage/local/LocalDatasetRefStore.js.map +1 -0
package/dist/src/storage/local/LocalLockService.d.ts +6 -0
package/dist/src/storage/local/LocalLockService.d.ts.map +1 -1
package/dist/src/storage/local/LocalLockService.js +17 -4
package/dist/src/storage/local/LocalLockService.js.map +1 -1
package/dist/src/storage/local/LocalRepoStore.d.ts +4 -2
package/dist/src/storage/local/LocalRepoStore.d.ts.map +1 -1
package/dist/src/storage/local/LocalRepoStore.js +14 -2
package/dist/src/storage/local/LocalRepoStore.js.map +1 -1
package/dist/src/storage/local/gc.d.ts.map +1 -1
package/dist/src/storage/local/gc.js +8 -1
package/dist/src/storage/local/gc.js.map +1 -1
package/dist/src/storage/local/index.d.ts +1 -0
package/dist/src/storage/local/index.d.ts.map +1 -1
package/dist/src/storage/local/index.js +1 -0
package/dist/src/storage/local/index.js.map +1 -1
package/dist/src/trees.d.ts +35 -43
package/dist/src/trees.d.ts.map +1 -1
package/dist/src/trees.js +228 -449
package/dist/src/trees.js.map +1 -1
package/dist/src/workspaces.d.ts +6 -27
package/dist/src/workspaces.d.ts.map +1 -1
package/dist/src/workspaces.js +42 -55
package/dist/src/workspaces.js.map +1 -1
package/package.json +1 -1

package/dist/src/dataflow.js CHANGED Viewed

@@ -5,29 +5,19 @@
 /**
  * Dataflow execution for e3 workspaces.
  *
- * Executes tasks in a workspace based on their dependency graph. Tasks are
- * executed in parallel where possible, respecting a concurrency limit.
+ * Provides the high-level `dataflowExecute` entry point (which delegates
+ * to `LocalOrchestrator`) and shared graph-building utilities used by
+ * both local and cloud execution paths.
  *
- * The execution model is event-driven with a work queue:
- * 1. Build dependency graph from tasks (input paths -> task -> output path)
- * 2. Compute reverse dependencies (which tasks depend on each output)
- * 3. Initialize ready queue with tasks whose inputs are all assigned
- * 4. Execute tasks from ready queue, respecting concurrency limit
- * 5. On task completion, queue workspace update then check dependents for readiness
- * 6. On failure, stop launching new tasks but wait for running ones
- *
- * IMPORTANT: Workspace state updates are serialized through an async queue to
- * prevent race conditions when multiple tasks complete concurrently. Each task's
- * output is written to the workspace and dependents are notified only after the
- * write completes, ensuring downstream tasks see consistent state.
+ * The reactive execution logic (input change detection, task invalidation,
+ * version vector consistency) lives in `dataflow/steps.ts` and is orchestrated
+ * by `dataflow/orchestrator/LocalOrchestrator.ts`.
  */
-import { decodeBeast2For, encodeBeast2For, variant } from '@elaraai/east';
+import { decodeBeast2For, variant } from '@elaraai/east';
 import { PackageObjectType, TaskObjectType, WorkspaceStateType, pathToString, } from '@elaraai/e3-types';
 import { executionGetOutput, inputsHash, } from './executions.js';
-import { uuidv7 } from './uuid.js';
-import { taskExecute } from './execution/LocalTaskRunner.js';
-import { workspaceGetDatasetHash, workspaceSetDatasetByHash, } from './trees.js';
-import { E3Error, WorkspaceNotFoundError, WorkspaceNotDeployedError, WorkspaceLockError, TaskNotFoundError, DataflowError, DataflowAbortedError, } from './errors.js';
+import { workspaceGetDatasetHash, } from './trees.js';
+import { E3Error, WorkspaceNotFoundError, WorkspaceNotDeployedError, DataflowError, } from './errors.js';
 // =============================================================================
 // Path Parsing Helper
 // =============================================================================
@@ -76,53 +66,6 @@ export function parsePathString(pathStr) {
     return segments;
 }
 // =============================================================================
-// Async Mutex for Workspace Updates
-// =============================================================================
-/**
- * Simple async mutex to serialize workspace state updates.
- *
- * When multiple tasks complete concurrently, their workspace writes must be
- * serialized to prevent race conditions (read-modify-write on the workspace
- * root hash). This mutex ensures only one update runs at a time.
- */
-class AsyncMutex {
-    queue = [];
-    locked = false;
-    /**
-     * Acquire the mutex, execute the callback, then release.
-     * If the mutex is already held, waits until it's available.
-     */
-    async runExclusive(fn) {
-        await this.acquire();
-        try {
-            return await fn();
-        }
-        finally {
-            this.release();
-        }
-    }
-    acquire() {
-        return new Promise((resolve) => {
-            if (!this.locked) {
-                this.locked = true;
-                resolve();
-            }
-            else {
-                this.queue.push(resolve);
-            }
-        });
-    }
-    release() {
-        const next = this.queue.shift();
-        if (next) {
-            next();
-        }
-        else {
-            this.locked = false;
-        }
-    }
-}
-// =============================================================================
 // Workspace State Reader
 // =============================================================================
 /**
@@ -146,22 +89,14 @@ async function readWorkspaceState(storage, repo, ws) {
 // =============================================================================
 /**
  * Build the dependency graph for a workspace.
- *
- * Returns:
- * - taskNodes: Map of task name -> TaskNode
- * - outputToTask: Map of output path string -> task name
- * - taskDependents: Map of task name -> set of dependent task names
  */
 async function buildDependencyGraph(storage, repo, ws) {
-    // Read workspace state to get package hash
     const state = await readWorkspaceState(storage, repo, ws);
-    // Read package object to get tasks map
     const pkgData = await storage.objects.read(repo, state.packageHash);
     const pkgDecoder = decodeBeast2For(PackageObjectType);
     const pkgObject = pkgDecoder(Buffer.from(pkgData));
     const taskNodes = new Map();
-    const outputToTask = new Map(); // output path -> task name
-    // First pass: load all tasks and build output->task map
+    const outputToTask = new Map();
     const taskDecoder = decodeBeast2For(TaskObjectType);
     for (const [taskName, taskHash] of pkgObject.tasks) {
         const taskData = await storage.objects.read(repo, taskHash);
@@ -174,32 +109,24 @@ async function buildDependencyGraph(storage, repo, ws) {
             task,
             inputPaths: task.inputs,
             outputPath: task.output,
-            unresolvedCount: 0, // Will be computed below
+            unresolvedCount: 0,
         });
     }
-    // Build reverse dependency map: task -> tasks that depend on it
     const taskDependents = new Map();
     for (const taskName of taskNodes.keys()) {
         taskDependents.set(taskName, new Set());
     }
-    // Second pass: compute dependencies and unresolved counts
     for (const [taskName, node] of taskNodes) {
         for (const inputPath of node.inputPaths) {
             const inputPathStr = pathToString(inputPath);
             const producerTask = outputToTask.get(inputPathStr);
             if (producerTask) {
-                // This input comes from another task's output.
-                // The task cannot run until the producer task completes,
-                // regardless of whether the output is currently assigned
-                // (it might be stale from a previous run).
                 taskDependents.get(producerTask).add(taskName);
                 node.unresolvedCount++;
             }
-            // If not produced by a task, it's an external input - check if assigned
             else {
                 const { refType } = await workspaceGetDatasetHash(storage, repo, ws, inputPath);
                 if (refType === 'unassigned') {
-                    // External input that is unassigned - this task can never run
                     node.unresolvedCount++;
                 }
             }
@@ -213,19 +140,16 @@ async function buildDependencyGraph(storage, repo, ws) {
 /**
  * Execute all tasks in a workspace according to the dependency graph.
  *
- * Tasks are executed in parallel where dependencies allow, respecting
- * the concurrency limit. On failure, no new tasks are launched but
- * running tasks are allowed to complete.
- *
- * Acquires an exclusive lock on the workspace for the duration of execution
- * to prevent concurrent modifications. If options.lock is provided, uses that
- * lock instead (caller is responsible for releasing it).
+ * Delegates to `LocalOrchestrator` which implements reactive fixpoint
+ * execution using step functions. After each task completes, input changes
+ * are detected and affected tasks are invalidated and re-executed.
  *
  * @param storage - Storage backend
- * @param repo - Repository identifier (for local storage, the path to e3 repository directory)
+ * @param repo - Repository identifier
  * @param ws - Workspace name
  * @param options - Execution options
  * @returns Result of the dataflow execution
+ *
  * @throws {WorkspaceLockError} If workspace is locked by another process
  * @throws {WorkspaceNotFoundError} If workspace doesn't exist
  * @throws {WorkspaceNotDeployedError} If workspace has no package deployed
@@ -233,494 +157,83 @@ async function buildDependencyGraph(storage, repo, ws) {
  * @throws {DataflowError} If execution fails for other reasons
  */
 export async function dataflowExecute(storage, repo, ws, options = {}) {
-    // Acquire lock if not provided externally
-    const externalLock = options.lock;
-    const lock = externalLock ?? await storage.locks.acquire(repo, ws, variant('dataflow', null));
-    if (!lock) {
-        // Lock couldn't be acquired - the LockService returns null instead of throwing
-        throw new WorkspaceLockError(ws);
-    }
-    try {
-        return await dataflowExecuteWithLock(storage, repo, ws, options);
-    }
-    finally {
-        // Only release the lock if we acquired it internally
-        if (!externalLock) {
-            await lock.release();
-        }
-    }
+    const { LocalOrchestrator } = await import('./dataflow/orchestrator/LocalOrchestrator.js');
+    const orchestrator = new LocalOrchestrator();
+    const taskResults = [];
+    const handle = await orchestrator.start(storage, repo, ws, {
+        concurrency: options.concurrency,
+        force: options.force,
+        filter: options.filter,
+        signal: options.signal,
+        lock: options.lock,
+        runner: options.runner,
+        onTaskStart: options.onTaskStart,
+        onTaskComplete: (result) => {
+            taskResults.push({
+                name: result.name,
+                cached: result.cached,
+                state: result.state,
+                error: result.error,
+                exitCode: result.exitCode,
+                duration: result.duration,
+            });
+            options.onTaskComplete?.({
+                name: result.name,
+                cached: result.cached,
+                state: result.state,
+                error: result.error,
+                exitCode: result.exitCode,
+                duration: result.duration,
+            });
+        },
+        onStdout: options.onStdout,
+        onStderr: options.onStderr,
+        onInputChanged: options.onInputChanged,
+        onTaskInvalidated: options.onTaskInvalidated,
+        onTaskDeferred: options.onTaskDeferred,
+    });
+    const result = await orchestrator.wait(handle);
+    return {
+        success: result.success,
+        runId: result.runId,
+        executed: result.executed,
+        cached: result.cached,
+        failed: result.failed,
+        skipped: result.skipped,
+        reexecuted: result.reexecuted,
+        tasks: taskResults,
+        duration: result.duration,
+    };
 }
 /**
- * Start dataflow execution in the background (non-blocking).
- *
- * Returns a promise immediately without awaiting execution. The lock is
- * released automatically when execution completes.
+ * Execute dataflow with an externally-held lock.
+ * The lock is released automatically when execution completes or fails.
  *
  * @param storage - Storage backend
- * @param repo - Repository identifier (for local storage, the path to e3 repository directory)
+ * @param repo - Repository identifier
  * @param ws - Workspace name
  * @param options - Execution options (lock must be provided)
  * @returns Promise that resolves when execution completes
- * @throws {WorkspaceNotFoundError} If workspace doesn't exist
- * @throws {WorkspaceNotDeployedError} If workspace has no package deployed
- * @throws {TaskNotFoundError} If filter specifies a task that doesn't exist
- * @throws {DataflowError} If execution fails for other reasons
  */
-export function dataflowStart(storage, repo, ws, options) {
-    return dataflowExecuteWithLock(storage, repo, ws, options)
-        .finally(() => options.lock.release());
-}
-/**
- * Internal: Execute dataflow with lock already held.
- */
-async function dataflowExecuteWithLock(storage, repo, ws, options) {
-    const startTime = Date.now();
-    const startedAt = new Date();
-    const concurrency = options.concurrency ?? 4;
-    // Generate run ID for this execution
-    const runId = uuidv7();
-    let taskNodes;
-    let taskDependents;
-    let outputToTask;
-    let wsState;
+export async function dataflowStart(storage, repo, ws, options) {
     try {
-        // Read workspace state for run tracking
-        wsState = await readWorkspaceState(storage, repo, ws);
-        // Build dependency graph
-        const graphResult = await buildDependencyGraph(storage, repo, ws);
-        taskNodes = graphResult.taskNodes;
-        taskDependents = graphResult.taskDependents;
-        outputToTask = graphResult.outputToTask;
-    }
-    catch (err) {
-        // Re-throw E3Errors as-is
-        if (err instanceof E3Error)
-            throw err;
-        // Wrap unexpected errors
-        throw new DataflowError(`Failed to build dependency graph: ${err instanceof Error ? err.message : err}`);
+        return await dataflowExecute(storage, repo, ws, options);
     }
-    // Clean up all previous runs (we hold the lock, so no concurrent runs)
-    const allRunIds = await storage.refs.dataflowRunList(repo, ws);
-    for (const oldRunId of allRunIds) {
-        await storage.refs.dataflowRunDelete(repo, ws, oldRunId);
-    }
-    // Initialize task execution records map
-    const taskExecutions = new Map();
-    // Create initial DataflowRun record
-    const initialRun = {
-        runId,
-        workspaceName: ws,
-        packageRef: `${wsState.packageName}@${wsState.packageVersion}`,
-        startedAt,
-        completedAt: variant('none', null),
-        status: variant('running', {}),
-        inputSnapshot: wsState.rootHash,
-        outputSnapshot: variant('none', null),
-        taskExecutions: taskExecutions,
-        summary: {
-            total: BigInt(taskNodes.size),
-            completed: 0n,
-            cached: 0n,
-            failed: 0n,
-            skipped: 0n,
-        },
-    };
-    // Write initial run record
-    await storage.refs.dataflowRunWrite(repo, ws, initialRun);
-    // Build DataflowGraph for use with decomposed building blocks
-    const dataflowGraph = {
-        tasks: Array.from(taskNodes.entries()).map(([taskName, node]) => {
-            const dependsOn = [];
-            for (const inputPath of node.inputPaths) {
-                const inputPathStr = pathToString(inputPath);
-                const producerTask = outputToTask.get(inputPathStr);
-                if (producerTask) {
-                    dependsOn.push(producerTask);
-                }
-            }
-            return {
-                name: taskName,
-                hash: node.hash,
-                inputs: node.inputPaths.map(pathToString),
-                output: pathToString(node.outputPath),
-                dependsOn,
-            };
-        }),
-    };
-    // Apply filter if specified
-    const filteredTaskNames = options.filter
-        ? new Set([options.filter])
-        : null;
-    // Validate filter
-    if (filteredTaskNames && options.filter && !taskNodes.has(options.filter)) {
-        throw new TaskNotFoundError(options.filter);
-    }
-    // Track execution state
-    const results = [];
-    let executed = 0;
-    let cached = 0;
-    let failed = 0;
-    let skipped = 0;
-    let hasFailure = false;
-    let aborted = false;
-    // Check for abort signal
-    const checkAborted = () => {
-        if (options.signal?.aborted && !aborted) {
-            aborted = true;
-        }
-        return aborted;
-    };
-    // Mutex to serialize workspace state updates.
-    // When multiple tasks complete concurrently, their writes to the workspace
-    // must be serialized to prevent lost updates (read-modify-write race).
-    const workspaceUpdateMutex = new AsyncMutex();
-    // Ready queue: tasks with all dependencies resolved
-    const readyQueue = [];
-    const completed = new Set();
-    const inProgress = new Set();
-    const skippedTasks = new Set(); // Track skipped tasks separately for dataflowGetDependentsToSkip
-    // Initialize ready queue with tasks that have no unresolved dependencies
-    // and pass the filter (if any)
-    for (const [taskName, node] of taskNodes) {
-        if (node.unresolvedCount === 0) {
-            if (!filteredTaskNames || filteredTaskNames.has(taskName)) {
-                readyQueue.push(taskName);
-            }
-        }
-    }
-    // Check if the task has a valid cached execution for current inputs
-    // Returns the output hash and executionId if cached, null if re-execution is needed
-    async function getCachedOutput(taskName) {
-        const node = taskNodes.get(taskName);
-        // Gather current input hashes
-        const currentInputHashes = [];
-        for (const inputPath of node.inputPaths) {
-            const { refType, hash } = await workspaceGetDatasetHash(storage, repo, ws, inputPath);
-            if (refType !== 'value' || hash === null) {
-                // Input not assigned, can't be cached
-                return null;
-            }
-            currentInputHashes.push(hash);
-        }
-        // Check if there's a cached execution for these inputs
-        const inHash = inputsHash(currentInputHashes);
-        const cachedOutputHash = await executionGetOutput(storage, repo, node.hash, inHash);
-        if (cachedOutputHash === null) {
-            // No cached execution for current inputs
-            return null;
-        }
-        // Get the latest execution status to retrieve the executionId
-        const latestStatus = await storage.refs.executionGetLatest(repo, node.hash, inHash);
-        if (!latestStatus || latestStatus.type !== 'success') {
-            // Latest execution wasn't a success
-            return null;
-        }
-        // Also verify the workspace output matches the cached output
-        // (in case the workspace was modified outside of execution)
-        const { refType, hash: wsOutputHash } = await workspaceGetDatasetHash(storage, repo, ws, node.outputPath);
-        if (refType !== 'value' || wsOutputHash !== cachedOutputHash) {
-            // Workspace output doesn't match cached output, need to re-execute
-            // (or update workspace with cached value)
-            return null;
-        }
-        return { outputHash: cachedOutputHash, executionId: latestStatus.value.executionId };
-    }
-    // Execute a single task (does NOT write to workspace - caller must do that)
-    async function executeTask(taskName) {
-        const node = taskNodes.get(taskName);
-        const taskStartTime = Date.now();
-        options.onTaskStart?.(taskName);
-        // Gather input hashes
-        const inputHashes = [];
-        for (const inputPath of node.inputPaths) {
-            const { refType, hash } = await workspaceGetDatasetHash(storage, repo, ws, inputPath);
-            if (refType !== 'value' || hash === null) {
-                // Input not available - should not happen if dependency tracking is correct
-                return {
-                    name: taskName,
-                    cached: false,
-                    state: 'error',
-                    error: `Input at ${pathToString(inputPath)} is not assigned (refType: ${refType})`,
-                    duration: Date.now() - taskStartTime,
-                };
-            }
-            inputHashes.push(hash);
-        }
-        // Execute the task using either the provided runner or direct taskExecute()
-        const execOptions = {
-            force: options.force,
-            signal: options.signal,
-            onStdout: options.onStdout ? (data) => options.onStdout(taskName, data) : undefined,
-            onStderr: options.onStderr ? (data) => options.onStderr(taskName, data) : undefined,
-        };
-        // Use provided runner if available, otherwise call taskExecute directly
-        const result = options.runner
-            ? await options.runner.execute(storage, node.hash, inputHashes, execOptions)
-            : await taskExecute(storage, repo, node.hash, inputHashes, execOptions);
-        // Build task result (NOTE: workspace update happens later, in mutex-protected section)
-        const taskResult = {
-            name: taskName,
-            cached: result.cached,
-            executionId: result.executionId,
-            state: result.state,
-            duration: Date.now() - taskStartTime,
-        };
-        if (result.state === 'error') {
-            taskResult.error = result.error ?? undefined;
-        }
-        else if (result.state === 'failed') {
-            taskResult.exitCode = result.exitCode ?? undefined;
-        }
-        // Pass output hash to caller for workspace update (if successful)
-        if (result.state === 'success' && result.outputHash) {
-            taskResult.outputHash = result.outputHash;
-        }
-        return taskResult;
-    }
-    // Process dependents when a task completes
-    function notifyDependents(taskName) {
-        const dependents = taskDependents.get(taskName) ?? new Set();
-        for (const depName of dependents) {
-            if (completed.has(depName) || inProgress.has(depName))
-                continue;
-            // Skip dependents not in the filter
-            if (filteredTaskNames && !filteredTaskNames.has(depName))
-                continue;
-            const depNode = taskNodes.get(depName);
-            depNode.unresolvedCount--;
-            if (depNode.unresolvedCount === 0 && !readyQueue.includes(depName)) {
-                readyQueue.push(depName);
-            }
-        }
-    }
-    // Mark dependents as skipped when a task fails.
-    // Uses dataflowGetDependentsToSkip to find all transitive dependents at once
-    // (shared with distributed execution in e3-aws).
-    function skipDependents(taskName) {
-        // Get all tasks to skip (excludes already completed, already skipped, and in-progress)
-        const toSkip = dataflowGetDependentsToSkip(dataflowGraph, taskName, completed, skippedTasks)
-            .filter(name => !inProgress.has(name)) // Also exclude in-progress tasks
-            .filter(name => !filteredTaskNames || filteredTaskNames.has(name)); // Apply filter
-        for (const depName of toSkip) {
-            completed.add(depName);
-            skippedTasks.add(depName);
-            skipped++;
-            results.push({
-                name: depName,
-                cached: false,
-                state: 'skipped',
-                duration: 0,
-            });
-            options.onTaskComplete?.({
-                name: depName,
-                cached: false,
-                state: 'skipped',
-                duration: 0,
-            });
-        }
-    }
-    // Main execution loop using a work-stealing approach
-    const runningPromises = new Map();
-    async function processQueue() {
-        while (true) {
-            // Check if we're done
-            if (readyQueue.length === 0 && runningPromises.size === 0) {
-                break;
-            }
-            // Launch tasks up to concurrency limit if no failure and not aborted
-            while (!hasFailure && !checkAborted() && readyQueue.length > 0 && runningPromises.size < concurrency) {
-                const taskName = readyQueue.shift();
-                if (completed.has(taskName) || inProgress.has(taskName))
-                    continue;
-                // Check if there's a valid cached execution for current inputs
-                const cachedResult = await getCachedOutput(taskName);
-                if (cachedResult !== null && !options.force) {
-                    // Valid cached execution exists for current inputs.
-                    // No workspace write needed (output already matches), but we still
-                    // need mutex protection for state updates to prevent races with
-                    // concurrent task completions.
-                    await workspaceUpdateMutex.runExclusive(() => {
-                        completed.add(taskName);
-                        cached++;
-                        const result = {
-                            name: taskName,
-                            cached: true,
-                            executionId: cachedResult.executionId,
-                            state: 'success',
-                            duration: 0,
-                        };
-                        results.push(result);
-                        options.onTaskComplete?.(result);
-                        notifyDependents(taskName);
-                        // Track in taskExecutions map
-                        taskExecutions.set(taskName, {
-                            executionId: cachedResult.executionId,
-                            cached: true,
-                        });
-                    });
-                    continue;
-                }
-                inProgress.add(taskName);
-                const promise = (async () => {
-                    try {
-                        const result = await executeTask(taskName);
-                        // Use mutex to serialize workspace updates and dependent notifications.
-                        // This prevents race conditions where two tasks complete simultaneously,
-                        // both read the same workspace state, and one overwrites the other's changes.
-                        await workspaceUpdateMutex.runExclusive(async () => {
-                            // Write output to workspace BEFORE notifying dependents
-                            if (result.state === 'success' && result.outputHash) {
-                                const node = taskNodes.get(taskName);
-                                await workspaceSetDatasetByHash(storage, repo, ws, node.outputPath, result.outputHash);
-                            }
-                            // Now safe to update execution state and notify dependents
-                            inProgress.delete(taskName);
-                            completed.add(taskName);
-                            results.push(result);
-                            options.onTaskComplete?.(result);
-                            if (result.state === 'success') {
-                                if (result.cached) {
-                                    cached++;
-                                }
-                                else {
-                                    executed++;
-                                }
-                                notifyDependents(taskName);
-                                // Track in taskExecutions map
-                                if (result.executionId) {
-                                    taskExecutions.set(taskName, {
-                                        executionId: result.executionId,
-                                        cached: result.cached,
-                                    });
-                                }
-                            }
-                            else {
-                                failed++;
-                                hasFailure = true;
-                                skipDependents(taskName);
-                                // Track failed execution too
-                                if (result.executionId) {
-                                    taskExecutions.set(taskName, {
-                                        executionId: result.executionId,
-                                        cached: false,
-                                    });
-                                }
-                            }
-                        });
-                    }
-                    finally {
-                        runningPromises.delete(taskName);
-                    }
-                })();
-                runningPromises.set(taskName, promise);
-            }
-            // Wait for at least one task to complete if we can't launch more
-            if (runningPromises.size > 0) {
-                await Promise.race(runningPromises.values());
-            }
-            else if (readyQueue.length === 0 || aborted) {
-                // No running tasks and either:
-                // - no ready tasks (unresolvable dependencies)
-                // - aborted (stop processing)
-                break;
-            }
-        }
-    }
-    await processQueue();
-    // Wait for any remaining tasks
-    if (runningPromises.size > 0) {
-        await Promise.all(runningPromises.values());
-    }
-    // Check for abort one final time
-    checkAborted();
-    // If aborted, throw with partial results (also update run record)
-    if (aborted) {
-        const finalWsState = await readWorkspaceState(storage, repo, ws);
-        const cancelledRun = {
-            runId,
-            workspaceName: ws,
-            packageRef: `${wsState.packageName}@${wsState.packageVersion}`,
-            startedAt,
-            completedAt: variant('some', new Date()),
-            status: variant('cancelled', {}),
-            inputSnapshot: wsState.rootHash,
-            outputSnapshot: variant('some', finalWsState.rootHash),
-            taskExecutions,
-            summary: {
-                total: BigInt(taskNodes.size),
-                completed: BigInt(executed + cached),
-                cached: BigInt(cached),
-                failed: BigInt(failed),
-                skipped: BigInt(skipped),
-            },
-        };
-        await storage.refs.dataflowRunWrite(repo, ws, cancelledRun);
-        throw new DataflowAbortedError(results);
-    }
-    // Read final workspace state for output snapshot
-    const finalWsState = await readWorkspaceState(storage, repo, ws);
-    // Determine final status
-    let finalStatus;
-    if (hasFailure) {
-        // Find the failed task
-        const failedTask = results.find(r => r.state === 'failed' || r.state === 'error');
-        finalStatus = variant('failed', {
-            failedTask: failedTask?.name ?? 'unknown',
-            error: failedTask?.error ?? failedTask?.exitCode?.toString() ?? 'Task failed',
-        });
-    }
-    else {
-        finalStatus = variant('completed', {});
-    }
-    // Write final DataflowRun record
-    const finalRun = {
-        runId,
-        workspaceName: ws,
-        packageRef: `${wsState.packageName}@${wsState.packageVersion}`,
-        startedAt,
-        completedAt: variant('some', new Date()),
-        status: finalStatus,
-        inputSnapshot: wsState.rootHash,
-        outputSnapshot: variant('some', finalWsState.rootHash),
-        taskExecutions,
-        summary: {
-            total: BigInt(taskNodes.size),
-            completed: BigInt(executed + cached),
-            cached: BigInt(cached),
-            failed: BigInt(failed),
-            skipped: BigInt(skipped),
-        },
-    };
-    await storage.refs.dataflowRunWrite(repo, ws, finalRun);
-    // Update workspace state with currentRunId on success
-    if (!hasFailure) {
-        // Read, update, write workspace state
-        const currentState = await readWorkspaceState(storage, repo, ws);
-        const updatedState = {
-            ...currentState,
-            currentRunId: variant('some', runId),
-        };
-        const encoder = encodeBeast2For(WorkspaceStateType);
-        await storage.refs.workspaceWrite(repo, ws, encoder(updatedState));
+    finally {
+        await options.lock.release();
     }
-    return {
-        success: !hasFailure,
-        runId,
-        executed,
-        cached,
-        failed,
-        skipped,
-        tasks: results,
-        duration: Date.now() - startTime,
-    };
 }
+// =============================================================================
+// Graph Queries (shared between local and cloud execution)
+// =============================================================================
 /**
  * Get the dependency graph for a workspace (for visualization/debugging).
  *
  * @param storage - Storage backend
- * @param repo - Repository identifier (for local storage, the path to e3 repository directory)
+ * @param repo - Repository identifier
  * @param ws - Workspace name
  * @returns Graph information
+ *
  * @throws {WorkspaceNotFoundError} If workspace doesn't exist
  * @throws {WorkspaceNotDeployedError} If workspace has no package deployed
  * @throws {DataflowError} If graph building fails for other reasons
@@ -758,33 +271,60 @@ export async function dataflowGetGraph(storage, repo, ws) {
     }
     return { tasks };
 }
+/**
+ * Find all tasks affected by input changes (transitive dependents).
+ * An affected task is one whose output could change due to the input change.
+ *
+ * @param graph - The dependency graph
+ * @param changes - Array of changed input paths
+ * @returns Array of affected task names
+ */
+export function findAffectedTasks(graph, changes) {
+    const changedPaths = new Set(changes.map(c => c.path));
+    const affected = new Set();
+    const queue = [];
+    // Build forward dep map: task name → tasks that depend on its output
+    const taskToDependents = new Map();
+    for (const task of graph.tasks) {
+        for (const dep of task.dependsOn) {
+            if (!taskToDependents.has(dep))
+                taskToDependents.set(dep, []);
+            taskToDependents.get(dep).push(task.name);
+        }
+    }
+    // Seed: tasks that directly read a changed input
+    for (const task of graph.tasks) {
+        if (task.inputs.some(inp => changedPaths.has(inp))) {
+            queue.push(task.name);
+        }
+    }
+    // BFS through dependency graph
+    while (queue.length > 0) {
+        const name = queue.shift();
+        if (affected.has(name))
+            continue;
+        affected.add(name);
+        for (const dep of taskToDependents.get(name) ?? []) {
+            queue.push(dep);
+        }
+    }
+    return Array.from(affected);
+}
 /**
  * Get tasks that are ready to execute given the set of completed tasks.
  *
  * A task is ready when all tasks it depends on have completed.
- * This is useful for distributed execution (e.g., AWS Step Functions)
- * where a coordinator needs to determine which tasks can run next.
  *
  * @param graph - The dependency graph from dataflowGetGraph
  * @param completedTasks - Set of task names that have completed
  * @returns Array of task names that are ready to execute
- *
- * @example
- * ```typescript
- * const graph = await dataflowGetGraph(storage, repo, 'production');
- * const ready = dataflowGetReadyTasks(graph, new Set()); // Initial ready tasks
- * // Execute ready[0]...
- * const nextReady = dataflowGetReadyTasks(graph, new Set([ready[0]]));
- * ```
  */
 export function dataflowGetReadyTasks(graph, completedTasks) {
     const ready = [];
     for (const task of graph.tasks) {
-        // Skip already completed tasks
         if (completedTasks.has(task.name)) {
             continue;
         }
-        // Check if all dependencies are satisfied
         const allDepsCompleted = task.dependsOn.every(dep => completedTasks.has(dep));
         if (allDepsCompleted) {
             ready.push(task.name);
@@ -795,24 +335,11 @@ export function dataflowGetReadyTasks(graph, completedTasks) {
 /**
  * Check if a task execution is cached for the given inputs.
  *
- * This is useful for distributed execution where a Lambda handler needs
- * to check if a task can be skipped before spawning execution.
- *
  * @param storage - Storage backend
  * @param repo - Repository path
  * @param taskHash - Hash of the TaskObject
  * @param inputHashes - Array of input dataset hashes (in order)
  * @returns Output hash if cached, null if execution needed
- *
- * @example
- * ```typescript
- * const outputHash = await dataflowCheckCache(storage, repo, taskHash, inputHashes);
- * if (outputHash) {
- *   // Task is cached, use outputHash directly
- * } else {
- *   // Need to execute task
- * }
- * ```
  */
 export async function dataflowCheckCache(storage, repo, taskHash, inputHashes) {
     const inHash = inputsHash(inputHashes);
@@ -821,29 +348,16 @@ export async function dataflowCheckCache(storage, repo, taskHash, inputHashes) {
 /**
  * Find tasks that should be skipped when a task fails.
  *
- * Returns all tasks that transitively depend on the failed task
- * (directly or through other tasks), excluding already completed
- * or already skipped tasks.
- *
- * This is useful for distributed execution where the coordinator
- * needs to mark downstream tasks as skipped after a failure.
+ * Returns all tasks that transitively depend on the failed task,
+ * excluding already completed or already skipped tasks.
  *
  * @param graph - The dependency graph from dataflowGetGraph
  * @param failedTask - Name of the task that failed
- * @param completedTasks - Set of task names already completed (won't be skipped)
- * @param skippedTasks - Set of task names already skipped (won't be returned again)
+ * @param completedTasks - Set of task names already completed
+ * @param skippedTasks - Set of task names already skipped
  * @returns Array of task names that should be skipped
- *
- * @example
- * ```typescript
- * const graph = await dataflowGetGraph(storage, repo, 'production');
- * // Task 'etl' failed...
- * const toSkip = dataflowGetDependentsToSkip(graph, 'etl', completed, skipped);
- * // toSkip might be ['transform', 'aggregate', 'report'] - all downstream tasks
- * ```
  */
 export function dataflowGetDependentsToSkip(graph, failedTask, completedTasks, skippedTasks) {
-    // Build reverse dependency map: task -> tasks that depend on it
     const dependents = new Map();
     for (const task of graph.tasks) {
         dependents.set(task.name, []);
@@ -853,7 +367,6 @@ export function dataflowGetDependentsToSkip(graph, failedTask, completedTasks, s
             dependents.get(dep)?.push(task.name);
         }
     }
-    // BFS to find all transitive dependents
     const toSkip = [];
     const visited = new Set();
     const queue = [failedTask];
@@ -861,21 +374,15 @@ export function dataflowGetDependentsToSkip(graph, failedTask, completedTasks, s
         const current = queue.shift();
         const deps = dependents.get(current) ?? [];
         for (const dep of deps) {
-            // Skip if already processed
-            if (visited.has(dep)) {
+            if (visited.has(dep))
                 continue;
-            }
             visited.add(dep);
-            // Skip if already completed (no need to explore further - completed tasks break the chain)
-            if (completedTasks.has(dep)) {
+            if (completedTasks.has(dep))
                 continue;
-            }
-            // If already skipped, still explore dependents but don't add to result again
             if (skippedTasks.has(dep)) {
                 queue.push(dep);
                 continue;
             }
-            // New task to skip
             toSkip.push(dep);
             queue.push(dep);
         }
@@ -885,32 +392,15 @@ export function dataflowGetDependentsToSkip(graph, failedTask, completedTasks, s
 /**
  * Resolve input hashes for a task from current workspace state.
  *
- * Returns an array of hashes in the same order as the task's inputs.
- * If any input is unassigned, returns null for that position.
- *
- * This is useful for distributed execution where the input hashes
- * need to be resolved before checking cache or executing.
- *
  * @param storage - Storage backend
  * @param repo - Repository path
  * @param ws - Workspace name
- * @param task - Task info from the graph (needs inputs array)
+ * @param task - Task info from the graph
  * @returns Array of hashes (null if input is unassigned)
- *
- * @example
- * ```typescript
- * const graph = await dataflowGetGraph(storage, repo, 'production');
- * const task = graph.tasks.find(t => t.name === 'etl')!;
- * const inputHashes = await dataflowResolveInputHashes(storage, repo, 'production', task);
- * if (!inputHashes.includes(null)) {
- *   const cached = await dataflowCheckCache(storage, repo, task.hash, inputHashes);
- * }
- * ```
  */
 export async function dataflowResolveInputHashes(storage, repo, ws, task) {
     const hashes = [];
     for (const inputPathStr of task.inputs) {
-        // Parse the keypath string back to TreePath
         const inputPath = parsePathString(inputPathStr);
         const { refType, hash } = await workspaceGetDatasetHash(storage, repo, ws, inputPath);
         if (refType === 'value' && hash !== null) {