PyPI - smartify-ai - Versions diffs - 0.1.0__py3-none-any.whl - Mend

smartify-ai 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

smartify/__init__.py +3 -0
smartify/agents/__init__.py +0 -0
smartify/agents/adapters/__init__.py +13 -0
smartify/agents/adapters/anthropic.py +253 -0
smartify/agents/adapters/openai.py +289 -0
smartify/api/__init__.py +26 -0
smartify/api/auth.py +352 -0
smartify/api/errors.py +380 -0
smartify/api/events.py +345 -0
smartify/api/server.py +992 -0
smartify/cli/__init__.py +1 -0
smartify/cli/main.py +430 -0
smartify/engine/__init__.py +64 -0
smartify/engine/approval.py +479 -0
smartify/engine/orchestrator.py +1365 -0
smartify/engine/scheduler.py +380 -0
smartify/engine/spark.py +294 -0
smartify/guardrails/__init__.py +22 -0
smartify/guardrails/breakers.py +409 -0
smartify/models/__init__.py +61 -0
smartify/models/grid.py +625 -0
smartify/notifications/__init__.py +22 -0
smartify/notifications/webhook.py +556 -0
smartify/state/__init__.py +46 -0
smartify/state/checkpoint.py +558 -0
smartify/state/resume.py +301 -0
smartify/state/store.py +370 -0
smartify/tools/__init__.py +17 -0
smartify/tools/base.py +196 -0
smartify/tools/builtin/__init__.py +79 -0
smartify/tools/builtin/file.py +464 -0
smartify/tools/builtin/http.py +195 -0
smartify/tools/builtin/shell.py +137 -0
smartify/tools/mcp/__init__.py +33 -0
smartify/tools/mcp/adapter.py +157 -0
smartify/tools/mcp/client.py +334 -0
smartify/tools/mcp/registry.py +130 -0
smartify/validator/__init__.py +0 -0
smartify/validator/validate.py +271 -0
smartify/workspace/__init__.py +5 -0
smartify/workspace/manager.py +248 -0
smartify_ai-0.1.0.dist-info/METADATA +201 -0
smartify_ai-0.1.0.dist-info/RECORD +46 -0
smartify_ai-0.1.0.dist-info/WHEEL +4 -0
smartify_ai-0.1.0.dist-info/entry_points.txt +2 -0
smartify_ai-0.1.0.dist-info/licenses/LICENSE +21 -0

smartify/engine/orchestrator.py ADDED Viewed

@@ -0,0 +1,1365 @@
+"""Grid orchestrator - core execution engine.
+The orchestrator manages grid lifecycle and coordinates node execution:
+1. Load and validate grid specifications
+2. Manage grid state transitions (draft → ready → energized → running → completed)
+3. Execute nodes in topological order via DAG scheduler
+4. Handle context/state passing between nodes
+5. Coordinate with LLM and tool adapters
+"""
+import asyncio
+import logging
+import time
+from dataclasses import dataclass, field
+from datetime import datetime
+from enum import Enum
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, Protocol, Union
+import yaml
+from smartify.models.grid import (
+    GridSpec,
+    GridState,
+    NodeSpec,
+    NodeKind,
+    DynamicSpawningSpec,
+)
+from smartify.engine.scheduler import DAGScheduler, NodeState
+from smartify.engine.spark import SparkManager, SparkRequest, SparkNode
+from smartify.engine.approval import (
+    ApprovalManager,
+    ApprovalRequest,
+    ApprovalStatus,
+    get_approval_manager,
+)
+from smartify.notifications.webhook import (
+    WebhookNotifier,
+    WebhookConfig,
+    EventType,
+)
+from smartify.state.checkpoint import CheckpointStore, get_checkpoint_store
+from smartify.validator.validate import validate_grid
+from smartify.guardrails.breakers import (
+    BreakerManager,
+    BreakerError,
+    BreakerSpec,
+)
+from smartify.models.grid import TripAction
+logger = logging.getLogger(__name__)
+class LLMAdapter(Protocol):
+    """Protocol for LLM adapters."""
+    async def complete(
+        self,
+        messages: List[Dict[str, str]],
+        system: Optional[str] = None,
+        temperature: float = 0.7,
+        max_tokens: Optional[int] = None,
+        tools: Optional[List[Dict]] = None,
+    ) -> Dict[str, Any]:
+        """Generate a completion from the LLM."""
+        ...
+class ToolAdapter(Protocol):
+    """Protocol for tool execution adapters."""
+    async def execute(
+        self,
+        tool_name: str,
+        arguments: Dict[str, Any],
+    ) -> Dict[str, Any]:
+        """Execute a tool and return its result."""
+        ...
+class ExecutionError(Exception):
+    """Raised when grid execution fails."""
+    pass
+class GridLifecycleError(Exception):
+    """Raised for invalid grid state transitions."""
+    pass
+@dataclass
+class NodeResult:
+    """Result of executing a single node."""
+    node_id: str
+    success: bool
+    output: Optional[Dict[str, Any]] = None
+    error: Optional[str] = None
+    started_at: Optional[datetime] = None
+    completed_at: Optional[datetime] = None
+    tokens_used: int = 0
+@dataclass
+class ExecutionContext:
+    """Shared context during grid execution.
+    Provides access to:
+    - Grid inputs
+    - Node outputs (from completed nodes)
+    - Environment variables
+    - Execution metadata
+    """
+    grid_id: str
+    inputs: Dict[str, Any] = field(default_factory=dict)
+    outputs: Dict[str, Dict[str, Any]] = field(default_factory=dict)
+    env: Dict[str, str] = field(default_factory=dict)
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    # Token tracking
+    total_tokens: int = 0
+    token_limit: Optional[int] = None
+    # Cost tracking
+    total_cost: float = 0.0
+    cost_limit: Optional[float] = None
+    def get_node_output(self, node_id: str) -> Optional[Dict[str, Any]]:
+        """Get output from a completed node."""
+        return self.outputs.get(node_id)
+    def set_node_output(self, node_id: str, output: Dict[str, Any]) -> None:
+        """Store output from a completed node."""
+        self.outputs[node_id] = output
+    def resolve_reference(self, ref: str) -> Any:
+        """Resolve a reference like '$controller.output.result'.
+        Supported prefixes:
+        - $<node_id>. - Reference to node output
+        - $inputs. - Reference to grid inputs
+        - $env. - Reference to environment variables
+        """
+        if not ref.startswith('$'):
+            return ref
+        parts = ref[1:].split('.')
+        if not parts:
+            return None
+        root = parts[0]
+        path = parts[1:] if len(parts) > 1 else []
+        # Get the root object
+        if root == 'inputs':
+            obj = self.inputs
+        elif root == 'env':
+            obj = self.env
+        elif root in self.outputs:
+            obj = self.outputs[root]
+        else:
+            return None
+        # Navigate path
+        for key in path:
+            if isinstance(obj, dict) and key in obj:
+                obj = obj[key]
+            else:
+                return None
+        return obj
+    def check_breakers(self) -> Optional[str]:
+        """Check if any breakers are tripped. Returns trip reason or None."""
+        if self.token_limit and self.total_tokens >= self.token_limit:
+            return f"Token limit exceeded: {self.total_tokens}/{self.token_limit}"
+        if self.cost_limit and self.total_cost >= self.cost_limit:
+            return f"Cost limit exceeded: {self.total_cost}/{self.cost_limit}"
+        return None
+@dataclass
+class GridRun:
+    """Represents a single execution run of a grid."""
+    grid: GridSpec
+    scheduler: DAGScheduler
+    context: ExecutionContext
+    state: GridState = GridState.DRAFT
+    results: Dict[str, NodeResult] = field(default_factory=dict)
+    started_at: Optional[datetime] = None
+    completed_at: Optional[datetime] = None
+    error: Optional[str] = None
+    spark_manager: Optional[SparkManager] = None
+    webhook_notifier: Optional[WebhookNotifier] = None
+    checkpoint_store: Optional[CheckpointStore] = None
+    run_id: Optional[str] = None  # Unique ID for this run (for checkpointing)
+    breaker_manager: Optional[BreakerManager] = None  # Request/concurrency rate limiting
+class Orchestrator:
+    """Core grid execution engine.
+    Usage:
+        orchestrator = Orchestrator()
+        orchestrator.register_llm_adapter("anthropic", anthropic_adapter)
+        run = await orchestrator.load_grid("path/to/grid.yaml")
+        await orchestrator.energize(run)
+        result = await orchestrator.execute(run)
+    """
+    def __init__(
+        self,
+        tool_registry: Optional["ToolRegistry"] = None,
+        checkpoint_store: Optional[CheckpointStore] = None,
+        enable_checkpoints: bool = True,
+    ):
+        from smartify.tools import ToolRegistry
+        from smartify.tools.builtin import create_builtin_registry
+        self.llm_adapters: Dict[str, LLMAdapter] = {}
+        self.tool_adapters: Dict[str, ToolAdapter] = {}  # Legacy, kept for compat
+        self.tool_registry: ToolRegistry = tool_registry or create_builtin_registry()
+        self.runs: Dict[str, GridRun] = {}
+        self.checkpoint_store = checkpoint_store
+        self.enable_checkpoints = enable_checkpoints
+        # Node executors by kind
+        self._executors: Dict[NodeKind, Callable] = {
+            NodeKind.CONTROLLER: self._execute_controller,
+            NodeKind.RELAY: self._execute_relay,
+            NodeKind.SUBSTATION: self._execute_substation,
+            NodeKind.SPARK: self._execute_spark,
+            NodeKind.FOREACH: self._execute_foreach,
+            NodeKind.EXPR: self._execute_expr,
+            NodeKind.AGGREGATE: self._execute_aggregate,
+            NodeKind.APPROVAL: self._execute_approval,
+        }
+    def register_llm_adapter(self, name: str, adapter: LLMAdapter) -> None:
+        """Register an LLM adapter."""
+        self.llm_adapters[name] = adapter
+    def register_tool_adapter(self, name: str, adapter: ToolAdapter) -> None:
+        """Register a tool adapter."""
+        self.tool_adapters[name] = adapter
+    async def load_grid(
+        self,
+        source: Union[str, Path, Dict],
+        inputs: Optional[Dict[str, Any]] = None,
+        env: Optional[Dict[str, str]] = None,
+    ) -> GridRun:
+        """Load and validate a grid specification.
+        Args:
+            source: Path to YAML file, YAML string, or dict
+            inputs: Initial input values for the grid
+            env: Environment variables
+        Returns:
+            GridRun instance in DRAFT state
+        """
+        # Parse source
+        if isinstance(source, (str, Path)):
+            path = Path(source)
+            if path.exists():
+                with open(path) as f:
+                    spec_dict = yaml.safe_load(f)
+            else:
+                # Assume it's YAML content
+                spec_dict = yaml.safe_load(str(source))
+        else:
+            spec_dict = source
+        # Validate and parse
+        errors = validate_grid(spec_dict)
+        if errors:
+            raise ExecutionError(f"Grid validation failed: {errors}")
+        grid = GridSpec.model_validate(spec_dict)
+        # Create scheduler
+        scheduler = DAGScheduler(grid)
+        scheduler.build_graph()
+        # Create context
+        context = ExecutionContext(
+            grid_id=grid.id,
+            inputs=inputs or {},
+            env=env or {},
+        )
+        # Set breaker limits from grid spec (via guardrails)
+        breaker_manager = None
+        if grid.guardrails and grid.guardrails.breakers:
+            breakers = grid.guardrails.breakers
+            if breakers.tokens:
+                context.token_limit = breakers.tokens.maxTotalTokensPerRun
+            if breakers.cost:
+                context.cost_limit = breakers.cost.maxCostPerRun
+            # Create BreakerManager for request/concurrency limits
+            if breakers.requests:
+                request_spec = BreakerSpec(requests=breakers.requests)
+                breaker_manager = BreakerManager(
+                    spec=request_spec,
+                    actions=grid.guardrails.breakerActions,
+                )
+                breaker_manager.start()
+                logger.debug(
+                    f"Created BreakerManager with maxConcurrentAgents="
+                    f"{breakers.requests.maxConcurrentAgents}, "
+                    f"maxRequestsPerMinute={breakers.requests.maxRequestsPerMinute}"
+                )
+        # Create spark manager for dynamic spawning
+        spark_manager = SparkManager(
+            config=grid.topology.dynamicSpawning,
+            scheduler=scheduler,
+        )
+        # Create webhook notifier if notifications configured
+        webhook_notifier = None
+        if grid.notifications and grid.notifications.webhooks:
+            webhook_notifier = WebhookNotifier()
+            for wh_config in grid.notifications.webhooks:
+                if not wh_config.enabled:
+                    continue
+                # Convert event strings to EventType enums
+                events = []
+                for event_name in wh_config.events:
+                    try:
+                        events.append(EventType(event_name))
+                    except ValueError:
+                        logger.warning(f"Unknown webhook event type: {event_name}")
+                webhook_notifier.add_webhook(WebhookConfig(
+                    url=wh_config.url,
+                    events=events or list(EventType),
+                    secret=wh_config.secret,
+                    headers=wh_config.headers or {},
+                    max_retries=wh_config.maxRetries,
+                    timeout_seconds=wh_config.timeout,
+                ))
+            logger.info(f"Configured {len(grid.notifications.webhooks)} webhook(s) for grid '{grid.id}'")
+        # Generate unique run ID
+        from uuid import uuid4
+        run_id = f"run-{uuid4().hex[:12]}"
+        # Create run
+        run = GridRun(
+            grid=grid,
+            scheduler=scheduler,
+            context=context,
+            state=GridState.DRAFT,
+            spark_manager=spark_manager,
+            webhook_notifier=webhook_notifier,
+            checkpoint_store=self.checkpoint_store,
+            run_id=run_id,
+            breaker_manager=breaker_manager,
+        )
+        # Create checkpoint if enabled
+        if self.enable_checkpoints and self.checkpoint_store:
+            # Serialize grid back to YAML for checkpoint
+            grid_yaml = yaml.dump(spec_dict, default_flow_style=False)
+            self.checkpoint_store.create_checkpoint(
+                run_id=run_id,
+                grid_id=grid.id,
+                grid_yaml=grid_yaml,
+                inputs=inputs or {},
+            )
+            logger.debug(f"Created checkpoint for run {run_id}")
+        self.runs[grid.id] = run
+        logger.info(f"Loaded grid '{grid.id}' ({len(grid.topology.nodes)} nodes) [run_id={run_id}]")
+        return run
+    async def energize(self, run: GridRun) -> None:
+        """Transition grid from DRAFT to READY to ENERGIZED.
+        This validates the grid is ready for execution.
+        """
+        if run.state not in (GridState.DRAFT, GridState.READY):
+            raise GridLifecycleError(
+                f"Cannot energize grid in state {run.state}"
+            )
+        # Validate required inputs and apply defaults
+        if run.grid.inputs:
+            for input_def in run.grid.inputs:
+                if input_def.name not in run.context.inputs:
+                    if input_def.default is not None:
+                        # Apply default value
+                        run.context.inputs[input_def.name] = input_def.default
+                    elif input_def.required:
+                        # Required input with no default - error
+                        raise ExecutionError(
+                            f"Missing required input: {input_def.name}"
+                        )
+        # Validate LLM adapter is available
+        has_llm_nodes = any(
+            node.kind in (NodeKind.CONTROLLER, NodeKind.RELAY, NodeKind.SUBSTATION)
+            for node in run.grid.topology.nodes
+        )
+        if has_llm_nodes and not self.llm_adapters:
+            raise ExecutionError("No LLM adapter registered")
+        # Register MCP servers if configured
+        await self._register_mcp_servers(run)
+        run.state = GridState.ENERGIZED
+        logger.info(f"Grid '{run.grid.id}' energized and ready for execution")
+    async def _register_mcp_servers(self, run: GridRun) -> None:
+        """Register MCP server tools from the grid spec.
+        Connects to each MCP server defined in grid.tools.mcpServers and
+        registers their tools in the tool registry.
+        """
+        if not run.grid.tools or not run.grid.tools.mcpServers:
+            return
+        mcp_servers = run.grid.tools.mcpServers
+        if not mcp_servers:
+            return
+        # Try importing MCP integration
+        try:
+            from smartify.tools.mcp import McpServerConfig, McpTransport, register_mcp_server
+        except ImportError:
+            raise ExecutionError(
+                f"Grid '{run.grid.id}' uses MCP servers but MCP is not installed. "
+                "Install with: pip install smartify[mcp]"
+            )
+        logger.info(f"Registering {len(mcp_servers)} MCP server(s) for grid '{run.grid.id}'")
+        for server_spec in mcp_servers:
+            try:
+                # Convert grid spec to McpServerConfig
+                transport_map = {
+                    "stdio": McpTransport.STDIO,
+                    "sse": McpTransport.SSE,
+                    "streamable_http": McpTransport.STREAMABLE_HTTP,
+                }
+                transport = transport_map.get(server_spec.transport, McpTransport.STDIO)
+                config = McpServerConfig(
+                    id=server_spec.id,
+                    transport=transport,
+                    command=server_spec.command,
+                    args=server_spec.args,
+                    env=server_spec.env,
+                    cwd=server_spec.cwd,
+                    url=server_spec.url,
+                    headers=server_spec.headers,
+                    prefix=server_spec.prefix,
+                    tools=server_spec.tools,
+                )
+                # Register the MCP server's tools
+                tool_names = await register_mcp_server(self.tool_registry, config)
+                logger.info(
+                    f"Registered {len(tool_names)} tools from MCP server '{server_spec.id}'"
+                )
+            except Exception as e:
+                logger.error(f"Failed to register MCP server '{server_spec.id}': {e}")
+                raise ExecutionError(
+                    f"Failed to connect to MCP server '{server_spec.id}': {e}"
+                )
+    async def execute(self, run: GridRun) -> Dict[str, Any]:
+        """Execute the grid to completion.
+        Returns:
+            Final outputs from the grid execution
+        """
+        if run.state != GridState.ENERGIZED:
+            raise GridLifecycleError(
+                f"Cannot execute grid in state {run.state}"
+            )
+        run.state = GridState.RUNNING
+        run.started_at = datetime.now()
+        logger.info(f"Starting execution of grid '{run.grid.id}'")
+        # Send run_started notification
+        if run.webhook_notifier:
+            asyncio.create_task(run.webhook_notifier.notify_run_started(
+                grid_id=run.grid.id,
+                grid_name=run.grid.name,
+                inputs=run.context.inputs,
+            ))
+        try:
+            while not run.scheduler.is_complete():
+                # Check breakers
+                trip_reason = run.context.check_breakers()
+                if trip_reason:
+                    logger.warning(f"Breaker tripped: {trip_reason}")
+                    run.error = trip_reason
+                    run.state = GridState.FAILED
+                    # Send breaker_tripped notification
+                    if run.webhook_notifier:
+                        # Parse breaker type from reason
+                        breaker_type = "unknown"
+                        current_value = 0.0
+                        limit = 0.0
+                        if "Token limit" in trip_reason:
+                            breaker_type = "tokens"
+                            current_value = run.context.total_tokens
+                            limit = run.context.token_limit or 0
+                        elif "Cost limit" in trip_reason:
+                            breaker_type = "cost"
+                            current_value = run.context.total_cost
+                            limit = run.context.cost_limit or 0
+                        asyncio.create_task(run.webhook_notifier.notify_breaker_tripped(
+                            grid_id=run.grid.id,
+                            breaker_type=breaker_type,
+                            current_value=current_value,
+                            limit=limit,
+                            action="stop",
+                        ))
+                    break
+                # Check request/concurrency breakers via BreakerManager
+                if run.breaker_manager:
+                    try:
+                        trip = await run.breaker_manager.check_and_enforce()
+                        if trip and trip.action in (TripAction.STOP, TripAction.BLOCK):
+                            logger.warning(f"Request breaker tripped: {trip.reason}")
+                            run.error = trip.reason
+                            run.state = GridState.FAILED
+                            # Send breaker_tripped notification
+                            if run.webhook_notifier:
+                                asyncio.create_task(run.webhook_notifier.notify_breaker_tripped(
+                                    grid_id=run.grid.id,
+                                    breaker_type="requests",
+                                    current_value=trip.current_value,
+                                    limit=trip.limit_value,
+                                    action=trip.action.value,
+                                ))
+                            break
+                    except BreakerError as e:
+                        logger.warning(f"Request breaker error: {e}")
+                        run.error = str(e)
+                        run.state = GridState.FAILED
+                        break
+                # Get ready nodes
+                ready_nodes = run.scheduler.get_ready_nodes()
+                # Cap concurrency based on maxConcurrentAgents
+                if (
+                    run.breaker_manager
+                    and run.breaker_manager.spec.requests
+                    and run.breaker_manager.spec.requests.maxConcurrentAgents
+                ):
+                    max_concurrent = run.breaker_manager.spec.requests.maxConcurrentAgents
+                    current_concurrent = run.breaker_manager.state.rate_limit.concurrent_count
+                    available_slots = max(0, max_concurrent - current_concurrent)
+                    if available_slots < len(ready_nodes):
+                        logger.debug(
+                            f"Capping ready nodes from {len(ready_nodes)} to {available_slots} "
+                            f"(maxConcurrentAgents={max_concurrent}, current={current_concurrent})"
+                        )
+                        ready_nodes = ready_nodes[:available_slots]
+                if not ready_nodes:
+                    # Check for deadlock
+                    running = run.scheduler.get_running_nodes()
+                    if not running:
+                        logger.error("Execution stalled - no ready or running nodes")
+                        run.error = "Execution deadlock"
+                        run.state = GridState.FAILED
+                        break
+                    # Wait for running nodes
+                    await asyncio.sleep(0.1)
+                    continue
+                # Execute ready nodes (in parallel)
+                tasks = []
+                for node_id in ready_nodes:
+                    run.scheduler.mark_running(node_id)
+                    # Record request start for rate limiting
+                    if run.breaker_manager:
+                        run.breaker_manager.record_request_start()
+                    # Checkpoint node start
+                    if run.checkpoint_store and run.run_id:
+                        run.checkpoint_store.checkpoint_node_started(run.run_id, node_id)
+                    task = asyncio.create_task(
+                        self._execute_node(run, node_id)
+                    )
+                    tasks.append(task)
+                # Wait for all parallel nodes
+                results = await asyncio.gather(*tasks, return_exceptions=True)
+                # Process results
+                for node_id, result in zip(ready_nodes, results):
+                    # Record request complete for rate limiting (both success and failure)
+                    if run.breaker_manager:
+                        run.breaker_manager.record_request_complete()
+                    if isinstance(result, Exception):
+                        logger.error(f"Node {node_id} failed: {result}")
+                        run.scheduler.mark_failed(node_id, str(result))
+                        # Checkpoint node failure
+                        if run.checkpoint_store and run.run_id:
+                            run.checkpoint_store.checkpoint_node_failed(
+                                run.run_id, node_id, str(result)
+                            )
+                        # Check retry
+                        if run.scheduler.can_retry(node_id):
+                            logger.info(f"Retrying node {node_id}")
+                            run.scheduler.reset_for_retry(node_id)
+                        else:
+                            run.error = f"Node {node_id} failed: {result}"
+                            run.state = GridState.FAILED
+                    else:
+                        run.results[node_id] = result
+                        run.scheduler.mark_completed(node_id, result.output)
+                        run.context.set_node_output(node_id, result.output or {})
+                        # Checkpoint node completion
+                        if run.checkpoint_store and run.run_id:
+                            run.checkpoint_store.checkpoint_node_complete(
+                                run.run_id,
+                                node_id,
+                                result.output or {},
+                                tokens_used=result.tokens_used,
+                            )
+            # Determine final state
+            run.completed_at = datetime.now()
+            duration_seconds = (run.completed_at - run.started_at).total_seconds() if run.started_at else 0
+            if run.state != GridState.FAILED:
+                if run.scheduler.is_successful():
+                    run.state = GridState.COMPLETED
+                    logger.info(f"Grid '{run.grid.id}' completed successfully")
+                    # Mark checkpoint completed
+                    if run.checkpoint_store and run.run_id:
+                        run.checkpoint_store.mark_completed(run.run_id)
+                    # Send run_completed notification
+                    if run.webhook_notifier:
+                        asyncio.create_task(run.webhook_notifier.notify_run_completed(
+                            grid_id=run.grid.id,
+                            outputs=run.context.outputs,
+                            duration_seconds=duration_seconds,
+                            total_tokens=run.context.total_tokens,
+                            total_cost=run.context.total_cost,
+                        ))
+                else:
+                    run.state = GridState.FAILED
+                    logger.error(f"Grid '{run.grid.id}' failed")
+                    # Mark checkpoint failed
+                    if run.checkpoint_store and run.run_id:
+                        run.checkpoint_store.mark_failed(run.run_id, run.error or "Unknown error")
+                    # Send run_failed notification
+                    if run.webhook_notifier:
+                        asyncio.create_task(run.webhook_notifier.notify_run_failed(
+                            grid_id=run.grid.id,
+                            error=run.error or "Unknown error",
+                            duration_seconds=duration_seconds,
+                        ))
+            else:
+                # Already failed (breaker or node failure)
+                # Mark checkpoint failed
+                if run.checkpoint_store and run.run_id:
+                    run.checkpoint_store.mark_failed(run.run_id, run.error or "Unknown error")
+                if run.webhook_notifier:
+                    asyncio.create_task(run.webhook_notifier.notify_run_failed(
+                        grid_id=run.grid.id,
+                        error=run.error or "Unknown error",
+                        duration_seconds=duration_seconds,
+                    ))
+            return self._collect_outputs(run)
+        except Exception as e:
+            logger.exception(f"Grid execution error: {e}")
+            run.state = GridState.FAILED
+            run.error = str(e)
+            run.completed_at = datetime.now()
+            # Mark checkpoint failed
+            if run.checkpoint_store and run.run_id:
+                run.checkpoint_store.mark_failed(run.run_id, str(e))
+            # Send run_failed notification
+            if run.webhook_notifier:
+                duration_seconds = (run.completed_at - run.started_at).total_seconds() if run.started_at else 0
+                asyncio.create_task(run.webhook_notifier.notify_run_failed(
+                    grid_id=run.grid.id,
+                    error=str(e),
+                    duration_seconds=duration_seconds,
+                ))
+            raise ExecutionError(str(e)) from e
+    async def _execute_node(self, run: GridRun, node_id: str) -> NodeResult:
+        """Execute a single node."""
+        node = run.scheduler.nodes[node_id].node
+        result = NodeResult(
+            node_id=node_id,
+            success=False,
+            started_at=datetime.now(),
+        )
+        logger.debug(f"Executing node {node_id} ({node.kind})")
+        try:
+            # Get executor for node kind
+            executor = self._executors.get(node.kind)
+            if not executor:
+                raise ExecutionError(f"No executor for node kind: {node.kind}")
+            # Execute
+            output = await executor(run, node)
+            result.success = True
+            result.output = output
+            result.completed_at = datetime.now()
+            logger.debug(f"Node {node_id} completed successfully")
+        except Exception as e:
+            result.success = False
+            result.error = str(e)
+            result.completed_at = datetime.now()
+            logger.error(f"Node {node_id} failed: {e}")
+            raise
+        return result
+    async def _execute_controller(
+        self,
+        run: GridRun,
+        node: NodeSpec,
+    ) -> Dict[str, Any]:
+        """Execute a controller node.
+        Controllers orchestrate the overall workflow with LLM guidance.
+        """
+        # Build prompt from node config (falls back to description)
+        prompt = self._build_prompt(run, node)
+        # Get LLM adapter
+        adapter = self.llm_adapters.get("default") or next(
+            iter(self.llm_adapters.values()), None
+        )
+        if not adapter:
+            raise ExecutionError("No LLM adapter registered")
+        # Build messages
+        messages = [{"role": "user", "content": prompt}]
+        # Add system prompt
+        system = node.prompt.system if node.prompt else None
+        # Call LLM
+        response = await adapter.complete(
+            messages=messages,
+            system=system,
+            temperature=0.7,
+            tools=self._get_node_tools(node),
+        )
+        # Track tokens
+        run.context.total_tokens += response.get('tokens_in', 0) + response.get('tokens_out', 0)
+        run.context.total_cost += response.get('cost', 0.0)
+        return {
+            "response": response.get('content', ''),
+            "raw": response,
+        }
+    async def _execute_relay(
+        self,
+        run: GridRun,
+        node: NodeSpec,
+    ) -> Dict[str, Any]:
+        """Execute a relay node.
+        Relays coordinate between controller and substations.
+        """
+        # Similar to controller but with coordination focus
+        return await self._execute_controller(run, node)
+    async def _execute_substation(
+        self,
+        run: GridRun,
+        node: NodeSpec,
+    ) -> Dict[str, Any]:
+        """Execute a substation node.
+        Substations perform specific tasks with tools.
+        """
+        # Build prompt
+        prompt = self._build_prompt(run, node)
+        # Get LLM adapter
+        adapter = self.llm_adapters.get("default") or next(
+            iter(self.llm_adapters.values()), None
+        )
+        if not adapter:
+            raise ExecutionError("No LLM adapter registered")
+        # Build messages with context from parent
+        messages = []
+        # Add parent context if available
+        if node.parent:
+            parent_output = run.context.get_node_output(node.parent)
+            if parent_output:
+                messages.append({
+                    "role": "assistant",
+                    "content": str(parent_output.get('response', ''))
+                })
+        messages.append({"role": "user", "content": prompt})
+        # System prompt
+        system = node.prompt.system if node.prompt else None
+        # Get tools
+        tools = self._get_node_tools(node)
+        # Call LLM (potentially with tool use loop)
+        response = await adapter.complete(
+            messages=messages,
+            system=system,
+            temperature=0.7,
+            tools=tools,
+        )
+        # Track tokens
+        run.context.total_tokens += response.get('tokens_in', 0) + response.get('tokens_out', 0)
+        run.context.total_cost += response.get('cost', 0.0)
+        # Handle tool calls
+        if response.get('tool_calls'):
+            tool_results = await self._handle_tool_calls(
+                run, node, response['tool_calls']
+            )
+            response['tool_results'] = tool_results
+        return {
+            "response": response.get('content', ''),
+            "raw": response,
+        }
+    async def _execute_foreach(
+        self,
+        run: GridRun,
+        node: NodeSpec,
+    ) -> Dict[str, Any]:
+        """Execute a foreach node (fan-out iteration)."""
+        if not node.foreach:
+            raise ExecutionError(f"foreach node {node.id} missing foreach config")
+        # Get items to iterate over (field is 'over' in ForeachSpec)
+        items_ref = node.foreach.over
+        items = run.context.resolve_reference(items_ref)
+        if not isinstance(items, list):
+            raise ExecutionError(
+                f"foreach items must be a list, got: {type(items)}"
+            )
+        # Execute iteration (results collected by child nodes)
+        # Item variable field is 'as_' in ForeachSpec (aliased from 'as')
+        return {
+            "items": items,
+            "count": len(items),
+            "item_var": node.foreach.as_,
+        }
+    async def _execute_expr(
+        self,
+        run: GridRun,
+        node: NodeSpec,
+    ) -> Dict[str, Any]:
+        """Execute an expression node."""
+        if not node.expr:
+            raise ExecutionError(f"expr node {node.id} missing expression")
+        # Simple expression evaluation
+        # TODO: Implement proper expression parser with safety checks
+        expr = node.expr
+        # Replace references
+        for ref_match in self._find_references(expr):
+            value = run.context.resolve_reference(ref_match)
+            expr = expr.replace(ref_match, repr(value))
+        # Evaluate (UNSAFE - needs sandboxing in production)
+        try:
+            result = eval(expr, {"__builtins__": {}}, {})
+        except Exception as e:
+            raise ExecutionError(f"Expression evaluation failed: {e}")
+        return {"result": result}
+    async def _execute_aggregate(
+        self,
+        run: GridRun,
+        node: NodeSpec,
+    ) -> Dict[str, Any]:
+        """Execute an aggregate node (fan-in merge)."""
+        if not node.aggregate:
+            raise ExecutionError(f"aggregate node {node.id} missing config")
+        # Collect outputs from sources (from_ is aliased as 'from' in YAML)
+        collected = []
+        source_ids = node.aggregate.from_
+        for source_id in source_ids:
+            output = run.context.get_node_output(source_id)
+            if output:
+                collected.append(output)
+        # Apply merge strategy
+        strategy = node.aggregate.strategy.value if node.aggregate.strategy else "concat_arrays"
+        if strategy == "concat_arrays":
+            result = collected
+        elif strategy == "merge_objects":
+            result = {}
+            for item in collected:
+                if isinstance(item, dict):
+                    result.update(item)
+        elif strategy == "sum":
+            # Sum numeric values
+            result = sum(item.get('value', 0) if isinstance(item, dict) else 0 for item in collected)
+        elif strategy == "first":
+            result = collected[0] if collected else None
+        elif strategy == "last":
+            result = collected[-1] if collected else None
+        else:
+            result = collected
+        return {"result": result, "count": len(collected)}
+    async def _execute_approval(
+        self,
+        run: GridRun,
+        node: NodeSpec,
+    ) -> Dict[str, Any]:
+        """Execute an approval node (human-in-the-loop).
+        This pauses execution, sends notifications, and waits for
+        human approval before continuing.
+        """
+        if not node.approval:
+            raise ExecutionError(f"approval node {node.id} missing config")
+        approval_config = node.approval
+        # Check for auto-approve condition
+        if approval_config.autoApprove:
+            when_expr = approval_config.autoApprove.get("when")
+            if when_expr and self._evaluate_auto_approve(run, when_expr):
+                logger.info(f"Auto-approving {node.id} based on condition: {when_expr}")
+                return {
+                    "approved": True,
+                    "approver": "auto",
+                    "reason": f"Auto-approved: {when_expr}",
+                    "timestamp": datetime.now().isoformat(),
+                }
+        # Collect outputs from specified nodes to show in approval
+        show_outputs = {}
+        for source_id in approval_config.showOutputsFrom:
+            output = run.context.get_node_output(source_id)
+            if output:
+                show_outputs[source_id] = output
+        # Get approval manager
+        approval_manager = get_approval_manager()
+        # Create approval request
+        request = await approval_manager.create_request(
+            grid_id=run.grid.id,
+            node_id=node.id,
+            prompt=approval_config.prompt,
+            context=run.context.inputs,
+            timeout_seconds=approval_config.timeout,
+            required_approvers=approval_config.requiredApprovers,
+            allowed_approvers=approval_config.allowedApprovers,
+            show_outputs=show_outputs,
+        )
+        logger.info(
+            f"Approval node {node.id} waiting for approval (request: {request.id})"
+        )
+        # Wait for approval (this blocks until resolved or timeout)
+        try:
+            resolved_request = await approval_manager.wait_for_approval(
+                request.id,
+                timeout=approval_config.timeout,
+            )
+            if resolved_request.status == ApprovalStatus.APPROVED:
+                logger.info(f"Approval {request.id} approved by: {resolved_request.approvers}")
+                return {
+                    "approved": True,
+                    "approvers": resolved_request.approvers,
+                    "timestamp": resolved_request.resolved_at.isoformat() if resolved_request.resolved_at else None,
+                }
+            elif resolved_request.status == ApprovalStatus.REJECTED:
+                logger.warning(f"Approval {request.id} rejected: {resolved_request.rejection_reason}")
+                raise ExecutionError(
+                    f"Approval rejected: {resolved_request.rejection_reason or 'No reason given'}"
+                )
+            else:
+                raise ExecutionError(f"Approval in unexpected state: {resolved_request.status}")
+        except TimeoutError:
+            logger.error(f"Approval {request.id} timed out after {approval_config.timeout}s")
+            raise ExecutionError(f"Approval timed out after {approval_config.timeout} seconds")
+    def _evaluate_auto_approve(self, run: GridRun, expression: str) -> bool:
+        """Evaluate an auto-approve expression.
+        Supports simple expressions like:
+        - "$inputs.env == 'dev'"
+        - "$controller.output.risk_level == 'low'"
+        """
+        try:
+            # Very basic expression evaluation
+            # TODO: Use a proper expression parser with safety
+            # Replace references
+            expr = expression
+            for ref in self._find_references(expression):
+                value = run.context.resolve_reference(ref)
+                if isinstance(value, str):
+                    expr = expr.replace(ref, f"'{value}'")
+                else:
+                    expr = expr.replace(ref, repr(value))
+            # Evaluate (limited builtins for safety)
+            result = eval(expr, {"__builtins__": {"True": True, "False": False, "None": None}}, {})
+            return bool(result)
+        except Exception as e:
+            logger.warning(f"Auto-approve expression failed: {e}")
+            return False
+    async def _execute_spark(
+        self,
+        run: GridRun,
+        node: NodeSpec,
+    ) -> Dict[str, Any]:
+        """Execute a dynamically spawned Spark node.
+        Sparks are lightweight helpers that execute a specific subtask
+        in parallel with their parent substation.
+        """
+        # Get the spark from the manager
+        if not run.spark_manager:
+            raise ExecutionError("Spark manager not initialized")
+        spark = run.spark_manager.sparks.get(node.id)
+        if not spark:
+            raise ExecutionError(f"Spark {node.id} not found in manager")
+        run.spark_manager.mark_running(node.id)
+        # Build prompt from spark task
+        prompt = f"""You are a helper agent (Spark) spawned to assist with a specific subtask.
+TASK: {spark.task}
+CONTEXT:
+{self._format_context(spark.context)}
+Execute this task and provide a clear, focused result. Keep your response concise."""
+        # Get LLM adapter
+        adapter = self.llm_adapters.get("default") or next(
+            iter(self.llm_adapters.values()), None
+        )
+        if not adapter:
+            raise ExecutionError("No LLM adapter registered")
+        # Get default spark agent config if available
+        default_agent = run.grid.topology.dynamicSpawning.defaults.spark
+        system = None
+        if default_agent and run.grid.agents and default_agent in run.grid.agents:
+            agent_spec = run.grid.agents[default_agent]
+            system = agent_spec.systemPrompt
+        # Call LLM
+        response = await adapter.complete(
+            messages=[{"role": "user", "content": prompt}],
+            system=system,
+            temperature=0.7,
+            max_tokens=2000,  # Sparks should be concise
+        )
+        # Track tokens
+        tokens_used = response.get('tokens_in', 0) + response.get('tokens_out', 0)
+        run.context.total_tokens += tokens_used
+        run.context.total_cost += response.get('cost', 0.0)
+        output = {
+            "response": response.get('content', ''),
+            "task": spark.task,
+            "parent_id": spark.parent_id,
+        }
+        run.spark_manager.mark_completed(node.id, output, tokens_used)
+        return output
+    def _format_context(self, context: Dict[str, Any]) -> str:
+        """Format context dict as readable string."""
+        lines = []
+        for key, value in context.items():
+            if isinstance(value, (dict, list)):
+                import json
+                lines.append(f"{key}: {json.dumps(value, indent=2)}")
+            else:
+                lines.append(f"{key}: {value}")
+        return "\n".join(lines)
+    async def spawn_sparks_for_node(
+        self,
+        run: GridRun,
+        parent_id: str,
+        requests: List[SparkRequest],
+    ) -> List[SparkNode]:
+        """Spawn sparks for a node and add them to the scheduler.
+        Called by substations that want to parallelize work.
+        """
+        if not run.spark_manager:
+            return []
+        # Set parent_id on all requests
+        for req in requests:
+            req.parent_id = parent_id
+        # Spawn sparks
+        sparks = await run.spark_manager.spawn_batch(requests)
+        # Add sparks to scheduler as dynamic nodes
+        for spark in sparks:
+            from smartify.engine.spark import create_spark_node_spec
+            from smartify.engine.scheduler import NodeExecution, NodeState
+            node_spec = create_spark_node_spec(
+                spark,
+                default_agent=run.grid.topology.dynamicSpawning.defaults.spark,
+            )
+            # Add to scheduler's node tracking
+            run.scheduler.nodes[spark.id] = NodeExecution(
+                node=node_spec,
+                state=NodeState.READY,  # Sparks start ready immediately
+                dependencies={parent_id},  # Depend on parent completing setup
+            )
+            # Add as dependent of parent
+            if parent_id in run.scheduler.nodes:
+                run.scheduler.nodes[parent_id].dependents.add(spark.id)
+            logger.info(f"Added spark {spark.id} to scheduler")
+        return sparks
+    async def execute_pending_sparks(self, run: GridRun) -> List[NodeResult]:
+        """Execute all pending sparks in parallel.
+        Can be called from the main execution loop to process sparks
+        alongside regular nodes.
+        """
+        if not run.spark_manager:
+            return []
+        pending = run.spark_manager.get_pending_sparks()
+        if not pending:
+            return []
+        results = []
+        tasks = []
+        for spark in pending:
+            # Create a minimal NodeSpec for the spark
+            from smartify.engine.spark import create_spark_node_spec
+            node_spec = create_spark_node_spec(spark)
+            task = asyncio.create_task(
+                self._execute_spark(run, node_spec)
+            )
+            tasks.append((spark.id, task))
+        # Execute all in parallel
+        for spark_id, task in tasks:
+            try:
+                output = await task
+                results.append(NodeResult(
+                    node_id=spark_id,
+                    success=True,
+                    output=output,
+                    completed_at=datetime.now(),
+                ))
+            except Exception as e:
+                logger.error(f"Spark {spark_id} failed: {e}")
+                run.spark_manager.mark_failed(spark_id, str(e))
+                results.append(NodeResult(
+                    node_id=spark_id,
+                    success=False,
+                    error=str(e),
+                    completed_at=datetime.now(),
+                ))
+        return results
+    def _build_prompt(self, run: GridRun, node: NodeSpec) -> str:
+        """Build the prompt for an LLM node."""
+        # Use explicit prompt template, or fall back to node description
+        if node.prompt and node.prompt.template:
+            prompt = node.prompt.template
+        elif node.description:
+            prompt = node.description
+        else:
+            prompt = f"Execute the task for node '{node.name}'"
+        # Resolve references in prompt
+        for ref in self._find_references(prompt):
+            value = run.context.resolve_reference(ref)
+            prompt = prompt.replace(ref, str(value) if value else "")
+        return prompt
+    def _find_references(self, text: str) -> List[str]:
+        """Find all $-references in text."""
+        import re
+        return re.findall(r'\$[\w.]+', text)
+    def _get_node_tools(self, node: NodeSpec) -> Optional[List[Dict]]:
+        """Get tool definitions for a node."""
+        if not node.tools:
+            # If no specific tools, provide all builtins
+            return self.tool_registry.to_anthropic_format()
+        # Filter to only requested tools
+        tool_names = [t.name if hasattr(t, 'name') else t for t in node.tools]
+        return self.tool_registry.to_anthropic_format(names=tool_names)
+    async def _handle_tool_calls(
+        self,
+        run: GridRun,
+        node: NodeSpec,
+        tool_calls: List[Dict],
+    ) -> List[Dict]:
+        """Handle tool calls from LLM response."""
+        results = []
+        for call in tool_calls:
+            tool_name = call.get('name')
+            arguments = call.get('arguments', {})
+            logger.debug(f"Executing tool: {tool_name} with args: {arguments}")
+            # Execute via tool registry
+            result = await self.tool_registry.execute(tool_name, **arguments)
+            if result.success:
+                results.append({
+                    "tool_call_id": call.get('id'),
+                    "result": result.output,
+                })
+            else:
+                results.append({
+                    "tool_call_id": call.get('id'),
+                    "error": result.error,
+                })
+        return results
+    def _collect_outputs(self, run: GridRun) -> Dict[str, Any]:
+        """Collect final outputs from the grid execution."""
+        outputs = {}
+        # Get outputs from all completed nodes
+        for node_id, result in run.results.items():
+            if result.success and result.output:
+                outputs[node_id] = result.output
+        # Build summary
+        return {
+            "grid_id": run.grid.id,
+            "state": run.state.value,
+            "started_at": run.started_at.isoformat() if run.started_at else None,
+            "completed_at": run.completed_at.isoformat() if run.completed_at else None,
+            "node_outputs": outputs,
+            "total_tokens": run.context.total_tokens,
+            "total_cost": run.context.total_cost,
+            "error": run.error,
+        }
+    # State management
+    async def pause(self, run: GridRun) -> None:
+        """Pause grid execution."""
+        if run.state != GridState.RUNNING:
+            raise GridLifecycleError(f"Cannot pause grid in state {run.state}")
+        run.state = GridState.PAUSED
+        logger.info(f"Grid '{run.grid.id}' paused")
+    async def resume(self, run: GridRun) -> None:
+        """Resume paused grid execution."""
+        if run.state != GridState.PAUSED:
+            raise GridLifecycleError(f"Cannot resume grid in state {run.state}")
+        run.state = GridState.RUNNING
+        logger.info(f"Grid '{run.grid.id}' resumed")
+    async def stop(self, run: GridRun) -> None:
+        """Stop grid execution."""
+        if run.state not in (GridState.RUNNING, GridState.PAUSED):
+            raise GridLifecycleError(f"Cannot stop grid in state {run.state}")
+        run.state = GridState.STOPPED
+        run.completed_at = datetime.now()
+        logger.info(f"Grid '{run.grid.id}' stopped")
+    def get_run(self, grid_id: str) -> Optional[GridRun]:
+        """Get a grid run by ID."""
+        return self.runs.get(grid_id)
+    def get_status(self, run: GridRun) -> Dict[str, Any]:
+        """Get current status of a grid run."""
+        return {
+            "grid_id": run.grid.id,
+            "state": run.state.value,
+            "scheduler": run.scheduler.get_state_summary(),
+            "started_at": run.started_at.isoformat() if run.started_at else None,
+            "tokens_used": run.context.total_tokens,
+            "cost": run.context.total_cost,
+            "error": run.error,
+        }