smartify-ai 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. smartify/__init__.py +3 -0
  2. smartify/agents/__init__.py +0 -0
  3. smartify/agents/adapters/__init__.py +13 -0
  4. smartify/agents/adapters/anthropic.py +253 -0
  5. smartify/agents/adapters/openai.py +289 -0
  6. smartify/api/__init__.py +26 -0
  7. smartify/api/auth.py +352 -0
  8. smartify/api/errors.py +380 -0
  9. smartify/api/events.py +345 -0
  10. smartify/api/server.py +992 -0
  11. smartify/cli/__init__.py +1 -0
  12. smartify/cli/main.py +430 -0
  13. smartify/engine/__init__.py +64 -0
  14. smartify/engine/approval.py +479 -0
  15. smartify/engine/orchestrator.py +1365 -0
  16. smartify/engine/scheduler.py +380 -0
  17. smartify/engine/spark.py +294 -0
  18. smartify/guardrails/__init__.py +22 -0
  19. smartify/guardrails/breakers.py +409 -0
  20. smartify/models/__init__.py +61 -0
  21. smartify/models/grid.py +625 -0
  22. smartify/notifications/__init__.py +22 -0
  23. smartify/notifications/webhook.py +556 -0
  24. smartify/state/__init__.py +46 -0
  25. smartify/state/checkpoint.py +558 -0
  26. smartify/state/resume.py +301 -0
  27. smartify/state/store.py +370 -0
  28. smartify/tools/__init__.py +17 -0
  29. smartify/tools/base.py +196 -0
  30. smartify/tools/builtin/__init__.py +79 -0
  31. smartify/tools/builtin/file.py +464 -0
  32. smartify/tools/builtin/http.py +195 -0
  33. smartify/tools/builtin/shell.py +137 -0
  34. smartify/tools/mcp/__init__.py +33 -0
  35. smartify/tools/mcp/adapter.py +157 -0
  36. smartify/tools/mcp/client.py +334 -0
  37. smartify/tools/mcp/registry.py +130 -0
  38. smartify/validator/__init__.py +0 -0
  39. smartify/validator/validate.py +271 -0
  40. smartify/workspace/__init__.py +5 -0
  41. smartify/workspace/manager.py +248 -0
  42. smartify_ai-0.1.0.dist-info/METADATA +201 -0
  43. smartify_ai-0.1.0.dist-info/RECORD +46 -0
  44. smartify_ai-0.1.0.dist-info/WHEEL +4 -0
  45. smartify_ai-0.1.0.dist-info/entry_points.txt +2 -0
  46. smartify_ai-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1365 @@
1
+ """Grid orchestrator - core execution engine.
2
+
3
+ The orchestrator manages grid lifecycle and coordinates node execution:
4
+ 1. Load and validate grid specifications
5
+ 2. Manage grid state transitions (draft → ready → energized → running → completed)
6
+ 3. Execute nodes in topological order via DAG scheduler
7
+ 4. Handle context/state passing between nodes
8
+ 5. Coordinate with LLM and tool adapters
9
+ """
10
+
11
+ import asyncio
12
+ import logging
13
+ import time
14
+ from dataclasses import dataclass, field
15
+ from datetime import datetime
16
+ from enum import Enum
17
+ from pathlib import Path
18
+ from typing import Any, Callable, Dict, List, Optional, Protocol, Union
19
+ import yaml
20
+
21
+ from smartify.models.grid import (
22
+ GridSpec,
23
+ GridState,
24
+ NodeSpec,
25
+ NodeKind,
26
+ DynamicSpawningSpec,
27
+ )
28
+ from smartify.engine.scheduler import DAGScheduler, NodeState
29
+ from smartify.engine.spark import SparkManager, SparkRequest, SparkNode
30
+ from smartify.engine.approval import (
31
+ ApprovalManager,
32
+ ApprovalRequest,
33
+ ApprovalStatus,
34
+ get_approval_manager,
35
+ )
36
+ from smartify.notifications.webhook import (
37
+ WebhookNotifier,
38
+ WebhookConfig,
39
+ EventType,
40
+ )
41
+ from smartify.state.checkpoint import CheckpointStore, get_checkpoint_store
42
+ from smartify.validator.validate import validate_grid
43
+ from smartify.guardrails.breakers import (
44
+ BreakerManager,
45
+ BreakerError,
46
+ BreakerSpec,
47
+ )
48
+ from smartify.models.grid import TripAction
49
+
50
+
51
+ logger = logging.getLogger(__name__)
52
+
53
+
54
+ class LLMAdapter(Protocol):
55
+ """Protocol for LLM adapters."""
56
+
57
+ async def complete(
58
+ self,
59
+ messages: List[Dict[str, str]],
60
+ system: Optional[str] = None,
61
+ temperature: float = 0.7,
62
+ max_tokens: Optional[int] = None,
63
+ tools: Optional[List[Dict]] = None,
64
+ ) -> Dict[str, Any]:
65
+ """Generate a completion from the LLM."""
66
+ ...
67
+
68
+
69
+ class ToolAdapter(Protocol):
70
+ """Protocol for tool execution adapters."""
71
+
72
+ async def execute(
73
+ self,
74
+ tool_name: str,
75
+ arguments: Dict[str, Any],
76
+ ) -> Dict[str, Any]:
77
+ """Execute a tool and return its result."""
78
+ ...
79
+
80
+
81
+ class ExecutionError(Exception):
82
+ """Raised when grid execution fails."""
83
+ pass
84
+
85
+
86
+ class GridLifecycleError(Exception):
87
+ """Raised for invalid grid state transitions."""
88
+ pass
89
+
90
+
91
+ @dataclass
92
+ class NodeResult:
93
+ """Result of executing a single node."""
94
+ node_id: str
95
+ success: bool
96
+ output: Optional[Dict[str, Any]] = None
97
+ error: Optional[str] = None
98
+ started_at: Optional[datetime] = None
99
+ completed_at: Optional[datetime] = None
100
+ tokens_used: int = 0
101
+
102
+
103
+ @dataclass
104
+ class ExecutionContext:
105
+ """Shared context during grid execution.
106
+
107
+ Provides access to:
108
+ - Grid inputs
109
+ - Node outputs (from completed nodes)
110
+ - Environment variables
111
+ - Execution metadata
112
+ """
113
+ grid_id: str
114
+ inputs: Dict[str, Any] = field(default_factory=dict)
115
+ outputs: Dict[str, Dict[str, Any]] = field(default_factory=dict)
116
+ env: Dict[str, str] = field(default_factory=dict)
117
+ metadata: Dict[str, Any] = field(default_factory=dict)
118
+
119
+ # Token tracking
120
+ total_tokens: int = 0
121
+ token_limit: Optional[int] = None
122
+
123
+ # Cost tracking
124
+ total_cost: float = 0.0
125
+ cost_limit: Optional[float] = None
126
+
127
+ def get_node_output(self, node_id: str) -> Optional[Dict[str, Any]]:
128
+ """Get output from a completed node."""
129
+ return self.outputs.get(node_id)
130
+
131
+ def set_node_output(self, node_id: str, output: Dict[str, Any]) -> None:
132
+ """Store output from a completed node."""
133
+ self.outputs[node_id] = output
134
+
135
+ def resolve_reference(self, ref: str) -> Any:
136
+ """Resolve a reference like '$controller.output.result'.
137
+
138
+ Supported prefixes:
139
+ - $<node_id>. - Reference to node output
140
+ - $inputs. - Reference to grid inputs
141
+ - $env. - Reference to environment variables
142
+ """
143
+ if not ref.startswith('$'):
144
+ return ref
145
+
146
+ parts = ref[1:].split('.')
147
+ if not parts:
148
+ return None
149
+
150
+ root = parts[0]
151
+ path = parts[1:] if len(parts) > 1 else []
152
+
153
+ # Get the root object
154
+ if root == 'inputs':
155
+ obj = self.inputs
156
+ elif root == 'env':
157
+ obj = self.env
158
+ elif root in self.outputs:
159
+ obj = self.outputs[root]
160
+ else:
161
+ return None
162
+
163
+ # Navigate path
164
+ for key in path:
165
+ if isinstance(obj, dict) and key in obj:
166
+ obj = obj[key]
167
+ else:
168
+ return None
169
+
170
+ return obj
171
+
172
+ def check_breakers(self) -> Optional[str]:
173
+ """Check if any breakers are tripped. Returns trip reason or None."""
174
+ if self.token_limit and self.total_tokens >= self.token_limit:
175
+ return f"Token limit exceeded: {self.total_tokens}/{self.token_limit}"
176
+ if self.cost_limit and self.total_cost >= self.cost_limit:
177
+ return f"Cost limit exceeded: {self.total_cost}/{self.cost_limit}"
178
+ return None
179
+
180
+
181
+ @dataclass
182
+ class GridRun:
183
+ """Represents a single execution run of a grid."""
184
+ grid: GridSpec
185
+ scheduler: DAGScheduler
186
+ context: ExecutionContext
187
+ state: GridState = GridState.DRAFT
188
+ results: Dict[str, NodeResult] = field(default_factory=dict)
189
+ started_at: Optional[datetime] = None
190
+ completed_at: Optional[datetime] = None
191
+ error: Optional[str] = None
192
+ spark_manager: Optional[SparkManager] = None
193
+ webhook_notifier: Optional[WebhookNotifier] = None
194
+ checkpoint_store: Optional[CheckpointStore] = None
195
+ run_id: Optional[str] = None # Unique ID for this run (for checkpointing)
196
+ breaker_manager: Optional[BreakerManager] = None # Request/concurrency rate limiting
197
+
198
+
199
+ class Orchestrator:
200
+ """Core grid execution engine.
201
+
202
+ Usage:
203
+ orchestrator = Orchestrator()
204
+ orchestrator.register_llm_adapter("anthropic", anthropic_adapter)
205
+
206
+ run = await orchestrator.load_grid("path/to/grid.yaml")
207
+ await orchestrator.energize(run)
208
+ result = await orchestrator.execute(run)
209
+ """
210
+
211
+ def __init__(
212
+ self,
213
+ tool_registry: Optional["ToolRegistry"] = None,
214
+ checkpoint_store: Optional[CheckpointStore] = None,
215
+ enable_checkpoints: bool = True,
216
+ ):
217
+ from smartify.tools import ToolRegistry
218
+ from smartify.tools.builtin import create_builtin_registry
219
+
220
+ self.llm_adapters: Dict[str, LLMAdapter] = {}
221
+ self.tool_adapters: Dict[str, ToolAdapter] = {} # Legacy, kept for compat
222
+ self.tool_registry: ToolRegistry = tool_registry or create_builtin_registry()
223
+ self.runs: Dict[str, GridRun] = {}
224
+ self.checkpoint_store = checkpoint_store
225
+ self.enable_checkpoints = enable_checkpoints
226
+
227
+ # Node executors by kind
228
+ self._executors: Dict[NodeKind, Callable] = {
229
+ NodeKind.CONTROLLER: self._execute_controller,
230
+ NodeKind.RELAY: self._execute_relay,
231
+ NodeKind.SUBSTATION: self._execute_substation,
232
+ NodeKind.SPARK: self._execute_spark,
233
+ NodeKind.FOREACH: self._execute_foreach,
234
+ NodeKind.EXPR: self._execute_expr,
235
+ NodeKind.AGGREGATE: self._execute_aggregate,
236
+ NodeKind.APPROVAL: self._execute_approval,
237
+ }
238
+
239
+ def register_llm_adapter(self, name: str, adapter: LLMAdapter) -> None:
240
+ """Register an LLM adapter."""
241
+ self.llm_adapters[name] = adapter
242
+
243
+ def register_tool_adapter(self, name: str, adapter: ToolAdapter) -> None:
244
+ """Register a tool adapter."""
245
+ self.tool_adapters[name] = adapter
246
+
247
+ async def load_grid(
248
+ self,
249
+ source: Union[str, Path, Dict],
250
+ inputs: Optional[Dict[str, Any]] = None,
251
+ env: Optional[Dict[str, str]] = None,
252
+ ) -> GridRun:
253
+ """Load and validate a grid specification.
254
+
255
+ Args:
256
+ source: Path to YAML file, YAML string, or dict
257
+ inputs: Initial input values for the grid
258
+ env: Environment variables
259
+
260
+ Returns:
261
+ GridRun instance in DRAFT state
262
+ """
263
+ # Parse source
264
+ if isinstance(source, (str, Path)):
265
+ path = Path(source)
266
+ if path.exists():
267
+ with open(path) as f:
268
+ spec_dict = yaml.safe_load(f)
269
+ else:
270
+ # Assume it's YAML content
271
+ spec_dict = yaml.safe_load(str(source))
272
+ else:
273
+ spec_dict = source
274
+
275
+ # Validate and parse
276
+ errors = validate_grid(spec_dict)
277
+ if errors:
278
+ raise ExecutionError(f"Grid validation failed: {errors}")
279
+
280
+ grid = GridSpec.model_validate(spec_dict)
281
+
282
+ # Create scheduler
283
+ scheduler = DAGScheduler(grid)
284
+ scheduler.build_graph()
285
+
286
+ # Create context
287
+ context = ExecutionContext(
288
+ grid_id=grid.id,
289
+ inputs=inputs or {},
290
+ env=env or {},
291
+ )
292
+
293
+ # Set breaker limits from grid spec (via guardrails)
294
+ breaker_manager = None
295
+ if grid.guardrails and grid.guardrails.breakers:
296
+ breakers = grid.guardrails.breakers
297
+ if breakers.tokens:
298
+ context.token_limit = breakers.tokens.maxTotalTokensPerRun
299
+ if breakers.cost:
300
+ context.cost_limit = breakers.cost.maxCostPerRun
301
+ # Create BreakerManager for request/concurrency limits
302
+ if breakers.requests:
303
+ request_spec = BreakerSpec(requests=breakers.requests)
304
+ breaker_manager = BreakerManager(
305
+ spec=request_spec,
306
+ actions=grid.guardrails.breakerActions,
307
+ )
308
+ breaker_manager.start()
309
+ logger.debug(
310
+ f"Created BreakerManager with maxConcurrentAgents="
311
+ f"{breakers.requests.maxConcurrentAgents}, "
312
+ f"maxRequestsPerMinute={breakers.requests.maxRequestsPerMinute}"
313
+ )
314
+
315
+ # Create spark manager for dynamic spawning
316
+ spark_manager = SparkManager(
317
+ config=grid.topology.dynamicSpawning,
318
+ scheduler=scheduler,
319
+ )
320
+
321
+ # Create webhook notifier if notifications configured
322
+ webhook_notifier = None
323
+ if grid.notifications and grid.notifications.webhooks:
324
+ webhook_notifier = WebhookNotifier()
325
+ for wh_config in grid.notifications.webhooks:
326
+ if not wh_config.enabled:
327
+ continue
328
+
329
+ # Convert event strings to EventType enums
330
+ events = []
331
+ for event_name in wh_config.events:
332
+ try:
333
+ events.append(EventType(event_name))
334
+ except ValueError:
335
+ logger.warning(f"Unknown webhook event type: {event_name}")
336
+
337
+ webhook_notifier.add_webhook(WebhookConfig(
338
+ url=wh_config.url,
339
+ events=events or list(EventType),
340
+ secret=wh_config.secret,
341
+ headers=wh_config.headers or {},
342
+ max_retries=wh_config.maxRetries,
343
+ timeout_seconds=wh_config.timeout,
344
+ ))
345
+
346
+ logger.info(f"Configured {len(grid.notifications.webhooks)} webhook(s) for grid '{grid.id}'")
347
+
348
+ # Generate unique run ID
349
+ from uuid import uuid4
350
+ run_id = f"run-{uuid4().hex[:12]}"
351
+
352
+ # Create run
353
+ run = GridRun(
354
+ grid=grid,
355
+ scheduler=scheduler,
356
+ context=context,
357
+ state=GridState.DRAFT,
358
+ spark_manager=spark_manager,
359
+ webhook_notifier=webhook_notifier,
360
+ checkpoint_store=self.checkpoint_store,
361
+ run_id=run_id,
362
+ breaker_manager=breaker_manager,
363
+ )
364
+
365
+ # Create checkpoint if enabled
366
+ if self.enable_checkpoints and self.checkpoint_store:
367
+ # Serialize grid back to YAML for checkpoint
368
+ grid_yaml = yaml.dump(spec_dict, default_flow_style=False)
369
+ self.checkpoint_store.create_checkpoint(
370
+ run_id=run_id,
371
+ grid_id=grid.id,
372
+ grid_yaml=grid_yaml,
373
+ inputs=inputs or {},
374
+ )
375
+ logger.debug(f"Created checkpoint for run {run_id}")
376
+
377
+ self.runs[grid.id] = run
378
+ logger.info(f"Loaded grid '{grid.id}' ({len(grid.topology.nodes)} nodes) [run_id={run_id}]")
379
+
380
+ return run
381
+
382
+ async def energize(self, run: GridRun) -> None:
383
+ """Transition grid from DRAFT to READY to ENERGIZED.
384
+
385
+ This validates the grid is ready for execution.
386
+ """
387
+ if run.state not in (GridState.DRAFT, GridState.READY):
388
+ raise GridLifecycleError(
389
+ f"Cannot energize grid in state {run.state}"
390
+ )
391
+
392
+ # Validate required inputs and apply defaults
393
+ if run.grid.inputs:
394
+ for input_def in run.grid.inputs:
395
+ if input_def.name not in run.context.inputs:
396
+ if input_def.default is not None:
397
+ # Apply default value
398
+ run.context.inputs[input_def.name] = input_def.default
399
+ elif input_def.required:
400
+ # Required input with no default - error
401
+ raise ExecutionError(
402
+ f"Missing required input: {input_def.name}"
403
+ )
404
+
405
+ # Validate LLM adapter is available
406
+ has_llm_nodes = any(
407
+ node.kind in (NodeKind.CONTROLLER, NodeKind.RELAY, NodeKind.SUBSTATION)
408
+ for node in run.grid.topology.nodes
409
+ )
410
+ if has_llm_nodes and not self.llm_adapters:
411
+ raise ExecutionError("No LLM adapter registered")
412
+
413
+ # Register MCP servers if configured
414
+ await self._register_mcp_servers(run)
415
+
416
+ run.state = GridState.ENERGIZED
417
+ logger.info(f"Grid '{run.grid.id}' energized and ready for execution")
418
+
419
+ async def _register_mcp_servers(self, run: GridRun) -> None:
420
+ """Register MCP server tools from the grid spec.
421
+
422
+ Connects to each MCP server defined in grid.tools.mcpServers and
423
+ registers their tools in the tool registry.
424
+ """
425
+ if not run.grid.tools or not run.grid.tools.mcpServers:
426
+ return
427
+
428
+ mcp_servers = run.grid.tools.mcpServers
429
+ if not mcp_servers:
430
+ return
431
+
432
+ # Try importing MCP integration
433
+ try:
434
+ from smartify.tools.mcp import McpServerConfig, McpTransport, register_mcp_server
435
+ except ImportError:
436
+ raise ExecutionError(
437
+ f"Grid '{run.grid.id}' uses MCP servers but MCP is not installed. "
438
+ "Install with: pip install smartify[mcp]"
439
+ )
440
+
441
+ logger.info(f"Registering {len(mcp_servers)} MCP server(s) for grid '{run.grid.id}'")
442
+
443
+ for server_spec in mcp_servers:
444
+ try:
445
+ # Convert grid spec to McpServerConfig
446
+ transport_map = {
447
+ "stdio": McpTransport.STDIO,
448
+ "sse": McpTransport.SSE,
449
+ "streamable_http": McpTransport.STREAMABLE_HTTP,
450
+ }
451
+ transport = transport_map.get(server_spec.transport, McpTransport.STDIO)
452
+
453
+ config = McpServerConfig(
454
+ id=server_spec.id,
455
+ transport=transport,
456
+ command=server_spec.command,
457
+ args=server_spec.args,
458
+ env=server_spec.env,
459
+ cwd=server_spec.cwd,
460
+ url=server_spec.url,
461
+ headers=server_spec.headers,
462
+ prefix=server_spec.prefix,
463
+ tools=server_spec.tools,
464
+ )
465
+
466
+ # Register the MCP server's tools
467
+ tool_names = await register_mcp_server(self.tool_registry, config)
468
+ logger.info(
469
+ f"Registered {len(tool_names)} tools from MCP server '{server_spec.id}'"
470
+ )
471
+
472
+ except Exception as e:
473
+ logger.error(f"Failed to register MCP server '{server_spec.id}': {e}")
474
+ raise ExecutionError(
475
+ f"Failed to connect to MCP server '{server_spec.id}': {e}"
476
+ )
477
+
478
+ async def execute(self, run: GridRun) -> Dict[str, Any]:
479
+ """Execute the grid to completion.
480
+
481
+ Returns:
482
+ Final outputs from the grid execution
483
+ """
484
+ if run.state != GridState.ENERGIZED:
485
+ raise GridLifecycleError(
486
+ f"Cannot execute grid in state {run.state}"
487
+ )
488
+
489
+ run.state = GridState.RUNNING
490
+ run.started_at = datetime.now()
491
+ logger.info(f"Starting execution of grid '{run.grid.id}'")
492
+
493
+ # Send run_started notification
494
+ if run.webhook_notifier:
495
+ asyncio.create_task(run.webhook_notifier.notify_run_started(
496
+ grid_id=run.grid.id,
497
+ grid_name=run.grid.name,
498
+ inputs=run.context.inputs,
499
+ ))
500
+
501
+ try:
502
+ while not run.scheduler.is_complete():
503
+ # Check breakers
504
+ trip_reason = run.context.check_breakers()
505
+ if trip_reason:
506
+ logger.warning(f"Breaker tripped: {trip_reason}")
507
+ run.error = trip_reason
508
+ run.state = GridState.FAILED
509
+
510
+ # Send breaker_tripped notification
511
+ if run.webhook_notifier:
512
+ # Parse breaker type from reason
513
+ breaker_type = "unknown"
514
+ current_value = 0.0
515
+ limit = 0.0
516
+ if "Token limit" in trip_reason:
517
+ breaker_type = "tokens"
518
+ current_value = run.context.total_tokens
519
+ limit = run.context.token_limit or 0
520
+ elif "Cost limit" in trip_reason:
521
+ breaker_type = "cost"
522
+ current_value = run.context.total_cost
523
+ limit = run.context.cost_limit or 0
524
+
525
+ asyncio.create_task(run.webhook_notifier.notify_breaker_tripped(
526
+ grid_id=run.grid.id,
527
+ breaker_type=breaker_type,
528
+ current_value=current_value,
529
+ limit=limit,
530
+ action="stop",
531
+ ))
532
+ break
533
+
534
+ # Check request/concurrency breakers via BreakerManager
535
+ if run.breaker_manager:
536
+ try:
537
+ trip = await run.breaker_manager.check_and_enforce()
538
+ if trip and trip.action in (TripAction.STOP, TripAction.BLOCK):
539
+ logger.warning(f"Request breaker tripped: {trip.reason}")
540
+ run.error = trip.reason
541
+ run.state = GridState.FAILED
542
+
543
+ # Send breaker_tripped notification
544
+ if run.webhook_notifier:
545
+ asyncio.create_task(run.webhook_notifier.notify_breaker_tripped(
546
+ grid_id=run.grid.id,
547
+ breaker_type="requests",
548
+ current_value=trip.current_value,
549
+ limit=trip.limit_value,
550
+ action=trip.action.value,
551
+ ))
552
+ break
553
+ except BreakerError as e:
554
+ logger.warning(f"Request breaker error: {e}")
555
+ run.error = str(e)
556
+ run.state = GridState.FAILED
557
+ break
558
+
559
+ # Get ready nodes
560
+ ready_nodes = run.scheduler.get_ready_nodes()
561
+
562
+ # Cap concurrency based on maxConcurrentAgents
563
+ if (
564
+ run.breaker_manager
565
+ and run.breaker_manager.spec.requests
566
+ and run.breaker_manager.spec.requests.maxConcurrentAgents
567
+ ):
568
+ max_concurrent = run.breaker_manager.spec.requests.maxConcurrentAgents
569
+ current_concurrent = run.breaker_manager.state.rate_limit.concurrent_count
570
+ available_slots = max(0, max_concurrent - current_concurrent)
571
+ if available_slots < len(ready_nodes):
572
+ logger.debug(
573
+ f"Capping ready nodes from {len(ready_nodes)} to {available_slots} "
574
+ f"(maxConcurrentAgents={max_concurrent}, current={current_concurrent})"
575
+ )
576
+ ready_nodes = ready_nodes[:available_slots]
577
+
578
+ if not ready_nodes:
579
+ # Check for deadlock
580
+ running = run.scheduler.get_running_nodes()
581
+ if not running:
582
+ logger.error("Execution stalled - no ready or running nodes")
583
+ run.error = "Execution deadlock"
584
+ run.state = GridState.FAILED
585
+ break
586
+
587
+ # Wait for running nodes
588
+ await asyncio.sleep(0.1)
589
+ continue
590
+
591
+ # Execute ready nodes (in parallel)
592
+ tasks = []
593
+ for node_id in ready_nodes:
594
+ run.scheduler.mark_running(node_id)
595
+ # Record request start for rate limiting
596
+ if run.breaker_manager:
597
+ run.breaker_manager.record_request_start()
598
+ # Checkpoint node start
599
+ if run.checkpoint_store and run.run_id:
600
+ run.checkpoint_store.checkpoint_node_started(run.run_id, node_id)
601
+ task = asyncio.create_task(
602
+ self._execute_node(run, node_id)
603
+ )
604
+ tasks.append(task)
605
+
606
+ # Wait for all parallel nodes
607
+ results = await asyncio.gather(*tasks, return_exceptions=True)
608
+
609
+ # Process results
610
+ for node_id, result in zip(ready_nodes, results):
611
+ # Record request complete for rate limiting (both success and failure)
612
+ if run.breaker_manager:
613
+ run.breaker_manager.record_request_complete()
614
+
615
+ if isinstance(result, Exception):
616
+ logger.error(f"Node {node_id} failed: {result}")
617
+ run.scheduler.mark_failed(node_id, str(result))
618
+
619
+ # Checkpoint node failure
620
+ if run.checkpoint_store and run.run_id:
621
+ run.checkpoint_store.checkpoint_node_failed(
622
+ run.run_id, node_id, str(result)
623
+ )
624
+
625
+ # Check retry
626
+ if run.scheduler.can_retry(node_id):
627
+ logger.info(f"Retrying node {node_id}")
628
+ run.scheduler.reset_for_retry(node_id)
629
+ else:
630
+ run.error = f"Node {node_id} failed: {result}"
631
+ run.state = GridState.FAILED
632
+ else:
633
+ run.results[node_id] = result
634
+ run.scheduler.mark_completed(node_id, result.output)
635
+ run.context.set_node_output(node_id, result.output or {})
636
+
637
+ # Checkpoint node completion
638
+ if run.checkpoint_store and run.run_id:
639
+ run.checkpoint_store.checkpoint_node_complete(
640
+ run.run_id,
641
+ node_id,
642
+ result.output or {},
643
+ tokens_used=result.tokens_used,
644
+ )
645
+
646
+ # Determine final state
647
+ run.completed_at = datetime.now()
648
+ duration_seconds = (run.completed_at - run.started_at).total_seconds() if run.started_at else 0
649
+
650
+ if run.state != GridState.FAILED:
651
+ if run.scheduler.is_successful():
652
+ run.state = GridState.COMPLETED
653
+ logger.info(f"Grid '{run.grid.id}' completed successfully")
654
+
655
+ # Mark checkpoint completed
656
+ if run.checkpoint_store and run.run_id:
657
+ run.checkpoint_store.mark_completed(run.run_id)
658
+
659
+ # Send run_completed notification
660
+ if run.webhook_notifier:
661
+ asyncio.create_task(run.webhook_notifier.notify_run_completed(
662
+ grid_id=run.grid.id,
663
+ outputs=run.context.outputs,
664
+ duration_seconds=duration_seconds,
665
+ total_tokens=run.context.total_tokens,
666
+ total_cost=run.context.total_cost,
667
+ ))
668
+ else:
669
+ run.state = GridState.FAILED
670
+ logger.error(f"Grid '{run.grid.id}' failed")
671
+
672
+ # Mark checkpoint failed
673
+ if run.checkpoint_store and run.run_id:
674
+ run.checkpoint_store.mark_failed(run.run_id, run.error or "Unknown error")
675
+
676
+ # Send run_failed notification
677
+ if run.webhook_notifier:
678
+ asyncio.create_task(run.webhook_notifier.notify_run_failed(
679
+ grid_id=run.grid.id,
680
+ error=run.error or "Unknown error",
681
+ duration_seconds=duration_seconds,
682
+ ))
683
+ else:
684
+ # Already failed (breaker or node failure)
685
+ # Mark checkpoint failed
686
+ if run.checkpoint_store and run.run_id:
687
+ run.checkpoint_store.mark_failed(run.run_id, run.error or "Unknown error")
688
+
689
+ if run.webhook_notifier:
690
+ asyncio.create_task(run.webhook_notifier.notify_run_failed(
691
+ grid_id=run.grid.id,
692
+ error=run.error or "Unknown error",
693
+ duration_seconds=duration_seconds,
694
+ ))
695
+
696
+ return self._collect_outputs(run)
697
+
698
+ except Exception as e:
699
+ logger.exception(f"Grid execution error: {e}")
700
+ run.state = GridState.FAILED
701
+ run.error = str(e)
702
+ run.completed_at = datetime.now()
703
+
704
+ # Mark checkpoint failed
705
+ if run.checkpoint_store and run.run_id:
706
+ run.checkpoint_store.mark_failed(run.run_id, str(e))
707
+
708
+ # Send run_failed notification
709
+ if run.webhook_notifier:
710
+ duration_seconds = (run.completed_at - run.started_at).total_seconds() if run.started_at else 0
711
+ asyncio.create_task(run.webhook_notifier.notify_run_failed(
712
+ grid_id=run.grid.id,
713
+ error=str(e),
714
+ duration_seconds=duration_seconds,
715
+ ))
716
+
717
+ raise ExecutionError(str(e)) from e
718
+
719
+ async def _execute_node(self, run: GridRun, node_id: str) -> NodeResult:
720
+ """Execute a single node."""
721
+ node = run.scheduler.nodes[node_id].node
722
+ result = NodeResult(
723
+ node_id=node_id,
724
+ success=False,
725
+ started_at=datetime.now(),
726
+ )
727
+
728
+ logger.debug(f"Executing node {node_id} ({node.kind})")
729
+
730
+ try:
731
+ # Get executor for node kind
732
+ executor = self._executors.get(node.kind)
733
+ if not executor:
734
+ raise ExecutionError(f"No executor for node kind: {node.kind}")
735
+
736
+ # Execute
737
+ output = await executor(run, node)
738
+
739
+ result.success = True
740
+ result.output = output
741
+ result.completed_at = datetime.now()
742
+
743
+ logger.debug(f"Node {node_id} completed successfully")
744
+
745
+ except Exception as e:
746
+ result.success = False
747
+ result.error = str(e)
748
+ result.completed_at = datetime.now()
749
+ logger.error(f"Node {node_id} failed: {e}")
750
+ raise
751
+
752
+ return result
753
+
754
+ async def _execute_controller(
755
+ self,
756
+ run: GridRun,
757
+ node: NodeSpec,
758
+ ) -> Dict[str, Any]:
759
+ """Execute a controller node.
760
+
761
+ Controllers orchestrate the overall workflow with LLM guidance.
762
+ """
763
+ # Build prompt from node config (falls back to description)
764
+ prompt = self._build_prompt(run, node)
765
+
766
+ # Get LLM adapter
767
+ adapter = self.llm_adapters.get("default") or next(
768
+ iter(self.llm_adapters.values()), None
769
+ )
770
+
771
+ if not adapter:
772
+ raise ExecutionError("No LLM adapter registered")
773
+
774
+ # Build messages
775
+ messages = [{"role": "user", "content": prompt}]
776
+
777
+ # Add system prompt
778
+ system = node.prompt.system if node.prompt else None
779
+
780
+ # Call LLM
781
+ response = await adapter.complete(
782
+ messages=messages,
783
+ system=system,
784
+ temperature=0.7,
785
+ tools=self._get_node_tools(node),
786
+ )
787
+
788
+ # Track tokens
789
+ run.context.total_tokens += response.get('tokens_in', 0) + response.get('tokens_out', 0)
790
+ run.context.total_cost += response.get('cost', 0.0)
791
+
792
+ return {
793
+ "response": response.get('content', ''),
794
+ "raw": response,
795
+ }
796
+
797
+ async def _execute_relay(
798
+ self,
799
+ run: GridRun,
800
+ node: NodeSpec,
801
+ ) -> Dict[str, Any]:
802
+ """Execute a relay node.
803
+
804
+ Relays coordinate between controller and substations.
805
+ """
806
+ # Similar to controller but with coordination focus
807
+ return await self._execute_controller(run, node)
808
+
809
+ async def _execute_substation(
810
+ self,
811
+ run: GridRun,
812
+ node: NodeSpec,
813
+ ) -> Dict[str, Any]:
814
+ """Execute a substation node.
815
+
816
+ Substations perform specific tasks with tools.
817
+ """
818
+ # Build prompt
819
+ prompt = self._build_prompt(run, node)
820
+
821
+ # Get LLM adapter
822
+ adapter = self.llm_adapters.get("default") or next(
823
+ iter(self.llm_adapters.values()), None
824
+ )
825
+
826
+ if not adapter:
827
+ raise ExecutionError("No LLM adapter registered")
828
+
829
+ # Build messages with context from parent
830
+ messages = []
831
+
832
+ # Add parent context if available
833
+ if node.parent:
834
+ parent_output = run.context.get_node_output(node.parent)
835
+ if parent_output:
836
+ messages.append({
837
+ "role": "assistant",
838
+ "content": str(parent_output.get('response', ''))
839
+ })
840
+
841
+ messages.append({"role": "user", "content": prompt})
842
+
843
+ # System prompt
844
+ system = node.prompt.system if node.prompt else None
845
+
846
+ # Get tools
847
+ tools = self._get_node_tools(node)
848
+
849
+ # Call LLM (potentially with tool use loop)
850
+ response = await adapter.complete(
851
+ messages=messages,
852
+ system=system,
853
+ temperature=0.7,
854
+ tools=tools,
855
+ )
856
+
857
+ # Track tokens
858
+ run.context.total_tokens += response.get('tokens_in', 0) + response.get('tokens_out', 0)
859
+ run.context.total_cost += response.get('cost', 0.0)
860
+
861
+ # Handle tool calls
862
+ if response.get('tool_calls'):
863
+ tool_results = await self._handle_tool_calls(
864
+ run, node, response['tool_calls']
865
+ )
866
+ response['tool_results'] = tool_results
867
+
868
+ return {
869
+ "response": response.get('content', ''),
870
+ "raw": response,
871
+ }
872
+
873
+ async def _execute_foreach(
874
+ self,
875
+ run: GridRun,
876
+ node: NodeSpec,
877
+ ) -> Dict[str, Any]:
878
+ """Execute a foreach node (fan-out iteration)."""
879
+ if not node.foreach:
880
+ raise ExecutionError(f"foreach node {node.id} missing foreach config")
881
+
882
+ # Get items to iterate over (field is 'over' in ForeachSpec)
883
+ items_ref = node.foreach.over
884
+ items = run.context.resolve_reference(items_ref)
885
+
886
+ if not isinstance(items, list):
887
+ raise ExecutionError(
888
+ f"foreach items must be a list, got: {type(items)}"
889
+ )
890
+
891
+ # Execute iteration (results collected by child nodes)
892
+ # Item variable field is 'as_' in ForeachSpec (aliased from 'as')
893
+ return {
894
+ "items": items,
895
+ "count": len(items),
896
+ "item_var": node.foreach.as_,
897
+ }
898
+
899
+ async def _execute_expr(
900
+ self,
901
+ run: GridRun,
902
+ node: NodeSpec,
903
+ ) -> Dict[str, Any]:
904
+ """Execute an expression node."""
905
+ if not node.expr:
906
+ raise ExecutionError(f"expr node {node.id} missing expression")
907
+
908
+ # Simple expression evaluation
909
+ # TODO: Implement proper expression parser with safety checks
910
+ expr = node.expr
911
+
912
+ # Replace references
913
+ for ref_match in self._find_references(expr):
914
+ value = run.context.resolve_reference(ref_match)
915
+ expr = expr.replace(ref_match, repr(value))
916
+
917
+ # Evaluate (UNSAFE - needs sandboxing in production)
918
+ try:
919
+ result = eval(expr, {"__builtins__": {}}, {})
920
+ except Exception as e:
921
+ raise ExecutionError(f"Expression evaluation failed: {e}")
922
+
923
+ return {"result": result}
924
+
925
+ async def _execute_aggregate(
926
+ self,
927
+ run: GridRun,
928
+ node: NodeSpec,
929
+ ) -> Dict[str, Any]:
930
+ """Execute an aggregate node (fan-in merge)."""
931
+ if not node.aggregate:
932
+ raise ExecutionError(f"aggregate node {node.id} missing config")
933
+
934
+ # Collect outputs from sources (from_ is aliased as 'from' in YAML)
935
+ collected = []
936
+ source_ids = node.aggregate.from_
937
+ for source_id in source_ids:
938
+ output = run.context.get_node_output(source_id)
939
+ if output:
940
+ collected.append(output)
941
+
942
+ # Apply merge strategy
943
+ strategy = node.aggregate.strategy.value if node.aggregate.strategy else "concat_arrays"
944
+
945
+ if strategy == "concat_arrays":
946
+ result = collected
947
+ elif strategy == "merge_objects":
948
+ result = {}
949
+ for item in collected:
950
+ if isinstance(item, dict):
951
+ result.update(item)
952
+ elif strategy == "sum":
953
+ # Sum numeric values
954
+ result = sum(item.get('value', 0) if isinstance(item, dict) else 0 for item in collected)
955
+ elif strategy == "first":
956
+ result = collected[0] if collected else None
957
+ elif strategy == "last":
958
+ result = collected[-1] if collected else None
959
+ else:
960
+ result = collected
961
+
962
+ return {"result": result, "count": len(collected)}
963
+
964
+ async def _execute_approval(
965
+ self,
966
+ run: GridRun,
967
+ node: NodeSpec,
968
+ ) -> Dict[str, Any]:
969
+ """Execute an approval node (human-in-the-loop).
970
+
971
+ This pauses execution, sends notifications, and waits for
972
+ human approval before continuing.
973
+ """
974
+ if not node.approval:
975
+ raise ExecutionError(f"approval node {node.id} missing config")
976
+
977
+ approval_config = node.approval
978
+
979
+ # Check for auto-approve condition
980
+ if approval_config.autoApprove:
981
+ when_expr = approval_config.autoApprove.get("when")
982
+ if when_expr and self._evaluate_auto_approve(run, when_expr):
983
+ logger.info(f"Auto-approving {node.id} based on condition: {when_expr}")
984
+ return {
985
+ "approved": True,
986
+ "approver": "auto",
987
+ "reason": f"Auto-approved: {when_expr}",
988
+ "timestamp": datetime.now().isoformat(),
989
+ }
990
+
991
+ # Collect outputs from specified nodes to show in approval
992
+ show_outputs = {}
993
+ for source_id in approval_config.showOutputsFrom:
994
+ output = run.context.get_node_output(source_id)
995
+ if output:
996
+ show_outputs[source_id] = output
997
+
998
+ # Get approval manager
999
+ approval_manager = get_approval_manager()
1000
+
1001
+ # Create approval request
1002
+ request = await approval_manager.create_request(
1003
+ grid_id=run.grid.id,
1004
+ node_id=node.id,
1005
+ prompt=approval_config.prompt,
1006
+ context=run.context.inputs,
1007
+ timeout_seconds=approval_config.timeout,
1008
+ required_approvers=approval_config.requiredApprovers,
1009
+ allowed_approvers=approval_config.allowedApprovers,
1010
+ show_outputs=show_outputs,
1011
+ )
1012
+
1013
+ logger.info(
1014
+ f"Approval node {node.id} waiting for approval (request: {request.id})"
1015
+ )
1016
+
1017
+ # Wait for approval (this blocks until resolved or timeout)
1018
+ try:
1019
+ resolved_request = await approval_manager.wait_for_approval(
1020
+ request.id,
1021
+ timeout=approval_config.timeout,
1022
+ )
1023
+
1024
+ if resolved_request.status == ApprovalStatus.APPROVED:
1025
+ logger.info(f"Approval {request.id} approved by: {resolved_request.approvers}")
1026
+ return {
1027
+ "approved": True,
1028
+ "approvers": resolved_request.approvers,
1029
+ "timestamp": resolved_request.resolved_at.isoformat() if resolved_request.resolved_at else None,
1030
+ }
1031
+ elif resolved_request.status == ApprovalStatus.REJECTED:
1032
+ logger.warning(f"Approval {request.id} rejected: {resolved_request.rejection_reason}")
1033
+ raise ExecutionError(
1034
+ f"Approval rejected: {resolved_request.rejection_reason or 'No reason given'}"
1035
+ )
1036
+ else:
1037
+ raise ExecutionError(f"Approval in unexpected state: {resolved_request.status}")
1038
+
1039
+ except TimeoutError:
1040
+ logger.error(f"Approval {request.id} timed out after {approval_config.timeout}s")
1041
+ raise ExecutionError(f"Approval timed out after {approval_config.timeout} seconds")
1042
+
1043
+ def _evaluate_auto_approve(self, run: GridRun, expression: str) -> bool:
1044
+ """Evaluate an auto-approve expression.
1045
+
1046
+ Supports simple expressions like:
1047
+ - "$inputs.env == 'dev'"
1048
+ - "$controller.output.risk_level == 'low'"
1049
+ """
1050
+ try:
1051
+ # Very basic expression evaluation
1052
+ # TODO: Use a proper expression parser with safety
1053
+
1054
+ # Replace references
1055
+ expr = expression
1056
+ for ref in self._find_references(expression):
1057
+ value = run.context.resolve_reference(ref)
1058
+ if isinstance(value, str):
1059
+ expr = expr.replace(ref, f"'{value}'")
1060
+ else:
1061
+ expr = expr.replace(ref, repr(value))
1062
+
1063
+ # Evaluate (limited builtins for safety)
1064
+ result = eval(expr, {"__builtins__": {"True": True, "False": False, "None": None}}, {})
1065
+ return bool(result)
1066
+
1067
+ except Exception as e:
1068
+ logger.warning(f"Auto-approve expression failed: {e}")
1069
+ return False
1070
+
1071
+ async def _execute_spark(
1072
+ self,
1073
+ run: GridRun,
1074
+ node: NodeSpec,
1075
+ ) -> Dict[str, Any]:
1076
+ """Execute a dynamically spawned Spark node.
1077
+
1078
+ Sparks are lightweight helpers that execute a specific subtask
1079
+ in parallel with their parent substation.
1080
+ """
1081
+ # Get the spark from the manager
1082
+ if not run.spark_manager:
1083
+ raise ExecutionError("Spark manager not initialized")
1084
+
1085
+ spark = run.spark_manager.sparks.get(node.id)
1086
+ if not spark:
1087
+ raise ExecutionError(f"Spark {node.id} not found in manager")
1088
+
1089
+ run.spark_manager.mark_running(node.id)
1090
+
1091
+ # Build prompt from spark task
1092
+ prompt = f"""You are a helper agent (Spark) spawned to assist with a specific subtask.
1093
+
1094
+ TASK: {spark.task}
1095
+
1096
+ CONTEXT:
1097
+ {self._format_context(spark.context)}
1098
+
1099
+ Execute this task and provide a clear, focused result. Keep your response concise."""
1100
+
1101
+ # Get LLM adapter
1102
+ adapter = self.llm_adapters.get("default") or next(
1103
+ iter(self.llm_adapters.values()), None
1104
+ )
1105
+
1106
+ if not adapter:
1107
+ raise ExecutionError("No LLM adapter registered")
1108
+
1109
+ # Get default spark agent config if available
1110
+ default_agent = run.grid.topology.dynamicSpawning.defaults.spark
1111
+ system = None
1112
+ if default_agent and run.grid.agents and default_agent in run.grid.agents:
1113
+ agent_spec = run.grid.agents[default_agent]
1114
+ system = agent_spec.systemPrompt
1115
+
1116
+ # Call LLM
1117
+ response = await adapter.complete(
1118
+ messages=[{"role": "user", "content": prompt}],
1119
+ system=system,
1120
+ temperature=0.7,
1121
+ max_tokens=2000, # Sparks should be concise
1122
+ )
1123
+
1124
+ # Track tokens
1125
+ tokens_used = response.get('tokens_in', 0) + response.get('tokens_out', 0)
1126
+ run.context.total_tokens += tokens_used
1127
+ run.context.total_cost += response.get('cost', 0.0)
1128
+
1129
+ output = {
1130
+ "response": response.get('content', ''),
1131
+ "task": spark.task,
1132
+ "parent_id": spark.parent_id,
1133
+ }
1134
+
1135
+ run.spark_manager.mark_completed(node.id, output, tokens_used)
1136
+
1137
+ return output
1138
+
1139
+ def _format_context(self, context: Dict[str, Any]) -> str:
1140
+ """Format context dict as readable string."""
1141
+ lines = []
1142
+ for key, value in context.items():
1143
+ if isinstance(value, (dict, list)):
1144
+ import json
1145
+ lines.append(f"{key}: {json.dumps(value, indent=2)}")
1146
+ else:
1147
+ lines.append(f"{key}: {value}")
1148
+ return "\n".join(lines)
1149
+
1150
+ async def spawn_sparks_for_node(
1151
+ self,
1152
+ run: GridRun,
1153
+ parent_id: str,
1154
+ requests: List[SparkRequest],
1155
+ ) -> List[SparkNode]:
1156
+ """Spawn sparks for a node and add them to the scheduler.
1157
+
1158
+ Called by substations that want to parallelize work.
1159
+ """
1160
+ if not run.spark_manager:
1161
+ return []
1162
+
1163
+ # Set parent_id on all requests
1164
+ for req in requests:
1165
+ req.parent_id = parent_id
1166
+
1167
+ # Spawn sparks
1168
+ sparks = await run.spark_manager.spawn_batch(requests)
1169
+
1170
+ # Add sparks to scheduler as dynamic nodes
1171
+ for spark in sparks:
1172
+ from smartify.engine.spark import create_spark_node_spec
1173
+ from smartify.engine.scheduler import NodeExecution, NodeState
1174
+
1175
+ node_spec = create_spark_node_spec(
1176
+ spark,
1177
+ default_agent=run.grid.topology.dynamicSpawning.defaults.spark,
1178
+ )
1179
+
1180
+ # Add to scheduler's node tracking
1181
+ run.scheduler.nodes[spark.id] = NodeExecution(
1182
+ node=node_spec,
1183
+ state=NodeState.READY, # Sparks start ready immediately
1184
+ dependencies={parent_id}, # Depend on parent completing setup
1185
+ )
1186
+
1187
+ # Add as dependent of parent
1188
+ if parent_id in run.scheduler.nodes:
1189
+ run.scheduler.nodes[parent_id].dependents.add(spark.id)
1190
+
1191
+ logger.info(f"Added spark {spark.id} to scheduler")
1192
+
1193
+ return sparks
1194
+
1195
+ async def execute_pending_sparks(self, run: GridRun) -> List[NodeResult]:
1196
+ """Execute all pending sparks in parallel.
1197
+
1198
+ Can be called from the main execution loop to process sparks
1199
+ alongside regular nodes.
1200
+ """
1201
+ if not run.spark_manager:
1202
+ return []
1203
+
1204
+ pending = run.spark_manager.get_pending_sparks()
1205
+ if not pending:
1206
+ return []
1207
+
1208
+ results = []
1209
+ tasks = []
1210
+
1211
+ for spark in pending:
1212
+ # Create a minimal NodeSpec for the spark
1213
+ from smartify.engine.spark import create_spark_node_spec
1214
+ node_spec = create_spark_node_spec(spark)
1215
+
1216
+ task = asyncio.create_task(
1217
+ self._execute_spark(run, node_spec)
1218
+ )
1219
+ tasks.append((spark.id, task))
1220
+
1221
+ # Execute all in parallel
1222
+ for spark_id, task in tasks:
1223
+ try:
1224
+ output = await task
1225
+ results.append(NodeResult(
1226
+ node_id=spark_id,
1227
+ success=True,
1228
+ output=output,
1229
+ completed_at=datetime.now(),
1230
+ ))
1231
+ except Exception as e:
1232
+ logger.error(f"Spark {spark_id} failed: {e}")
1233
+ run.spark_manager.mark_failed(spark_id, str(e))
1234
+ results.append(NodeResult(
1235
+ node_id=spark_id,
1236
+ success=False,
1237
+ error=str(e),
1238
+ completed_at=datetime.now(),
1239
+ ))
1240
+
1241
+ return results
1242
+
1243
+ def _build_prompt(self, run: GridRun, node: NodeSpec) -> str:
1244
+ """Build the prompt for an LLM node."""
1245
+ # Use explicit prompt template, or fall back to node description
1246
+ if node.prompt and node.prompt.template:
1247
+ prompt = node.prompt.template
1248
+ elif node.description:
1249
+ prompt = node.description
1250
+ else:
1251
+ prompt = f"Execute the task for node '{node.name}'"
1252
+
1253
+ # Resolve references in prompt
1254
+ for ref in self._find_references(prompt):
1255
+ value = run.context.resolve_reference(ref)
1256
+ prompt = prompt.replace(ref, str(value) if value else "")
1257
+
1258
+ return prompt
1259
+
1260
+ def _find_references(self, text: str) -> List[str]:
1261
+ """Find all $-references in text."""
1262
+ import re
1263
+ return re.findall(r'\$[\w.]+', text)
1264
+
1265
+ def _get_node_tools(self, node: NodeSpec) -> Optional[List[Dict]]:
1266
+ """Get tool definitions for a node."""
1267
+ if not node.tools:
1268
+ # If no specific tools, provide all builtins
1269
+ return self.tool_registry.to_anthropic_format()
1270
+
1271
+ # Filter to only requested tools
1272
+ tool_names = [t.name if hasattr(t, 'name') else t for t in node.tools]
1273
+ return self.tool_registry.to_anthropic_format(names=tool_names)
1274
+
1275
+ async def _handle_tool_calls(
1276
+ self,
1277
+ run: GridRun,
1278
+ node: NodeSpec,
1279
+ tool_calls: List[Dict],
1280
+ ) -> List[Dict]:
1281
+ """Handle tool calls from LLM response."""
1282
+ results = []
1283
+
1284
+ for call in tool_calls:
1285
+ tool_name = call.get('name')
1286
+ arguments = call.get('arguments', {})
1287
+
1288
+ logger.debug(f"Executing tool: {tool_name} with args: {arguments}")
1289
+
1290
+ # Execute via tool registry
1291
+ result = await self.tool_registry.execute(tool_name, **arguments)
1292
+
1293
+ if result.success:
1294
+ results.append({
1295
+ "tool_call_id": call.get('id'),
1296
+ "result": result.output,
1297
+ })
1298
+ else:
1299
+ results.append({
1300
+ "tool_call_id": call.get('id'),
1301
+ "error": result.error,
1302
+ })
1303
+
1304
+ return results
1305
+
1306
+ def _collect_outputs(self, run: GridRun) -> Dict[str, Any]:
1307
+ """Collect final outputs from the grid execution."""
1308
+ outputs = {}
1309
+
1310
+ # Get outputs from all completed nodes
1311
+ for node_id, result in run.results.items():
1312
+ if result.success and result.output:
1313
+ outputs[node_id] = result.output
1314
+
1315
+ # Build summary
1316
+ return {
1317
+ "grid_id": run.grid.id,
1318
+ "state": run.state.value,
1319
+ "started_at": run.started_at.isoformat() if run.started_at else None,
1320
+ "completed_at": run.completed_at.isoformat() if run.completed_at else None,
1321
+ "node_outputs": outputs,
1322
+ "total_tokens": run.context.total_tokens,
1323
+ "total_cost": run.context.total_cost,
1324
+ "error": run.error,
1325
+ }
1326
+
1327
+ # State management
1328
+
1329
+ async def pause(self, run: GridRun) -> None:
1330
+ """Pause grid execution."""
1331
+ if run.state != GridState.RUNNING:
1332
+ raise GridLifecycleError(f"Cannot pause grid in state {run.state}")
1333
+ run.state = GridState.PAUSED
1334
+ logger.info(f"Grid '{run.grid.id}' paused")
1335
+
1336
+ async def resume(self, run: GridRun) -> None:
1337
+ """Resume paused grid execution."""
1338
+ if run.state != GridState.PAUSED:
1339
+ raise GridLifecycleError(f"Cannot resume grid in state {run.state}")
1340
+ run.state = GridState.RUNNING
1341
+ logger.info(f"Grid '{run.grid.id}' resumed")
1342
+
1343
+ async def stop(self, run: GridRun) -> None:
1344
+ """Stop grid execution."""
1345
+ if run.state not in (GridState.RUNNING, GridState.PAUSED):
1346
+ raise GridLifecycleError(f"Cannot stop grid in state {run.state}")
1347
+ run.state = GridState.STOPPED
1348
+ run.completed_at = datetime.now()
1349
+ logger.info(f"Grid '{run.grid.id}' stopped")
1350
+
1351
+ def get_run(self, grid_id: str) -> Optional[GridRun]:
1352
+ """Get a grid run by ID."""
1353
+ return self.runs.get(grid_id)
1354
+
1355
+ def get_status(self, run: GridRun) -> Dict[str, Any]:
1356
+ """Get current status of a grid run."""
1357
+ return {
1358
+ "grid_id": run.grid.id,
1359
+ "state": run.state.value,
1360
+ "scheduler": run.scheduler.get_state_summary(),
1361
+ "started_at": run.started_at.isoformat() if run.started_at else None,
1362
+ "tokens_used": run.context.total_tokens,
1363
+ "cost": run.context.total_cost,
1364
+ "error": run.error,
1365
+ }