agnt5 0.3.0a8__cp310-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of agnt5 might be problematic. Click here for more details.

agnt5/worker.py ADDED
@@ -0,0 +1,1982 @@
1
+ """Worker implementation for AGNT5 SDK."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import contextvars
7
+ import logging
8
+ import time
9
+ import uuid
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from .function import FunctionRegistry
13
+ from .workflow import WorkflowRegistry
14
+ from ._telemetry import setup_module_logger
15
+ from . import _sentry
16
+
17
+ logger = setup_module_logger(__name__)
18
+
19
+
20
+ import dataclasses
21
+ import json as _json
22
+
23
+
24
+ class _ResultEncoder(_json.JSONEncoder):
25
+ """Custom JSON encoder for serializing component results.
26
+
27
+ Handles Pydantic models, dataclasses, bytes, and sets that are commonly
28
+ returned from functions, workflows, entities, and agents.
29
+ """
30
+ def default(self, obj):
31
+ # Handle Pydantic models (v2 API)
32
+ if hasattr(obj, 'model_dump'):
33
+ return obj.model_dump()
34
+ # Handle Pydantic models (v1 API)
35
+ if hasattr(obj, 'dict') and hasattr(obj, '__fields__'):
36
+ return obj.dict()
37
+ # Handle dataclasses
38
+ if dataclasses.is_dataclass(obj) and not isinstance(obj, type):
39
+ return dataclasses.asdict(obj)
40
+ # Handle bytes
41
+ if isinstance(obj, bytes):
42
+ return obj.decode('utf-8', errors='replace')
43
+ # Handle sets
44
+ if isinstance(obj, set):
45
+ return list(obj)
46
+ # Fallback to default behavior
47
+ return super().default(obj)
48
+
49
+
50
+ def _serialize_result(result) -> bytes:
51
+ """Serialize a component result to JSON bytes.
52
+
53
+ Uses _ResultEncoder to handle Pydantic models, dataclasses, and other
54
+ complex types that may be returned from functions, workflows, entities,
55
+ tools, and agents.
56
+ """
57
+ return _json.dumps(result, cls=_ResultEncoder).encode("utf-8")
58
+
59
+
60
+ def _normalize_metadata(metadata: Dict[str, Any]) -> Dict[str, str]:
61
+ """
62
+ Convert metadata dictionary to Dict[str, str] for Rust FFI compatibility.
63
+
64
+ PyO3 requires HashMap<String, String>, but Python code may include booleans,
65
+ integers, or other types. This helper ensures all values are strings.
66
+
67
+ Args:
68
+ metadata: Dictionary with potentially mixed types
69
+
70
+ Returns:
71
+ Dictionary with all string values
72
+
73
+ Example:
74
+ >>> _normalize_metadata({"error": True, "count": 42, "msg": "hello"})
75
+ {"error": "true", "count": "42", "msg": "hello"}
76
+ """
77
+ normalized = {}
78
+ for key, value in metadata.items():
79
+ if isinstance(value, str):
80
+ normalized[key] = value
81
+ elif isinstance(value, bool):
82
+ # Convert bool to lowercase string for JSON compatibility
83
+ normalized[key] = str(value).lower()
84
+ elif value is None:
85
+ normalized[key] = ""
86
+ else:
87
+ # Convert any other type to string representation
88
+ normalized[key] = str(value)
89
+ return normalized
90
+
91
+ # Context variable to store trace metadata for propagation to LM calls
92
+ # This allows Rust LM layer to access traceparent without explicit parameter passing
93
+ _trace_metadata: contextvars.ContextVar[Dict[str, str]] = contextvars.ContextVar(
94
+ '_trace_metadata', default={}
95
+ )
96
+
97
+
98
+ class Worker:
99
+ """AGNT5 Worker for registering and running functions/workflows with the coordinator.
100
+
101
+ The Worker class manages the lifecycle of your service, including:
102
+ - Registration with the AGNT5 coordinator
103
+ - Automatic discovery of @function and @workflow decorated handlers
104
+ - Message handling and execution
105
+ - Health monitoring
106
+
107
+ Example:
108
+ ```python
109
+ from agnt5 import Worker, function
110
+
111
+ @function
112
+ async def process_data(ctx: Context, data: str) -> dict:
113
+ return {"result": data.upper()}
114
+
115
+ async def main():
116
+ worker = Worker(
117
+ service_name="data-processor",
118
+ service_version="1.0.0",
119
+ coordinator_endpoint="http://localhost:34186"
120
+ )
121
+ await worker.run()
122
+
123
+ if __name__ == "__main__":
124
+ asyncio.run(main())
125
+ ```
126
+ """
127
+
128
+ def __init__(
129
+ self,
130
+ service_name: str,
131
+ service_version: str = "1.0.0",
132
+ coordinator_endpoint: Optional[str] = None,
133
+ runtime: str = "standalone",
134
+ metadata: Optional[Dict[str, str]] = None,
135
+ functions: Optional[List] = None,
136
+ workflows: Optional[List] = None,
137
+ entities: Optional[List] = None,
138
+ agents: Optional[List] = None,
139
+ tools: Optional[List] = None,
140
+ auto_register: bool = False,
141
+ auto_register_paths: Optional[List[str]] = None,
142
+ pyproject_path: Optional[str] = None,
143
+ ):
144
+ """Initialize a new Worker with explicit or automatic component registration.
145
+
146
+ The Worker supports two registration modes:
147
+
148
+ **Explicit Mode (default, production):**
149
+ - Register workflows/agents explicitly, their dependencies are auto-included
150
+ - Optionally register standalone functions/tools for direct API invocation
151
+
152
+ **Auto-Registration Mode (development):**
153
+ - Automatically discovers all decorated components in source paths
154
+ - Reads source paths from pyproject.toml or uses explicit paths
155
+ - No need to maintain import lists
156
+
157
+ Args:
158
+ service_name: Unique name for this service
159
+ service_version: Version string (semantic versioning recommended)
160
+ coordinator_endpoint: Coordinator endpoint URL (default: from env AGNT5_COORDINATOR_ENDPOINT)
161
+ runtime: Runtime type - "standalone", "docker", "kubernetes", etc.
162
+ metadata: Optional service-level metadata
163
+ functions: List of @function decorated handlers (explicit mode)
164
+ workflows: List of @workflow decorated handlers (explicit mode)
165
+ entities: List of Entity classes (explicit mode)
166
+ agents: List of Agent instances (explicit mode)
167
+ tools: List of Tool instances (explicit mode)
168
+ auto_register: Enable automatic component discovery (default: False)
169
+ auto_register_paths: Explicit source paths to scan (overrides pyproject.toml discovery)
170
+ pyproject_path: Path to pyproject.toml (default: current directory)
171
+
172
+ Example (explicit mode - production):
173
+ ```python
174
+ from agnt5 import Worker
175
+ from my_service import greet_user, order_fulfillment, ShoppingCart, analyst_agent
176
+
177
+ worker = Worker(
178
+ service_name="my-service",
179
+ workflows=[order_fulfillment],
180
+ entities=[ShoppingCart],
181
+ agents=[analyst_agent],
182
+ functions=[greet_user],
183
+ )
184
+ await worker.run()
185
+ ```
186
+
187
+ Example (auto-register mode - development):
188
+ ```python
189
+ from agnt5 import Worker
190
+
191
+ worker = Worker(
192
+ service_name="my-service",
193
+ auto_register=True, # Discovers from pyproject.toml
194
+ )
195
+ await worker.run()
196
+ ```
197
+ """
198
+ self.service_name = service_name
199
+ self.service_version = service_version
200
+ self.coordinator_endpoint = coordinator_endpoint
201
+ self.runtime = runtime
202
+ self.metadata = metadata or {}
203
+
204
+ # Get tenant_id from environment (required for entity state management)
205
+ import os
206
+ self._tenant_id = os.getenv("AGNT5_TENANT_ID", "default-tenant")
207
+
208
+ # Import Rust worker
209
+ try:
210
+ from ._core import PyWorker, PyWorkerConfig, PyComponentInfo
211
+ self._PyWorker = PyWorker
212
+ self._PyWorkerConfig = PyWorkerConfig
213
+ self._PyComponentInfo = PyComponentInfo
214
+ except ImportError as e:
215
+ # Capture SDK-level import failure in Sentry
216
+ _sentry.capture_exception(
217
+ e,
218
+ context={
219
+ "service_name": service_name,
220
+ "service_version": service_version,
221
+ "error_location": "Worker.__init__",
222
+ "error_phase": "rust_core_import",
223
+ },
224
+ tags={
225
+ "sdk_error": "true",
226
+ "error_type": "import_error",
227
+ "component": "rust_core",
228
+ },
229
+ level="error",
230
+ )
231
+ raise ImportError(
232
+ f"Failed to import Rust core worker: {e}. "
233
+ "Make sure agnt5 is properly installed with: pip install agnt5"
234
+ )
235
+
236
+ # Create Rust worker config
237
+ self._rust_config = self._PyWorkerConfig(
238
+ service_name=service_name,
239
+ service_version=service_version,
240
+ service_type=runtime,
241
+ )
242
+
243
+ # Create Rust worker instance
244
+ self._rust_worker = self._PyWorker(self._rust_config)
245
+
246
+ # Create worker-scoped entity state adapter with Rust core
247
+ from .entity import EntityStateAdapter
248
+ from ._core import EntityStateManager as RustEntityStateManager
249
+
250
+ # Create Rust core for entity state management
251
+ rust_core = RustEntityStateManager(tenant_id=self._tenant_id)
252
+
253
+ # Create Python adapter (thin wrapper around Rust core)
254
+ self._entity_state_adapter = EntityStateAdapter(rust_core=rust_core)
255
+
256
+ logger.info("Created EntityStateAdapter with Rust core for state management")
257
+
258
+ # Create CheckpointClient for step-level memoization (Phase 3)
259
+ # This client is shared across all workflow executions and connects lazily on first use
260
+ try:
261
+ from .checkpoint import CheckpointClient
262
+ self._checkpoint_client = CheckpointClient()
263
+ logger.info("Created CheckpointClient for step-level memoization")
264
+ except Exception as e:
265
+ logger.warning(f"Failed to create CheckpointClient (memoization disabled): {e}")
266
+ self._checkpoint_client = None
267
+
268
+ # Initialize Sentry for SDK-level error tracking
269
+ # Telemetry behavior:
270
+ # - Alpha/Beta releases: ENABLED by default (opt-out with AGNT5_DISABLE_SDK_TELEMETRY=true)
271
+ # - Stable releases: DISABLED by default (opt-in with AGNT5_ENABLE_SDK_TELEMETRY=true)
272
+ # This captures SDK bugs, initialization failures, and Python-specific issues
273
+ # NOT user code execution errors (those should be handled by users)
274
+ from .version import _get_version
275
+ sdk_version = _get_version()
276
+
277
+ sentry_enabled = _sentry.initialize_sentry(
278
+ service_name=service_name,
279
+ service_version=service_version,
280
+ sdk_version=sdk_version,
281
+ )
282
+ if sentry_enabled:
283
+ # Set service-level context (anonymized)
284
+ _sentry.set_context("service", {
285
+ "name": service_name, # User's service name (they control this)
286
+ "version": service_version,
287
+ "runtime": runtime,
288
+ })
289
+ else:
290
+ logger.debug("SDK telemetry not enabled")
291
+
292
+ # Component registration: auto-discover or explicit
293
+ if auto_register:
294
+ # Warn if explicit components are passed with auto_register=True
295
+ if any([functions, workflows, entities, agents, tools]):
296
+ logger.warning(
297
+ "auto_register=True ignores explicit functions/workflows/entities/agents/tools parameters. "
298
+ "Remove explicit params or set auto_register=False to use explicit registration."
299
+ )
300
+
301
+ # Auto-registration mode: discover from source paths
302
+ if auto_register_paths:
303
+ source_paths = auto_register_paths
304
+ logger.info(f"Auto-registration with explicit paths: {source_paths}")
305
+ else:
306
+ source_paths = self._discover_source_paths(pyproject_path)
307
+ logger.info(f"Auto-registration with discovered paths: {source_paths}")
308
+
309
+ # Auto-discover components (will populate _explicit_components)
310
+ self._auto_discover_components(source_paths)
311
+ else:
312
+ # Explicit registration from constructor kwargs
313
+ self._explicit_components = {
314
+ 'functions': list(functions or []),
315
+ 'workflows': list(workflows or []),
316
+ 'entities': list(entities or []),
317
+ 'agents': list(agents or []),
318
+ 'tools': list(tools or []),
319
+ }
320
+
321
+ # Count explicitly registered components
322
+ total_explicit = sum(len(v) for v in self._explicit_components.values())
323
+ logger.info(
324
+ f"Worker initialized: {service_name} v{service_version} (runtime: {runtime}), "
325
+ f"{total_explicit} components explicitly registered"
326
+ )
327
+
328
+ def register_components(
329
+ self,
330
+ functions=None,
331
+ workflows=None,
332
+ entities=None,
333
+ agents=None,
334
+ tools=None,
335
+ ):
336
+ """Register additional components after Worker initialization.
337
+
338
+ This method allows incremental registration of components after the Worker
339
+ has been created. Useful for conditional or dynamic component registration.
340
+
341
+ Args:
342
+ functions: List of functions decorated with @function
343
+ workflows: List of workflows decorated with @workflow
344
+ entities: List of entity classes
345
+ agents: List of agent instances
346
+ tools: List of tool instances
347
+
348
+ Example:
349
+ ```python
350
+ worker = Worker(service_name="my-service")
351
+
352
+ # Register conditionally
353
+ if feature_enabled:
354
+ worker.register_components(workflows=[advanced_workflow])
355
+ ```
356
+ """
357
+ if functions:
358
+ self._explicit_components['functions'].extend(functions)
359
+ logger.debug(f"Incrementally registered {len(functions)} functions")
360
+
361
+ if workflows:
362
+ self._explicit_components['workflows'].extend(workflows)
363
+ logger.debug(f"Incrementally registered {len(workflows)} workflows")
364
+
365
+ if entities:
366
+ self._explicit_components['entities'].extend(entities)
367
+ logger.debug(f"Incrementally registered {len(entities)} entities")
368
+
369
+ if agents:
370
+ self._explicit_components['agents'].extend(agents)
371
+ logger.debug(f"Incrementally registered {len(agents)} agents")
372
+
373
+ if tools:
374
+ self._explicit_components['tools'].extend(tools)
375
+ logger.debug(f"Incrementally registered {len(tools)} tools")
376
+
377
+ total = sum(len(v) for v in self._explicit_components.values())
378
+ logger.info(f"Total components now registered: {total}")
379
+
380
+ def _discover_source_paths(self, pyproject_path: Optional[str] = None) -> List[str]:
381
+ """Discover source paths from pyproject.toml.
382
+
383
+ Reads pyproject.toml to find package source directories using:
384
+ - Hatch: [tool.hatch.build.targets.wheel] packages
385
+ - Maturin: [tool.maturin] python-source
386
+ - Fallback: ["src"] if not found
387
+
388
+ Args:
389
+ pyproject_path: Path to pyproject.toml (default: current directory)
390
+
391
+ Returns:
392
+ List of directory paths to scan (e.g., ["src/agnt5_benchmark"])
393
+ """
394
+ from pathlib import Path
395
+
396
+ # Python 3.11+ has tomllib in stdlib
397
+ try:
398
+ import tomllib
399
+ except ImportError:
400
+ logger.error("tomllib not available (Python 3.11+ required for auto-registration)")
401
+ return ["src"]
402
+
403
+ # Determine pyproject.toml location
404
+ if pyproject_path:
405
+ pyproject_file = Path(pyproject_path)
406
+ else:
407
+ # Look in current directory
408
+ pyproject_file = Path.cwd() / "pyproject.toml"
409
+
410
+ if not pyproject_file.exists():
411
+ logger.warning(
412
+ f"pyproject.toml not found at {pyproject_file}, "
413
+ f"defaulting to 'src/' directory"
414
+ )
415
+ return ["src"]
416
+
417
+ # Parse pyproject.toml
418
+ try:
419
+ with open(pyproject_file, "rb") as f:
420
+ config = tomllib.load(f)
421
+ except Exception as e:
422
+ logger.error(f"Failed to parse pyproject.toml: {e}")
423
+ return ["src"]
424
+
425
+ # Extract source paths based on build system
426
+ source_paths = []
427
+
428
+ # Try Hatch configuration
429
+ if "tool" in config and "hatch" in config["tool"]:
430
+ hatch_config = config["tool"]["hatch"]
431
+ if "build" in hatch_config and "targets" in hatch_config["build"]:
432
+ wheel_config = hatch_config["build"]["targets"].get("wheel", {})
433
+ packages = wheel_config.get("packages", [])
434
+ source_paths.extend(packages)
435
+
436
+ # Try Maturin configuration
437
+ if not source_paths and "tool" in config and "maturin" in config["tool"]:
438
+ maturin_config = config["tool"]["maturin"]
439
+ python_source = maturin_config.get("python-source")
440
+ if python_source:
441
+ source_paths.append(python_source)
442
+
443
+ # Fallback to src/
444
+ if not source_paths:
445
+ logger.info("No source paths in pyproject.toml, defaulting to 'src/'")
446
+ source_paths = ["src"]
447
+
448
+ logger.info(f"Discovered source paths from pyproject.toml: {source_paths}")
449
+ return source_paths
450
+
451
+ def _auto_discover_components(self, source_paths: List[str]) -> None:
452
+ """Auto-discover components by importing all Python files in source paths.
453
+
454
+ Args:
455
+ source_paths: List of directory paths to scan
456
+ """
457
+ import importlib.util
458
+ import sys
459
+ from pathlib import Path
460
+
461
+ logger.info(f"Auto-discovering components in paths: {source_paths}")
462
+
463
+ total_modules = 0
464
+
465
+ for source_path in source_paths:
466
+ path = Path(source_path)
467
+
468
+ if not path.exists():
469
+ logger.warning(f"Source path does not exist: {source_path}")
470
+ continue
471
+
472
+ # Recursively find all .py files
473
+ for py_file in path.rglob("*.py"):
474
+ # Skip __pycache__ and test files
475
+ if "__pycache__" in str(py_file) or py_file.name.startswith("test_"):
476
+ continue
477
+
478
+ # Convert path to module name
479
+ # e.g., src/agnt5_benchmark/functions.py -> agnt5_benchmark.functions
480
+ relative_path = py_file.relative_to(path.parent)
481
+ module_parts = list(relative_path.parts[:-1]) # Remove .py extension part
482
+ module_parts.append(relative_path.stem) # Add filename without .py
483
+ module_name = ".".join(module_parts)
484
+
485
+ # Import module (triggers decorators)
486
+ try:
487
+ if module_name in sys.modules:
488
+ logger.debug(f"Module already imported: {module_name}")
489
+ else:
490
+ spec = importlib.util.spec_from_file_location(module_name, py_file)
491
+ if spec and spec.loader:
492
+ module = importlib.util.module_from_spec(spec)
493
+ sys.modules[module_name] = module
494
+ spec.loader.exec_module(module)
495
+ logger.debug(f"Auto-imported: {module_name}")
496
+ total_modules += 1
497
+ except Exception as e:
498
+ logger.warning(f"Failed to import {module_name}: {e}")
499
+ # Capture SDK auto-registration failures
500
+ _sentry.capture_exception(
501
+ e,
502
+ context={
503
+ "service_name": self.service_name,
504
+ "module_name": module_name,
505
+ "source_path": str(py_file),
506
+ "error_location": "_auto_discover_components",
507
+ },
508
+ tags={
509
+ "sdk_error": "true",
510
+ "error_type": "auto_registration_failure",
511
+ },
512
+ level="warning",
513
+ )
514
+
515
+ logger.info(f"Auto-imported {total_modules} modules")
516
+
517
+ # Collect components from registries
518
+ from .agent import AgentRegistry
519
+ from .entity import EntityRegistry
520
+ from .tool import ToolRegistry
521
+
522
+ # Extract actual objects from registries
523
+ functions = [cfg.handler for cfg in FunctionRegistry.all().values()]
524
+ workflows = [cfg.handler for cfg in WorkflowRegistry.all().values()]
525
+ entities = [et.entity_class for et in EntityRegistry.all().values()]
526
+ agents = list(AgentRegistry.all().values())
527
+ tools = list(ToolRegistry.all().values())
528
+
529
+ self._explicit_components = {
530
+ 'functions': functions,
531
+ 'workflows': workflows,
532
+ 'entities': entities,
533
+ 'agents': agents,
534
+ 'tools': tools,
535
+ }
536
+
537
+ logger.info(
538
+ f"Auto-discovered components: "
539
+ f"{len(functions)} functions, "
540
+ f"{len(workflows)} workflows, "
541
+ f"{len(entities)} entities, "
542
+ f"{len(agents)} agents, "
543
+ f"{len(tools)} tools"
544
+ )
545
+
546
+ def _discover_components(self):
547
+ """Discover explicit components and auto-include their dependencies.
548
+
549
+ Hybrid approach:
550
+ - Explicitly registered workflows/agents are processed
551
+ - Functions called by workflows are auto-included (TODO: implement)
552
+ - Tools used by agents are auto-included
553
+ - Standalone functions/tools can be explicitly registered
554
+
555
+ Returns:
556
+ List of PyComponentInfo instances for all components
557
+ """
558
+ components = []
559
+ import json
560
+
561
+ # Import registries and types
562
+ from .entity import EntityRegistry
563
+ from .tool import ToolRegistry, Tool
564
+
565
+ # Track all components (explicit + auto-included)
566
+ all_functions = set(self._explicit_components['functions'])
567
+ all_tools = set(self._explicit_components['tools'])
568
+
569
+ # Auto-include agent tool dependencies
570
+ for agent in self._explicit_components['agents']:
571
+ if hasattr(agent, 'tools') and agent.tools:
572
+ # Agent.tools is a dict of {tool_name: tool_instance}
573
+ all_tools.update(agent.tools.values())
574
+ logger.debug(
575
+ f"Auto-included {len(agent.tools)} tools from agent '{agent.name}'"
576
+ )
577
+
578
+ # Log registration summary
579
+ explicit_func_count = len(self._explicit_components['functions'])
580
+ explicit_tool_count = len(self._explicit_components['tools'])
581
+ auto_func_count = len(all_functions) - explicit_func_count
582
+ auto_tool_count = len(all_tools) - explicit_tool_count
583
+
584
+ logger.info(
585
+ f"Component registration summary: "
586
+ f"{len(all_functions)} functions ({explicit_func_count} explicit, {auto_func_count} auto-included), "
587
+ f"{len(self._explicit_components['workflows'])} workflows, "
588
+ f"{len(self._explicit_components['entities'])} entities, "
589
+ f"{len(self._explicit_components['agents'])} agents, "
590
+ f"{len(all_tools)} tools ({explicit_tool_count} explicit, {auto_tool_count} auto-included)"
591
+ )
592
+
593
+ # Process functions (explicit + auto-included)
594
+ for func in all_functions:
595
+ config = FunctionRegistry.get(func.__name__)
596
+ if not config:
597
+ logger.warning(f"Function '{func.__name__}' not found in FunctionRegistry")
598
+ continue
599
+
600
+ input_schema_str = json.dumps(config.input_schema) if config.input_schema else None
601
+ output_schema_str = json.dumps(config.output_schema) if config.output_schema else None
602
+ metadata = config.metadata if config.metadata else {}
603
+
604
+ # Serialize retry and backoff policies
605
+ config_dict = {}
606
+ if config.retries:
607
+ config_dict["max_attempts"] = str(config.retries.max_attempts)
608
+ config_dict["initial_interval_ms"] = str(config.retries.initial_interval_ms)
609
+ config_dict["max_interval_ms"] = str(config.retries.max_interval_ms)
610
+
611
+ if config.backoff:
612
+ config_dict["backoff_type"] = config.backoff.type.value
613
+ config_dict["backoff_multiplier"] = str(config.backoff.multiplier)
614
+
615
+ component_info = self._PyComponentInfo(
616
+ name=config.name,
617
+ component_type="function",
618
+ metadata=metadata,
619
+ config=config_dict,
620
+ input_schema=input_schema_str,
621
+ output_schema=output_schema_str,
622
+ definition=None,
623
+ )
624
+ components.append(component_info)
625
+
626
+ # Process workflows
627
+ for workflow in self._explicit_components['workflows']:
628
+ config = WorkflowRegistry.get(workflow.__name__)
629
+ if not config:
630
+ logger.warning(f"Workflow '{workflow.__name__}' not found in WorkflowRegistry")
631
+ continue
632
+
633
+ input_schema_str = json.dumps(config.input_schema) if config.input_schema else None
634
+ output_schema_str = json.dumps(config.output_schema) if config.output_schema else None
635
+ metadata = config.metadata if config.metadata else {}
636
+
637
+ component_info = self._PyComponentInfo(
638
+ name=config.name,
639
+ component_type="workflow",
640
+ metadata=metadata,
641
+ config={},
642
+ input_schema=input_schema_str,
643
+ output_schema=output_schema_str,
644
+ definition=None,
645
+ )
646
+ components.append(component_info)
647
+
648
+ # Process entities
649
+ for entity_class in self._explicit_components['entities']:
650
+ entity_type = EntityRegistry.get(entity_class.__name__)
651
+ if not entity_type:
652
+ logger.warning(f"Entity '{entity_class.__name__}' not found in EntityRegistry")
653
+ continue
654
+
655
+ # Build complete entity definition with state schema and method schemas
656
+ entity_definition = entity_type.build_entity_definition()
657
+ definition_str = json.dumps(entity_definition)
658
+
659
+ # Keep minimal metadata for backward compatibility
660
+ metadata_dict = {
661
+ "methods": json.dumps(list(entity_type._method_schemas.keys())),
662
+ }
663
+
664
+ component_info = self._PyComponentInfo(
665
+ name=entity_type.name,
666
+ component_type="entity",
667
+ metadata=metadata_dict,
668
+ config={},
669
+ input_schema=None, # Entities don't have single input/output schemas
670
+ output_schema=None,
671
+ definition=definition_str, # Complete entity definition with state and methods
672
+ )
673
+ components.append(component_info)
674
+ logger.debug(f"Registered entity '{entity_type.name}' with definition")
675
+
676
+ # Process agents
677
+ from .agent import AgentRegistry
678
+
679
+ for agent in self._explicit_components['agents']:
680
+ # Register agent in AgentRegistry for execution lookup
681
+ AgentRegistry.register(agent)
682
+ logger.debug(f"Registered agent '{agent.name}' in AgentRegistry for execution")
683
+
684
+ input_schema_str = json.dumps(agent.input_schema) if hasattr(agent, 'input_schema') and agent.input_schema else None
685
+ output_schema_str = json.dumps(agent.output_schema) if hasattr(agent, 'output_schema') and agent.output_schema else None
686
+
687
+ metadata_dict = agent.metadata if hasattr(agent, 'metadata') else {}
688
+ if hasattr(agent, 'tools'):
689
+ metadata_dict["tools"] = json.dumps(list(agent.tools.keys()))
690
+
691
+ component_info = self._PyComponentInfo(
692
+ name=agent.name,
693
+ component_type="agent",
694
+ metadata=metadata_dict,
695
+ config={},
696
+ input_schema=input_schema_str,
697
+ output_schema=output_schema_str,
698
+ definition=None,
699
+ )
700
+ components.append(component_info)
701
+
702
+ # Process tools (explicit + auto-included)
703
+ for tool in all_tools:
704
+ # Validate that item is a Tool instance
705
+ if not isinstance(tool, Tool):
706
+ logger.warning(
707
+ f"Skipping non-Tool item in tools collection: {type(tool).__name__}. "
708
+ f"Use @tool decorator or pass Tool instances."
709
+ )
710
+ continue
711
+
712
+ input_schema_str = json.dumps(tool.input_schema) if hasattr(tool, 'input_schema') and tool.input_schema else None
713
+ output_schema_str = json.dumps(tool.output_schema) if hasattr(tool, 'output_schema') and tool.output_schema else None
714
+
715
+ component_info = self._PyComponentInfo(
716
+ name=tool.name,
717
+ component_type="tool",
718
+ metadata={},
719
+ config={},
720
+ input_schema=input_schema_str,
721
+ output_schema=output_schema_str,
722
+ definition=None,
723
+ )
724
+ components.append(component_info)
725
+
726
+ logger.info(f"Discovered {len(components)} total components")
727
+ return components
728
+
729
+ def _create_message_handler(self):
730
+ """Create the message handler that will be called by Rust worker."""
731
+
732
+ def handle_message(request):
733
+ """Handle incoming execution requests - returns coroutine for Rust to await."""
734
+ # Extract request details
735
+ component_name = request.component_name
736
+ component_type = request.component_type
737
+ input_data = request.input_data
738
+
739
+ logger.debug(
740
+ f"Handling {component_type} request: {component_name}, input size: {len(input_data)} bytes"
741
+ )
742
+
743
+ # Import all registries
744
+ from .tool import ToolRegistry
745
+ from .entity import EntityRegistry
746
+ from .agent import AgentRegistry
747
+
748
+ # Route based on component type and return coroutines
749
+ if component_type == "tool":
750
+ tool = ToolRegistry.get(component_name)
751
+ if tool:
752
+ logger.debug(f"Found tool: {component_name}")
753
+ # Return coroutine, don't await it
754
+ return self._execute_tool(tool, input_data, request)
755
+
756
+ elif component_type == "entity":
757
+ entity_type = EntityRegistry.get(component_name)
758
+ if entity_type:
759
+ logger.debug(f"Found entity: {component_name}")
760
+ # Return coroutine, don't await it
761
+ return self._execute_entity(entity_type, input_data, request)
762
+
763
+ elif component_type == "agent":
764
+ agent = AgentRegistry.get(component_name)
765
+ if agent:
766
+ logger.debug(f"Found agent: {component_name}")
767
+ # Return coroutine, don't await it
768
+ return self._execute_agent(agent, input_data, request)
769
+
770
+ elif component_type == "workflow":
771
+ workflow_config = WorkflowRegistry.get(component_name)
772
+ if workflow_config:
773
+ logger.debug(f"Found workflow: {component_name}")
774
+ # Return coroutine, don't await it
775
+ return self._execute_workflow(workflow_config, input_data, request)
776
+
777
+ elif component_type == "function":
778
+ function_config = FunctionRegistry.get(component_name)
779
+ if function_config:
780
+ # Return coroutine, don't await it
781
+ return self._execute_function(function_config, input_data, request)
782
+
783
+ # Not found - need to return an async error response
784
+ error_msg = f"Component '{component_name}' of type '{component_type}' not found"
785
+ logger.error(error_msg)
786
+
787
+ # Create async wrapper for error response
788
+ async def error_response():
789
+ return self._create_error_response(request, error_msg)
790
+
791
+ return error_response()
792
+
793
+ return handle_message
794
+
795
+ def _extract_critical_metadata(self, request) -> Dict[str, str]:
796
+ """
797
+ Extract critical metadata from request that MUST be propagated to response.
798
+
799
+ This ensures journal events are written to the correct tenant partition
800
+ and can be properly replayed. Missing tenant_id causes catastrophic
801
+ event sourcing corruption where events are split across partitions.
802
+
803
+ Returns:
804
+ Dict[str, str]: Metadata with all values normalized to strings for Rust FFI
805
+ """
806
+ metadata = {}
807
+ if hasattr(request, 'metadata') and request.metadata:
808
+ # CRITICAL: Propagate tenant_id to prevent journal corruption
809
+ # Convert to string immediately to ensure Rust FFI compatibility
810
+ if "tenant_id" in request.metadata:
811
+ metadata["tenant_id"] = str(request.metadata["tenant_id"])
812
+ if "deployment_id" in request.metadata:
813
+ metadata["deployment_id"] = str(request.metadata["deployment_id"])
814
+
815
+ # CRITICAL: Normalize all metadata values to strings for Rust FFI (PyO3)
816
+ # PyO3 expects HashMap<String, String> and will fail with bool/int values
817
+ return _normalize_metadata(metadata)
818
+
819
+ async def _execute_function(self, config, input_data: bytes, request):
820
+ """Execute a function handler (supports both regular and streaming functions)."""
821
+ import json
822
+ import inspect
823
+ import time
824
+ from .context import Context
825
+ from ._core import PyExecuteComponentResponse
826
+
827
+ exec_start = time.time()
828
+
829
+ try:
830
+ # Parse input data
831
+ input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
832
+
833
+ # Store trace metadata in contextvar for LM calls to access
834
+ # The Rust worker injects traceparent into request.metadata for trace propagation
835
+ if hasattr(request, 'metadata') and request.metadata:
836
+ _trace_metadata.set(dict(request.metadata))
837
+ logger.debug(f"Trace metadata stored: traceparent={request.metadata.get('traceparent', 'N/A')}")
838
+
839
+ # Extract attempt number from platform request (if provided)
840
+ platform_attempt = getattr(request, 'attempt', 0)
841
+
842
+ # Extract streaming context for real-time SSE log delivery
843
+ is_streaming = getattr(request, 'is_streaming', False)
844
+ tenant_id = request.metadata.get('tenant_id') if hasattr(request, 'metadata') else None
845
+
846
+ # Create FunctionContext with attempt number for retry tracking
847
+ # - If platform_attempt > 0: Platform is orchestrating retries
848
+ # - If platform_attempt == 0: First attempt (or no retry config)
849
+ from .function import FunctionContext
850
+ ctx = FunctionContext(
851
+ run_id=f"{self.service_name}:{config.name}",
852
+ attempt=platform_attempt,
853
+ runtime_context=request.runtime_context,
854
+ retry_policy=config.retries,
855
+ is_streaming=is_streaming,
856
+ tenant_id=tenant_id,
857
+ )
858
+
859
+ # Set context in contextvar so get_current_context() and error handlers can access it
860
+ from .context import set_current_context, _current_context
861
+ token = set_current_context(ctx)
862
+
863
+ # Execute function directly - Rust bridge handles tracing
864
+ # Note: Removed Python-level span creation to avoid duplicate spans.
865
+ # The Rust worker bridge (sdk-python/rust-src/worker.rs:413-659) already
866
+ # creates a comprehensive OpenTelemetry span with all necessary attributes.
867
+ # See DUPLICATE_SPANS_FIX.md for details.
868
+ #
869
+ # Note on retry handling:
870
+ # - If platform_attempt > 0: Platform is orchestrating retries, execute once
871
+ # - If platform_attempt == 0: Local retry loop in decorator wrapper handles retries
872
+ if input_dict:
873
+ result = config.handler(ctx, **input_dict)
874
+ else:
875
+ result = config.handler(ctx)
876
+
877
+ # Note: Removed flush_telemetry_py() call here - it was causing 2-second blocking delay!
878
+ # The batch span processor handles flushing automatically with 5s timeout
879
+ # We only need to flush on worker shutdown, not after each function execution
880
+
881
+ # Check if result is an async generator (streaming function)
882
+ if inspect.isasyncgen(result):
883
+ # Streaming function - queue deltas immediately via Rust for real-time delivery
884
+ # Instead of collecting into a list and returning, we send each chunk
885
+ # as it's yielded via the delta queue with 10ms flush interval
886
+ from .events import Event
887
+
888
+ sequence = 0
889
+ has_typed_events = False # Track if user yields Event objects
890
+ first_chunk = True
891
+
892
+ # Extract metadata for delta queue (must be Dict[str, str] for Rust FFI)
893
+ metadata = _normalize_metadata(self._extract_critical_metadata(request))
894
+
895
+ async for chunk in result:
896
+ # Check if chunk is a typed Event
897
+ if isinstance(chunk, Event):
898
+ has_typed_events = True
899
+ # Use the event's fields directly
900
+ event_data = chunk.to_response_fields()
901
+ output_data = event_data.get("output_data", b"")
902
+ # Convert bytes to string for queue_delta
903
+ output_str = output_data.decode("utf-8") if isinstance(output_data, bytes) else str(output_data or "{}")
904
+ self._rust_worker.queue_delta(
905
+ invocation_id=request.invocation_id,
906
+ event_type=event_data.get("event_type", ""),
907
+ output_data=output_str,
908
+ content_index=event_data.get("content_index", 0),
909
+ sequence=sequence,
910
+ metadata=metadata,
911
+ )
912
+ else:
913
+ # Regular chunk - wrap with output events
914
+ if first_chunk:
915
+ # Emit output.start event before first chunk
916
+ self._rust_worker.queue_delta(
917
+ invocation_id=request.invocation_id,
918
+ event_type="output.start",
919
+ output_data="{}",
920
+ content_index=0,
921
+ sequence=sequence,
922
+ metadata=metadata,
923
+ )
924
+ sequence += 1
925
+ first_chunk = False
926
+
927
+ # Serialize chunk (using _serialize_result to handle Pydantic models, etc.)
928
+ chunk_data = _serialize_result(chunk)
929
+ # Convert bytes to string for queue_delta
930
+ chunk_str = chunk_data.decode("utf-8") if isinstance(chunk_data, bytes) else str(chunk_data)
931
+
932
+ # Emit output.delta event
933
+ self._rust_worker.queue_delta(
934
+ invocation_id=request.invocation_id,
935
+ event_type="output.delta",
936
+ output_data=chunk_str,
937
+ content_index=0,
938
+ sequence=sequence,
939
+ metadata=metadata,
940
+ )
941
+ sequence += 1
942
+
943
+ # Emit closing events if we had regular chunks
944
+ if not has_typed_events and not first_chunk:
945
+ # Emit output.stop event
946
+ self._rust_worker.queue_delta(
947
+ invocation_id=request.invocation_id,
948
+ event_type="output.stop",
949
+ output_data="{}",
950
+ content_index=0,
951
+ sequence=sequence,
952
+ metadata=metadata,
953
+ )
954
+ sequence += 1
955
+
956
+ # Always emit run.completed event
957
+ self._rust_worker.queue_delta(
958
+ invocation_id=request.invocation_id,
959
+ event_type="run.completed",
960
+ output_data="{}",
961
+ content_index=0,
962
+ sequence=sequence,
963
+ metadata=metadata,
964
+ )
965
+
966
+ logger.debug(f"Streaming function queued {sequence + 1} deltas for real-time delivery")
967
+ # Return None to signal that streaming was handled via delta queue
968
+ return None
969
+ else:
970
+ # Regular function - await and return single response
971
+ if inspect.iscoroutine(result):
972
+ result = await result
973
+
974
+ # Serialize result
975
+ output_data = _serialize_result(result)
976
+
977
+ # Extract critical metadata for journal event correlation
978
+ response_metadata = self._extract_critical_metadata(request)
979
+
980
+ # Emit run.completed event with output
981
+ return PyExecuteComponentResponse(
982
+ invocation_id=request.invocation_id,
983
+ success=True,
984
+ output_data=output_data,
985
+ state_update=None,
986
+ error_message=None,
987
+ metadata=response_metadata if response_metadata else None,
988
+ event_type="run.completed",
989
+ content_index=0,
990
+ sequence=0,
991
+ attempt=platform_attempt,
992
+ )
993
+
994
+ except Exception as e:
995
+ # Include exception type for better error messages
996
+ error_msg = f"{type(e).__name__}: {str(e)}"
997
+
998
+ # Capture full stack trace for telemetry
999
+ import traceback
1000
+ stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
1001
+
1002
+ # Log with full traceback using ctx.logger to ensure run_id correlation
1003
+ from .context import get_current_context
1004
+ current_ctx = get_current_context()
1005
+ error_logger = current_ctx.logger if current_ctx else logger
1006
+ error_logger.error(f"Function execution failed: {error_msg}", exc_info=True)
1007
+
1008
+ # Store stack trace in metadata for observability
1009
+ metadata = {
1010
+ "error_type": type(e).__name__,
1011
+ "stack_trace": stack_trace,
1012
+ "error": True, # Boolean flag for error detection
1013
+ }
1014
+
1015
+ # CRITICAL: Extract critical metadata (including tenant_id) for journal event correlation
1016
+ # This ensures run.failed events are properly emitted by Worker Coordinator
1017
+ critical_metadata = self._extract_critical_metadata(request)
1018
+ metadata.update(critical_metadata)
1019
+
1020
+ # CRITICAL: Normalize metadata to ensure all values are strings (Rust FFI requirement)
1021
+ # PyO3 expects HashMap<String, String>, but we may have booleans or other types
1022
+ normalized_metadata = _normalize_metadata(metadata)
1023
+
1024
+ # Emit run.failed event
1025
+ return PyExecuteComponentResponse(
1026
+ invocation_id=request.invocation_id,
1027
+ success=False,
1028
+ output_data=b"",
1029
+ state_update=None,
1030
+ error_message=error_msg,
1031
+ metadata=normalized_metadata,
1032
+ event_type="run.failed",
1033
+ content_index=0,
1034
+ sequence=0,
1035
+ attempt=getattr(request, 'attempt', 0),
1036
+ )
1037
+
1038
+ finally:
1039
+ # Always reset context to prevent leakage between executions
1040
+ _current_context.reset(token)
1041
+
1042
+ async def _execute_workflow(self, config, input_data: bytes, request):
1043
+ """Execute a workflow handler with automatic replay support."""
1044
+ import json
1045
+ from .workflow import WorkflowEntity, WorkflowContext
1046
+ from .entity import _get_state_adapter, _entity_state_adapter_ctx
1047
+ from .exceptions import WaitingForUserInputException
1048
+ from ._core import PyExecuteComponentResponse
1049
+
1050
+ # Set entity state adapter in context so workflows can use Entities
1051
+ _entity_state_adapter_ctx.set(self._entity_state_adapter)
1052
+
1053
+ try:
1054
+ # Parse input data
1055
+ input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
1056
+
1057
+ # Extract or generate session_id for multi-turn conversation support (for chat workflows)
1058
+ # If session_id is provided, the workflow can maintain conversation context
1059
+ session_id = input_dict.get("session_id")
1060
+
1061
+ if not session_id:
1062
+ session_id = str(uuid.uuid4())
1063
+ logger.info(f"Created new workflow session: {session_id}")
1064
+ else:
1065
+ logger.info(f"Using existing workflow session: {session_id}")
1066
+
1067
+ # Parse replay data from request metadata for crash recovery
1068
+ completed_steps = {}
1069
+ initial_state = {}
1070
+ user_response = None
1071
+
1072
+ if hasattr(request, 'metadata') and request.metadata:
1073
+ # Parse completed steps for replay
1074
+ if "completed_steps" in request.metadata:
1075
+ completed_steps_json = request.metadata["completed_steps"]
1076
+ if completed_steps_json:
1077
+ try:
1078
+ completed_steps = json.loads(completed_steps_json)
1079
+ logger.info(f"🔄 Replaying workflow with {len(completed_steps)} cached steps")
1080
+ except json.JSONDecodeError:
1081
+ logger.warning("Failed to parse completed_steps from metadata")
1082
+
1083
+ # Parse initial workflow state for replay
1084
+ if "workflow_state" in request.metadata:
1085
+ workflow_state_json = request.metadata["workflow_state"]
1086
+ if workflow_state_json:
1087
+ try:
1088
+ initial_state = json.loads(workflow_state_json)
1089
+ logger.info(f"🔄 Loaded workflow state: {len(initial_state)} keys")
1090
+ except json.JSONDecodeError:
1091
+ logger.warning("Failed to parse workflow_state from metadata")
1092
+
1093
+ # Check for user response (workflow resume after pause)
1094
+ if "user_response" in request.metadata:
1095
+ user_response = request.metadata["user_response"]
1096
+ logger.info(f"▶️ Resuming workflow with user response: {user_response}")
1097
+
1098
+ # NEW: Check for agent resume (agent-level HITL)
1099
+ agent_context = None
1100
+ if hasattr(request, 'metadata') and request.metadata:
1101
+ if "agent_context" in request.metadata:
1102
+ agent_context_json = request.metadata["agent_context"]
1103
+ try:
1104
+ agent_context = json.loads(agent_context_json)
1105
+ agent_name = agent_context.get("agent_name", "unknown")
1106
+ iteration = agent_context.get("iteration", 0)
1107
+ logger.info(
1108
+ f"▶️ Resuming agent '{agent_name}' from iteration {iteration} "
1109
+ f"with user response: {user_response}"
1110
+ )
1111
+ except json.JSONDecodeError:
1112
+ logger.warning("Failed to parse agent_context from metadata")
1113
+ agent_context = None
1114
+
1115
+ # Extract session_id and user_id from request for memory scoping
1116
+ # Do this FIRST so we can pass to WorkflowEntity constructor
1117
+ session_id = request.session_id if hasattr(request, 'session_id') and request.session_id else request.invocation_id
1118
+ user_id = request.user_id if hasattr(request, 'user_id') and request.user_id else None
1119
+
1120
+ # Extract streaming context for real-time SSE log delivery
1121
+ is_streaming = getattr(request, 'is_streaming', False)
1122
+ tenant_id = request.metadata.get('tenant_id') if hasattr(request, 'metadata') else None
1123
+
1124
+ # Create WorkflowEntity for state management with memory scoping
1125
+ # Entity key will be scoped based on priority: user_id > session_id > run_id
1126
+ # For session scope, include component_name to enable listing sessions by workflow
1127
+ component_name = getattr(request, 'component_name', None)
1128
+ workflow_entity = WorkflowEntity(
1129
+ run_id=request.invocation_id,
1130
+ session_id=session_id,
1131
+ user_id=user_id,
1132
+ component_name=component_name,
1133
+ )
1134
+
1135
+ # Load replay data into entity if provided
1136
+ if completed_steps:
1137
+ workflow_entity._completed_steps = completed_steps
1138
+ logger.debug(f"Loaded {len(completed_steps)} completed steps into workflow entity")
1139
+
1140
+ # Inject user response if resuming from pause
1141
+ if user_response:
1142
+ workflow_entity.inject_user_response(user_response)
1143
+ logger.debug(f"Injected user response into workflow entity")
1144
+
1145
+ if initial_state:
1146
+ # Load initial state into entity's state adapter
1147
+ state_adapter = _get_state_adapter()
1148
+ if hasattr(state_adapter, '_standalone_states'):
1149
+ # Standalone mode - set state directly
1150
+ state_adapter._standalone_states[workflow_entity._state_key] = initial_state
1151
+ logger.debug(f"Loaded initial state with {len(initial_state)} keys into workflow entity (standalone)")
1152
+ else:
1153
+ # Production mode - state is managed by Rust core
1154
+ logger.debug(f"Initial state will be loaded from platform (production mode)")
1155
+
1156
+ # Create checkpoint callback for real-time streaming
1157
+ def checkpoint_callback(checkpoint: dict) -> None:
1158
+ """Send checkpoint to Rust worker queue."""
1159
+ try:
1160
+ # Extract critical metadata for checkpoint routing
1161
+ metadata = self._extract_critical_metadata(request)
1162
+
1163
+ # DEBUG: Log metadata types for troubleshooting PyO3 conversion errors
1164
+ logger.debug(f"Checkpoint metadata types: {[(k, type(v).__name__) for k, v in metadata.items()]}")
1165
+
1166
+ # Get source timestamp (use from checkpoint if provided, otherwise generate now)
1167
+ source_timestamp_ns = checkpoint.get("source_timestamp_ns", time.time_ns())
1168
+
1169
+ # Queue checkpoint via Rust FFI
1170
+ self._rust_worker.queue_workflow_checkpoint(
1171
+ invocation_id=request.invocation_id,
1172
+ checkpoint_type=checkpoint["checkpoint_type"],
1173
+ checkpoint_data=_json.dumps(checkpoint["checkpoint_data"], cls=_ResultEncoder),
1174
+ sequence_number=checkpoint["sequence_number"],
1175
+ metadata=metadata,
1176
+ source_timestamp_ns=source_timestamp_ns,
1177
+ )
1178
+ logger.debug(
1179
+ f"Queued checkpoint: type={checkpoint['checkpoint_type']} "
1180
+ f"seq={checkpoint['sequence_number']}"
1181
+ )
1182
+ except Exception as e:
1183
+ # Checkpoints are critical for durability - failing to persist them
1184
+ # means we cannot guarantee replay/recovery. Re-raise to fail the workflow.
1185
+ logger.error(f"Failed to queue checkpoint: {e}", exc_info=True)
1186
+ logger.error(f"Checkpoint metadata: {metadata}")
1187
+ logger.error(f"Checkpoint type: {checkpoint.get('checkpoint_type')}")
1188
+ raise RuntimeError(
1189
+ f"Failed to queue checkpoint '{checkpoint.get('checkpoint_type')}': {e}. "
1190
+ f"Workflow cannot continue without durable checkpoints."
1191
+ ) from e
1192
+
1193
+ # Create delta callback for forwarding streaming events from nested agents/functions
1194
+ # This is used by WorkflowContext._consume_streaming_result to forward events
1195
+ delta_metadata = _normalize_metadata(self._extract_critical_metadata(request))
1196
+
1197
+ def delta_callback(event_type: str, output_data: str, content_index: int, sequence: int) -> None:
1198
+ """Forward streaming delta event from nested component."""
1199
+ try:
1200
+ self._rust_worker.queue_delta(
1201
+ invocation_id=request.invocation_id,
1202
+ event_type=event_type,
1203
+ output_data=output_data,
1204
+ content_index=content_index,
1205
+ sequence=sequence,
1206
+ metadata=delta_metadata,
1207
+ )
1208
+ logger.debug(f"Forwarded delta: type={event_type} seq={sequence}")
1209
+ except Exception as e:
1210
+ # Delta forwarding is best-effort - log but don't fail the workflow
1211
+ logger.warning(f"Failed to forward delta event: {e}")
1212
+
1213
+ # Create WorkflowContext with entity, runtime_context, checkpoint callback, and checkpoint client
1214
+ ctx = WorkflowContext(
1215
+ workflow_entity=workflow_entity,
1216
+ run_id=request.invocation_id, # Use unique invocation_id for this execution
1217
+ session_id=session_id, # Session for multi-turn conversations
1218
+ user_id=user_id, # User for long-term memory
1219
+ runtime_context=request.runtime_context,
1220
+ checkpoint_callback=checkpoint_callback,
1221
+ checkpoint_client=self._checkpoint_client, # Phase 3: platform-side memoization
1222
+ is_streaming=is_streaming, # For real-time SSE log delivery
1223
+ tenant_id=tenant_id, # For multi-tenant deployments
1224
+ delta_callback=delta_callback, # For forwarding streaming events from nested components
1225
+ )
1226
+
1227
+ # NEW: Populate agent resume info if this is an agent HITL resume
1228
+ if agent_context and user_response:
1229
+ ctx._agent_resume_info = {
1230
+ "agent_name": agent_context["agent_name"],
1231
+ "agent_context": agent_context,
1232
+ "user_response": user_response,
1233
+ }
1234
+ logger.debug(
1235
+ f"Set agent resume info for '{agent_context['agent_name']}' "
1236
+ f"in workflow context"
1237
+ )
1238
+
1239
+ # Execute workflow directly - Rust bridge handles tracing
1240
+ # Note: Removed Python-level span creation to avoid duplicate spans.
1241
+ # The Rust worker bridge creates comprehensive OpenTelemetry spans.
1242
+ # See DUPLICATE_SPANS_FIX.md for details.
1243
+
1244
+ # CRITICAL: Set context in contextvar so LM/Agent/Tool calls can access it
1245
+ from .context import set_current_context
1246
+ import time as _time
1247
+ token = set_current_context(ctx)
1248
+ workflow_start_time = _time.time()
1249
+ try:
1250
+ # Emit workflow.started checkpoint
1251
+ ctx._send_checkpoint("workflow.started", {
1252
+ "workflow.name": config.name,
1253
+ "run_id": request.invocation_id,
1254
+ "session_id": session_id,
1255
+ "is_replay": bool(completed_steps),
1256
+ })
1257
+
1258
+ if input_dict:
1259
+ result = await config.handler(ctx, **input_dict)
1260
+ else:
1261
+ result = await config.handler(ctx)
1262
+
1263
+ # Serialize result BEFORE emitting workflow.completed
1264
+ # This ensures serialization errors trigger workflow.failed, not run.failed
1265
+ output_data = _serialize_result(result)
1266
+
1267
+ # Emit workflow.completed checkpoint
1268
+ workflow_duration_ms = int((_time.time() - workflow_start_time) * 1000)
1269
+ ctx._send_checkpoint("workflow.completed", {
1270
+ "workflow.name": config.name,
1271
+ "run_id": request.invocation_id,
1272
+ "duration_ms": workflow_duration_ms,
1273
+ "steps_count": len(ctx._workflow_entity._step_events),
1274
+ })
1275
+
1276
+ # Note: Workflow entity persistence is handled by the @workflow decorator wrapper
1277
+ # which persists before returning. No need to persist here.
1278
+ except Exception as workflow_error:
1279
+ # Emit workflow.failed checkpoint
1280
+ workflow_duration_ms = int((_time.time() - workflow_start_time) * 1000)
1281
+ ctx._send_checkpoint("workflow.failed", {
1282
+ "workflow.name": config.name,
1283
+ "run_id": request.invocation_id,
1284
+ "duration_ms": workflow_duration_ms,
1285
+ "error": str(workflow_error),
1286
+ "error_type": type(workflow_error).__name__,
1287
+ })
1288
+ raise
1289
+ finally:
1290
+ # Always reset context to prevent leakage
1291
+ from .context import _current_context
1292
+ _current_context.reset(token)
1293
+
1294
+ # Note: Removed flush_telemetry_py() call here - it was causing 2-second blocking delay!
1295
+ # The batch span processor handles flushing automatically with 5s timeout
1296
+
1297
+ # Collect workflow execution metadata for durability
1298
+ metadata = {}
1299
+
1300
+ # CRITICAL: Propagate tenant_id and deployment_id to prevent journal corruption
1301
+ # Missing tenant_id causes events to be written to wrong partition
1302
+ critical_metadata = self._extract_critical_metadata(request)
1303
+ metadata.update(critical_metadata)
1304
+
1305
+ # Add step events to metadata (for workflow durability)
1306
+ # Access _step_events from the workflow entity, not the context
1307
+ step_events = ctx._workflow_entity._step_events
1308
+ if step_events:
1309
+ metadata["step_events"] = json.dumps(step_events)
1310
+ logger.debug(f"Workflow has {len(step_events)} recorded steps")
1311
+
1312
+ # Add final state snapshot to metadata (if state was used)
1313
+ # Check if _state was initialized without triggering property getter
1314
+ if hasattr(ctx, '_workflow_entity') and ctx._workflow_entity._state is not None:
1315
+ if ctx._workflow_entity._state.has_changes():
1316
+ state_snapshot = ctx._workflow_entity._state.get_state_snapshot()
1317
+ metadata["workflow_state"] = json.dumps(state_snapshot)
1318
+ logger.debug(f"Workflow state snapshot: {state_snapshot}")
1319
+
1320
+ # AUDIT TRAIL: Serialize complete state change history for replay and debugging
1321
+ # This captures all intermediate state mutations, not just final snapshot
1322
+ state_changes = ctx._workflow_entity._state_changes
1323
+ logger.info(f"🔍 DEBUG: _state_changes list has {len(state_changes)} entries")
1324
+ if state_changes:
1325
+ metadata["state_changes"] = json.dumps(state_changes)
1326
+ logger.info(f"✅ Serialized {len(state_changes)} state changes to metadata")
1327
+ else:
1328
+ logger.warning("⚠️ _state_changes list is empty - no state change history captured")
1329
+
1330
+ # CRITICAL: Persist workflow entity state to platform
1331
+ # This stores the WorkflowEntity as a first-class entity with proper versioning
1332
+ try:
1333
+ logger.info(f"🔍 DEBUG: About to call _persist_state() for run {request.invocation_id}")
1334
+ await ctx._workflow_entity._persist_state()
1335
+ logger.info(f"✅ Successfully persisted WorkflowEntity state for run {request.invocation_id}")
1336
+ except Exception as persist_error:
1337
+ logger.error(f"❌ Failed to persist WorkflowEntity state (non-fatal): {persist_error}", exc_info=True)
1338
+ # Continue anyway - persistence failure shouldn't fail the workflow
1339
+
1340
+ logger.info(f"Workflow completed successfully with {len(step_events)} steps")
1341
+
1342
+ # Add session_id to metadata for multi-turn conversation support
1343
+ metadata["session_id"] = session_id
1344
+
1345
+ # CRITICAL: Flush all buffered checkpoints before returning response
1346
+ # This ensures checkpoints arrive at platform BEFORE run.completed event
1347
+ try:
1348
+ flushed_count = self._rust_worker.flush_workflow_checkpoints()
1349
+ if flushed_count > 0:
1350
+ logger.info(f"✅ Flushed {flushed_count} checkpoints before completion")
1351
+ except Exception as flush_error:
1352
+ logger.error(f"Failed to flush checkpoints: {flush_error}", exc_info=True)
1353
+ # Continue anyway - checkpoint flushing is best-effort
1354
+
1355
+ return PyExecuteComponentResponse(
1356
+ invocation_id=request.invocation_id,
1357
+ success=True,
1358
+ output_data=output_data,
1359
+ state_update=None, # Not used for workflows (use metadata instead)
1360
+ error_message=None,
1361
+ metadata=metadata if metadata else None, # Include step events + state + session_id
1362
+ event_type="run.completed",
1363
+ content_index=0,
1364
+ sequence=0,
1365
+ attempt=getattr(request, 'attempt', 0),
1366
+ )
1367
+
1368
+ except WaitingForUserInputException as e:
1369
+ # Workflow or agent paused for user input
1370
+ pause_type = "agent" if e.agent_context else "workflow"
1371
+ logger.info(f"⏸️ {pause_type.capitalize()} paused waiting for user input: {e.question}")
1372
+
1373
+ # Collect metadata for pause state
1374
+ # Note: All metadata values must be strings for Rust FFI
1375
+ pause_metadata = {
1376
+ "status": "awaiting_user_input",
1377
+ "question": e.question,
1378
+ "input_type": e.input_type,
1379
+ "pause_type": pause_type, # NEW: Indicates workflow vs agent pause
1380
+ }
1381
+
1382
+ # CRITICAL: Propagate tenant_id even when pausing
1383
+ critical_metadata = self._extract_critical_metadata(request)
1384
+ pause_metadata.update(critical_metadata)
1385
+
1386
+ # Add optional fields only if they exist
1387
+ if e.options:
1388
+ pause_metadata["options"] = json.dumps(e.options)
1389
+ if e.checkpoint_state:
1390
+ pause_metadata["checkpoint_state"] = json.dumps(e.checkpoint_state)
1391
+ if session_id:
1392
+ pause_metadata["session_id"] = session_id
1393
+
1394
+ # NEW: Store agent execution state if present
1395
+ if e.agent_context:
1396
+ pause_metadata["agent_context"] = json.dumps(e.agent_context)
1397
+ logger.debug(
1398
+ f"Agent '{e.agent_context['agent_name']}' paused at "
1399
+ f"iteration {e.agent_context['iteration']}"
1400
+ )
1401
+
1402
+ # Add step events to pause metadata for durability
1403
+ step_events = ctx._workflow_entity._step_events
1404
+ if step_events:
1405
+ pause_metadata["step_events"] = json.dumps(step_events)
1406
+ logger.debug(f"Paused workflow has {len(step_events)} recorded steps")
1407
+
1408
+ # Add current workflow state to pause metadata
1409
+ if hasattr(ctx, '_workflow_entity') and ctx._workflow_entity._state is not None:
1410
+ if ctx._workflow_entity._state.has_changes():
1411
+ state_snapshot = ctx._workflow_entity._state.get_state_snapshot()
1412
+ pause_metadata["workflow_state"] = json.dumps(state_snapshot)
1413
+ logger.debug(f"Paused workflow state snapshot: {state_snapshot}")
1414
+
1415
+ # AUDIT TRAIL: Also include state change history for paused workflows
1416
+ state_changes = ctx._workflow_entity._state_changes
1417
+ if state_changes:
1418
+ pause_metadata["state_changes"] = json.dumps(state_changes)
1419
+ logger.debug(f"Paused workflow has {len(state_changes)} state changes in history")
1420
+
1421
+ # Return "success" with awaiting_user_input metadata
1422
+ # The output contains the question details for the client
1423
+ output = {
1424
+ "question": e.question,
1425
+ "input_type": e.input_type,
1426
+ "options": e.options,
1427
+ }
1428
+ output_data = _serialize_result(output)
1429
+
1430
+ # Emit run.paused event for HITL (human-in-the-loop)
1431
+ return PyExecuteComponentResponse(
1432
+ invocation_id=request.invocation_id,
1433
+ success=True, # This is a valid pause state, not an error
1434
+ output_data=output_data,
1435
+ state_update=None,
1436
+ error_message=None,
1437
+ metadata=pause_metadata,
1438
+ event_type="run.paused",
1439
+ content_index=0,
1440
+ sequence=0,
1441
+ attempt=getattr(request, 'attempt', 0),
1442
+ )
1443
+
1444
+ except Exception as e:
1445
+ # Include exception type for better error messages
1446
+ error_msg = f"{type(e).__name__}: {str(e)}"
1447
+
1448
+ # Capture full stack trace for telemetry
1449
+ import traceback
1450
+ stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
1451
+
1452
+ # Log with full traceback
1453
+ logger.error(f"Workflow execution failed: {error_msg}", exc_info=True)
1454
+
1455
+ # CRITICAL: Flush all buffered checkpoints before returning error response
1456
+ # This ensures workflow.failed checkpoint arrives at platform BEFORE run.failed event
1457
+ # Without this, SSE clients may not receive workflow.failed events
1458
+ try:
1459
+ flushed_count = self._rust_worker.flush_workflow_checkpoints()
1460
+ if flushed_count > 0:
1461
+ logger.info(f"✅ Flushed {flushed_count} checkpoints before error response")
1462
+ except Exception as flush_error:
1463
+ logger.error(f"Failed to flush checkpoints in error path: {flush_error}", exc_info=True)
1464
+ # Continue anyway - checkpoint flushing is best-effort
1465
+
1466
+ # Store error metadata for observability
1467
+ metadata = {
1468
+ "error_type": type(e).__name__,
1469
+ "stack_trace": stack_trace,
1470
+ "error": True,
1471
+ }
1472
+
1473
+ # Extract critical metadata for journal correlation (if available)
1474
+ critical_metadata = self._extract_critical_metadata(request)
1475
+ metadata.update(critical_metadata)
1476
+
1477
+ # Normalize metadata for Rust FFI compatibility
1478
+ normalized_metadata = _normalize_metadata(metadata)
1479
+
1480
+ # Emit run.failed event
1481
+ return PyExecuteComponentResponse(
1482
+ invocation_id=request.invocation_id,
1483
+ success=False,
1484
+ output_data=b"",
1485
+ state_update=None,
1486
+ error_message=error_msg,
1487
+ metadata=normalized_metadata,
1488
+ event_type="run.failed",
1489
+ content_index=0,
1490
+ sequence=0,
1491
+ attempt=getattr(request, 'attempt', 0),
1492
+ )
1493
+
1494
+ async def _execute_tool(self, tool, input_data: bytes, request):
1495
+ """Execute a tool handler."""
1496
+ import json
1497
+ from .context import Context
1498
+ from ._core import PyExecuteComponentResponse
1499
+
1500
+ try:
1501
+ # Parse input data
1502
+ input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
1503
+
1504
+ # Create context with runtime_context for trace correlation
1505
+ ctx = Context(
1506
+ run_id=f"{self.service_name}:{tool.name}",
1507
+ runtime_context=request.runtime_context,
1508
+ )
1509
+
1510
+ # Set context in contextvar so get_current_context() and error handlers can access it
1511
+ from .context import set_current_context, _current_context
1512
+ token = set_current_context(ctx)
1513
+
1514
+ # Execute tool
1515
+ result = await tool.invoke(ctx, **input_dict)
1516
+
1517
+ # Serialize result
1518
+ output_data = _serialize_result(result)
1519
+
1520
+ return PyExecuteComponentResponse(
1521
+ invocation_id=request.invocation_id,
1522
+ success=True,
1523
+ output_data=output_data,
1524
+ state_update=None,
1525
+ error_message=None,
1526
+ metadata=None,
1527
+ event_type="run.completed",
1528
+ content_index=0,
1529
+ sequence=0,
1530
+ attempt=getattr(request, 'attempt', 0),
1531
+ )
1532
+
1533
+ except Exception as e:
1534
+ # Include exception type for better error messages
1535
+ error_msg = f"{type(e).__name__}: {str(e)}"
1536
+
1537
+ # Capture full stack trace for telemetry
1538
+ import traceback
1539
+ stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
1540
+
1541
+ # Log with full traceback using ctx.logger to ensure run_id correlation
1542
+ from .context import get_current_context
1543
+ current_ctx = get_current_context()
1544
+ error_logger = current_ctx.logger if current_ctx else logger
1545
+ error_logger.error(f"Tool execution failed: {error_msg}", exc_info=True)
1546
+
1547
+ # Store error metadata for observability
1548
+ metadata = {
1549
+ "error_type": type(e).__name__,
1550
+ "stack_trace": stack_trace,
1551
+ "error": True,
1552
+ }
1553
+
1554
+ # CRITICAL: Extract critical metadata (including tenant_id) for journal event correlation
1555
+ critical_metadata = self._extract_critical_metadata(request)
1556
+ metadata.update(critical_metadata)
1557
+
1558
+ # Normalize metadata for Rust FFI compatibility
1559
+ normalized_metadata = _normalize_metadata(metadata)
1560
+
1561
+ # Emit run.failed event
1562
+ return PyExecuteComponentResponse(
1563
+ invocation_id=request.invocation_id,
1564
+ success=False,
1565
+ output_data=b"",
1566
+ state_update=None,
1567
+ error_message=error_msg,
1568
+ metadata=normalized_metadata,
1569
+ event_type="run.failed",
1570
+ content_index=0,
1571
+ sequence=0,
1572
+ attempt=getattr(request, 'attempt', 0),
1573
+ )
1574
+
1575
+ finally:
1576
+ # Always reset context to prevent leakage between executions
1577
+ _current_context.reset(token)
1578
+
1579
+ async def _execute_entity(self, entity_type, input_data: bytes, request):
1580
+ """Execute an entity method."""
1581
+ import json
1582
+ from .context import Context
1583
+ from .entity import EntityType, Entity, _entity_state_adapter_ctx
1584
+ from ._core import PyExecuteComponentResponse
1585
+
1586
+ # Set entity state adapter in context for Entity instances to access
1587
+ _entity_state_adapter_ctx.set(self._entity_state_adapter)
1588
+
1589
+ try:
1590
+ # Parse input data
1591
+ input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
1592
+
1593
+ # Extract entity key and method name from input
1594
+ entity_key = input_dict.pop("key", None)
1595
+ method_name = input_dict.pop("method", None)
1596
+
1597
+ if not entity_key:
1598
+ raise ValueError("Entity invocation requires 'key' parameter")
1599
+ if not method_name:
1600
+ raise ValueError("Entity invocation requires 'method' parameter")
1601
+
1602
+ # Create context for logging and tracing
1603
+ ctx = Context(
1604
+ run_id=f"{self.service_name}:{entity_type.name}:{entity_key}",
1605
+ runtime_context=request.runtime_context,
1606
+ )
1607
+
1608
+ # Set context in contextvar so get_current_context() and error handlers can access it
1609
+ from .context import set_current_context, _current_context
1610
+ token = set_current_context(ctx)
1611
+
1612
+ # Note: State loading is now handled automatically by the entity method wrapper
1613
+ # via EntityStateAdapter which uses the Rust core for cache + platform persistence
1614
+
1615
+ # Create entity instance using the stored class reference
1616
+ entity_instance = entity_type.entity_class(key=entity_key)
1617
+
1618
+ # Get method
1619
+ if not hasattr(entity_instance, method_name):
1620
+ raise ValueError(f"Entity '{entity_type.name}' has no method '{method_name}'")
1621
+
1622
+ method = getattr(entity_instance, method_name)
1623
+
1624
+ # Execute method (entity method wrapper handles state load/save automatically)
1625
+ result = await method(**input_dict)
1626
+
1627
+ # Serialize result
1628
+ output_data = _serialize_result(result)
1629
+
1630
+ # Note: State persistence is now handled automatically by the entity method wrapper
1631
+ # via EntityStateAdapter which uses Rust core for optimistic locking + version tracking
1632
+
1633
+ # CRITICAL: Propagate tenant_id and deployment_id to prevent journal corruption
1634
+ metadata = self._extract_critical_metadata(request)
1635
+
1636
+ return PyExecuteComponentResponse(
1637
+ invocation_id=request.invocation_id,
1638
+ success=True,
1639
+ output_data=output_data,
1640
+ state_update=None, # TODO: Use structured StateUpdate object
1641
+ error_message=None,
1642
+ metadata=metadata if metadata else None, # Include state in metadata for Worker Coordinator
1643
+ event_type="run.completed",
1644
+ content_index=0,
1645
+ sequence=0,
1646
+ attempt=getattr(request, 'attempt', 0),
1647
+ )
1648
+
1649
+ except Exception as e:
1650
+ # Include exception type for better error messages
1651
+ error_msg = f"{type(e).__name__}: {str(e)}"
1652
+
1653
+ # Capture full stack trace for telemetry
1654
+ import traceback
1655
+ stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
1656
+
1657
+ # Log with full traceback using ctx.logger to ensure run_id correlation
1658
+ from .context import get_current_context
1659
+ current_ctx = get_current_context()
1660
+ error_logger = current_ctx.logger if current_ctx else logger
1661
+ error_logger.error(f"Entity execution failed: {error_msg}", exc_info=True)
1662
+
1663
+ # Store error metadata for observability
1664
+ metadata = {
1665
+ "error_type": type(e).__name__,
1666
+ "stack_trace": stack_trace,
1667
+ "error": True,
1668
+ }
1669
+
1670
+ # Extract critical metadata for journal correlation (if available)
1671
+ critical_metadata = self._extract_critical_metadata(request)
1672
+ metadata.update(critical_metadata)
1673
+
1674
+ # Normalize metadata for Rust FFI compatibility
1675
+ normalized_metadata = _normalize_metadata(metadata)
1676
+
1677
+ # Emit run.failed event
1678
+ return PyExecuteComponentResponse(
1679
+ invocation_id=request.invocation_id,
1680
+ success=False,
1681
+ output_data=b"",
1682
+ state_update=None,
1683
+ error_message=error_msg,
1684
+ metadata=normalized_metadata,
1685
+ event_type="run.failed",
1686
+ content_index=0,
1687
+ sequence=0,
1688
+ attempt=getattr(request, 'attempt', 0),
1689
+ )
1690
+
1691
+ finally:
1692
+ # Always reset context to prevent leakage between executions
1693
+ _current_context.reset(token)
1694
+
1695
+ async def _execute_agent(self, agent, input_data: bytes, request):
1696
+ """Execute an agent with session support for multi-turn conversations."""
1697
+ import json
1698
+ import uuid
1699
+ from .agent import AgentContext
1700
+ from .entity import _entity_state_adapter_ctx
1701
+ from ._core import PyExecuteComponentResponse
1702
+
1703
+ # Set entity state adapter in context so AgentContext can access it
1704
+ _entity_state_adapter_ctx.set(self._entity_state_adapter)
1705
+
1706
+ try:
1707
+ # Parse input data
1708
+ input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
1709
+
1710
+ # Extract user message
1711
+ user_message = input_dict.get("message", "")
1712
+ if not user_message:
1713
+ raise ValueError("Agent invocation requires 'message' parameter")
1714
+
1715
+ # Extract or generate session_id for multi-turn conversation support
1716
+ # If session_id is provided, the agent will load previous conversation history
1717
+ # If not provided, a new session is created with auto-generated ID
1718
+ session_id = input_dict.get("session_id")
1719
+
1720
+ if not session_id:
1721
+ session_id = str(uuid.uuid4())
1722
+ logger.info(f"Created new agent session: {session_id}")
1723
+ else:
1724
+ logger.info(f"Using existing agent session: {session_id}")
1725
+
1726
+ # Extract streaming context for real-time SSE log delivery
1727
+ is_streaming = getattr(request, 'is_streaming', False)
1728
+ tenant_id = request.metadata.get('tenant_id') if hasattr(request, 'metadata') else None
1729
+
1730
+ # Create AgentContext with session support for conversation persistence
1731
+ # AgentContext automatically loads/saves conversation history based on session_id
1732
+ ctx = AgentContext(
1733
+ run_id=request.invocation_id,
1734
+ agent_name=agent.name,
1735
+ session_id=session_id,
1736
+ runtime_context=request.runtime_context,
1737
+ is_streaming=is_streaming,
1738
+ tenant_id=tenant_id,
1739
+ )
1740
+
1741
+ # Set context in contextvar so get_current_context() and error handlers can access it
1742
+ from .context import set_current_context, _current_context
1743
+ token = set_current_context(ctx)
1744
+
1745
+ # Execute agent - now returns an async generator for streaming
1746
+ result = agent.run(user_message, context=ctx)
1747
+
1748
+ # Agent.run() always returns an async generator
1749
+ # Queue each event via delta queue for real-time delivery
1750
+ import inspect
1751
+ if inspect.isasyncgen(result):
1752
+ from .events import Event, EventType
1753
+
1754
+ sequence = 0
1755
+ final_output = None
1756
+ final_tool_calls = []
1757
+ handoff_to = None
1758
+
1759
+ # Extract metadata for delta queue (must be Dict[str, str] for Rust FFI)
1760
+ metadata = _normalize_metadata(self._extract_critical_metadata(request))
1761
+ metadata["session_id"] = session_id # Include session for UI
1762
+
1763
+ async for event in result:
1764
+ if isinstance(event, Event):
1765
+ # Queue the event via delta queue
1766
+ event_data = event.to_response_fields()
1767
+ output_data = event_data.get("output_data", b"")
1768
+ output_str = output_data.decode("utf-8") if isinstance(output_data, bytes) else str(output_data or "{}")
1769
+
1770
+ self._rust_worker.queue_delta(
1771
+ invocation_id=request.invocation_id,
1772
+ event_type=event_data.get("event_type", ""),
1773
+ output_data=output_str,
1774
+ content_index=event_data.get("content_index", 0),
1775
+ sequence=sequence,
1776
+ metadata=metadata,
1777
+ )
1778
+ sequence += 1
1779
+
1780
+ # Capture final result from agent.completed event
1781
+ if event.event_type == EventType.AGENT_COMPLETED:
1782
+ final_output = event.data.get("output", "")
1783
+ final_tool_calls = event.data.get("tool_calls", [])
1784
+ handoff_to = event.data.get("handoff_to")
1785
+
1786
+ # Emit run.completed event with the final agent result
1787
+ final_result = {
1788
+ "output": final_output,
1789
+ "tool_calls": final_tool_calls,
1790
+ }
1791
+ if handoff_to:
1792
+ final_result["handoff_to"] = handoff_to
1793
+
1794
+ self._rust_worker.queue_delta(
1795
+ invocation_id=request.invocation_id,
1796
+ event_type="run.completed",
1797
+ output_data=json.dumps(final_result),
1798
+ content_index=0,
1799
+ sequence=sequence,
1800
+ metadata=metadata,
1801
+ )
1802
+
1803
+ logger.debug(f"Agent streaming queued {sequence + 1} deltas for real-time delivery")
1804
+ # Return None to signal that streaming was handled via delta queue
1805
+ return None
1806
+ else:
1807
+ # Fallback for non-generator (shouldn't happen but handle gracefully)
1808
+ if inspect.iscoroutine(result):
1809
+ agent_result = await result
1810
+ else:
1811
+ agent_result = result
1812
+
1813
+ # Build response with agent output and tool calls
1814
+ result = {
1815
+ "output": agent_result.output,
1816
+ "tool_calls": agent_result.tool_calls,
1817
+ }
1818
+
1819
+ # Serialize result
1820
+ output_data = _serialize_result(result)
1821
+
1822
+ # CRITICAL: Propagate tenant_id and deployment_id to prevent journal corruption
1823
+ metadata = self._extract_critical_metadata(request)
1824
+ # Also include session_id for UI to persist conversation
1825
+ metadata["session_id"] = session_id
1826
+
1827
+ return PyExecuteComponentResponse(
1828
+ invocation_id=request.invocation_id,
1829
+ success=True,
1830
+ output_data=output_data,
1831
+ state_update=None,
1832
+ error_message=None,
1833
+ metadata=metadata if metadata else None,
1834
+ event_type="run.completed",
1835
+ content_index=0,
1836
+ sequence=0,
1837
+ attempt=getattr(request, 'attempt', 0),
1838
+ )
1839
+
1840
+ except Exception as e:
1841
+ # Include exception type for better error messages
1842
+ error_msg = f"{type(e).__name__}: {str(e)}"
1843
+
1844
+ # Capture full stack trace for telemetry
1845
+ import traceback
1846
+ stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
1847
+
1848
+ # Log with full traceback using ctx.logger to ensure run_id correlation
1849
+ from .context import get_current_context
1850
+ current_ctx = get_current_context()
1851
+ error_logger = current_ctx.logger if current_ctx else logger
1852
+ error_logger.error(f"Agent execution failed: {error_msg}", exc_info=True)
1853
+
1854
+ # Store error metadata for observability
1855
+ metadata = {
1856
+ "error_type": type(e).__name__,
1857
+ "stack_trace": stack_trace,
1858
+ "error": True,
1859
+ }
1860
+
1861
+ # Extract critical metadata for journal correlation (if available)
1862
+ critical_metadata = self._extract_critical_metadata(request)
1863
+ metadata.update(critical_metadata)
1864
+
1865
+ # Normalize metadata for Rust FFI compatibility
1866
+ normalized_metadata = _normalize_metadata(metadata)
1867
+
1868
+ # Emit run.failed event
1869
+ return PyExecuteComponentResponse(
1870
+ invocation_id=request.invocation_id,
1871
+ success=False,
1872
+ output_data=b"",
1873
+ state_update=None,
1874
+ error_message=error_msg,
1875
+ metadata=normalized_metadata,
1876
+ event_type="run.failed",
1877
+ content_index=0,
1878
+ sequence=0,
1879
+ attempt=getattr(request, 'attempt', 0),
1880
+ )
1881
+
1882
+ finally:
1883
+ # Always reset context to prevent leakage between executions
1884
+ _current_context.reset(token)
1885
+
1886
+ def _create_error_response(self, request, error_message: str):
1887
+ """Create an error response."""
1888
+ from ._core import PyExecuteComponentResponse
1889
+
1890
+ # Emit run.failed event
1891
+ return PyExecuteComponentResponse(
1892
+ invocation_id=request.invocation_id,
1893
+ success=False,
1894
+ output_data=b"",
1895
+ state_update=None,
1896
+ error_message=error_message,
1897
+ metadata=None,
1898
+ event_type="run.failed",
1899
+ content_index=0,
1900
+ sequence=0,
1901
+ attempt=getattr(request, 'attempt', 0),
1902
+ )
1903
+
1904
+ async def run(self):
1905
+ """Run the worker (register and start message loop).
1906
+
1907
+ This method will:
1908
+ 1. Discover all registered @function and @workflow handlers
1909
+ 2. Register with the coordinator
1910
+ 3. Create a shared Python event loop for all function executions
1911
+ 4. Enter the message processing loop
1912
+ 5. Block until shutdown
1913
+
1914
+ This is the main entry point for your worker service.
1915
+ """
1916
+ try:
1917
+ logger.info(f"Starting worker: {self.service_name}")
1918
+
1919
+ # Discover components
1920
+ components = self._discover_components()
1921
+
1922
+ # Set components on Rust worker
1923
+ self._rust_worker.set_components(components)
1924
+
1925
+ # Set metadata
1926
+ if self.metadata:
1927
+ self._rust_worker.set_service_metadata(self.metadata)
1928
+
1929
+ # Configure entity state manager on Rust worker for database persistence
1930
+ logger.info("Configuring Rust EntityStateManager for database persistence")
1931
+ # Access the Rust core from the adapter
1932
+ if hasattr(self._entity_state_adapter, '_rust_core') and self._entity_state_adapter._rust_core:
1933
+ self._rust_worker.set_entity_state_manager(self._entity_state_adapter._rust_core)
1934
+ logger.info("Successfully configured Rust EntityStateManager")
1935
+
1936
+ # Get the current event loop to pass to Rust for concurrent Python async execution
1937
+ # This allows Rust to execute Python async functions on the same event loop
1938
+ # without spawn_blocking overhead, enabling true concurrency
1939
+ loop = asyncio.get_running_loop()
1940
+ logger.info("Passing Python event loop to Rust worker for concurrent execution")
1941
+
1942
+ # Set event loop on Rust worker
1943
+ self._rust_worker.set_event_loop(loop)
1944
+
1945
+ # Set message handler
1946
+ handler = self._create_message_handler()
1947
+ self._rust_worker.set_message_handler(handler)
1948
+
1949
+ # Initialize worker
1950
+ self._rust_worker.initialize()
1951
+
1952
+ logger.info("Worker registered successfully, entering message loop...")
1953
+
1954
+ # Run worker (this will block until shutdown)
1955
+ await self._rust_worker.run()
1956
+
1957
+ except Exception as e:
1958
+ # Capture SDK-level startup/runtime failures
1959
+ logger.error(f"Worker failed to start or encountered critical error: {e}", exc_info=True)
1960
+ _sentry.capture_exception(
1961
+ e,
1962
+ context={
1963
+ "service_name": self.service_name,
1964
+ "service_version": self.service_version,
1965
+ "error_location": "Worker.run",
1966
+ "error_phase": "worker_lifecycle",
1967
+ },
1968
+ tags={
1969
+ "sdk_error": "true",
1970
+ "error_type": "worker_failure",
1971
+ "severity": "critical",
1972
+ },
1973
+ level="error",
1974
+ )
1975
+ raise
1976
+
1977
+ finally:
1978
+ # Flush Sentry events before shutdown
1979
+ logger.info("Flushing Sentry events before shutdown...")
1980
+ _sentry.flush(timeout=5.0)
1981
+
1982
+ logger.info("Worker shutdown complete")