agnt5 0.3.2a1__cp310-abi3-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of agnt5 might be problematic. Click here for more details.

agnt5/worker.py ADDED
@@ -0,0 +1,2094 @@
1
+ """Worker implementation for AGNT5 SDK."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import contextvars
7
+ import logging
8
+ import time
9
+ import uuid
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ from .function import FunctionRegistry
13
+ from .workflow import WorkflowRegistry
14
+ from ._telemetry import setup_module_logger
15
+ from . import _sentry
16
+
17
+ logger = setup_module_logger(__name__)
18
+
19
+
20
+ import dataclasses
21
+ import json as _json
22
+
23
+
24
+ class _ResultEncoder(_json.JSONEncoder):
25
+ """Custom JSON encoder for serializing component results.
26
+
27
+ Handles Pydantic models, dataclasses, bytes, and sets that are commonly
28
+ returned from functions, workflows, entities, and agents.
29
+ """
30
+ def default(self, obj):
31
+ # Handle Pydantic models (v2 API)
32
+ if hasattr(obj, 'model_dump'):
33
+ return obj.model_dump()
34
+ # Handle Pydantic models (v1 API)
35
+ if hasattr(obj, 'dict') and hasattr(obj, '__fields__'):
36
+ return obj.dict()
37
+ # Handle dataclasses
38
+ if dataclasses.is_dataclass(obj) and not isinstance(obj, type):
39
+ return dataclasses.asdict(obj)
40
+ # Handle bytes
41
+ if isinstance(obj, bytes):
42
+ return obj.decode('utf-8', errors='replace')
43
+ # Handle sets
44
+ if isinstance(obj, set):
45
+ return list(obj)
46
+ # Fallback to default behavior
47
+ return super().default(obj)
48
+
49
+
50
+ def _serialize_result(result) -> bytes:
51
+ """Serialize a component result to JSON bytes.
52
+
53
+ Uses _ResultEncoder to handle Pydantic models, dataclasses, and other
54
+ complex types that may be returned from functions, workflows, entities,
55
+ tools, and agents.
56
+ """
57
+ return _json.dumps(result, cls=_ResultEncoder).encode("utf-8")
58
+
59
+
60
+ def _normalize_metadata(metadata: Dict[str, Any]) -> Dict[str, str]:
61
+ """
62
+ Convert metadata dictionary to Dict[str, str] for Rust FFI compatibility.
63
+
64
+ PyO3 requires HashMap<String, String>, but Python code may include booleans,
65
+ integers, or other types. This helper ensures all values are strings.
66
+
67
+ Args:
68
+ metadata: Dictionary with potentially mixed types
69
+
70
+ Returns:
71
+ Dictionary with all string values
72
+
73
+ Example:
74
+ >>> _normalize_metadata({"error": True, "count": 42, "msg": "hello"})
75
+ {"error": "true", "count": "42", "msg": "hello"}
76
+ """
77
+ normalized = {}
78
+ for key, value in metadata.items():
79
+ if isinstance(value, str):
80
+ normalized[key] = value
81
+ elif isinstance(value, bool):
82
+ # Convert bool to lowercase string for JSON compatibility
83
+ normalized[key] = str(value).lower()
84
+ elif value is None:
85
+ normalized[key] = ""
86
+ else:
87
+ # Convert any other type to string representation
88
+ normalized[key] = str(value)
89
+ return normalized
90
+
91
+ # Context variable to store trace metadata for propagation to LM calls
92
+ # This allows Rust LM layer to access traceparent without explicit parameter passing
93
+ _trace_metadata: contextvars.ContextVar[Dict[str, str]] = contextvars.ContextVar(
94
+ '_trace_metadata', default={}
95
+ )
96
+
97
+
98
+ class Worker:
99
+ """AGNT5 Worker for registering and running functions/workflows with the coordinator.
100
+
101
+ The Worker class manages the lifecycle of your service, including:
102
+ - Registration with the AGNT5 coordinator
103
+ - Automatic discovery of @function and @workflow decorated handlers
104
+ - Message handling and execution
105
+ - Health monitoring
106
+
107
+ Example:
108
+ ```python
109
+ from agnt5 import Worker, function
110
+
111
+ @function
112
+ async def process_data(ctx: Context, data: str) -> dict:
113
+ return {"result": data.upper()}
114
+
115
+ async def main():
116
+ worker = Worker(
117
+ service_name="data-processor",
118
+ service_version="1.0.0",
119
+ coordinator_endpoint="http://localhost:34186"
120
+ )
121
+ await worker.run()
122
+
123
+ if __name__ == "__main__":
124
+ asyncio.run(main())
125
+ ```
126
+ """
127
+
128
+ def __init__(
129
+ self,
130
+ service_name: str,
131
+ service_version: str = "1.0.0",
132
+ coordinator_endpoint: Optional[str] = None,
133
+ runtime: str = "standalone",
134
+ metadata: Optional[Dict[str, str]] = None,
135
+ functions: Optional[List] = None,
136
+ workflows: Optional[List] = None,
137
+ entities: Optional[List] = None,
138
+ agents: Optional[List] = None,
139
+ tools: Optional[List] = None,
140
+ auto_register: bool = False,
141
+ auto_register_paths: Optional[List[str]] = None,
142
+ pyproject_path: Optional[str] = None,
143
+ ):
144
+ """Initialize a new Worker with explicit or automatic component registration.
145
+
146
+ The Worker supports two registration modes:
147
+
148
+ **Explicit Mode (default, production):**
149
+ - Register workflows/agents explicitly, their dependencies are auto-included
150
+ - Optionally register standalone functions/tools for direct API invocation
151
+
152
+ **Auto-Registration Mode (development):**
153
+ - Automatically discovers all decorated components in source paths
154
+ - Reads source paths from pyproject.toml or uses explicit paths
155
+ - No need to maintain import lists
156
+
157
+ Args:
158
+ service_name: Unique name for this service
159
+ service_version: Version string (semantic versioning recommended)
160
+ coordinator_endpoint: Coordinator endpoint URL (default: from env AGNT5_COORDINATOR_ENDPOINT)
161
+ runtime: Runtime type - "standalone", "docker", "kubernetes", etc.
162
+ metadata: Optional service-level metadata
163
+ functions: List of @function decorated handlers (explicit mode)
164
+ workflows: List of @workflow decorated handlers (explicit mode)
165
+ entities: List of Entity classes (explicit mode)
166
+ agents: List of Agent instances (explicit mode)
167
+ tools: List of Tool instances (explicit mode)
168
+ auto_register: Enable automatic component discovery (default: False)
169
+ auto_register_paths: Explicit source paths to scan (overrides pyproject.toml discovery)
170
+ pyproject_path: Path to pyproject.toml (default: current directory)
171
+
172
+ Example (explicit mode - production):
173
+ ```python
174
+ from agnt5 import Worker
175
+ from my_service import greet_user, order_fulfillment, ShoppingCart, analyst_agent
176
+
177
+ worker = Worker(
178
+ service_name="my-service",
179
+ workflows=[order_fulfillment],
180
+ entities=[ShoppingCart],
181
+ agents=[analyst_agent],
182
+ functions=[greet_user],
183
+ )
184
+ await worker.run()
185
+ ```
186
+
187
+ Example (auto-register mode - development):
188
+ ```python
189
+ from agnt5 import Worker
190
+
191
+ worker = Worker(
192
+ service_name="my-service",
193
+ auto_register=True, # Discovers from pyproject.toml
194
+ )
195
+ await worker.run()
196
+ ```
197
+ """
198
+ self.service_name = service_name
199
+ self.service_version = service_version
200
+ self.coordinator_endpoint = coordinator_endpoint
201
+ self.runtime = runtime
202
+ self.metadata = metadata or {}
203
+
204
+ # Get tenant_id from environment (required for entity state management)
205
+ import os
206
+ self._tenant_id = os.getenv("AGNT5_TENANT_ID", "default-tenant")
207
+
208
+ # Import Rust worker
209
+ try:
210
+ from ._core import PyWorker, PyWorkerConfig, PyComponentInfo
211
+ self._PyWorker = PyWorker
212
+ self._PyWorkerConfig = PyWorkerConfig
213
+ self._PyComponentInfo = PyComponentInfo
214
+ except ImportError as e:
215
+ # Capture SDK-level import failure in Sentry
216
+ _sentry.capture_exception(
217
+ e,
218
+ context={
219
+ "service_name": service_name,
220
+ "service_version": service_version,
221
+ "error_location": "Worker.__init__",
222
+ "error_phase": "rust_core_import",
223
+ },
224
+ tags={
225
+ "sdk_error": "true",
226
+ "error_type": "import_error",
227
+ "component": "rust_core",
228
+ },
229
+ level="error",
230
+ )
231
+ raise ImportError(
232
+ f"Failed to import Rust core worker: {e}. "
233
+ "Make sure agnt5 is properly installed with: pip install agnt5"
234
+ )
235
+
236
+ # Create Rust worker config
237
+ self._rust_config = self._PyWorkerConfig(
238
+ service_name=service_name,
239
+ service_version=service_version,
240
+ service_type=runtime,
241
+ )
242
+
243
+ # Create Rust worker instance
244
+ self._rust_worker = self._PyWorker(self._rust_config)
245
+
246
+ # Create worker-scoped entity state adapter with Rust core
247
+ from .entity import EntityStateAdapter
248
+ from ._core import EntityStateManager as RustEntityStateManager
249
+
250
+ # Create Rust core for entity state management
251
+ rust_core = RustEntityStateManager(tenant_id=self._tenant_id)
252
+
253
+ # Create Python adapter (thin wrapper around Rust core)
254
+ self._entity_state_adapter = EntityStateAdapter(rust_core=rust_core)
255
+
256
+ logger.info("Created EntityStateAdapter with Rust core for state management")
257
+
258
+ # Create CheckpointClient for step-level memoization (Phase 3)
259
+ # This client is shared across all workflow executions and connects lazily on first use
260
+ try:
261
+ from .checkpoint import CheckpointClient
262
+ self._checkpoint_client = CheckpointClient()
263
+ logger.info("Created CheckpointClient for step-level memoization")
264
+ except Exception as e:
265
+ logger.warning(f"Failed to create CheckpointClient (memoization disabled): {e}")
266
+ self._checkpoint_client = None
267
+
268
+ # Initialize Sentry for SDK-level error tracking
269
+ # Telemetry behavior:
270
+ # - Alpha/Beta releases: ENABLED by default (opt-out with AGNT5_DISABLE_SDK_TELEMETRY=true)
271
+ # - Stable releases: DISABLED by default (opt-in with AGNT5_ENABLE_SDK_TELEMETRY=true)
272
+ # This captures SDK bugs, initialization failures, and Python-specific issues
273
+ # NOT user code execution errors (those should be handled by users)
274
+ from .version import _get_version
275
+ sdk_version = _get_version()
276
+
277
+ sentry_enabled = _sentry.initialize_sentry(
278
+ service_name=service_name,
279
+ service_version=service_version,
280
+ sdk_version=sdk_version,
281
+ )
282
+ if sentry_enabled:
283
+ # Set service-level context (anonymized)
284
+ _sentry.set_context("service", {
285
+ "name": service_name, # User's service name (they control this)
286
+ "version": service_version,
287
+ "runtime": runtime,
288
+ })
289
+ else:
290
+ logger.debug("SDK telemetry not enabled")
291
+
292
+ # Component registration: auto-discover or explicit
293
+ if auto_register:
294
+ # Warn if explicit components are passed with auto_register=True
295
+ if any([functions, workflows, entities, agents, tools]):
296
+ logger.warning(
297
+ "auto_register=True ignores explicit functions/workflows/entities/agents/tools parameters. "
298
+ "Remove explicit params or set auto_register=False to use explicit registration."
299
+ )
300
+
301
+ # Auto-registration mode: discover from source paths
302
+ if auto_register_paths:
303
+ source_paths = auto_register_paths
304
+ logger.info(f"Auto-registration with explicit paths: {source_paths}")
305
+ else:
306
+ source_paths = self._discover_source_paths(pyproject_path)
307
+ logger.info(f"Auto-registration with discovered paths: {source_paths}")
308
+
309
+ # Auto-discover components (will populate _explicit_components)
310
+ self._auto_discover_components(source_paths)
311
+ else:
312
+ # Explicit registration from constructor kwargs
313
+ self._explicit_components = {
314
+ 'functions': list(functions or []),
315
+ 'workflows': list(workflows or []),
316
+ 'entities': list(entities or []),
317
+ 'agents': list(agents or []),
318
+ 'tools': list(tools or []),
319
+ }
320
+
321
+ # Count explicitly registered components
322
+ total_explicit = sum(len(v) for v in self._explicit_components.values())
323
+ logger.info(
324
+ f"Worker initialized: {service_name} v{service_version} (runtime: {runtime}), "
325
+ f"{total_explicit} components explicitly registered"
326
+ )
327
+
328
+ def register_components(
329
+ self,
330
+ functions=None,
331
+ workflows=None,
332
+ entities=None,
333
+ agents=None,
334
+ tools=None,
335
+ ):
336
+ """Register additional components after Worker initialization.
337
+
338
+ This method allows incremental registration of components after the Worker
339
+ has been created. Useful for conditional or dynamic component registration.
340
+
341
+ Args:
342
+ functions: List of functions decorated with @function
343
+ workflows: List of workflows decorated with @workflow
344
+ entities: List of entity classes
345
+ agents: List of agent instances
346
+ tools: List of tool instances
347
+
348
+ Example:
349
+ ```python
350
+ worker = Worker(service_name="my-service")
351
+
352
+ # Register conditionally
353
+ if feature_enabled:
354
+ worker.register_components(workflows=[advanced_workflow])
355
+ ```
356
+ """
357
+ if functions:
358
+ self._explicit_components['functions'].extend(functions)
359
+ logger.debug(f"Incrementally registered {len(functions)} functions")
360
+
361
+ if workflows:
362
+ self._explicit_components['workflows'].extend(workflows)
363
+ logger.debug(f"Incrementally registered {len(workflows)} workflows")
364
+
365
+ if entities:
366
+ self._explicit_components['entities'].extend(entities)
367
+ logger.debug(f"Incrementally registered {len(entities)} entities")
368
+
369
+ if agents:
370
+ self._explicit_components['agents'].extend(agents)
371
+ logger.debug(f"Incrementally registered {len(agents)} agents")
372
+
373
+ if tools:
374
+ self._explicit_components['tools'].extend(tools)
375
+ logger.debug(f"Incrementally registered {len(tools)} tools")
376
+
377
+ total = sum(len(v) for v in self._explicit_components.values())
378
+ logger.info(f"Total components now registered: {total}")
379
+
380
+ def _discover_source_paths(self, pyproject_path: Optional[str] = None) -> List[str]:
381
+ """Discover source paths from pyproject.toml.
382
+
383
+ Reads pyproject.toml to find package source directories using:
384
+ - Hatch: [tool.hatch.build.targets.wheel] packages
385
+ - Maturin: [tool.maturin] python-source
386
+ - Fallback: ["src"] if not found
387
+
388
+ Args:
389
+ pyproject_path: Path to pyproject.toml (default: current directory)
390
+
391
+ Returns:
392
+ List of directory paths to scan (e.g., ["src/agnt5_benchmark"])
393
+ """
394
+ from pathlib import Path
395
+
396
+ # Python 3.11+ has tomllib in stdlib
397
+ try:
398
+ import tomllib
399
+ except ImportError:
400
+ logger.error("tomllib not available (Python 3.11+ required for auto-registration)")
401
+ return ["src"]
402
+
403
+ # Determine pyproject.toml location
404
+ if pyproject_path:
405
+ pyproject_file = Path(pyproject_path)
406
+ else:
407
+ # Look in current directory
408
+ pyproject_file = Path.cwd() / "pyproject.toml"
409
+
410
+ if not pyproject_file.exists():
411
+ logger.warning(
412
+ f"pyproject.toml not found at {pyproject_file}, "
413
+ f"defaulting to 'src/' directory"
414
+ )
415
+ return ["src"]
416
+
417
+ # Parse pyproject.toml
418
+ try:
419
+ with open(pyproject_file, "rb") as f:
420
+ config = tomllib.load(f)
421
+ except Exception as e:
422
+ logger.error(f"Failed to parse pyproject.toml: {e}")
423
+ return ["src"]
424
+
425
+ # Extract source paths based on build system
426
+ source_paths = []
427
+
428
+ # Try Hatch configuration
429
+ if "tool" in config and "hatch" in config["tool"]:
430
+ hatch_config = config["tool"]["hatch"]
431
+ if "build" in hatch_config and "targets" in hatch_config["build"]:
432
+ wheel_config = hatch_config["build"]["targets"].get("wheel", {})
433
+ packages = wheel_config.get("packages", [])
434
+ source_paths.extend(packages)
435
+
436
+ # Try Maturin configuration
437
+ if not source_paths and "tool" in config and "maturin" in config["tool"]:
438
+ maturin_config = config["tool"]["maturin"]
439
+ python_source = maturin_config.get("python-source")
440
+ if python_source:
441
+ source_paths.append(python_source)
442
+
443
+ # Fallback to src/
444
+ if not source_paths:
445
+ logger.info("No source paths in pyproject.toml, defaulting to 'src/'")
446
+ source_paths = ["src"]
447
+
448
+ logger.info(f"Discovered source paths from pyproject.toml: {source_paths}")
449
+ return source_paths
450
+
451
+ def _auto_discover_components(self, source_paths: List[str]) -> None:
452
+ """Auto-discover components by importing all Python files in source paths.
453
+
454
+ Args:
455
+ source_paths: List of directory paths to scan
456
+ """
457
+ import importlib.util
458
+ import sys
459
+ from pathlib import Path
460
+
461
+ logger.info(f"Auto-discovering components in paths: {source_paths}")
462
+
463
+ total_modules = 0
464
+
465
+ for source_path in source_paths:
466
+ path = Path(source_path)
467
+
468
+ if not path.exists():
469
+ logger.warning(f"Source path does not exist: {source_path}")
470
+ continue
471
+
472
+ # Recursively find all .py files
473
+ for py_file in path.rglob("*.py"):
474
+ # Skip __pycache__ and test files
475
+ if "__pycache__" in str(py_file) or py_file.name.startswith("test_"):
476
+ continue
477
+
478
+ # Convert path to module name
479
+ # e.g., src/agnt5_benchmark/functions.py -> agnt5_benchmark.functions
480
+ relative_path = py_file.relative_to(path.parent)
481
+ module_parts = list(relative_path.parts[:-1]) # Remove .py extension part
482
+ module_parts.append(relative_path.stem) # Add filename without .py
483
+ module_name = ".".join(module_parts)
484
+
485
+ # Import module (triggers decorators)
486
+ try:
487
+ if module_name in sys.modules:
488
+ logger.debug(f"Module already imported: {module_name}")
489
+ else:
490
+ spec = importlib.util.spec_from_file_location(module_name, py_file)
491
+ if spec and spec.loader:
492
+ module = importlib.util.module_from_spec(spec)
493
+ sys.modules[module_name] = module
494
+ spec.loader.exec_module(module)
495
+ logger.debug(f"Auto-imported: {module_name}")
496
+ total_modules += 1
497
+ except Exception as e:
498
+ logger.warning(f"Failed to import {module_name}: {e}")
499
+ # Capture SDK auto-registration failures
500
+ _sentry.capture_exception(
501
+ e,
502
+ context={
503
+ "service_name": self.service_name,
504
+ "module_name": module_name,
505
+ "source_path": str(py_file),
506
+ "error_location": "_auto_discover_components",
507
+ },
508
+ tags={
509
+ "sdk_error": "true",
510
+ "error_type": "auto_registration_failure",
511
+ },
512
+ level="warning",
513
+ )
514
+
515
+ logger.info(f"Auto-imported {total_modules} modules")
516
+
517
+ # Collect components from registries
518
+ from .agent import AgentRegistry
519
+ from .entity import EntityRegistry
520
+ from .tool import ToolRegistry
521
+
522
+ # Extract actual objects from registries
523
+ functions = [cfg.handler for cfg in FunctionRegistry.all().values()]
524
+ workflows = [cfg.handler for cfg in WorkflowRegistry.all().values()]
525
+ entities = [et.entity_class for et in EntityRegistry.all().values()]
526
+ agents = list(AgentRegistry.all().values())
527
+ tools = list(ToolRegistry.all().values())
528
+
529
+ self._explicit_components = {
530
+ 'functions': functions,
531
+ 'workflows': workflows,
532
+ 'entities': entities,
533
+ 'agents': agents,
534
+ 'tools': tools,
535
+ }
536
+
537
+ logger.info(
538
+ f"Auto-discovered components: "
539
+ f"{len(functions)} functions, "
540
+ f"{len(workflows)} workflows, "
541
+ f"{len(entities)} entities, "
542
+ f"{len(agents)} agents, "
543
+ f"{len(tools)} tools"
544
+ )
545
+
546
+ def _discover_components(self):
547
+ """Discover explicit components and auto-include their dependencies.
548
+
549
+ Hybrid approach:
550
+ - Explicitly registered workflows/agents are processed
551
+ - Functions called by workflows are auto-included (TODO: implement)
552
+ - Tools used by agents are auto-included
553
+ - Standalone functions/tools can be explicitly registered
554
+
555
+ Returns:
556
+ List of PyComponentInfo instances for all components
557
+ """
558
+ components = []
559
+ import json
560
+
561
+ # Import registries and types
562
+ from .entity import EntityRegistry
563
+ from .tool import ToolRegistry, Tool
564
+
565
+ # Track all components (explicit + auto-included)
566
+ all_functions = set(self._explicit_components['functions'])
567
+ all_tools = set(self._explicit_components['tools'])
568
+
569
+ # Auto-include agent tool dependencies
570
+ for agent in self._explicit_components['agents']:
571
+ if hasattr(agent, 'tools') and agent.tools:
572
+ # Agent.tools is a dict of {tool_name: tool_instance}
573
+ all_tools.update(agent.tools.values())
574
+ logger.debug(
575
+ f"Auto-included {len(agent.tools)} tools from agent '{agent.name}'"
576
+ )
577
+
578
+ # Log registration summary
579
+ explicit_func_count = len(self._explicit_components['functions'])
580
+ explicit_tool_count = len(self._explicit_components['tools'])
581
+ auto_func_count = len(all_functions) - explicit_func_count
582
+ auto_tool_count = len(all_tools) - explicit_tool_count
583
+
584
+ logger.info(
585
+ f"Component registration summary: "
586
+ f"{len(all_functions)} functions ({explicit_func_count} explicit, {auto_func_count} auto-included), "
587
+ f"{len(self._explicit_components['workflows'])} workflows, "
588
+ f"{len(self._explicit_components['entities'])} entities, "
589
+ f"{len(self._explicit_components['agents'])} agents, "
590
+ f"{len(all_tools)} tools ({explicit_tool_count} explicit, {auto_tool_count} auto-included)"
591
+ )
592
+
593
+ # Process functions (explicit + auto-included)
594
+ for func in all_functions:
595
+ config = FunctionRegistry.get(func.__name__)
596
+ if not config:
597
+ logger.warning(f"Function '{func.__name__}' not found in FunctionRegistry")
598
+ continue
599
+
600
+ input_schema_str = json.dumps(config.input_schema) if config.input_schema else None
601
+ output_schema_str = json.dumps(config.output_schema) if config.output_schema else None
602
+ metadata = config.metadata if config.metadata else {}
603
+
604
+ # Serialize retry and backoff policies
605
+ config_dict = {}
606
+ if config.retries:
607
+ config_dict["max_attempts"] = str(config.retries.max_attempts)
608
+ config_dict["initial_interval_ms"] = str(config.retries.initial_interval_ms)
609
+ config_dict["max_interval_ms"] = str(config.retries.max_interval_ms)
610
+
611
+ if config.backoff:
612
+ config_dict["backoff_type"] = config.backoff.type.value
613
+ config_dict["backoff_multiplier"] = str(config.backoff.multiplier)
614
+
615
+ component_info = self._PyComponentInfo(
616
+ name=config.name,
617
+ component_type="function",
618
+ metadata=metadata,
619
+ config=config_dict,
620
+ input_schema=input_schema_str,
621
+ output_schema=output_schema_str,
622
+ definition=None,
623
+ )
624
+ components.append(component_info)
625
+
626
+ # Process workflows
627
+ for workflow in self._explicit_components['workflows']:
628
+ config = WorkflowRegistry.get(workflow.__name__)
629
+ if not config:
630
+ logger.warning(f"Workflow '{workflow.__name__}' not found in WorkflowRegistry")
631
+ continue
632
+
633
+ input_schema_str = json.dumps(config.input_schema) if config.input_schema else None
634
+ output_schema_str = json.dumps(config.output_schema) if config.output_schema else None
635
+ metadata = config.metadata if config.metadata else {}
636
+
637
+ component_info = self._PyComponentInfo(
638
+ name=config.name,
639
+ component_type="workflow",
640
+ metadata=metadata,
641
+ config={},
642
+ input_schema=input_schema_str,
643
+ output_schema=output_schema_str,
644
+ definition=None,
645
+ )
646
+ components.append(component_info)
647
+
648
+ # Process entities
649
+ for entity_class in self._explicit_components['entities']:
650
+ entity_type = EntityRegistry.get(entity_class.__name__)
651
+ if not entity_type:
652
+ logger.warning(f"Entity '{entity_class.__name__}' not found in EntityRegistry")
653
+ continue
654
+
655
+ # Build complete entity definition with state schema and method schemas
656
+ entity_definition = entity_type.build_entity_definition()
657
+ definition_str = json.dumps(entity_definition)
658
+
659
+ # Keep minimal metadata for backward compatibility
660
+ metadata_dict = {
661
+ "methods": json.dumps(list(entity_type._method_schemas.keys())),
662
+ }
663
+
664
+ component_info = self._PyComponentInfo(
665
+ name=entity_type.name,
666
+ component_type="entity",
667
+ metadata=metadata_dict,
668
+ config={},
669
+ input_schema=None, # Entities don't have single input/output schemas
670
+ output_schema=None,
671
+ definition=definition_str, # Complete entity definition with state and methods
672
+ )
673
+ components.append(component_info)
674
+ logger.debug(f"Registered entity '{entity_type.name}' with definition")
675
+
676
+ # Process agents
677
+ from .agent import AgentRegistry
678
+
679
+ for agent in self._explicit_components['agents']:
680
+ # Register agent in AgentRegistry for execution lookup
681
+ AgentRegistry.register(agent)
682
+ logger.debug(f"Registered agent '{agent.name}' in AgentRegistry for execution")
683
+
684
+ input_schema_str = json.dumps(agent.input_schema) if hasattr(agent, 'input_schema') and agent.input_schema else None
685
+ output_schema_str = json.dumps(agent.output_schema) if hasattr(agent, 'output_schema') and agent.output_schema else None
686
+
687
+ metadata_dict = agent.metadata if hasattr(agent, 'metadata') else {}
688
+ if hasattr(agent, 'tools'):
689
+ metadata_dict["tools"] = json.dumps(list(agent.tools.keys()))
690
+
691
+ component_info = self._PyComponentInfo(
692
+ name=agent.name,
693
+ component_type="agent",
694
+ metadata=metadata_dict,
695
+ config={},
696
+ input_schema=input_schema_str,
697
+ output_schema=output_schema_str,
698
+ definition=None,
699
+ )
700
+ components.append(component_info)
701
+
702
+ # Process tools (explicit + auto-included)
703
+ for tool in all_tools:
704
+ # Validate that item is a Tool instance
705
+ if not isinstance(tool, Tool):
706
+ logger.warning(
707
+ f"Skipping non-Tool item in tools collection: {type(tool).__name__}. "
708
+ f"Use @tool decorator or pass Tool instances."
709
+ )
710
+ continue
711
+
712
+ input_schema_str = json.dumps(tool.input_schema) if hasattr(tool, 'input_schema') and tool.input_schema else None
713
+ output_schema_str = json.dumps(tool.output_schema) if hasattr(tool, 'output_schema') and tool.output_schema else None
714
+
715
+ component_info = self._PyComponentInfo(
716
+ name=tool.name,
717
+ component_type="tool",
718
+ metadata={},
719
+ config={},
720
+ input_schema=input_schema_str,
721
+ output_schema=output_schema_str,
722
+ definition=None,
723
+ )
724
+ components.append(component_info)
725
+
726
+ logger.info(f"Discovered {len(components)} total components")
727
+ return components
728
+
729
+ def _create_message_handler(self):
730
+ """Create the message handler that will be called by Rust worker."""
731
+
732
+ def handle_message(request):
733
+ """Handle incoming execution requests - returns coroutine for Rust to await."""
734
+ # Extract request details
735
+ component_name = request.component_name
736
+ component_type = request.component_type
737
+ input_data = request.input_data
738
+
739
+ logger.debug(
740
+ f"Handling {component_type} request: {component_name}, input size: {len(input_data)} bytes"
741
+ )
742
+
743
+ # Import all registries
744
+ from .tool import ToolRegistry
745
+ from .entity import EntityRegistry
746
+ from .agent import AgentRegistry
747
+
748
+ # Route based on component type and return coroutines
749
+ if component_type == "tool":
750
+ tool = ToolRegistry.get(component_name)
751
+ if tool:
752
+ logger.debug(f"Found tool: {component_name}")
753
+ # Return coroutine, don't await it
754
+ return self._execute_tool(tool, input_data, request)
755
+
756
+ elif component_type == "entity":
757
+ entity_type = EntityRegistry.get(component_name)
758
+ if entity_type:
759
+ logger.debug(f"Found entity: {component_name}")
760
+ # Return coroutine, don't await it
761
+ return self._execute_entity(entity_type, input_data, request)
762
+
763
+ elif component_type == "agent":
764
+ agent = AgentRegistry.get(component_name)
765
+ if agent:
766
+ logger.debug(f"Found agent: {component_name}")
767
+ # Return coroutine, don't await it
768
+ return self._execute_agent(agent, input_data, request)
769
+
770
+ elif component_type == "workflow":
771
+ workflow_config = WorkflowRegistry.get(component_name)
772
+ if workflow_config:
773
+ logger.debug(f"Found workflow: {component_name}")
774
+ # Return coroutine, don't await it
775
+ return self._execute_workflow(workflow_config, input_data, request)
776
+
777
+ elif component_type == "function":
778
+ function_config = FunctionRegistry.get(component_name)
779
+ if function_config:
780
+ # Return coroutine, don't await it
781
+ return self._execute_function(function_config, input_data, request)
782
+
783
+ # Not found - need to return an async error response
784
+ error_msg = f"Component '{component_name}' of type '{component_type}' not found"
785
+ logger.error(error_msg)
786
+
787
+ # Create async wrapper for error response
788
+ async def error_response():
789
+ return self._create_error_response(request, error_msg)
790
+
791
+ return error_response()
792
+
793
+ return handle_message
794
+
795
+ def _extract_critical_metadata(self, request) -> Dict[str, str]:
796
+ """
797
+ Extract critical metadata from request that MUST be propagated to response.
798
+
799
+ This ensures journal events are written to the correct tenant partition
800
+ and can be properly replayed. Missing tenant_id causes catastrophic
801
+ event sourcing corruption where events are split across partitions.
802
+
803
+ Returns:
804
+ Dict[str, str]: Metadata with all values normalized to strings for Rust FFI
805
+ """
806
+ metadata = {}
807
+ if hasattr(request, 'metadata') and request.metadata:
808
+ # CRITICAL: Propagate tenant_id to prevent journal corruption
809
+ # Convert to string immediately to ensure Rust FFI compatibility
810
+ if "tenant_id" in request.metadata:
811
+ metadata["tenant_id"] = str(request.metadata["tenant_id"])
812
+ if "deployment_id" in request.metadata:
813
+ metadata["deployment_id"] = str(request.metadata["deployment_id"])
814
+
815
+ # CRITICAL: Normalize all metadata values to strings for Rust FFI (PyO3)
816
+ # PyO3 expects HashMap<String, String> and will fail with bool/int values
817
+ return _normalize_metadata(metadata)
818
+
819
+ async def _execute_function(self, config, input_data: bytes, request):
820
+ """Execute a function handler (supports both regular and streaming functions)."""
821
+ import json
822
+ import inspect
823
+ import time
824
+ from .context import Context
825
+ from ._core import PyExecuteComponentResponse
826
+
827
+ exec_start = time.time()
828
+
829
+ try:
830
+ # Parse input data
831
+ input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
832
+
833
+ # Store trace metadata in contextvar for LM calls to access
834
+ # The Rust worker injects traceparent into request.metadata for trace propagation
835
+ if hasattr(request, 'metadata') and request.metadata:
836
+ _trace_metadata.set(dict(request.metadata))
837
+ logger.debug(f"Trace metadata stored: traceparent={request.metadata.get('traceparent', 'N/A')}")
838
+
839
+ # Extract attempt number from platform request (if provided)
840
+ platform_attempt = getattr(request, 'attempt', 0)
841
+
842
+ # Extract streaming context for real-time SSE log delivery
843
+ is_streaming = getattr(request, 'is_streaming', False)
844
+ tenant_id = request.metadata.get('tenant_id') if hasattr(request, 'metadata') else None
845
+
846
+ # Create FunctionContext with attempt number for retry tracking
847
+ # - If platform_attempt > 0: Platform is orchestrating retries
848
+ # - If platform_attempt == 0: First attempt (or no retry config)
849
+ from .function import FunctionContext
850
+ ctx = FunctionContext(
851
+ run_id=f"{self.service_name}:{config.name}",
852
+ attempt=platform_attempt,
853
+ runtime_context=request.runtime_context,
854
+ retry_policy=config.retries,
855
+ is_streaming=is_streaming,
856
+ tenant_id=tenant_id,
857
+ )
858
+
859
+ # Set context in contextvar so get_current_context() and error handlers can access it
860
+ from .context import set_current_context, _current_context
861
+ token = set_current_context(ctx)
862
+
863
+ # Set up _current_span contextvar for proper trace parent-child linking
864
+ # The Rust worker creates spans (python_component_execution) and passes trace context
865
+ # via runtime_context. We need to set this in Python's _current_span contextvar
866
+ # so that spans created in Python (e.g., agent.calculator) become proper children.
867
+ from .tracing import _current_span, SpanInfo
868
+ span_token = None
869
+ if request.runtime_context:
870
+ trace_id = request.runtime_context.trace_id
871
+ span_id = request.runtime_context.span_id
872
+ if trace_id and span_id:
873
+ span_info = SpanInfo(trace_id=trace_id, span_id=span_id)
874
+ span_token = _current_span.set(span_info)
875
+
876
+ # Execute function directly - Rust bridge handles tracing
877
+ # Note: Removed Python-level span creation to avoid duplicate spans.
878
+ # The Rust worker bridge (sdk-python/rust-src/worker.rs:413-659) already
879
+ # creates a comprehensive OpenTelemetry span with all necessary attributes.
880
+ # See DUPLICATE_SPANS_FIX.md for details.
881
+ #
882
+ # Note on retry handling:
883
+ # - If platform_attempt > 0: Platform is orchestrating retries, execute once
884
+ # - If platform_attempt == 0: Local retry loop in decorator wrapper handles retries
885
+ if input_dict:
886
+ result = config.handler(ctx, **input_dict)
887
+ else:
888
+ result = config.handler(ctx)
889
+
890
+ # Check if result is an async generator BEFORE awaiting
891
+ # Async generators (streaming functions) cannot be awaited directly
892
+ if inspect.isasyncgen(result):
893
+ # result is already an async generator, proceed to streaming handling below
894
+ pass
895
+ elif inspect.iscoroutine(result):
896
+ # Apply timeout if specified in function config
897
+ if hasattr(config, 'timeout_ms') and config.timeout_ms is not None:
898
+ timeout_seconds = config.timeout_ms / 1000.0
899
+ try:
900
+ result = await asyncio.wait_for(result, timeout=timeout_seconds)
901
+ except asyncio.TimeoutError:
902
+ raise asyncio.TimeoutError(
903
+ f"Function '{config.name}' execution timed out after {config.timeout_ms}ms"
904
+ )
905
+ else:
906
+ result = await result
907
+
908
+ # Note: Removed flush_telemetry_py() call here - it was causing 2-second blocking delay!
909
+ # The batch span processor handles flushing automatically with 5s timeout
910
+ # We only need to flush on worker shutdown, not after each function execution
911
+
912
+ # Check if result is an async generator (streaming function)
913
+ if inspect.isasyncgen(result):
914
+ # Streaming function - queue deltas immediately via Rust for real-time delivery
915
+ # Instead of collecting into a list and returning, we send each chunk
916
+ # as it's yielded via the delta queue with 10ms flush interval
917
+ from .events import Event
918
+
919
+ sequence = 0
920
+ has_typed_events = False # Track if user yields Event objects
921
+ first_chunk = True
922
+
923
+ # Extract metadata for delta queue (must be Dict[str, str] for Rust FFI)
924
+ metadata = _normalize_metadata(self._extract_critical_metadata(request))
925
+
926
+ async for chunk in result:
927
+ # Check if chunk is a typed Event
928
+ if isinstance(chunk, Event):
929
+ has_typed_events = True
930
+ # Use the event's fields directly
931
+ event_data = chunk.to_response_fields()
932
+ output_data = event_data.get("output_data", b"")
933
+ # Convert bytes to string for queue_delta
934
+ output_str = output_data.decode("utf-8") if isinstance(output_data, bytes) else str(output_data or "{}")
935
+ self._rust_worker.queue_delta(
936
+ invocation_id=request.invocation_id,
937
+ event_type=event_data.get("event_type", ""),
938
+ output_data=output_str,
939
+ content_index=event_data.get("content_index", 0),
940
+ sequence=sequence,
941
+ metadata=metadata,
942
+ source_timestamp_ns=chunk.source_timestamp_ns,
943
+ )
944
+ else:
945
+ # Regular chunk - wrap with output events
946
+ if first_chunk:
947
+ # Emit output.start event before first chunk
948
+ self._rust_worker.queue_delta(
949
+ invocation_id=request.invocation_id,
950
+ event_type="output.start",
951
+ output_data="{}",
952
+ content_index=0,
953
+ sequence=sequence,
954
+ metadata=metadata,
955
+ source_timestamp_ns=time.time_ns(),
956
+ )
957
+ sequence += 1
958
+ first_chunk = False
959
+
960
+ # Serialize chunk for streaming
961
+ # Strings are passed through directly to avoid double-encoding
962
+ # (functions may yield pre-formatted JSON strings)
963
+ # Other types (dicts, Pydantic models, etc.) are JSON-serialized
964
+ if isinstance(chunk, str):
965
+ chunk_str = chunk
966
+ elif isinstance(chunk, bytes):
967
+ chunk_str = chunk.decode("utf-8")
968
+ else:
969
+ # Use _serialize_result for complex types (dicts, Pydantic models, etc.)
970
+ chunk_data = _serialize_result(chunk)
971
+ chunk_str = chunk_data.decode("utf-8") if isinstance(chunk_data, bytes) else str(chunk_data)
972
+
973
+ # Emit output.delta event
974
+ self._rust_worker.queue_delta(
975
+ invocation_id=request.invocation_id,
976
+ event_type="output.delta",
977
+ output_data=chunk_str,
978
+ content_index=0,
979
+ sequence=sequence,
980
+ metadata=metadata,
981
+ source_timestamp_ns=time.time_ns(),
982
+ )
983
+ sequence += 1
984
+
985
+ # Emit closing events if we had regular chunks
986
+ if not has_typed_events and not first_chunk:
987
+ # Emit output.stop event
988
+ self._rust_worker.queue_delta(
989
+ invocation_id=request.invocation_id,
990
+ event_type="output.stop",
991
+ output_data="{}",
992
+ content_index=0,
993
+ sequence=sequence,
994
+ metadata=metadata,
995
+ source_timestamp_ns=time.time_ns(),
996
+ )
997
+ sequence += 1
998
+
999
+ # Always emit run.completed event
1000
+ self._rust_worker.queue_delta(
1001
+ invocation_id=request.invocation_id,
1002
+ event_type="run.completed",
1003
+ output_data="{}",
1004
+ content_index=0,
1005
+ sequence=sequence,
1006
+ metadata=metadata,
1007
+ source_timestamp_ns=time.time_ns(),
1008
+ )
1009
+
1010
+ logger.debug(f"Streaming function queued {sequence + 1} deltas for real-time delivery")
1011
+ # Return None to signal that streaming was handled via delta queue
1012
+ return None
1013
+ else:
1014
+ # Regular function - await and return single response
1015
+ if inspect.iscoroutine(result):
1016
+ result = await result
1017
+
1018
+ # Serialize result
1019
+ output_data = _serialize_result(result)
1020
+
1021
+ # Extract critical metadata for journal event correlation
1022
+ response_metadata = self._extract_critical_metadata(request)
1023
+
1024
+ # Emit run.completed event with output
1025
+ return PyExecuteComponentResponse(
1026
+ invocation_id=request.invocation_id,
1027
+ success=True,
1028
+ output_data=output_data,
1029
+ state_update=None,
1030
+ error_message=None,
1031
+ metadata=response_metadata if response_metadata else None,
1032
+ event_type="run.completed",
1033
+ content_index=0,
1034
+ sequence=0,
1035
+ attempt=platform_attempt,
1036
+ )
1037
+
1038
+ except Exception as e:
1039
+ # Include exception type for better error messages
1040
+ error_msg = f"{type(e).__name__}: {str(e)}"
1041
+
1042
+ # Capture full stack trace for telemetry
1043
+ import traceback
1044
+ stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
1045
+
1046
+ # Log with full traceback using ctx.logger to ensure run_id correlation
1047
+ from .context import get_current_context
1048
+ current_ctx = get_current_context()
1049
+ error_logger = current_ctx.logger if current_ctx else logger
1050
+ error_logger.error(f"Function execution failed: {error_msg}", exc_info=True)
1051
+
1052
+ # Store stack trace in metadata for observability
1053
+ metadata = {
1054
+ "error_type": type(e).__name__,
1055
+ "stack_trace": stack_trace,
1056
+ "error": True, # Boolean flag for error detection
1057
+ }
1058
+
1059
+ # CRITICAL: Extract critical metadata (including tenant_id) for journal event correlation
1060
+ # This ensures run.failed events are properly emitted by Worker Coordinator
1061
+ critical_metadata = self._extract_critical_metadata(request)
1062
+ metadata.update(critical_metadata)
1063
+
1064
+ # CRITICAL: Normalize metadata to ensure all values are strings (Rust FFI requirement)
1065
+ # PyO3 expects HashMap<String, String>, but we may have booleans or other types
1066
+ normalized_metadata = _normalize_metadata(metadata)
1067
+
1068
+ # Emit run.failed event
1069
+ return PyExecuteComponentResponse(
1070
+ invocation_id=request.invocation_id,
1071
+ success=False,
1072
+ output_data=b"",
1073
+ state_update=None,
1074
+ error_message=error_msg,
1075
+ metadata=normalized_metadata,
1076
+ event_type="run.failed",
1077
+ content_index=0,
1078
+ sequence=0,
1079
+ attempt=getattr(request, 'attempt', 0),
1080
+ )
1081
+
1082
+ finally:
1083
+ # Always reset context to prevent leakage between executions
1084
+ _current_context.reset(token)
1085
+
1086
+ async def _execute_workflow(self, config, input_data: bytes, request):
1087
+ """Execute a workflow handler with automatic replay support."""
1088
+ import json
1089
+ from .workflow import WorkflowEntity, WorkflowContext
1090
+ from .entity import _get_state_adapter, _entity_state_adapter_ctx
1091
+ from .exceptions import WaitingForUserInputException
1092
+ from ._core import PyExecuteComponentResponse
1093
+
1094
+ # Set entity state adapter in context so workflows can use Entities
1095
+ _entity_state_adapter_ctx.set(self._entity_state_adapter)
1096
+
1097
+ try:
1098
+ # Parse input data
1099
+ input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
1100
+
1101
+ # Extract or generate session_id for multi-turn conversation support (for chat workflows)
1102
+ # If session_id is provided, the workflow can maintain conversation context
1103
+ session_id = input_dict.get("session_id")
1104
+
1105
+ if not session_id:
1106
+ session_id = str(uuid.uuid4())
1107
+ logger.info(f"Created new workflow session: {session_id}")
1108
+ else:
1109
+ logger.info(f"Using existing workflow session: {session_id}")
1110
+
1111
+ # Parse replay data from request metadata for crash recovery
1112
+ completed_steps = {}
1113
+ step_events = [] # Raw step_events list for serialization on next pause
1114
+ initial_state = {}
1115
+ user_response = None
1116
+
1117
+ if hasattr(request, 'metadata') and request.metadata:
1118
+ # Parse completed steps for replay (from crash recovery or HITL resume)
1119
+ # Try both formats: completed_steps (dict) and step_events (list from pause)
1120
+ if "completed_steps" in request.metadata:
1121
+ completed_steps_json = request.metadata["completed_steps"]
1122
+ if completed_steps_json:
1123
+ try:
1124
+ completed_steps = json.loads(completed_steps_json)
1125
+ logger.info(f"🔄 Replaying workflow with {len(completed_steps)} cached steps")
1126
+ except json.JSONDecodeError:
1127
+ logger.warning("Failed to parse completed_steps from metadata")
1128
+ elif "step_events" in request.metadata:
1129
+ # Convert step_events list to completed_steps dict for HITL resume
1130
+ step_events_json = request.metadata["step_events"]
1131
+ if step_events_json:
1132
+ try:
1133
+ step_events_list = json.loads(step_events_json)
1134
+ # Convert list format to dict: {step_name: result, ...}
1135
+ for event in step_events_list:
1136
+ if "step_name" in event and "result" in event:
1137
+ completed_steps[event["step_name"]] = event["result"]
1138
+ # Also preserve raw step_events list for serialization on next pause
1139
+ step_events = step_events_list
1140
+ logger.info(f"🔄 Resuming workflow with {len(completed_steps)} completed steps from pause")
1141
+ except json.JSONDecodeError:
1142
+ logger.warning("Failed to parse step_events from metadata")
1143
+
1144
+ # Parse initial workflow state for replay
1145
+ if "workflow_state" in request.metadata:
1146
+ workflow_state_json = request.metadata["workflow_state"]
1147
+ if workflow_state_json:
1148
+ try:
1149
+ initial_state = json.loads(workflow_state_json)
1150
+ logger.info(f"🔄 Loaded workflow state: {len(initial_state)} keys")
1151
+ except json.JSONDecodeError:
1152
+ logger.warning("Failed to parse workflow_state from metadata")
1153
+
1154
+ # Check for user response (workflow resume after pause)
1155
+ if "user_response" in request.metadata:
1156
+ user_response = request.metadata["user_response"]
1157
+ logger.info(f"▶️ Resuming workflow with user response: {user_response}")
1158
+
1159
+ # NEW: Check for agent resume (agent-level HITL)
1160
+ agent_context = None
1161
+ if hasattr(request, 'metadata') and request.metadata:
1162
+ if "agent_context" in request.metadata:
1163
+ agent_context_json = request.metadata["agent_context"]
1164
+ try:
1165
+ agent_context = json.loads(agent_context_json)
1166
+ agent_name = agent_context.get("agent_name", "unknown")
1167
+ iteration = agent_context.get("iteration", 0)
1168
+ logger.info(
1169
+ f"▶️ Resuming agent '{agent_name}' from iteration {iteration} "
1170
+ f"with user response: {user_response}"
1171
+ )
1172
+ except json.JSONDecodeError:
1173
+ logger.warning("Failed to parse agent_context from metadata")
1174
+ agent_context = None
1175
+
1176
+ # Extract session_id and user_id from request for memory scoping
1177
+ # Do this FIRST so we can pass to WorkflowEntity constructor
1178
+ session_id = request.session_id if hasattr(request, 'session_id') and request.session_id else request.invocation_id
1179
+ user_id = request.user_id if hasattr(request, 'user_id') and request.user_id else None
1180
+
1181
+ # Extract streaming context for real-time SSE log delivery
1182
+ is_streaming = getattr(request, 'is_streaming', False)
1183
+ tenant_id = request.metadata.get('tenant_id') if hasattr(request, 'metadata') else None
1184
+
1185
+ # Create WorkflowEntity for state management with memory scoping
1186
+ # Entity key will be scoped based on priority: user_id > session_id > run_id
1187
+ # For session scope, include component_name to enable listing sessions by workflow
1188
+ component_name = getattr(request, 'component_name', None)
1189
+ workflow_entity = WorkflowEntity(
1190
+ run_id=request.invocation_id,
1191
+ session_id=session_id,
1192
+ user_id=user_id,
1193
+ component_name=component_name,
1194
+ )
1195
+
1196
+ # Load replay data into entity if provided
1197
+ if completed_steps:
1198
+ workflow_entity._completed_steps = completed_steps
1199
+ logger.debug(f"Loaded {len(completed_steps)} completed steps into workflow entity")
1200
+
1201
+ # Restore raw step_events list for serialization on next pause
1202
+ # This ensures previous user responses are preserved across multiple resumes
1203
+ if step_events:
1204
+ workflow_entity._step_events = step_events
1205
+ logger.debug(f"Restored {len(step_events)} step events into workflow entity")
1206
+
1207
+ # Inject user response if resuming from pause
1208
+ if user_response:
1209
+ # Restore pause_index from metadata for multi-step HITL
1210
+ # This ensures we inject at the correct position in the pause sequence
1211
+ if hasattr(request, 'metadata') and request.metadata:
1212
+ pause_index_str = request.metadata.get("pause_index", "0")
1213
+ try:
1214
+ workflow_entity._pause_index = int(pause_index_str)
1215
+ logger.debug(f"Restored pause_index={workflow_entity._pause_index} for resume")
1216
+ except ValueError:
1217
+ logger.warning(f"Invalid pause_index in metadata: {pause_index_str}, using 0")
1218
+ workflow_entity._pause_index = 0
1219
+
1220
+ workflow_entity.inject_user_response(user_response)
1221
+ logger.debug(f"Injected user response into workflow entity at pause {workflow_entity._pause_index}")
1222
+
1223
+ # IMPORTANT: Reset pause_index to 0 for replay
1224
+ # The workflow replays from the beginning, so the first wait_for_user
1225
+ # should check at index 0, not at the stored index
1226
+ workflow_entity._pause_index = 0
1227
+ logger.debug("Reset pause_index to 0 for replay")
1228
+
1229
+ if initial_state:
1230
+ # Load initial state into entity's state adapter AND workflow entity's state
1231
+ state_adapter = _get_state_adapter()
1232
+ if hasattr(state_adapter, '_standalone_states'):
1233
+ # Standalone mode - set state directly in adapter
1234
+ state_adapter._standalone_states[workflow_entity._state_key] = initial_state
1235
+ logger.debug(f"Loaded initial state with {len(initial_state)} keys into state adapter (standalone)")
1236
+
1237
+ # Also initialize the workflow entity's internal state with the loaded data
1238
+ # This ensures workflow_entity.state.get() returns the persisted values
1239
+ from .workflow import WorkflowState
1240
+ workflow_entity._state = WorkflowState(initial_state.copy(), workflow_entity)
1241
+ logger.info(f"🔄 Initialized workflow entity state with {len(initial_state)} keys from session")
1242
+
1243
+ # Create checkpoint callback for real-time streaming
1244
+ def checkpoint_callback(checkpoint: dict) -> None:
1245
+ """Send checkpoint to Rust worker queue."""
1246
+ try:
1247
+ # Extract critical metadata for checkpoint routing
1248
+ metadata = self._extract_critical_metadata(request)
1249
+
1250
+ # DEBUG: Log metadata types for troubleshooting PyO3 conversion errors
1251
+ logger.debug(f"Checkpoint metadata types: {[(k, type(v).__name__) for k, v in metadata.items()]}")
1252
+
1253
+ # Get source timestamp (use from checkpoint if provided, otherwise generate now)
1254
+ source_timestamp_ns = checkpoint.get("source_timestamp_ns", time.time_ns())
1255
+
1256
+ # Queue checkpoint via Rust FFI
1257
+ self._rust_worker.queue_workflow_checkpoint(
1258
+ invocation_id=request.invocation_id,
1259
+ checkpoint_type=checkpoint["checkpoint_type"],
1260
+ checkpoint_data=_json.dumps(checkpoint["checkpoint_data"], cls=_ResultEncoder),
1261
+ sequence_number=checkpoint["sequence_number"],
1262
+ metadata=metadata,
1263
+ source_timestamp_ns=source_timestamp_ns,
1264
+ )
1265
+ logger.debug(
1266
+ f"Queued checkpoint: type={checkpoint['checkpoint_type']} "
1267
+ f"seq={checkpoint['sequence_number']}"
1268
+ )
1269
+ except Exception as e:
1270
+ # Checkpoints are critical for durability - failing to persist them
1271
+ # means we cannot guarantee replay/recovery. Re-raise to fail the workflow.
1272
+ logger.error(f"Failed to queue checkpoint: {e}", exc_info=True)
1273
+ logger.error(f"Checkpoint metadata: {metadata}")
1274
+ logger.error(f"Checkpoint type: {checkpoint.get('checkpoint_type')}")
1275
+ raise RuntimeError(
1276
+ f"Failed to queue checkpoint '{checkpoint.get('checkpoint_type')}': {e}. "
1277
+ f"Workflow cannot continue without durable checkpoints."
1278
+ ) from e
1279
+
1280
+ # Create delta callback for forwarding streaming events from nested agents/functions
1281
+ # This is used by WorkflowContext._consume_streaming_result to forward events
1282
+ delta_metadata = _normalize_metadata(self._extract_critical_metadata(request))
1283
+
1284
+ def delta_callback(event_type: str, output_data: str, content_index: int, sequence: int, source_timestamp_ns: int = 0) -> None:
1285
+ """Forward streaming delta event from nested component."""
1286
+ try:
1287
+ # Use provided timestamp or generate one if not provided
1288
+ ts = source_timestamp_ns if source_timestamp_ns > 0 else time.time_ns()
1289
+ self._rust_worker.queue_delta(
1290
+ invocation_id=request.invocation_id,
1291
+ event_type=event_type,
1292
+ output_data=output_data,
1293
+ content_index=content_index,
1294
+ sequence=sequence,
1295
+ metadata=delta_metadata,
1296
+ source_timestamp_ns=ts,
1297
+ )
1298
+ logger.debug(f"Forwarded delta: type={event_type} seq={sequence}")
1299
+ except Exception as e:
1300
+ # Delta forwarding is best-effort - log but don't fail the workflow
1301
+ logger.warning(f"Failed to forward delta event: {e}")
1302
+
1303
+ # Create WorkflowContext with entity, runtime_context, checkpoint callback, and checkpoint client
1304
+ ctx = WorkflowContext(
1305
+ workflow_entity=workflow_entity,
1306
+ run_id=request.invocation_id, # Use unique invocation_id for this execution
1307
+ session_id=session_id, # Session for multi-turn conversations
1308
+ user_id=user_id, # User for long-term memory
1309
+ runtime_context=request.runtime_context,
1310
+ checkpoint_callback=checkpoint_callback,
1311
+ checkpoint_client=self._checkpoint_client, # Phase 3: platform-side memoization
1312
+ is_streaming=is_streaming, # For real-time SSE log delivery
1313
+ tenant_id=tenant_id, # For multi-tenant deployments
1314
+ delta_callback=delta_callback, # For forwarding streaming events from nested components
1315
+ )
1316
+
1317
+ # NEW: Populate agent resume info if this is an agent HITL resume
1318
+ if agent_context and user_response:
1319
+ ctx._agent_resume_info = {
1320
+ "agent_name": agent_context["agent_name"],
1321
+ "agent_context": agent_context,
1322
+ "user_response": user_response,
1323
+ }
1324
+ logger.debug(
1325
+ f"Set agent resume info for '{agent_context['agent_name']}' "
1326
+ f"in workflow context"
1327
+ )
1328
+
1329
+ # Execute workflow directly - Rust bridge handles tracing
1330
+ # Note: Removed Python-level span creation to avoid duplicate spans.
1331
+ # The Rust worker bridge creates comprehensive OpenTelemetry spans.
1332
+ # See DUPLICATE_SPANS_FIX.md for details.
1333
+
1334
+ # CRITICAL: Set context in contextvar so LM/Agent/Tool calls can access it
1335
+ from .context import set_current_context
1336
+ import time as _time
1337
+ token = set_current_context(ctx)
1338
+
1339
+ # Set up _current_span contextvar for proper trace parent-child linking
1340
+ # The Rust worker creates spans and passes trace context via runtime_context.
1341
+ # We need to set this in Python's _current_span contextvar so that spans
1342
+ # created in Python (e.g., agent spans, nested function calls) become proper children.
1343
+ from .tracing import _current_span, SpanInfo
1344
+ span_token = None
1345
+ if request.runtime_context:
1346
+ trace_id = request.runtime_context.trace_id
1347
+ span_id = request.runtime_context.span_id
1348
+ if trace_id and span_id:
1349
+ span_info = SpanInfo(trace_id=trace_id, span_id=span_id)
1350
+ span_token = _current_span.set(span_info)
1351
+
1352
+ workflow_start_time = _time.time()
1353
+ try:
1354
+ # Emit workflow.started checkpoint
1355
+ ctx._send_checkpoint("workflow.started", {
1356
+ "workflow.name": config.name,
1357
+ "run_id": request.invocation_id,
1358
+ "session_id": session_id,
1359
+ "is_replay": bool(completed_steps),
1360
+ })
1361
+
1362
+ # CRITICAL: Flush immediately to ensure workflow.started arrives at platform
1363
+ # BEFORE handler runs. Without this, nested events (agent.started, lm.call.started)
1364
+ # which use direct journal writes would arrive before workflow.started which is queued.
1365
+ self._rust_worker.flush_workflow_checkpoints()
1366
+
1367
+ if input_dict:
1368
+ result = await config.handler(ctx, **input_dict)
1369
+ else:
1370
+ result = await config.handler(ctx)
1371
+
1372
+ # Serialize result BEFORE emitting workflow.completed
1373
+ # This ensures serialization errors trigger workflow.failed, not run.failed
1374
+ output_data = _serialize_result(result)
1375
+
1376
+ # Emit workflow.completed checkpoint
1377
+ workflow_duration_ms = int((_time.time() - workflow_start_time) * 1000)
1378
+ ctx._send_checkpoint("workflow.completed", {
1379
+ "workflow.name": config.name,
1380
+ "run_id": request.invocation_id,
1381
+ "duration_ms": workflow_duration_ms,
1382
+ "steps_count": len(ctx._workflow_entity._step_events),
1383
+ })
1384
+
1385
+ # Note: Workflow entity persistence is handled by the @workflow decorator wrapper
1386
+ # which persists before returning. No need to persist here.
1387
+ except Exception as workflow_error:
1388
+ # Emit workflow.failed checkpoint
1389
+ workflow_duration_ms = int((_time.time() - workflow_start_time) * 1000)
1390
+ ctx._send_checkpoint("workflow.failed", {
1391
+ "workflow.name": config.name,
1392
+ "run_id": request.invocation_id,
1393
+ "duration_ms": workflow_duration_ms,
1394
+ "error": str(workflow_error),
1395
+ "error_type": type(workflow_error).__name__,
1396
+ })
1397
+ raise
1398
+ finally:
1399
+ # Always reset context to prevent leakage
1400
+ from .context import _current_context
1401
+ _current_context.reset(token)
1402
+
1403
+ # Note: Removed flush_telemetry_py() call here - it was causing 2-second blocking delay!
1404
+ # The batch span processor handles flushing automatically with 5s timeout
1405
+
1406
+ # Collect workflow execution metadata for durability
1407
+ metadata = {}
1408
+
1409
+ # CRITICAL: Propagate tenant_id and deployment_id to prevent journal corruption
1410
+ # Missing tenant_id causes events to be written to wrong partition
1411
+ critical_metadata = self._extract_critical_metadata(request)
1412
+ metadata.update(critical_metadata)
1413
+
1414
+ # Add step events to metadata (for workflow durability)
1415
+ # Access _step_events from the workflow entity, not the context
1416
+ step_events = ctx._workflow_entity._step_events
1417
+ if step_events:
1418
+ metadata["step_events"] = json.dumps(step_events)
1419
+ logger.debug(f"Workflow has {len(step_events)} recorded steps")
1420
+
1421
+ # Add final state snapshot to metadata (if state was used)
1422
+ # Check if _state was initialized without triggering property getter
1423
+ if hasattr(ctx, '_workflow_entity') and ctx._workflow_entity._state is not None:
1424
+ if ctx._workflow_entity._state.has_changes():
1425
+ state_snapshot = ctx._workflow_entity._state.get_state_snapshot()
1426
+ metadata["workflow_state"] = json.dumps(state_snapshot)
1427
+ logger.debug(f"Workflow state snapshot: {state_snapshot}")
1428
+
1429
+ # AUDIT TRAIL: Serialize complete state change history for replay and debugging
1430
+ # This captures all intermediate state mutations, not just final snapshot
1431
+ state_changes = ctx._workflow_entity._state_changes
1432
+ logger.info(f"🔍 DEBUG: _state_changes list has {len(state_changes)} entries")
1433
+ if state_changes:
1434
+ metadata["state_changes"] = json.dumps(state_changes)
1435
+ logger.info(f"✅ Serialized {len(state_changes)} state changes to metadata")
1436
+ else:
1437
+ logger.warning("⚠️ _state_changes list is empty - no state change history captured")
1438
+
1439
+ # CRITICAL: Persist workflow entity state to platform
1440
+ # This stores the WorkflowEntity as a first-class entity with proper versioning
1441
+ try:
1442
+ logger.info(f"🔍 DEBUG: About to call _persist_state() for run {request.invocation_id}")
1443
+ await ctx._workflow_entity._persist_state()
1444
+ logger.info(f"✅ Successfully persisted WorkflowEntity state for run {request.invocation_id}")
1445
+ except Exception as persist_error:
1446
+ logger.error(f"❌ Failed to persist WorkflowEntity state (non-fatal): {persist_error}", exc_info=True)
1447
+ # Continue anyway - persistence failure shouldn't fail the workflow
1448
+
1449
+ logger.info(f"Workflow completed successfully with {len(step_events)} steps")
1450
+
1451
+ # Add session_id to metadata for multi-turn conversation support
1452
+ metadata["session_id"] = session_id
1453
+
1454
+ # CRITICAL: Flush all buffered checkpoints before returning response
1455
+ # This ensures checkpoints arrive at platform BEFORE run.completed event
1456
+ try:
1457
+ flushed_count = self._rust_worker.flush_workflow_checkpoints()
1458
+ if flushed_count > 0:
1459
+ logger.info(f"✅ Flushed {flushed_count} checkpoints before completion")
1460
+ except Exception as flush_error:
1461
+ logger.error(f"Failed to flush checkpoints: {flush_error}", exc_info=True)
1462
+ # Continue anyway - checkpoint flushing is best-effort
1463
+
1464
+ return PyExecuteComponentResponse(
1465
+ invocation_id=request.invocation_id,
1466
+ success=True,
1467
+ output_data=output_data,
1468
+ state_update=None, # Not used for workflows (use metadata instead)
1469
+ error_message=None,
1470
+ metadata=metadata if metadata else None, # Include step events + state + session_id
1471
+ event_type="run.completed",
1472
+ content_index=0,
1473
+ sequence=0,
1474
+ attempt=getattr(request, 'attempt', 0),
1475
+ )
1476
+
1477
+ except WaitingForUserInputException as e:
1478
+ # Workflow or agent paused for user input
1479
+ pause_type = "agent" if e.agent_context else "workflow"
1480
+ logger.info(f"⏸️ {pause_type.capitalize()} paused waiting for user input: {e.question}")
1481
+
1482
+ # Collect metadata for pause state
1483
+ # Note: All metadata values must be strings for Rust FFI
1484
+ pause_metadata = {
1485
+ "status": "awaiting_user_input",
1486
+ "question": e.question,
1487
+ "input_type": e.input_type,
1488
+ "pause_type": pause_type, # NEW: Indicates workflow vs agent pause
1489
+ "pause_index": str(e.pause_index), # Store pause index for multi-step HITL
1490
+ }
1491
+
1492
+ # CRITICAL: Propagate tenant_id even when pausing
1493
+ critical_metadata = self._extract_critical_metadata(request)
1494
+ pause_metadata.update(critical_metadata)
1495
+
1496
+ # Add optional fields only if they exist
1497
+ if e.options:
1498
+ pause_metadata["options"] = json.dumps(e.options)
1499
+ if e.checkpoint_state:
1500
+ pause_metadata["checkpoint_state"] = json.dumps(e.checkpoint_state)
1501
+ if session_id:
1502
+ pause_metadata["session_id"] = session_id
1503
+
1504
+ # NEW: Store agent execution state if present
1505
+ if e.agent_context:
1506
+ pause_metadata["agent_context"] = json.dumps(e.agent_context)
1507
+ logger.debug(
1508
+ f"Agent '{e.agent_context['agent_name']}' paused at "
1509
+ f"iteration {e.agent_context['iteration']}"
1510
+ )
1511
+
1512
+ # Add step events to pause metadata for durability
1513
+ step_events = ctx._workflow_entity._step_events
1514
+ if step_events:
1515
+ pause_metadata["step_events"] = json.dumps(step_events)
1516
+ logger.debug(f"Paused workflow has {len(step_events)} recorded steps")
1517
+
1518
+ # Add current workflow state to pause metadata
1519
+ if hasattr(ctx, '_workflow_entity') and ctx._workflow_entity._state is not None:
1520
+ if ctx._workflow_entity._state.has_changes():
1521
+ state_snapshot = ctx._workflow_entity._state.get_state_snapshot()
1522
+ pause_metadata["workflow_state"] = json.dumps(state_snapshot)
1523
+ logger.debug(f"Paused workflow state snapshot: {state_snapshot}")
1524
+
1525
+ # AUDIT TRAIL: Also include state change history for paused workflows
1526
+ state_changes = ctx._workflow_entity._state_changes
1527
+ if state_changes:
1528
+ pause_metadata["state_changes"] = json.dumps(state_changes)
1529
+ logger.debug(f"Paused workflow has {len(state_changes)} state changes in history")
1530
+
1531
+ # Return "success" with awaiting_user_input metadata
1532
+ # The output contains the question details for the client
1533
+ output = {
1534
+ "question": e.question,
1535
+ "input_type": e.input_type,
1536
+ "options": e.options,
1537
+ }
1538
+ output_data = _serialize_result(output)
1539
+
1540
+ # Emit run.paused event for HITL (human-in-the-loop)
1541
+ return PyExecuteComponentResponse(
1542
+ invocation_id=request.invocation_id,
1543
+ success=True, # This is a valid pause state, not an error
1544
+ output_data=output_data,
1545
+ state_update=None,
1546
+ error_message=None,
1547
+ metadata=pause_metadata,
1548
+ event_type="run.paused",
1549
+ content_index=0,
1550
+ sequence=0,
1551
+ attempt=getattr(request, 'attempt', 0),
1552
+ )
1553
+
1554
+ except Exception as e:
1555
+ # Include exception type for better error messages
1556
+ error_msg = f"{type(e).__name__}: {str(e)}"
1557
+
1558
+ # Capture full stack trace for telemetry
1559
+ import traceback
1560
+ stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
1561
+
1562
+ # Log with full traceback
1563
+ logger.error(f"Workflow execution failed: {error_msg}", exc_info=True)
1564
+
1565
+ # CRITICAL: Flush all buffered checkpoints before returning error response
1566
+ # This ensures workflow.failed checkpoint arrives at platform BEFORE run.failed event
1567
+ # Without this, SSE clients may not receive workflow.failed events
1568
+ try:
1569
+ flushed_count = self._rust_worker.flush_workflow_checkpoints()
1570
+ if flushed_count > 0:
1571
+ logger.info(f"✅ Flushed {flushed_count} checkpoints before error response")
1572
+ except Exception as flush_error:
1573
+ logger.error(f"Failed to flush checkpoints in error path: {flush_error}", exc_info=True)
1574
+ # Continue anyway - checkpoint flushing is best-effort
1575
+
1576
+ # Store error metadata for observability
1577
+ metadata = {
1578
+ "error_type": type(e).__name__,
1579
+ "stack_trace": stack_trace,
1580
+ "error": True,
1581
+ }
1582
+
1583
+ # Extract critical metadata for journal correlation (if available)
1584
+ critical_metadata = self._extract_critical_metadata(request)
1585
+ metadata.update(critical_metadata)
1586
+
1587
+ # Normalize metadata for Rust FFI compatibility
1588
+ normalized_metadata = _normalize_metadata(metadata)
1589
+
1590
+ # Emit run.failed event
1591
+ return PyExecuteComponentResponse(
1592
+ invocation_id=request.invocation_id,
1593
+ success=False,
1594
+ output_data=b"",
1595
+ state_update=None,
1596
+ error_message=error_msg,
1597
+ metadata=normalized_metadata,
1598
+ event_type="run.failed",
1599
+ content_index=0,
1600
+ sequence=0,
1601
+ attempt=getattr(request, 'attempt', 0),
1602
+ )
1603
+
1604
+ async def _execute_tool(self, tool, input_data: bytes, request):
1605
+ """Execute a tool handler."""
1606
+ import json
1607
+ from .context import Context
1608
+ from ._core import PyExecuteComponentResponse
1609
+
1610
+ try:
1611
+ # Parse input data
1612
+ input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
1613
+
1614
+ # Create context with runtime_context for trace correlation
1615
+ ctx = Context(
1616
+ run_id=f"{self.service_name}:{tool.name}",
1617
+ runtime_context=request.runtime_context,
1618
+ )
1619
+
1620
+ # Set context in contextvar so get_current_context() and error handlers can access it
1621
+ from .context import set_current_context, _current_context
1622
+ token = set_current_context(ctx)
1623
+
1624
+ # Execute tool
1625
+ result = await tool.invoke(ctx, **input_dict)
1626
+
1627
+ # Serialize result
1628
+ output_data = _serialize_result(result)
1629
+
1630
+ return PyExecuteComponentResponse(
1631
+ invocation_id=request.invocation_id,
1632
+ success=True,
1633
+ output_data=output_data,
1634
+ state_update=None,
1635
+ error_message=None,
1636
+ metadata=None,
1637
+ event_type="run.completed",
1638
+ content_index=0,
1639
+ sequence=0,
1640
+ attempt=getattr(request, 'attempt', 0),
1641
+ )
1642
+
1643
+ except Exception as e:
1644
+ # Include exception type for better error messages
1645
+ error_msg = f"{type(e).__name__}: {str(e)}"
1646
+
1647
+ # Capture full stack trace for telemetry
1648
+ import traceback
1649
+ stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
1650
+
1651
+ # Log with full traceback using ctx.logger to ensure run_id correlation
1652
+ from .context import get_current_context
1653
+ current_ctx = get_current_context()
1654
+ error_logger = current_ctx.logger if current_ctx else logger
1655
+ error_logger.error(f"Tool execution failed: {error_msg}", exc_info=True)
1656
+
1657
+ # Store error metadata for observability
1658
+ metadata = {
1659
+ "error_type": type(e).__name__,
1660
+ "stack_trace": stack_trace,
1661
+ "error": True,
1662
+ }
1663
+
1664
+ # CRITICAL: Extract critical metadata (including tenant_id) for journal event correlation
1665
+ critical_metadata = self._extract_critical_metadata(request)
1666
+ metadata.update(critical_metadata)
1667
+
1668
+ # Normalize metadata for Rust FFI compatibility
1669
+ normalized_metadata = _normalize_metadata(metadata)
1670
+
1671
+ # Emit run.failed event
1672
+ return PyExecuteComponentResponse(
1673
+ invocation_id=request.invocation_id,
1674
+ success=False,
1675
+ output_data=b"",
1676
+ state_update=None,
1677
+ error_message=error_msg,
1678
+ metadata=normalized_metadata,
1679
+ event_type="run.failed",
1680
+ content_index=0,
1681
+ sequence=0,
1682
+ attempt=getattr(request, 'attempt', 0),
1683
+ )
1684
+
1685
+ finally:
1686
+ # Always reset context to prevent leakage between executions
1687
+ _current_context.reset(token)
1688
+
1689
+ async def _execute_entity(self, entity_type, input_data: bytes, request):
1690
+ """Execute an entity method."""
1691
+ import json
1692
+ from .context import Context
1693
+ from .entity import EntityType, Entity, _entity_state_adapter_ctx
1694
+ from ._core import PyExecuteComponentResponse
1695
+
1696
+ # Set entity state adapter in context for Entity instances to access
1697
+ _entity_state_adapter_ctx.set(self._entity_state_adapter)
1698
+
1699
+ try:
1700
+ # Parse input data
1701
+ input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
1702
+
1703
+ # Extract entity key and method name from input
1704
+ entity_key = input_dict.pop("key", None)
1705
+ method_name = input_dict.pop("method", None)
1706
+
1707
+ if not entity_key:
1708
+ raise ValueError("Entity invocation requires 'key' parameter")
1709
+ if not method_name:
1710
+ raise ValueError("Entity invocation requires 'method' parameter")
1711
+
1712
+ # Create context for logging and tracing
1713
+ ctx = Context(
1714
+ run_id=f"{self.service_name}:{entity_type.name}:{entity_key}",
1715
+ runtime_context=request.runtime_context,
1716
+ )
1717
+
1718
+ # Set context in contextvar so get_current_context() and error handlers can access it
1719
+ from .context import set_current_context, _current_context
1720
+ token = set_current_context(ctx)
1721
+
1722
+ # Note: State loading is now handled automatically by the entity method wrapper
1723
+ # via EntityStateAdapter which uses the Rust core for cache + platform persistence
1724
+
1725
+ # Create entity instance using the stored class reference
1726
+ entity_instance = entity_type.entity_class(key=entity_key)
1727
+
1728
+ # Get method
1729
+ if not hasattr(entity_instance, method_name):
1730
+ raise ValueError(f"Entity '{entity_type.name}' has no method '{method_name}'")
1731
+
1732
+ method = getattr(entity_instance, method_name)
1733
+
1734
+ # Execute method (entity method wrapper handles state load/save automatically)
1735
+ result = await method(**input_dict)
1736
+
1737
+ # Serialize result
1738
+ output_data = _serialize_result(result)
1739
+
1740
+ # Note: State persistence is now handled automatically by the entity method wrapper
1741
+ # via EntityStateAdapter which uses Rust core for optimistic locking + version tracking
1742
+
1743
+ # CRITICAL: Propagate tenant_id and deployment_id to prevent journal corruption
1744
+ metadata = self._extract_critical_metadata(request)
1745
+
1746
+ return PyExecuteComponentResponse(
1747
+ invocation_id=request.invocation_id,
1748
+ success=True,
1749
+ output_data=output_data,
1750
+ state_update=None, # TODO: Use structured StateUpdate object
1751
+ error_message=None,
1752
+ metadata=metadata if metadata else None, # Include state in metadata for Worker Coordinator
1753
+ event_type="run.completed",
1754
+ content_index=0,
1755
+ sequence=0,
1756
+ attempt=getattr(request, 'attempt', 0),
1757
+ )
1758
+
1759
+ except Exception as e:
1760
+ # Include exception type for better error messages
1761
+ error_msg = f"{type(e).__name__}: {str(e)}"
1762
+
1763
+ # Capture full stack trace for telemetry
1764
+ import traceback
1765
+ stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
1766
+
1767
+ # Log with full traceback using ctx.logger to ensure run_id correlation
1768
+ from .context import get_current_context
1769
+ current_ctx = get_current_context()
1770
+ error_logger = current_ctx.logger if current_ctx else logger
1771
+ error_logger.error(f"Entity execution failed: {error_msg}", exc_info=True)
1772
+
1773
+ # Store error metadata for observability
1774
+ metadata = {
1775
+ "error_type": type(e).__name__,
1776
+ "stack_trace": stack_trace,
1777
+ "error": True,
1778
+ }
1779
+
1780
+ # Extract critical metadata for journal correlation (if available)
1781
+ critical_metadata = self._extract_critical_metadata(request)
1782
+ metadata.update(critical_metadata)
1783
+
1784
+ # Normalize metadata for Rust FFI compatibility
1785
+ normalized_metadata = _normalize_metadata(metadata)
1786
+
1787
+ # Emit run.failed event
1788
+ return PyExecuteComponentResponse(
1789
+ invocation_id=request.invocation_id,
1790
+ success=False,
1791
+ output_data=b"",
1792
+ state_update=None,
1793
+ error_message=error_msg,
1794
+ metadata=normalized_metadata,
1795
+ event_type="run.failed",
1796
+ content_index=0,
1797
+ sequence=0,
1798
+ attempt=getattr(request, 'attempt', 0),
1799
+ )
1800
+
1801
+ finally:
1802
+ # Always reset context to prevent leakage between executions
1803
+ _current_context.reset(token)
1804
+
1805
+ async def _execute_agent(self, agent, input_data: bytes, request):
1806
+ """Execute an agent with session support for multi-turn conversations."""
1807
+ import json
1808
+ import uuid
1809
+ from .agent import AgentContext
1810
+ from .entity import _entity_state_adapter_ctx
1811
+ from ._core import PyExecuteComponentResponse
1812
+
1813
+ # Set entity state adapter in context so AgentContext can access it
1814
+ _entity_state_adapter_ctx.set(self._entity_state_adapter)
1815
+
1816
+ try:
1817
+ # Parse input data
1818
+ input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
1819
+
1820
+ # Extract user message
1821
+ user_message = input_dict.get("message", "")
1822
+ if not user_message:
1823
+ raise ValueError("Agent invocation requires 'message' parameter")
1824
+
1825
+ # Extract or generate session_id for multi-turn conversation support
1826
+ # If session_id is provided, the agent will load previous conversation history
1827
+ # If not provided, a new session is created with auto-generated ID
1828
+ session_id = input_dict.get("session_id")
1829
+
1830
+ if not session_id:
1831
+ session_id = str(uuid.uuid4())
1832
+ logger.info(f"Created new agent session: {session_id}")
1833
+ else:
1834
+ logger.info(f"Using existing agent session: {session_id}")
1835
+
1836
+ # Extract streaming context for real-time SSE log delivery
1837
+ is_streaming = getattr(request, 'is_streaming', False)
1838
+ tenant_id = request.metadata.get('tenant_id') if hasattr(request, 'metadata') else None
1839
+
1840
+ # Create AgentContext with session support for conversation persistence
1841
+ # AgentContext automatically loads/saves conversation history based on session_id
1842
+ ctx = AgentContext(
1843
+ run_id=request.invocation_id,
1844
+ agent_name=agent.name,
1845
+ session_id=session_id,
1846
+ runtime_context=request.runtime_context,
1847
+ is_streaming=is_streaming,
1848
+ tenant_id=tenant_id,
1849
+ )
1850
+
1851
+ # Set context in contextvar so get_current_context() and error handlers can access it
1852
+ from .context import set_current_context, _current_context
1853
+ token = set_current_context(ctx)
1854
+
1855
+ # Execute agent - now returns an async generator for streaming
1856
+ result = agent.run(user_message, context=ctx)
1857
+
1858
+ # Agent.run() always returns an async generator
1859
+ # Queue each event via delta queue for real-time delivery
1860
+ import inspect
1861
+ if inspect.isasyncgen(result):
1862
+ from .events import Event, EventType
1863
+
1864
+ sequence = 0
1865
+ final_output = None
1866
+ final_tool_calls = []
1867
+ handoff_to = None
1868
+
1869
+ # Extract metadata for delta queue (must be Dict[str, str] for Rust FFI)
1870
+ metadata = _normalize_metadata(self._extract_critical_metadata(request))
1871
+ metadata["session_id"] = session_id # Include session for UI
1872
+
1873
+ async for event in result:
1874
+ if isinstance(event, Event):
1875
+ # Queue the event via delta queue
1876
+ event_data = event.to_response_fields()
1877
+ output_data = event_data.get("output_data", b"")
1878
+ output_str = output_data.decode("utf-8") if isinstance(output_data, bytes) else str(output_data or "{}")
1879
+
1880
+ self._rust_worker.queue_delta(
1881
+ invocation_id=request.invocation_id,
1882
+ event_type=event_data.get("event_type", ""),
1883
+ output_data=output_str,
1884
+ content_index=event_data.get("content_index", 0),
1885
+ sequence=sequence,
1886
+ metadata=metadata,
1887
+ source_timestamp_ns=event.source_timestamp_ns,
1888
+ )
1889
+ sequence += 1
1890
+
1891
+ # Capture final result from agent.completed event
1892
+ if event.event_type == EventType.AGENT_COMPLETED:
1893
+ final_output = event.data.get("output", "")
1894
+ final_tool_calls = event.data.get("tool_calls", [])
1895
+ handoff_to = event.data.get("handoff_to")
1896
+
1897
+ # Emit run.completed event with the final agent result
1898
+ final_result = {
1899
+ "output": final_output,
1900
+ "tool_calls": final_tool_calls,
1901
+ }
1902
+ if handoff_to:
1903
+ final_result["handoff_to"] = handoff_to
1904
+
1905
+ self._rust_worker.queue_delta(
1906
+ invocation_id=request.invocation_id,
1907
+ event_type="run.completed",
1908
+ output_data=json.dumps(final_result),
1909
+ content_index=0,
1910
+ sequence=sequence,
1911
+ metadata=metadata,
1912
+ source_timestamp_ns=time.time_ns(),
1913
+ )
1914
+
1915
+ logger.debug(f"Agent streaming queued {sequence + 1} deltas for real-time delivery")
1916
+ # Return None to signal that streaming was handled via delta queue
1917
+ return None
1918
+ else:
1919
+ # Fallback for non-generator (shouldn't happen but handle gracefully)
1920
+ if inspect.iscoroutine(result):
1921
+ agent_result = await result
1922
+ else:
1923
+ agent_result = result
1924
+
1925
+ # Build response with agent output and tool calls
1926
+ result = {
1927
+ "output": agent_result.output,
1928
+ "tool_calls": agent_result.tool_calls,
1929
+ }
1930
+
1931
+ # Serialize result
1932
+ output_data = _serialize_result(result)
1933
+
1934
+ # CRITICAL: Propagate tenant_id and deployment_id to prevent journal corruption
1935
+ metadata = self._extract_critical_metadata(request)
1936
+ # Also include session_id for UI to persist conversation
1937
+ metadata["session_id"] = session_id
1938
+
1939
+ return PyExecuteComponentResponse(
1940
+ invocation_id=request.invocation_id,
1941
+ success=True,
1942
+ output_data=output_data,
1943
+ state_update=None,
1944
+ error_message=None,
1945
+ metadata=metadata if metadata else None,
1946
+ event_type="run.completed",
1947
+ content_index=0,
1948
+ sequence=0,
1949
+ attempt=getattr(request, 'attempt', 0),
1950
+ )
1951
+
1952
+ except Exception as e:
1953
+ # Include exception type for better error messages
1954
+ error_msg = f"{type(e).__name__}: {str(e)}"
1955
+
1956
+ # Capture full stack trace for telemetry
1957
+ import traceback
1958
+ stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
1959
+
1960
+ # Log with full traceback using ctx.logger to ensure run_id correlation
1961
+ from .context import get_current_context
1962
+ current_ctx = get_current_context()
1963
+ error_logger = current_ctx.logger if current_ctx else logger
1964
+ error_logger.error(f"Agent execution failed: {error_msg}", exc_info=True)
1965
+
1966
+ # Store error metadata for observability
1967
+ metadata = {
1968
+ "error_type": type(e).__name__,
1969
+ "stack_trace": stack_trace,
1970
+ "error": True,
1971
+ }
1972
+
1973
+ # Extract critical metadata for journal correlation (if available)
1974
+ critical_metadata = self._extract_critical_metadata(request)
1975
+ metadata.update(critical_metadata)
1976
+
1977
+ # Normalize metadata for Rust FFI compatibility
1978
+ normalized_metadata = _normalize_metadata(metadata)
1979
+
1980
+ # Emit run.failed event
1981
+ return PyExecuteComponentResponse(
1982
+ invocation_id=request.invocation_id,
1983
+ success=False,
1984
+ output_data=b"",
1985
+ state_update=None,
1986
+ error_message=error_msg,
1987
+ metadata=normalized_metadata,
1988
+ event_type="run.failed",
1989
+ content_index=0,
1990
+ sequence=0,
1991
+ attempt=getattr(request, 'attempt', 0),
1992
+ )
1993
+
1994
+ finally:
1995
+ # Always reset context to prevent leakage between executions
1996
+ _current_context.reset(token)
1997
+
1998
+ def _create_error_response(self, request, error_message: str):
1999
+ """Create an error response."""
2000
+ from ._core import PyExecuteComponentResponse
2001
+
2002
+ # Emit run.failed event
2003
+ return PyExecuteComponentResponse(
2004
+ invocation_id=request.invocation_id,
2005
+ success=False,
2006
+ output_data=b"",
2007
+ state_update=None,
2008
+ error_message=error_message,
2009
+ metadata=None,
2010
+ event_type="run.failed",
2011
+ content_index=0,
2012
+ sequence=0,
2013
+ attempt=getattr(request, 'attempt', 0),
2014
+ )
2015
+
2016
+ async def run(self):
2017
+ """Run the worker (register and start message loop).
2018
+
2019
+ This method will:
2020
+ 1. Discover all registered @function and @workflow handlers
2021
+ 2. Register with the coordinator
2022
+ 3. Create a shared Python event loop for all function executions
2023
+ 4. Enter the message processing loop
2024
+ 5. Block until shutdown
2025
+
2026
+ This is the main entry point for your worker service.
2027
+ """
2028
+ try:
2029
+ logger.info(f"Starting worker: {self.service_name}")
2030
+
2031
+ # Discover components
2032
+ components = self._discover_components()
2033
+
2034
+ # Set components on Rust worker
2035
+ self._rust_worker.set_components(components)
2036
+
2037
+ # Set metadata
2038
+ if self.metadata:
2039
+ self._rust_worker.set_service_metadata(self.metadata)
2040
+
2041
+ # Configure entity state manager on Rust worker for database persistence
2042
+ logger.info("Configuring Rust EntityStateManager for database persistence")
2043
+ # Access the Rust core from the adapter
2044
+ if hasattr(self._entity_state_adapter, '_rust_core') and self._entity_state_adapter._rust_core:
2045
+ self._rust_worker.set_entity_state_manager(self._entity_state_adapter._rust_core)
2046
+ logger.info("Successfully configured Rust EntityStateManager")
2047
+
2048
+ # Get the current event loop to pass to Rust for concurrent Python async execution
2049
+ # This allows Rust to execute Python async functions on the same event loop
2050
+ # without spawn_blocking overhead, enabling true concurrency
2051
+ loop = asyncio.get_running_loop()
2052
+ logger.info("Passing Python event loop to Rust worker for concurrent execution")
2053
+
2054
+ # Set event loop on Rust worker
2055
+ self._rust_worker.set_event_loop(loop)
2056
+
2057
+ # Set message handler
2058
+ handler = self._create_message_handler()
2059
+ self._rust_worker.set_message_handler(handler)
2060
+
2061
+ # Initialize worker
2062
+ self._rust_worker.initialize()
2063
+
2064
+ logger.info("Worker registered successfully, entering message loop...")
2065
+
2066
+ # Run worker (this will block until shutdown)
2067
+ await self._rust_worker.run()
2068
+
2069
+ except Exception as e:
2070
+ # Capture SDK-level startup/runtime failures
2071
+ logger.error(f"Worker failed to start or encountered critical error: {e}", exc_info=True)
2072
+ _sentry.capture_exception(
2073
+ e,
2074
+ context={
2075
+ "service_name": self.service_name,
2076
+ "service_version": self.service_version,
2077
+ "error_location": "Worker.run",
2078
+ "error_phase": "worker_lifecycle",
2079
+ },
2080
+ tags={
2081
+ "sdk_error": "true",
2082
+ "error_type": "worker_failure",
2083
+ "severity": "critical",
2084
+ },
2085
+ level="error",
2086
+ )
2087
+ raise
2088
+
2089
+ finally:
2090
+ # Flush Sentry events before shutdown
2091
+ logger.info("Flushing Sentry events before shutdown...")
2092
+ _sentry.flush(timeout=5.0)
2093
+
2094
+ logger.info("Worker shutdown complete")