agnt5 0.2.8a10__cp310-abi3-manylinux_2_34_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of agnt5 might be problematic. Click here for more details.

agnt5/worker.py ADDED
@@ -0,0 +1,1619 @@
1
+ """Worker implementation for AGNT5 SDK."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import contextvars
7
+ import logging
8
+ import uuid
9
+ from typing import Any, Dict, List, Optional
10
+
11
+ from .function import FunctionRegistry
12
+ from .workflow import WorkflowRegistry
13
+ from ._telemetry import setup_module_logger
14
+
15
+ logger = setup_module_logger(__name__)
16
+
17
+
18
+ def _normalize_metadata(metadata: Dict[str, Any]) -> Dict[str, str]:
19
+ """
20
+ Convert metadata dictionary to Dict[str, str] for Rust FFI compatibility.
21
+
22
+ PyO3 requires HashMap<String, String>, but Python code may include booleans,
23
+ integers, or other types. This helper ensures all values are strings.
24
+
25
+ Args:
26
+ metadata: Dictionary with potentially mixed types
27
+
28
+ Returns:
29
+ Dictionary with all string values
30
+
31
+ Example:
32
+ >>> _normalize_metadata({"error": True, "count": 42, "msg": "hello"})
33
+ {"error": "true", "count": "42", "msg": "hello"}
34
+ """
35
+ normalized = {}
36
+ for key, value in metadata.items():
37
+ if isinstance(value, str):
38
+ normalized[key] = value
39
+ elif isinstance(value, bool):
40
+ # Convert bool to lowercase string for JSON compatibility
41
+ normalized[key] = str(value).lower()
42
+ elif value is None:
43
+ normalized[key] = ""
44
+ else:
45
+ # Convert any other type to string representation
46
+ normalized[key] = str(value)
47
+ return normalized
48
+
49
+ # Context variable to store trace metadata for propagation to LM calls
50
+ # This allows Rust LM layer to access traceparent without explicit parameter passing
51
+ _trace_metadata: contextvars.ContextVar[Dict[str, str]] = contextvars.ContextVar(
52
+ '_trace_metadata', default={}
53
+ )
54
+
55
+
56
+ class Worker:
57
+ """AGNT5 Worker for registering and running functions/workflows with the coordinator.
58
+
59
+ The Worker class manages the lifecycle of your service, including:
60
+ - Registration with the AGNT5 coordinator
61
+ - Automatic discovery of @function and @workflow decorated handlers
62
+ - Message handling and execution
63
+ - Health monitoring
64
+
65
+ Example:
66
+ ```python
67
+ from agnt5 import Worker, function
68
+
69
+ @function
70
+ async def process_data(ctx: Context, data: str) -> dict:
71
+ return {"result": data.upper()}
72
+
73
+ async def main():
74
+ worker = Worker(
75
+ service_name="data-processor",
76
+ service_version="1.0.0",
77
+ coordinator_endpoint="http://localhost:34186"
78
+ )
79
+ await worker.run()
80
+
81
+ if __name__ == "__main__":
82
+ asyncio.run(main())
83
+ ```
84
+ """
85
+
86
+ def __init__(
87
+ self,
88
+ service_name: str,
89
+ service_version: str = "1.0.0",
90
+ coordinator_endpoint: Optional[str] = None,
91
+ runtime: str = "standalone",
92
+ metadata: Optional[Dict[str, str]] = None,
93
+ functions: Optional[List] = None,
94
+ workflows: Optional[List] = None,
95
+ entities: Optional[List] = None,
96
+ agents: Optional[List] = None,
97
+ tools: Optional[List] = None,
98
+ auto_register: bool = False,
99
+ auto_register_paths: Optional[List[str]] = None,
100
+ pyproject_path: Optional[str] = None,
101
+ ):
102
+ """Initialize a new Worker with explicit or automatic component registration.
103
+
104
+ The Worker supports two registration modes:
105
+
106
+ **Explicit Mode (default, production):**
107
+ - Register workflows/agents explicitly, their dependencies are auto-included
108
+ - Optionally register standalone functions/tools for direct API invocation
109
+
110
+ **Auto-Registration Mode (development):**
111
+ - Automatically discovers all decorated components in source paths
112
+ - Reads source paths from pyproject.toml or uses explicit paths
113
+ - No need to maintain import lists
114
+
115
+ Args:
116
+ service_name: Unique name for this service
117
+ service_version: Version string (semantic versioning recommended)
118
+ coordinator_endpoint: Coordinator endpoint URL (default: from env AGNT5_COORDINATOR_ENDPOINT)
119
+ runtime: Runtime type - "standalone", "docker", "kubernetes", etc.
120
+ metadata: Optional service-level metadata
121
+ functions: List of @function decorated handlers (explicit mode)
122
+ workflows: List of @workflow decorated handlers (explicit mode)
123
+ entities: List of Entity classes (explicit mode)
124
+ agents: List of Agent instances (explicit mode)
125
+ tools: List of Tool instances (explicit mode)
126
+ auto_register: Enable automatic component discovery (default: False)
127
+ auto_register_paths: Explicit source paths to scan (overrides pyproject.toml discovery)
128
+ pyproject_path: Path to pyproject.toml (default: current directory)
129
+
130
+ Example (explicit mode - production):
131
+ ```python
132
+ from agnt5 import Worker
133
+ from my_service import greet_user, order_fulfillment, ShoppingCart, analyst_agent
134
+
135
+ worker = Worker(
136
+ service_name="my-service",
137
+ workflows=[order_fulfillment],
138
+ entities=[ShoppingCart],
139
+ agents=[analyst_agent],
140
+ functions=[greet_user],
141
+ )
142
+ await worker.run()
143
+ ```
144
+
145
+ Example (auto-register mode - development):
146
+ ```python
147
+ from agnt5 import Worker
148
+
149
+ worker = Worker(
150
+ service_name="my-service",
151
+ auto_register=True, # Discovers from pyproject.toml
152
+ )
153
+ await worker.run()
154
+ ```
155
+ """
156
+ self.service_name = service_name
157
+ self.service_version = service_version
158
+ self.coordinator_endpoint = coordinator_endpoint
159
+ self.runtime = runtime
160
+ self.metadata = metadata or {}
161
+
162
+ # Get tenant_id from environment (required for entity state management)
163
+ import os
164
+ self._tenant_id = os.getenv("AGNT5_TENANT_ID", "default-tenant")
165
+
166
+ # Import Rust worker
167
+ try:
168
+ from ._core import PyWorker, PyWorkerConfig, PyComponentInfo
169
+ self._PyWorker = PyWorker
170
+ self._PyWorkerConfig = PyWorkerConfig
171
+ self._PyComponentInfo = PyComponentInfo
172
+ except ImportError as e:
173
+ raise ImportError(
174
+ f"Failed to import Rust core worker: {e}. "
175
+ "Make sure agnt5 is properly installed with: pip install agnt5"
176
+ )
177
+
178
+ # Create Rust worker config
179
+ self._rust_config = self._PyWorkerConfig(
180
+ service_name=service_name,
181
+ service_version=service_version,
182
+ service_type=runtime,
183
+ )
184
+
185
+ # Create Rust worker instance
186
+ self._rust_worker = self._PyWorker(self._rust_config)
187
+
188
+ # Create worker-scoped entity state adapter with Rust core
189
+ from .entity import EntityStateAdapter
190
+ from ._core import EntityStateManager as RustEntityStateManager
191
+
192
+ # Create Rust core for entity state management
193
+ rust_core = RustEntityStateManager(tenant_id=self._tenant_id)
194
+
195
+ # Create Python adapter (thin wrapper around Rust core)
196
+ self._entity_state_adapter = EntityStateAdapter(rust_core=rust_core)
197
+
198
+ logger.info("Created EntityStateAdapter with Rust core for state management")
199
+
200
+ # Component registration: auto-discover or explicit
201
+ if auto_register:
202
+ # Auto-registration mode: discover from source paths
203
+ if auto_register_paths:
204
+ source_paths = auto_register_paths
205
+ logger.info(f"Auto-registration with explicit paths: {source_paths}")
206
+ else:
207
+ source_paths = self._discover_source_paths(pyproject_path)
208
+ logger.info(f"Auto-registration with discovered paths: {source_paths}")
209
+
210
+ # Auto-discover components (will populate _explicit_components)
211
+ self._auto_discover_components(source_paths)
212
+ else:
213
+ # Explicit registration from constructor kwargs
214
+ self._explicit_components = {
215
+ 'functions': list(functions or []),
216
+ 'workflows': list(workflows or []),
217
+ 'entities': list(entities or []),
218
+ 'agents': list(agents or []),
219
+ 'tools': list(tools or []),
220
+ }
221
+
222
+ # Count explicitly registered components
223
+ total_explicit = sum(len(v) for v in self._explicit_components.values())
224
+ logger.info(
225
+ f"Worker initialized: {service_name} v{service_version} (runtime: {runtime}), "
226
+ f"{total_explicit} components explicitly registered"
227
+ )
228
+
229
+ def register_components(
230
+ self,
231
+ functions=None,
232
+ workflows=None,
233
+ entities=None,
234
+ agents=None,
235
+ tools=None,
236
+ ):
237
+ """Register additional components after Worker initialization.
238
+
239
+ This method allows incremental registration of components after the Worker
240
+ has been created. Useful for conditional or dynamic component registration.
241
+
242
+ Args:
243
+ functions: List of functions decorated with @function
244
+ workflows: List of workflows decorated with @workflow
245
+ entities: List of entity classes
246
+ agents: List of agent instances
247
+ tools: List of tool instances
248
+
249
+ Example:
250
+ ```python
251
+ worker = Worker(service_name="my-service")
252
+
253
+ # Register conditionally
254
+ if feature_enabled:
255
+ worker.register_components(workflows=[advanced_workflow])
256
+ ```
257
+ """
258
+ if functions:
259
+ self._explicit_components['functions'].extend(functions)
260
+ logger.debug(f"Incrementally registered {len(functions)} functions")
261
+
262
+ if workflows:
263
+ self._explicit_components['workflows'].extend(workflows)
264
+ logger.debug(f"Incrementally registered {len(workflows)} workflows")
265
+
266
+ if entities:
267
+ self._explicit_components['entities'].extend(entities)
268
+ logger.debug(f"Incrementally registered {len(entities)} entities")
269
+
270
+ if agents:
271
+ self._explicit_components['agents'].extend(agents)
272
+ logger.debug(f"Incrementally registered {len(agents)} agents")
273
+
274
+ if tools:
275
+ self._explicit_components['tools'].extend(tools)
276
+ logger.debug(f"Incrementally registered {len(tools)} tools")
277
+
278
+ total = sum(len(v) for v in self._explicit_components.values())
279
+ logger.info(f"Total components now registered: {total}")
280
+
281
+ def _discover_source_paths(self, pyproject_path: Optional[str] = None) -> List[str]:
282
+ """Discover source paths from pyproject.toml.
283
+
284
+ Reads pyproject.toml to find package source directories using:
285
+ - Hatch: [tool.hatch.build.targets.wheel] packages
286
+ - Maturin: [tool.maturin] python-source
287
+ - Fallback: ["src"] if not found
288
+
289
+ Args:
290
+ pyproject_path: Path to pyproject.toml (default: current directory)
291
+
292
+ Returns:
293
+ List of directory paths to scan (e.g., ["src/agnt5_benchmark"])
294
+ """
295
+ from pathlib import Path
296
+
297
+ # Python 3.11+ has tomllib in stdlib
298
+ try:
299
+ import tomllib
300
+ except ImportError:
301
+ logger.error("tomllib not available (Python 3.11+ required for auto-registration)")
302
+ return ["src"]
303
+
304
+ # Determine pyproject.toml location
305
+ if pyproject_path:
306
+ pyproject_file = Path(pyproject_path)
307
+ else:
308
+ # Look in current directory
309
+ pyproject_file = Path.cwd() / "pyproject.toml"
310
+
311
+ if not pyproject_file.exists():
312
+ logger.warning(
313
+ f"pyproject.toml not found at {pyproject_file}, "
314
+ f"defaulting to 'src/' directory"
315
+ )
316
+ return ["src"]
317
+
318
+ # Parse pyproject.toml
319
+ try:
320
+ with open(pyproject_file, "rb") as f:
321
+ config = tomllib.load(f)
322
+ except Exception as e:
323
+ logger.error(f"Failed to parse pyproject.toml: {e}")
324
+ return ["src"]
325
+
326
+ # Extract source paths based on build system
327
+ source_paths = []
328
+
329
+ # Try Hatch configuration
330
+ if "tool" in config and "hatch" in config["tool"]:
331
+ hatch_config = config["tool"]["hatch"]
332
+ if "build" in hatch_config and "targets" in hatch_config["build"]:
333
+ wheel_config = hatch_config["build"]["targets"].get("wheel", {})
334
+ packages = wheel_config.get("packages", [])
335
+ source_paths.extend(packages)
336
+
337
+ # Try Maturin configuration
338
+ if not source_paths and "tool" in config and "maturin" in config["tool"]:
339
+ maturin_config = config["tool"]["maturin"]
340
+ python_source = maturin_config.get("python-source")
341
+ if python_source:
342
+ source_paths.append(python_source)
343
+
344
+ # Fallback to src/
345
+ if not source_paths:
346
+ logger.info("No source paths in pyproject.toml, defaulting to 'src/'")
347
+ source_paths = ["src"]
348
+
349
+ logger.info(f"Discovered source paths from pyproject.toml: {source_paths}")
350
+ return source_paths
351
+
352
+ def _auto_discover_components(self, source_paths: List[str]) -> None:
353
+ """Auto-discover components by importing all Python files in source paths.
354
+
355
+ Args:
356
+ source_paths: List of directory paths to scan
357
+ """
358
+ import importlib.util
359
+ import sys
360
+ from pathlib import Path
361
+
362
+ logger.info(f"Auto-discovering components in paths: {source_paths}")
363
+
364
+ total_modules = 0
365
+
366
+ for source_path in source_paths:
367
+ path = Path(source_path)
368
+
369
+ if not path.exists():
370
+ logger.warning(f"Source path does not exist: {source_path}")
371
+ continue
372
+
373
+ # Recursively find all .py files
374
+ for py_file in path.rglob("*.py"):
375
+ # Skip __pycache__ and test files
376
+ if "__pycache__" in str(py_file) or py_file.name.startswith("test_"):
377
+ continue
378
+
379
+ # Convert path to module name
380
+ # e.g., src/agnt5_benchmark/functions.py -> agnt5_benchmark.functions
381
+ relative_path = py_file.relative_to(path.parent)
382
+ module_parts = list(relative_path.parts[:-1]) # Remove .py extension part
383
+ module_parts.append(relative_path.stem) # Add filename without .py
384
+ module_name = ".".join(module_parts)
385
+
386
+ # Import module (triggers decorators)
387
+ try:
388
+ if module_name in sys.modules:
389
+ logger.debug(f"Module already imported: {module_name}")
390
+ else:
391
+ spec = importlib.util.spec_from_file_location(module_name, py_file)
392
+ if spec and spec.loader:
393
+ module = importlib.util.module_from_spec(spec)
394
+ sys.modules[module_name] = module
395
+ spec.loader.exec_module(module)
396
+ logger.debug(f"Auto-imported: {module_name}")
397
+ total_modules += 1
398
+ except Exception as e:
399
+ logger.warning(f"Failed to import {module_name}: {e}")
400
+
401
+ logger.info(f"Auto-imported {total_modules} modules")
402
+
403
+ # Collect components from registries
404
+ from .agent import AgentRegistry
405
+ from .entity import EntityRegistry
406
+ from .tool import ToolRegistry
407
+
408
+ # Extract actual objects from registries
409
+ functions = [cfg.handler for cfg in FunctionRegistry.all().values()]
410
+ workflows = [cfg.handler for cfg in WorkflowRegistry.all().values()]
411
+ entities = [et.entity_class for et in EntityRegistry.all().values()]
412
+ agents = list(AgentRegistry.all().values())
413
+ tools = list(ToolRegistry.all().values())
414
+
415
+ self._explicit_components = {
416
+ 'functions': functions,
417
+ 'workflows': workflows,
418
+ 'entities': entities,
419
+ 'agents': agents,
420
+ 'tools': tools,
421
+ }
422
+
423
+ logger.info(
424
+ f"Auto-discovered components: "
425
+ f"{len(functions)} functions, "
426
+ f"{len(workflows)} workflows, "
427
+ f"{len(entities)} entities, "
428
+ f"{len(agents)} agents, "
429
+ f"{len(tools)} tools"
430
+ )
431
+
432
+ def _discover_components(self):
433
+ """Discover explicit components and auto-include their dependencies.
434
+
435
+ Hybrid approach:
436
+ - Explicitly registered workflows/agents are processed
437
+ - Functions called by workflows are auto-included (TODO: implement)
438
+ - Tools used by agents are auto-included
439
+ - Standalone functions/tools can be explicitly registered
440
+
441
+ Returns:
442
+ List of PyComponentInfo instances for all components
443
+ """
444
+ components = []
445
+ import json
446
+
447
+ # Import registries
448
+ from .entity import EntityRegistry
449
+ from .tool import ToolRegistry
450
+
451
+ # Track all components (explicit + auto-included)
452
+ all_functions = set(self._explicit_components['functions'])
453
+ all_tools = set(self._explicit_components['tools'])
454
+
455
+ # Auto-include agent tool dependencies
456
+ for agent in self._explicit_components['agents']:
457
+ if hasattr(agent, 'tools') and agent.tools:
458
+ # Agent.tools is a dict of {tool_name: tool_instance}
459
+ all_tools.update(agent.tools.values())
460
+ logger.debug(
461
+ f"Auto-included {len(agent.tools)} tools from agent '{agent.name}'"
462
+ )
463
+
464
+ # Log registration summary
465
+ explicit_func_count = len(self._explicit_components['functions'])
466
+ explicit_tool_count = len(self._explicit_components['tools'])
467
+ auto_func_count = len(all_functions) - explicit_func_count
468
+ auto_tool_count = len(all_tools) - explicit_tool_count
469
+
470
+ logger.info(
471
+ f"Component registration summary: "
472
+ f"{len(all_functions)} functions ({explicit_func_count} explicit, {auto_func_count} auto-included), "
473
+ f"{len(self._explicit_components['workflows'])} workflows, "
474
+ f"{len(self._explicit_components['entities'])} entities, "
475
+ f"{len(self._explicit_components['agents'])} agents, "
476
+ f"{len(all_tools)} tools ({explicit_tool_count} explicit, {auto_tool_count} auto-included)"
477
+ )
478
+
479
+ # Process functions (explicit + auto-included)
480
+ for func in all_functions:
481
+ config = FunctionRegistry.get(func.__name__)
482
+ if not config:
483
+ logger.warning(f"Function '{func.__name__}' not found in FunctionRegistry")
484
+ continue
485
+
486
+ input_schema_str = json.dumps(config.input_schema) if config.input_schema else None
487
+ output_schema_str = json.dumps(config.output_schema) if config.output_schema else None
488
+ metadata = config.metadata if config.metadata else {}
489
+
490
+ # Serialize retry and backoff policies
491
+ config_dict = {}
492
+ if config.retries:
493
+ config_dict["max_attempts"] = str(config.retries.max_attempts)
494
+ config_dict["initial_interval_ms"] = str(config.retries.initial_interval_ms)
495
+ config_dict["max_interval_ms"] = str(config.retries.max_interval_ms)
496
+
497
+ if config.backoff:
498
+ config_dict["backoff_type"] = config.backoff.type.value
499
+ config_dict["backoff_multiplier"] = str(config.backoff.multiplier)
500
+
501
+ component_info = self._PyComponentInfo(
502
+ name=config.name,
503
+ component_type="function",
504
+ metadata=metadata,
505
+ config=config_dict,
506
+ input_schema=input_schema_str,
507
+ output_schema=output_schema_str,
508
+ definition=None,
509
+ )
510
+ components.append(component_info)
511
+
512
+ # Process workflows
513
+ for workflow in self._explicit_components['workflows']:
514
+ config = WorkflowRegistry.get(workflow.__name__)
515
+ if not config:
516
+ logger.warning(f"Workflow '{workflow.__name__}' not found in WorkflowRegistry")
517
+ continue
518
+
519
+ input_schema_str = json.dumps(config.input_schema) if config.input_schema else None
520
+ output_schema_str = json.dumps(config.output_schema) if config.output_schema else None
521
+ metadata = config.metadata if config.metadata else {}
522
+
523
+ component_info = self._PyComponentInfo(
524
+ name=config.name,
525
+ component_type="workflow",
526
+ metadata=metadata,
527
+ config={},
528
+ input_schema=input_schema_str,
529
+ output_schema=output_schema_str,
530
+ definition=None,
531
+ )
532
+ components.append(component_info)
533
+
534
+ # Process entities
535
+ for entity_class in self._explicit_components['entities']:
536
+ entity_type = EntityRegistry.get(entity_class.__name__)
537
+ if not entity_type:
538
+ logger.warning(f"Entity '{entity_class.__name__}' not found in EntityRegistry")
539
+ continue
540
+
541
+ # Build complete entity definition with state schema and method schemas
542
+ entity_definition = entity_type.build_entity_definition()
543
+ definition_str = json.dumps(entity_definition)
544
+
545
+ # Keep minimal metadata for backward compatibility
546
+ metadata_dict = {
547
+ "methods": json.dumps(list(entity_type._method_schemas.keys())),
548
+ }
549
+
550
+ component_info = self._PyComponentInfo(
551
+ name=entity_type.name,
552
+ component_type="entity",
553
+ metadata=metadata_dict,
554
+ config={},
555
+ input_schema=None, # Entities don't have single input/output schemas
556
+ output_schema=None,
557
+ definition=definition_str, # Complete entity definition with state and methods
558
+ )
559
+ components.append(component_info)
560
+ logger.debug(f"Registered entity '{entity_type.name}' with definition")
561
+
562
+ # Process agents
563
+ from .agent import AgentRegistry
564
+
565
+ for agent in self._explicit_components['agents']:
566
+ # Register agent in AgentRegistry for execution lookup
567
+ AgentRegistry.register(agent)
568
+ logger.debug(f"Registered agent '{agent.name}' in AgentRegistry for execution")
569
+
570
+ input_schema_str = json.dumps(agent.input_schema) if hasattr(agent, 'input_schema') and agent.input_schema else None
571
+ output_schema_str = json.dumps(agent.output_schema) if hasattr(agent, 'output_schema') and agent.output_schema else None
572
+
573
+ metadata_dict = agent.metadata if hasattr(agent, 'metadata') else {}
574
+ if hasattr(agent, 'tools'):
575
+ metadata_dict["tools"] = json.dumps(list(agent.tools.keys()))
576
+
577
+ component_info = self._PyComponentInfo(
578
+ name=agent.name,
579
+ component_type="agent",
580
+ metadata=metadata_dict,
581
+ config={},
582
+ input_schema=input_schema_str,
583
+ output_schema=output_schema_str,
584
+ definition=None,
585
+ )
586
+ components.append(component_info)
587
+
588
+ # Process tools (explicit + auto-included)
589
+ for tool in all_tools:
590
+ input_schema_str = json.dumps(tool.input_schema) if hasattr(tool, 'input_schema') and tool.input_schema else None
591
+ output_schema_str = json.dumps(tool.output_schema) if hasattr(tool, 'output_schema') and tool.output_schema else None
592
+
593
+ component_info = self._PyComponentInfo(
594
+ name=tool.name,
595
+ component_type="tool",
596
+ metadata={},
597
+ config={},
598
+ input_schema=input_schema_str,
599
+ output_schema=output_schema_str,
600
+ definition=None,
601
+ )
602
+ components.append(component_info)
603
+
604
+ logger.info(f"Discovered {len(components)} total components")
605
+ return components
606
+
607
+ def _create_message_handler(self):
608
+ """Create the message handler that will be called by Rust worker."""
609
+
610
+ def handle_message(request):
611
+ """Handle incoming execution requests - returns coroutine for Rust to await."""
612
+ # Extract request details
613
+ component_name = request.component_name
614
+ component_type = request.component_type
615
+ input_data = request.input_data
616
+
617
+ logger.debug(
618
+ f"Handling {component_type} request: {component_name}, input size: {len(input_data)} bytes"
619
+ )
620
+
621
+ # Import all registries
622
+ from .tool import ToolRegistry
623
+ from .entity import EntityRegistry
624
+ from .agent import AgentRegistry
625
+
626
+ # Route based on component type and return coroutines
627
+ if component_type == "tool":
628
+ tool = ToolRegistry.get(component_name)
629
+ if tool:
630
+ logger.debug(f"Found tool: {component_name}")
631
+ # Return coroutine, don't await it
632
+ return self._execute_tool(tool, input_data, request)
633
+
634
+ elif component_type == "entity":
635
+ entity_type = EntityRegistry.get(component_name)
636
+ if entity_type:
637
+ logger.debug(f"Found entity: {component_name}")
638
+ # Return coroutine, don't await it
639
+ return self._execute_entity(entity_type, input_data, request)
640
+
641
+ elif component_type == "agent":
642
+ agent = AgentRegistry.get(component_name)
643
+ if agent:
644
+ logger.debug(f"Found agent: {component_name}")
645
+ # Return coroutine, don't await it
646
+ return self._execute_agent(agent, input_data, request)
647
+
648
+ elif component_type == "workflow":
649
+ workflow_config = WorkflowRegistry.get(component_name)
650
+ if workflow_config:
651
+ logger.debug(f"Found workflow: {component_name}")
652
+ # Return coroutine, don't await it
653
+ return self._execute_workflow(workflow_config, input_data, request)
654
+
655
+ elif component_type == "function":
656
+ function_config = FunctionRegistry.get(component_name)
657
+ if function_config:
658
+ # Return coroutine, don't await it
659
+ return self._execute_function(function_config, input_data, request)
660
+
661
+ # Not found - need to return an async error response
662
+ error_msg = f"Component '{component_name}' of type '{component_type}' not found"
663
+ logger.error(error_msg)
664
+
665
+ # Create async wrapper for error response
666
+ async def error_response():
667
+ return self._create_error_response(request, error_msg)
668
+
669
+ return error_response()
670
+
671
+ return handle_message
672
+
673
+ def _extract_critical_metadata(self, request) -> Dict[str, str]:
674
+ """
675
+ Extract critical metadata from request that MUST be propagated to response.
676
+
677
+ This ensures journal events are written to the correct tenant partition
678
+ and can be properly replayed. Missing tenant_id causes catastrophic
679
+ event sourcing corruption where events are split across partitions.
680
+
681
+ Returns:
682
+ Dict[str, str]: Metadata with all values normalized to strings for Rust FFI
683
+ """
684
+ metadata = {}
685
+ if hasattr(request, 'metadata') and request.metadata:
686
+ # CRITICAL: Propagate tenant_id to prevent journal corruption
687
+ # Convert to string immediately to ensure Rust FFI compatibility
688
+ if "tenant_id" in request.metadata:
689
+ metadata["tenant_id"] = str(request.metadata["tenant_id"])
690
+ if "deployment_id" in request.metadata:
691
+ metadata["deployment_id"] = str(request.metadata["deployment_id"])
692
+
693
+ # CRITICAL: Normalize all metadata values to strings for Rust FFI (PyO3)
694
+ # PyO3 expects HashMap<String, String> and will fail with bool/int values
695
+ return _normalize_metadata(metadata)
696
+
697
+ async def _execute_function(self, config, input_data: bytes, request):
698
+ """Execute a function handler (supports both regular and streaming functions)."""
699
+ import json
700
+ import inspect
701
+ import time
702
+ from .context import Context
703
+ from ._core import PyExecuteComponentResponse
704
+
705
+ exec_start = time.time()
706
+
707
+ try:
708
+ # Parse input data
709
+ input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
710
+
711
+ # Store trace metadata in contextvar for LM calls to access
712
+ # The Rust worker injects traceparent into request.metadata for trace propagation
713
+ if hasattr(request, 'metadata') and request.metadata:
714
+ _trace_metadata.set(dict(request.metadata))
715
+ logger.debug(f"Trace metadata stored: traceparent={request.metadata.get('traceparent', 'N/A')}")
716
+
717
+ # Extract attempt number from platform request (if provided)
718
+ platform_attempt = getattr(request, 'attempt', 0)
719
+
720
+ # Create FunctionContext with attempt number for retry tracking
721
+ # - If platform_attempt > 0: Platform is orchestrating retries
722
+ # - If platform_attempt == 0: First attempt (or no retry config)
723
+ from .function import FunctionContext
724
+ ctx = FunctionContext(
725
+ run_id=f"{self.service_name}:{config.name}",
726
+ attempt=platform_attempt,
727
+ runtime_context=request.runtime_context,
728
+ retry_policy=config.retries,
729
+ )
730
+
731
+ # Set context in contextvar so get_current_context() and error handlers can access it
732
+ from .context import set_current_context, _current_context
733
+ token = set_current_context(ctx)
734
+
735
+ # Execute function directly - Rust bridge handles tracing
736
+ # Note: Removed Python-level span creation to avoid duplicate spans.
737
+ # The Rust worker bridge (sdk-python/rust-src/worker.rs:413-659) already
738
+ # creates a comprehensive OpenTelemetry span with all necessary attributes.
739
+ # See DUPLICATE_SPANS_FIX.md for details.
740
+ #
741
+ # Note on retry handling:
742
+ # - If platform_attempt > 0: Platform is orchestrating retries, execute once
743
+ # - If platform_attempt == 0: Local retry loop in decorator wrapper handles retries
744
+ if input_dict:
745
+ result = config.handler(ctx, **input_dict)
746
+ else:
747
+ result = config.handler(ctx)
748
+
749
+ # Note: Removed flush_telemetry_py() call here - it was causing 2-second blocking delay!
750
+ # The batch span processor handles flushing automatically with 5s timeout
751
+ # We only need to flush on worker shutdown, not after each function execution
752
+
753
+ # Check if result is an async generator (streaming function)
754
+ if inspect.isasyncgen(result):
755
+ # Streaming function - return list of responses
756
+ # Rust bridge will send each response separately to coordinator
757
+ responses = []
758
+ chunk_index = 0
759
+
760
+ async for chunk in result:
761
+ # Serialize chunk
762
+ chunk_data = json.dumps(chunk).encode("utf-8")
763
+
764
+ responses.append(PyExecuteComponentResponse(
765
+ invocation_id=request.invocation_id,
766
+ success=True,
767
+ output_data=chunk_data,
768
+ state_update=None,
769
+ error_message=None,
770
+ metadata=None,
771
+ is_chunk=True,
772
+ done=False,
773
+ chunk_index=chunk_index,
774
+ attempt=platform_attempt,
775
+ ))
776
+ chunk_index += 1
777
+
778
+ # Add final "done" marker
779
+ responses.append(PyExecuteComponentResponse(
780
+ invocation_id=request.invocation_id,
781
+ success=True,
782
+ output_data=b"",
783
+ state_update=None,
784
+ error_message=None,
785
+ metadata=None,
786
+ is_chunk=True,
787
+ done=True,
788
+ chunk_index=chunk_index,
789
+ attempt=platform_attempt,
790
+ ))
791
+
792
+ logger.debug(f"Streaming function produced {len(responses)} chunks")
793
+ return responses
794
+ else:
795
+ # Regular function - await and return single response
796
+ if inspect.iscoroutine(result):
797
+ result = await result
798
+
799
+ # Serialize result
800
+ output_data = json.dumps(result).encode("utf-8")
801
+
802
+ # Extract critical metadata for journal event correlation
803
+ response_metadata = self._extract_critical_metadata(request)
804
+
805
+ return PyExecuteComponentResponse(
806
+ invocation_id=request.invocation_id,
807
+ success=True,
808
+ output_data=output_data,
809
+ state_update=None,
810
+ error_message=None,
811
+ metadata=response_metadata if response_metadata else None,
812
+ is_chunk=False,
813
+ done=True,
814
+ chunk_index=0,
815
+ attempt=platform_attempt,
816
+ )
817
+
818
+ except Exception as e:
819
+ # Include exception type for better error messages
820
+ error_msg = f"{type(e).__name__}: {str(e)}"
821
+
822
+ # Capture full stack trace for telemetry
823
+ import traceback
824
+ stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
825
+
826
+ # Log with full traceback using ctx.logger to ensure run_id correlation
827
+ from .context import get_current_context
828
+ current_ctx = get_current_context()
829
+ error_logger = current_ctx.logger if current_ctx else logger
830
+ error_logger.error(f"Function execution failed: {error_msg}", exc_info=True)
831
+
832
+ # Store stack trace in metadata for observability
833
+ metadata = {
834
+ "error_type": type(e).__name__,
835
+ "stack_trace": stack_trace,
836
+ "error": True, # Boolean flag for error detection
837
+ }
838
+
839
+ # CRITICAL: Extract critical metadata (including tenant_id) for journal event correlation
840
+ # This ensures run.failed events are properly emitted by Worker Coordinator
841
+ critical_metadata = self._extract_critical_metadata(request)
842
+ metadata.update(critical_metadata)
843
+
844
+ # CRITICAL: Normalize metadata to ensure all values are strings (Rust FFI requirement)
845
+ # PyO3 expects HashMap<String, String>, but we may have booleans or other types
846
+ normalized_metadata = _normalize_metadata(metadata)
847
+
848
+ return PyExecuteComponentResponse(
849
+ invocation_id=request.invocation_id,
850
+ success=False,
851
+ output_data=b"",
852
+ state_update=None,
853
+ error_message=error_msg,
854
+ metadata=normalized_metadata,
855
+ is_chunk=False,
856
+ done=True,
857
+ chunk_index=0,
858
+ attempt=getattr(request, 'attempt', 0),
859
+ )
860
+
861
+ finally:
862
+ # Always reset context to prevent leakage between executions
863
+ _current_context.reset(token)
864
+
865
+ async def _execute_workflow(self, config, input_data: bytes, request):
866
+ """Execute a workflow handler with automatic replay support."""
867
+ import json
868
+ from .workflow import WorkflowEntity, WorkflowContext
869
+ from .entity import _get_state_adapter, _entity_state_adapter_ctx
870
+ from .exceptions import WaitingForUserInputException
871
+ from ._core import PyExecuteComponentResponse
872
+
873
+ # Set entity state adapter in context so workflows can use Entities
874
+ _entity_state_adapter_ctx.set(self._entity_state_adapter)
875
+
876
+ try:
877
+ # Parse input data
878
+ input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
879
+
880
+ # Extract or generate session_id for multi-turn conversation support (for chat workflows)
881
+ # If session_id is provided, the workflow can maintain conversation context
882
+ session_id = input_dict.get("session_id")
883
+
884
+ if not session_id:
885
+ session_id = str(uuid.uuid4())
886
+ logger.info(f"Created new workflow session: {session_id}")
887
+ else:
888
+ logger.info(f"Using existing workflow session: {session_id}")
889
+
890
+ # Parse replay data from request metadata for crash recovery
891
+ completed_steps = {}
892
+ initial_state = {}
893
+ user_response = None
894
+
895
+ if hasattr(request, 'metadata') and request.metadata:
896
+ # Parse completed steps for replay
897
+ if "completed_steps" in request.metadata:
898
+ completed_steps_json = request.metadata["completed_steps"]
899
+ if completed_steps_json:
900
+ try:
901
+ completed_steps = json.loads(completed_steps_json)
902
+ logger.info(f"🔄 Replaying workflow with {len(completed_steps)} cached steps")
903
+ except json.JSONDecodeError:
904
+ logger.warning("Failed to parse completed_steps from metadata")
905
+
906
+ # Parse initial workflow state for replay
907
+ if "workflow_state" in request.metadata:
908
+ workflow_state_json = request.metadata["workflow_state"]
909
+ if workflow_state_json:
910
+ try:
911
+ initial_state = json.loads(workflow_state_json)
912
+ logger.info(f"🔄 Loaded workflow state: {len(initial_state)} keys")
913
+ except json.JSONDecodeError:
914
+ logger.warning("Failed to parse workflow_state from metadata")
915
+
916
+ # Check for user response (workflow resume after pause)
917
+ if "user_response" in request.metadata:
918
+ user_response = request.metadata["user_response"]
919
+ logger.info(f"▶️ Resuming workflow with user response: {user_response}")
920
+
921
+ # NEW: Check for agent resume (agent-level HITL)
922
+ agent_context = None
923
+ if hasattr(request, 'metadata') and request.metadata:
924
+ if "agent_context" in request.metadata:
925
+ agent_context_json = request.metadata["agent_context"]
926
+ try:
927
+ agent_context = json.loads(agent_context_json)
928
+ agent_name = agent_context.get("agent_name", "unknown")
929
+ iteration = agent_context.get("iteration", 0)
930
+ logger.info(
931
+ f"▶️ Resuming agent '{agent_name}' from iteration {iteration} "
932
+ f"with user response: {user_response}"
933
+ )
934
+ except json.JSONDecodeError:
935
+ logger.warning("Failed to parse agent_context from metadata")
936
+ agent_context = None
937
+
938
+ # Extract session_id and user_id from request for memory scoping
939
+ # Do this FIRST so we can pass to WorkflowEntity constructor
940
+ session_id = request.session_id if hasattr(request, 'session_id') and request.session_id else request.invocation_id
941
+ user_id = request.user_id if hasattr(request, 'user_id') and request.user_id else None
942
+
943
+ # Create WorkflowEntity for state management with memory scoping
944
+ # Entity key will be scoped based on priority: user_id > session_id > run_id
945
+ workflow_entity = WorkflowEntity(
946
+ run_id=request.invocation_id,
947
+ session_id=session_id,
948
+ user_id=user_id,
949
+ )
950
+
951
+ # Load replay data into entity if provided
952
+ if completed_steps:
953
+ workflow_entity._completed_steps = completed_steps
954
+ logger.debug(f"Loaded {len(completed_steps)} completed steps into workflow entity")
955
+
956
+ # Inject user response if resuming from pause
957
+ if user_response:
958
+ workflow_entity.inject_user_response(user_response)
959
+ logger.debug(f"Injected user response into workflow entity")
960
+
961
+ if initial_state:
962
+ # Load initial state into entity's state adapter
963
+ state_adapter = _get_state_adapter()
964
+ if hasattr(state_adapter, '_standalone_states'):
965
+ # Standalone mode - set state directly
966
+ state_adapter._standalone_states[workflow_entity._state_key] = initial_state
967
+ logger.debug(f"Loaded initial state with {len(initial_state)} keys into workflow entity (standalone)")
968
+ else:
969
+ # Production mode - state is managed by Rust core
970
+ logger.debug(f"Initial state will be loaded from platform (production mode)")
971
+
972
+ # Create checkpoint callback for real-time streaming
973
+ def checkpoint_callback(checkpoint: dict) -> None:
974
+ """Send checkpoint to Rust worker queue."""
975
+ try:
976
+ # Extract critical metadata for checkpoint routing
977
+ metadata = self._extract_critical_metadata(request)
978
+
979
+ # DEBUG: Log metadata types for troubleshooting PyO3 conversion errors
980
+ logger.debug(f"Checkpoint metadata types: {[(k, type(v).__name__) for k, v in metadata.items()]}")
981
+
982
+ # Queue checkpoint via Rust FFI
983
+ self._rust_worker.queue_workflow_checkpoint(
984
+ invocation_id=request.invocation_id,
985
+ checkpoint_type=checkpoint["checkpoint_type"],
986
+ checkpoint_data=json.dumps(checkpoint["checkpoint_data"]),
987
+ sequence_number=checkpoint["sequence_number"],
988
+ metadata=metadata,
989
+ )
990
+ logger.debug(
991
+ f"Queued checkpoint: type={checkpoint['checkpoint_type']} "
992
+ f"seq={checkpoint['sequence_number']}"
993
+ )
994
+ except Exception as e:
995
+ logger.error(f"Failed to queue checkpoint: {e}", exc_info=True)
996
+ logger.error(f"Checkpoint metadata causing error: {metadata}")
997
+ logger.error(f"Checkpoint data: {checkpoint}")
998
+
999
+ # Create WorkflowContext with entity, runtime_context, and checkpoint callback
1000
+ ctx = WorkflowContext(
1001
+ workflow_entity=workflow_entity,
1002
+ run_id=request.invocation_id, # Use unique invocation_id for this execution
1003
+ session_id=session_id, # Session for multi-turn conversations
1004
+ user_id=user_id, # User for long-term memory
1005
+ runtime_context=request.runtime_context,
1006
+ checkpoint_callback=checkpoint_callback,
1007
+ )
1008
+
1009
+ # NEW: Populate agent resume info if this is an agent HITL resume
1010
+ if agent_context and user_response:
1011
+ ctx._agent_resume_info = {
1012
+ "agent_name": agent_context["agent_name"],
1013
+ "agent_context": agent_context,
1014
+ "user_response": user_response,
1015
+ }
1016
+ logger.debug(
1017
+ f"Set agent resume info for '{agent_context['agent_name']}' "
1018
+ f"in workflow context"
1019
+ )
1020
+
1021
+ # Execute workflow directly - Rust bridge handles tracing
1022
+ # Note: Removed Python-level span creation to avoid duplicate spans.
1023
+ # The Rust worker bridge creates comprehensive OpenTelemetry spans.
1024
+ # See DUPLICATE_SPANS_FIX.md for details.
1025
+
1026
+ # CRITICAL: Set context in contextvar so LM/Agent/Tool calls can access it
1027
+ from .context import set_current_context
1028
+ token = set_current_context(ctx)
1029
+ try:
1030
+ if input_dict:
1031
+ result = await config.handler(ctx, **input_dict)
1032
+ else:
1033
+ result = await config.handler(ctx)
1034
+
1035
+ # Note: Workflow entity persistence is handled by the @workflow decorator wrapper
1036
+ # which persists before returning. No need to persist here.
1037
+ finally:
1038
+ # Always reset context to prevent leakage
1039
+ from .context import _current_context
1040
+ _current_context.reset(token)
1041
+
1042
+ # Note: Removed flush_telemetry_py() call here - it was causing 2-second blocking delay!
1043
+ # The batch span processor handles flushing automatically with 5s timeout
1044
+
1045
+ # Serialize result
1046
+ output_data = json.dumps(result).encode("utf-8")
1047
+
1048
+ # Collect workflow execution metadata for durability
1049
+ metadata = {}
1050
+
1051
+ # CRITICAL: Propagate tenant_id and deployment_id to prevent journal corruption
1052
+ # Missing tenant_id causes events to be written to wrong partition
1053
+ critical_metadata = self._extract_critical_metadata(request)
1054
+ metadata.update(critical_metadata)
1055
+
1056
+ # Add step events to metadata (for workflow durability)
1057
+ # Access _step_events from the workflow entity, not the context
1058
+ step_events = ctx._workflow_entity._step_events
1059
+ if step_events:
1060
+ metadata["step_events"] = json.dumps(step_events)
1061
+ logger.debug(f"Workflow has {len(step_events)} recorded steps")
1062
+
1063
+ # Add final state snapshot to metadata (if state was used)
1064
+ # Check if _state was initialized without triggering property getter
1065
+ if hasattr(ctx, '_workflow_entity') and ctx._workflow_entity._state is not None:
1066
+ if ctx._workflow_entity._state.has_changes():
1067
+ state_snapshot = ctx._workflow_entity._state.get_state_snapshot()
1068
+ metadata["workflow_state"] = json.dumps(state_snapshot)
1069
+ logger.debug(f"Workflow state snapshot: {state_snapshot}")
1070
+
1071
+ # AUDIT TRAIL: Serialize complete state change history for replay and debugging
1072
+ # This captures all intermediate state mutations, not just final snapshot
1073
+ state_changes = ctx._workflow_entity._state_changes
1074
+ logger.info(f"🔍 DEBUG: _state_changes list has {len(state_changes)} entries")
1075
+ if state_changes:
1076
+ metadata["state_changes"] = json.dumps(state_changes)
1077
+ logger.info(f"✅ Serialized {len(state_changes)} state changes to metadata")
1078
+ else:
1079
+ logger.warning("⚠️ _state_changes list is empty - no state change history captured")
1080
+
1081
+ # CRITICAL: Persist workflow entity state to platform
1082
+ # This stores the WorkflowEntity as a first-class entity with proper versioning
1083
+ try:
1084
+ logger.info(f"🔍 DEBUG: About to call _persist_state() for run {request.invocation_id}")
1085
+ await ctx._workflow_entity._persist_state()
1086
+ logger.info(f"✅ Successfully persisted WorkflowEntity state for run {request.invocation_id}")
1087
+ except Exception as persist_error:
1088
+ logger.error(f"❌ Failed to persist WorkflowEntity state (non-fatal): {persist_error}", exc_info=True)
1089
+ # Continue anyway - persistence failure shouldn't fail the workflow
1090
+
1091
+ logger.info(f"Workflow completed successfully with {len(step_events)} steps")
1092
+
1093
+ # Add session_id to metadata for multi-turn conversation support
1094
+ metadata["session_id"] = session_id
1095
+
1096
+ # CRITICAL: Flush all buffered checkpoints before returning response
1097
+ # This ensures checkpoints arrive at platform BEFORE run.completed event
1098
+ try:
1099
+ flushed_count = self._rust_worker.flush_workflow_checkpoints()
1100
+ if flushed_count > 0:
1101
+ logger.info(f"✅ Flushed {flushed_count} checkpoints before completion")
1102
+ except Exception as flush_error:
1103
+ logger.error(f"Failed to flush checkpoints: {flush_error}", exc_info=True)
1104
+ # Continue anyway - checkpoint flushing is best-effort
1105
+
1106
+ return PyExecuteComponentResponse(
1107
+ invocation_id=request.invocation_id,
1108
+ success=True,
1109
+ output_data=output_data,
1110
+ state_update=None, # Not used for workflows (use metadata instead)
1111
+ error_message=None,
1112
+ metadata=metadata if metadata else None, # Include step events + state + session_id
1113
+ is_chunk=False,
1114
+ done=True,
1115
+ chunk_index=0,
1116
+ attempt=getattr(request, 'attempt', 0),
1117
+ )
1118
+
1119
+ except WaitingForUserInputException as e:
1120
+ # Workflow or agent paused for user input
1121
+ pause_type = "agent" if e.agent_context else "workflow"
1122
+ logger.info(f"⏸️ {pause_type.capitalize()} paused waiting for user input: {e.question}")
1123
+
1124
+ # Collect metadata for pause state
1125
+ # Note: All metadata values must be strings for Rust FFI
1126
+ pause_metadata = {
1127
+ "status": "awaiting_user_input",
1128
+ "question": e.question,
1129
+ "input_type": e.input_type,
1130
+ "pause_type": pause_type, # NEW: Indicates workflow vs agent pause
1131
+ }
1132
+
1133
+ # CRITICAL: Propagate tenant_id even when pausing
1134
+ critical_metadata = self._extract_critical_metadata(request)
1135
+ pause_metadata.update(critical_metadata)
1136
+
1137
+ # Add optional fields only if they exist
1138
+ if e.options:
1139
+ pause_metadata["options"] = json.dumps(e.options)
1140
+ if e.checkpoint_state:
1141
+ pause_metadata["checkpoint_state"] = json.dumps(e.checkpoint_state)
1142
+ if session_id:
1143
+ pause_metadata["session_id"] = session_id
1144
+
1145
+ # NEW: Store agent execution state if present
1146
+ if e.agent_context:
1147
+ pause_metadata["agent_context"] = json.dumps(e.agent_context)
1148
+ logger.debug(
1149
+ f"Agent '{e.agent_context['agent_name']}' paused at "
1150
+ f"iteration {e.agent_context['iteration']}"
1151
+ )
1152
+
1153
+ # Add step events to pause metadata for durability
1154
+ step_events = ctx._workflow_entity._step_events
1155
+ if step_events:
1156
+ pause_metadata["step_events"] = json.dumps(step_events)
1157
+ logger.debug(f"Paused workflow has {len(step_events)} recorded steps")
1158
+
1159
+ # Add current workflow state to pause metadata
1160
+ if hasattr(ctx, '_workflow_entity') and ctx._workflow_entity._state is not None:
1161
+ if ctx._workflow_entity._state.has_changes():
1162
+ state_snapshot = ctx._workflow_entity._state.get_state_snapshot()
1163
+ pause_metadata["workflow_state"] = json.dumps(state_snapshot)
1164
+ logger.debug(f"Paused workflow state snapshot: {state_snapshot}")
1165
+
1166
+ # AUDIT TRAIL: Also include state change history for paused workflows
1167
+ state_changes = ctx._workflow_entity._state_changes
1168
+ if state_changes:
1169
+ pause_metadata["state_changes"] = json.dumps(state_changes)
1170
+ logger.debug(f"Paused workflow has {len(state_changes)} state changes in history")
1171
+
1172
+ # Return "success" with awaiting_user_input metadata
1173
+ # The output contains the question details for the client
1174
+ output = {
1175
+ "question": e.question,
1176
+ "input_type": e.input_type,
1177
+ "options": e.options,
1178
+ }
1179
+ output_data = json.dumps(output).encode("utf-8")
1180
+
1181
+ return PyExecuteComponentResponse(
1182
+ invocation_id=request.invocation_id,
1183
+ success=True, # This is a valid pause state, not an error
1184
+ output_data=output_data,
1185
+ state_update=None,
1186
+ error_message=None,
1187
+ metadata=pause_metadata,
1188
+ is_chunk=False,
1189
+ done=True,
1190
+ chunk_index=0,
1191
+ attempt=getattr(request, 'attempt', 0),
1192
+ )
1193
+
1194
+ except Exception as e:
1195
+ # Include exception type for better error messages
1196
+ error_msg = f"{type(e).__name__}: {str(e)}"
1197
+
1198
+ # Capture full stack trace for telemetry
1199
+ import traceback
1200
+ stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
1201
+
1202
+ # Log with full traceback
1203
+ logger.error(f"Workflow execution failed: {error_msg}", exc_info=True)
1204
+
1205
+ # Store error metadata for observability
1206
+ metadata = {
1207
+ "error_type": type(e).__name__,
1208
+ "stack_trace": stack_trace,
1209
+ "error": True,
1210
+ }
1211
+
1212
+ # Extract critical metadata for journal correlation (if available)
1213
+ critical_metadata = self._extract_critical_metadata(request)
1214
+ metadata.update(critical_metadata)
1215
+
1216
+ # Normalize metadata for Rust FFI compatibility
1217
+ normalized_metadata = _normalize_metadata(metadata)
1218
+
1219
+ return PyExecuteComponentResponse(
1220
+ invocation_id=request.invocation_id,
1221
+ success=False,
1222
+ output_data=b"",
1223
+ state_update=None,
1224
+ error_message=error_msg,
1225
+ metadata=normalized_metadata,
1226
+ is_chunk=False,
1227
+ done=True,
1228
+ chunk_index=0,
1229
+ attempt=getattr(request, 'attempt', 0),
1230
+ )
1231
+
1232
+ async def _execute_tool(self, tool, input_data: bytes, request):
1233
+ """Execute a tool handler."""
1234
+ import json
1235
+ from .context import Context
1236
+ from ._core import PyExecuteComponentResponse
1237
+
1238
+ try:
1239
+ # Parse input data
1240
+ input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
1241
+
1242
+ # Create context with runtime_context for trace correlation
1243
+ ctx = Context(
1244
+ run_id=f"{self.service_name}:{tool.name}",
1245
+ runtime_context=request.runtime_context,
1246
+ )
1247
+
1248
+ # Set context in contextvar so get_current_context() and error handlers can access it
1249
+ from .context import set_current_context, _current_context
1250
+ token = set_current_context(ctx)
1251
+
1252
+ # Execute tool
1253
+ result = await tool.invoke(ctx, **input_dict)
1254
+
1255
+ # Serialize result
1256
+ output_data = json.dumps(result).encode("utf-8")
1257
+
1258
+ return PyExecuteComponentResponse(
1259
+ invocation_id=request.invocation_id,
1260
+ success=True,
1261
+ output_data=output_data,
1262
+ state_update=None,
1263
+ error_message=None,
1264
+ metadata=None,
1265
+ is_chunk=False,
1266
+ done=True,
1267
+ chunk_index=0,
1268
+ attempt=getattr(request, 'attempt', 0),
1269
+ )
1270
+
1271
+ except Exception as e:
1272
+ # Include exception type for better error messages
1273
+ error_msg = f"{type(e).__name__}: {str(e)}"
1274
+
1275
+ # Capture full stack trace for telemetry
1276
+ import traceback
1277
+ stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
1278
+
1279
+ # Log with full traceback using ctx.logger to ensure run_id correlation
1280
+ from .context import get_current_context
1281
+ current_ctx = get_current_context()
1282
+ error_logger = current_ctx.logger if current_ctx else logger
1283
+ error_logger.error(f"Tool execution failed: {error_msg}", exc_info=True)
1284
+
1285
+ # Store error metadata for observability
1286
+ metadata = {
1287
+ "error_type": type(e).__name__,
1288
+ "stack_trace": stack_trace,
1289
+ "error": True,
1290
+ }
1291
+
1292
+ # CRITICAL: Extract critical metadata (including tenant_id) for journal event correlation
1293
+ critical_metadata = self._extract_critical_metadata(request)
1294
+ metadata.update(critical_metadata)
1295
+
1296
+ # Normalize metadata for Rust FFI compatibility
1297
+ normalized_metadata = _normalize_metadata(metadata)
1298
+
1299
+ return PyExecuteComponentResponse(
1300
+ invocation_id=request.invocation_id,
1301
+ success=False,
1302
+ output_data=b"",
1303
+ state_update=None,
1304
+ error_message=error_msg,
1305
+ metadata=normalized_metadata,
1306
+ is_chunk=False,
1307
+ done=True,
1308
+ chunk_index=0,
1309
+ attempt=getattr(request, 'attempt', 0),
1310
+ )
1311
+
1312
+ finally:
1313
+ # Always reset context to prevent leakage between executions
1314
+ _current_context.reset(token)
1315
+
1316
+ async def _execute_entity(self, entity_type, input_data: bytes, request):
1317
+ """Execute an entity method."""
1318
+ import json
1319
+ from .context import Context
1320
+ from .entity import EntityType, Entity, _entity_state_adapter_ctx
1321
+ from ._core import PyExecuteComponentResponse
1322
+
1323
+ # Set entity state adapter in context for Entity instances to access
1324
+ _entity_state_adapter_ctx.set(self._entity_state_adapter)
1325
+
1326
+ try:
1327
+ # Parse input data
1328
+ input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
1329
+
1330
+ # Extract entity key and method name from input
1331
+ entity_key = input_dict.pop("key", None)
1332
+ method_name = input_dict.pop("method", None)
1333
+
1334
+ if not entity_key:
1335
+ raise ValueError("Entity invocation requires 'key' parameter")
1336
+ if not method_name:
1337
+ raise ValueError("Entity invocation requires 'method' parameter")
1338
+
1339
+ # Create context for logging and tracing
1340
+ ctx = Context(
1341
+ run_id=f"{self.service_name}:{entity_type.name}:{entity_key}",
1342
+ runtime_context=request.runtime_context,
1343
+ )
1344
+
1345
+ # Set context in contextvar so get_current_context() and error handlers can access it
1346
+ from .context import set_current_context, _current_context
1347
+ token = set_current_context(ctx)
1348
+
1349
+ # Note: State loading is now handled automatically by the entity method wrapper
1350
+ # via EntityStateAdapter which uses the Rust core for cache + platform persistence
1351
+
1352
+ # Create entity instance using the stored class reference
1353
+ entity_instance = entity_type.entity_class(key=entity_key)
1354
+
1355
+ # Get method
1356
+ if not hasattr(entity_instance, method_name):
1357
+ raise ValueError(f"Entity '{entity_type.name}' has no method '{method_name}'")
1358
+
1359
+ method = getattr(entity_instance, method_name)
1360
+
1361
+ # Execute method (entity method wrapper handles state load/save automatically)
1362
+ result = await method(**input_dict)
1363
+
1364
+ # Serialize result
1365
+ output_data = json.dumps(result).encode("utf-8")
1366
+
1367
+ # Note: State persistence is now handled automatically by the entity method wrapper
1368
+ # via EntityStateAdapter which uses Rust core for optimistic locking + version tracking
1369
+
1370
+ # CRITICAL: Propagate tenant_id and deployment_id to prevent journal corruption
1371
+ metadata = self._extract_critical_metadata(request)
1372
+
1373
+ return PyExecuteComponentResponse(
1374
+ invocation_id=request.invocation_id,
1375
+ success=True,
1376
+ output_data=output_data,
1377
+ state_update=None, # TODO: Use structured StateUpdate object
1378
+ error_message=None,
1379
+ metadata=metadata if metadata else None, # Include state in metadata for Worker Coordinator
1380
+ is_chunk=False,
1381
+ done=True,
1382
+ chunk_index=0,
1383
+ attempt=getattr(request, 'attempt', 0),
1384
+ )
1385
+
1386
+ except Exception as e:
1387
+ # Include exception type for better error messages
1388
+ error_msg = f"{type(e).__name__}: {str(e)}"
1389
+
1390
+ # Capture full stack trace for telemetry
1391
+ import traceback
1392
+ stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
1393
+
1394
+ # Log with full traceback using ctx.logger to ensure run_id correlation
1395
+ from .context import get_current_context
1396
+ current_ctx = get_current_context()
1397
+ error_logger = current_ctx.logger if current_ctx else logger
1398
+ error_logger.error(f"Entity execution failed: {error_msg}", exc_info=True)
1399
+
1400
+ # Store error metadata for observability
1401
+ metadata = {
1402
+ "error_type": type(e).__name__,
1403
+ "stack_trace": stack_trace,
1404
+ "error": True,
1405
+ }
1406
+
1407
+ # Extract critical metadata for journal correlation (if available)
1408
+ critical_metadata = self._extract_critical_metadata(request)
1409
+ metadata.update(critical_metadata)
1410
+
1411
+ # Normalize metadata for Rust FFI compatibility
1412
+ normalized_metadata = _normalize_metadata(metadata)
1413
+
1414
+ return PyExecuteComponentResponse(
1415
+ invocation_id=request.invocation_id,
1416
+ success=False,
1417
+ output_data=b"",
1418
+ state_update=None,
1419
+ error_message=error_msg,
1420
+ metadata=normalized_metadata,
1421
+ is_chunk=False,
1422
+ done=True,
1423
+ chunk_index=0,
1424
+ attempt=getattr(request, 'attempt', 0),
1425
+ )
1426
+
1427
+ finally:
1428
+ # Always reset context to prevent leakage between executions
1429
+ _current_context.reset(token)
1430
+
1431
+ async def _execute_agent(self, agent, input_data: bytes, request):
1432
+ """Execute an agent with session support for multi-turn conversations."""
1433
+ import json
1434
+ import uuid
1435
+ from .agent import AgentContext
1436
+ from .entity import _entity_state_adapter_ctx
1437
+ from ._core import PyExecuteComponentResponse
1438
+
1439
+ # Set entity state adapter in context so AgentContext can access it
1440
+ _entity_state_adapter_ctx.set(self._entity_state_adapter)
1441
+
1442
+ try:
1443
+ # Parse input data
1444
+ input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
1445
+
1446
+ # Extract user message
1447
+ user_message = input_dict.get("message", "")
1448
+ if not user_message:
1449
+ raise ValueError("Agent invocation requires 'message' parameter")
1450
+
1451
+ # Extract or generate session_id for multi-turn conversation support
1452
+ # If session_id is provided, the agent will load previous conversation history
1453
+ # If not provided, a new session is created with auto-generated ID
1454
+ session_id = input_dict.get("session_id")
1455
+
1456
+ if not session_id:
1457
+ session_id = str(uuid.uuid4())
1458
+ logger.info(f"Created new agent session: {session_id}")
1459
+ else:
1460
+ logger.info(f"Using existing agent session: {session_id}")
1461
+
1462
+ # Create AgentContext with session support for conversation persistence
1463
+ # AgentContext automatically loads/saves conversation history based on session_id
1464
+ ctx = AgentContext(
1465
+ run_id=request.invocation_id,
1466
+ agent_name=agent.name,
1467
+ session_id=session_id,
1468
+ runtime_context=request.runtime_context,
1469
+ )
1470
+
1471
+ # Set context in contextvar so get_current_context() and error handlers can access it
1472
+ from .context import set_current_context, _current_context
1473
+ token = set_current_context(ctx)
1474
+
1475
+ # Execute agent - conversation history is automatically included
1476
+ agent_result = await agent.run(user_message, context=ctx)
1477
+
1478
+ # Build response with agent output and tool calls
1479
+ result = {
1480
+ "output": agent_result.output,
1481
+ "tool_calls": agent_result.tool_calls,
1482
+ }
1483
+
1484
+ # Serialize result
1485
+ output_data = json.dumps(result).encode("utf-8")
1486
+
1487
+ # CRITICAL: Propagate tenant_id and deployment_id to prevent journal corruption
1488
+ metadata = self._extract_critical_metadata(request)
1489
+ # Also include session_id for UI to persist conversation
1490
+ metadata["session_id"] = session_id
1491
+
1492
+ return PyExecuteComponentResponse(
1493
+ invocation_id=request.invocation_id,
1494
+ success=True,
1495
+ output_data=output_data,
1496
+ state_update=None,
1497
+ error_message=None,
1498
+ metadata=metadata if metadata else None,
1499
+ is_chunk=False,
1500
+ done=True,
1501
+ chunk_index=0,
1502
+ attempt=getattr(request, 'attempt', 0),
1503
+ )
1504
+
1505
+ except Exception as e:
1506
+ # Include exception type for better error messages
1507
+ error_msg = f"{type(e).__name__}: {str(e)}"
1508
+
1509
+ # Capture full stack trace for telemetry
1510
+ import traceback
1511
+ stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
1512
+
1513
+ # Log with full traceback using ctx.logger to ensure run_id correlation
1514
+ from .context import get_current_context
1515
+ current_ctx = get_current_context()
1516
+ error_logger = current_ctx.logger if current_ctx else logger
1517
+ error_logger.error(f"Agent execution failed: {error_msg}", exc_info=True)
1518
+
1519
+ # Store error metadata for observability
1520
+ metadata = {
1521
+ "error_type": type(e).__name__,
1522
+ "stack_trace": stack_trace,
1523
+ "error": True,
1524
+ }
1525
+
1526
+ # Extract critical metadata for journal correlation (if available)
1527
+ critical_metadata = self._extract_critical_metadata(request)
1528
+ metadata.update(critical_metadata)
1529
+
1530
+ # Normalize metadata for Rust FFI compatibility
1531
+ normalized_metadata = _normalize_metadata(metadata)
1532
+
1533
+ return PyExecuteComponentResponse(
1534
+ invocation_id=request.invocation_id,
1535
+ success=False,
1536
+ output_data=b"",
1537
+ state_update=None,
1538
+ error_message=error_msg,
1539
+ metadata=normalized_metadata,
1540
+ is_chunk=False,
1541
+ done=True,
1542
+ chunk_index=0,
1543
+ attempt=getattr(request, 'attempt', 0),
1544
+ )
1545
+
1546
+ finally:
1547
+ # Always reset context to prevent leakage between executions
1548
+ _current_context.reset(token)
1549
+
1550
+ def _create_error_response(self, request, error_message: str):
1551
+ """Create an error response."""
1552
+ from ._core import PyExecuteComponentResponse
1553
+
1554
+ return PyExecuteComponentResponse(
1555
+ invocation_id=request.invocation_id,
1556
+ success=False,
1557
+ output_data=b"",
1558
+ state_update=None,
1559
+ error_message=error_message,
1560
+ metadata=None,
1561
+ is_chunk=False,
1562
+ done=True,
1563
+ chunk_index=0,
1564
+ attempt=getattr(request, 'attempt', 0),
1565
+ )
1566
+
1567
+ async def run(self):
1568
+ """Run the worker (register and start message loop).
1569
+
1570
+ This method will:
1571
+ 1. Discover all registered @function and @workflow handlers
1572
+ 2. Register with the coordinator
1573
+ 3. Create a shared Python event loop for all function executions
1574
+ 4. Enter the message processing loop
1575
+ 5. Block until shutdown
1576
+
1577
+ This is the main entry point for your worker service.
1578
+ """
1579
+ logger.info(f"Starting worker: {self.service_name}")
1580
+
1581
+ # Discover components
1582
+ components = self._discover_components()
1583
+
1584
+ # Set components on Rust worker
1585
+ self._rust_worker.set_components(components)
1586
+
1587
+ # Set metadata
1588
+ if self.metadata:
1589
+ self._rust_worker.set_service_metadata(self.metadata)
1590
+
1591
+ # Configure entity state manager on Rust worker for database persistence
1592
+ logger.info("Configuring Rust EntityStateManager for database persistence")
1593
+ # Access the Rust core from the adapter
1594
+ if hasattr(self._entity_state_adapter, '_rust_core') and self._entity_state_adapter._rust_core:
1595
+ self._rust_worker.set_entity_state_manager(self._entity_state_adapter._rust_core)
1596
+ logger.info("Successfully configured Rust EntityStateManager")
1597
+
1598
+ # Get the current event loop to pass to Rust for concurrent Python async execution
1599
+ # This allows Rust to execute Python async functions on the same event loop
1600
+ # without spawn_blocking overhead, enabling true concurrency
1601
+ loop = asyncio.get_running_loop()
1602
+ logger.info("Passing Python event loop to Rust worker for concurrent execution")
1603
+
1604
+ # Set event loop on Rust worker
1605
+ self._rust_worker.set_event_loop(loop)
1606
+
1607
+ # Set message handler
1608
+ handler = self._create_message_handler()
1609
+ self._rust_worker.set_message_handler(handler)
1610
+
1611
+ # Initialize worker
1612
+ self._rust_worker.initialize()
1613
+
1614
+ logger.info("Worker registered successfully, entering message loop...")
1615
+
1616
+ # Run worker (this will block until shutdown)
1617
+ await self._rust_worker.run()
1618
+
1619
+ logger.info("Worker shutdown complete")