agnt5 0.2.8a10__cp310-abi3-manylinux_2_34_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of agnt5 might be problematic. Click here for more details.
- agnt5/__init__.py +91 -0
- agnt5/_compat.py +16 -0
- agnt5/_core.abi3.so +0 -0
- agnt5/_retry_utils.py +169 -0
- agnt5/_schema_utils.py +312 -0
- agnt5/_telemetry.py +182 -0
- agnt5/agent.py +1685 -0
- agnt5/client.py +741 -0
- agnt5/context.py +178 -0
- agnt5/entity.py +795 -0
- agnt5/exceptions.py +102 -0
- agnt5/function.py +321 -0
- agnt5/lm.py +813 -0
- agnt5/tool.py +648 -0
- agnt5/tracing.py +196 -0
- agnt5/types.py +110 -0
- agnt5/version.py +19 -0
- agnt5/worker.py +1619 -0
- agnt5/workflow.py +1048 -0
- agnt5-0.2.8a10.dist-info/METADATA +25 -0
- agnt5-0.2.8a10.dist-info/RECORD +22 -0
- agnt5-0.2.8a10.dist-info/WHEEL +4 -0
agnt5/worker.py
ADDED
|
@@ -0,0 +1,1619 @@
|
|
|
1
|
+
"""Worker implementation for AGNT5 SDK."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import contextvars
|
|
7
|
+
import logging
|
|
8
|
+
import uuid
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
|
+
|
|
11
|
+
from .function import FunctionRegistry
|
|
12
|
+
from .workflow import WorkflowRegistry
|
|
13
|
+
from ._telemetry import setup_module_logger
|
|
14
|
+
|
|
15
|
+
logger = setup_module_logger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _normalize_metadata(metadata: Dict[str, Any]) -> Dict[str, str]:
|
|
19
|
+
"""
|
|
20
|
+
Convert metadata dictionary to Dict[str, str] for Rust FFI compatibility.
|
|
21
|
+
|
|
22
|
+
PyO3 requires HashMap<String, String>, but Python code may include booleans,
|
|
23
|
+
integers, or other types. This helper ensures all values are strings.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
metadata: Dictionary with potentially mixed types
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Dictionary with all string values
|
|
30
|
+
|
|
31
|
+
Example:
|
|
32
|
+
>>> _normalize_metadata({"error": True, "count": 42, "msg": "hello"})
|
|
33
|
+
{"error": "true", "count": "42", "msg": "hello"}
|
|
34
|
+
"""
|
|
35
|
+
normalized = {}
|
|
36
|
+
for key, value in metadata.items():
|
|
37
|
+
if isinstance(value, str):
|
|
38
|
+
normalized[key] = value
|
|
39
|
+
elif isinstance(value, bool):
|
|
40
|
+
# Convert bool to lowercase string for JSON compatibility
|
|
41
|
+
normalized[key] = str(value).lower()
|
|
42
|
+
elif value is None:
|
|
43
|
+
normalized[key] = ""
|
|
44
|
+
else:
|
|
45
|
+
# Convert any other type to string representation
|
|
46
|
+
normalized[key] = str(value)
|
|
47
|
+
return normalized
|
|
48
|
+
|
|
49
|
+
# Context variable to store trace metadata for propagation to LM calls
|
|
50
|
+
# This allows Rust LM layer to access traceparent without explicit parameter passing
|
|
51
|
+
_trace_metadata: contextvars.ContextVar[Dict[str, str]] = contextvars.ContextVar(
|
|
52
|
+
'_trace_metadata', default={}
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class Worker:
|
|
57
|
+
"""AGNT5 Worker for registering and running functions/workflows with the coordinator.
|
|
58
|
+
|
|
59
|
+
The Worker class manages the lifecycle of your service, including:
|
|
60
|
+
- Registration with the AGNT5 coordinator
|
|
61
|
+
- Automatic discovery of @function and @workflow decorated handlers
|
|
62
|
+
- Message handling and execution
|
|
63
|
+
- Health monitoring
|
|
64
|
+
|
|
65
|
+
Example:
|
|
66
|
+
```python
|
|
67
|
+
from agnt5 import Worker, function
|
|
68
|
+
|
|
69
|
+
@function
|
|
70
|
+
async def process_data(ctx: Context, data: str) -> dict:
|
|
71
|
+
return {"result": data.upper()}
|
|
72
|
+
|
|
73
|
+
async def main():
|
|
74
|
+
worker = Worker(
|
|
75
|
+
service_name="data-processor",
|
|
76
|
+
service_version="1.0.0",
|
|
77
|
+
coordinator_endpoint="http://localhost:34186"
|
|
78
|
+
)
|
|
79
|
+
await worker.run()
|
|
80
|
+
|
|
81
|
+
if __name__ == "__main__":
|
|
82
|
+
asyncio.run(main())
|
|
83
|
+
```
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
def __init__(
|
|
87
|
+
self,
|
|
88
|
+
service_name: str,
|
|
89
|
+
service_version: str = "1.0.0",
|
|
90
|
+
coordinator_endpoint: Optional[str] = None,
|
|
91
|
+
runtime: str = "standalone",
|
|
92
|
+
metadata: Optional[Dict[str, str]] = None,
|
|
93
|
+
functions: Optional[List] = None,
|
|
94
|
+
workflows: Optional[List] = None,
|
|
95
|
+
entities: Optional[List] = None,
|
|
96
|
+
agents: Optional[List] = None,
|
|
97
|
+
tools: Optional[List] = None,
|
|
98
|
+
auto_register: bool = False,
|
|
99
|
+
auto_register_paths: Optional[List[str]] = None,
|
|
100
|
+
pyproject_path: Optional[str] = None,
|
|
101
|
+
):
|
|
102
|
+
"""Initialize a new Worker with explicit or automatic component registration.
|
|
103
|
+
|
|
104
|
+
The Worker supports two registration modes:
|
|
105
|
+
|
|
106
|
+
**Explicit Mode (default, production):**
|
|
107
|
+
- Register workflows/agents explicitly, their dependencies are auto-included
|
|
108
|
+
- Optionally register standalone functions/tools for direct API invocation
|
|
109
|
+
|
|
110
|
+
**Auto-Registration Mode (development):**
|
|
111
|
+
- Automatically discovers all decorated components in source paths
|
|
112
|
+
- Reads source paths from pyproject.toml or uses explicit paths
|
|
113
|
+
- No need to maintain import lists
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
service_name: Unique name for this service
|
|
117
|
+
service_version: Version string (semantic versioning recommended)
|
|
118
|
+
coordinator_endpoint: Coordinator endpoint URL (default: from env AGNT5_COORDINATOR_ENDPOINT)
|
|
119
|
+
runtime: Runtime type - "standalone", "docker", "kubernetes", etc.
|
|
120
|
+
metadata: Optional service-level metadata
|
|
121
|
+
functions: List of @function decorated handlers (explicit mode)
|
|
122
|
+
workflows: List of @workflow decorated handlers (explicit mode)
|
|
123
|
+
entities: List of Entity classes (explicit mode)
|
|
124
|
+
agents: List of Agent instances (explicit mode)
|
|
125
|
+
tools: List of Tool instances (explicit mode)
|
|
126
|
+
auto_register: Enable automatic component discovery (default: False)
|
|
127
|
+
auto_register_paths: Explicit source paths to scan (overrides pyproject.toml discovery)
|
|
128
|
+
pyproject_path: Path to pyproject.toml (default: current directory)
|
|
129
|
+
|
|
130
|
+
Example (explicit mode - production):
|
|
131
|
+
```python
|
|
132
|
+
from agnt5 import Worker
|
|
133
|
+
from my_service import greet_user, order_fulfillment, ShoppingCart, analyst_agent
|
|
134
|
+
|
|
135
|
+
worker = Worker(
|
|
136
|
+
service_name="my-service",
|
|
137
|
+
workflows=[order_fulfillment],
|
|
138
|
+
entities=[ShoppingCart],
|
|
139
|
+
agents=[analyst_agent],
|
|
140
|
+
functions=[greet_user],
|
|
141
|
+
)
|
|
142
|
+
await worker.run()
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
Example (auto-register mode - development):
|
|
146
|
+
```python
|
|
147
|
+
from agnt5 import Worker
|
|
148
|
+
|
|
149
|
+
worker = Worker(
|
|
150
|
+
service_name="my-service",
|
|
151
|
+
auto_register=True, # Discovers from pyproject.toml
|
|
152
|
+
)
|
|
153
|
+
await worker.run()
|
|
154
|
+
```
|
|
155
|
+
"""
|
|
156
|
+
self.service_name = service_name
|
|
157
|
+
self.service_version = service_version
|
|
158
|
+
self.coordinator_endpoint = coordinator_endpoint
|
|
159
|
+
self.runtime = runtime
|
|
160
|
+
self.metadata = metadata or {}
|
|
161
|
+
|
|
162
|
+
# Get tenant_id from environment (required for entity state management)
|
|
163
|
+
import os
|
|
164
|
+
self._tenant_id = os.getenv("AGNT5_TENANT_ID", "default-tenant")
|
|
165
|
+
|
|
166
|
+
# Import Rust worker
|
|
167
|
+
try:
|
|
168
|
+
from ._core import PyWorker, PyWorkerConfig, PyComponentInfo
|
|
169
|
+
self._PyWorker = PyWorker
|
|
170
|
+
self._PyWorkerConfig = PyWorkerConfig
|
|
171
|
+
self._PyComponentInfo = PyComponentInfo
|
|
172
|
+
except ImportError as e:
|
|
173
|
+
raise ImportError(
|
|
174
|
+
f"Failed to import Rust core worker: {e}. "
|
|
175
|
+
"Make sure agnt5 is properly installed with: pip install agnt5"
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
# Create Rust worker config
|
|
179
|
+
self._rust_config = self._PyWorkerConfig(
|
|
180
|
+
service_name=service_name,
|
|
181
|
+
service_version=service_version,
|
|
182
|
+
service_type=runtime,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
# Create Rust worker instance
|
|
186
|
+
self._rust_worker = self._PyWorker(self._rust_config)
|
|
187
|
+
|
|
188
|
+
# Create worker-scoped entity state adapter with Rust core
|
|
189
|
+
from .entity import EntityStateAdapter
|
|
190
|
+
from ._core import EntityStateManager as RustEntityStateManager
|
|
191
|
+
|
|
192
|
+
# Create Rust core for entity state management
|
|
193
|
+
rust_core = RustEntityStateManager(tenant_id=self._tenant_id)
|
|
194
|
+
|
|
195
|
+
# Create Python adapter (thin wrapper around Rust core)
|
|
196
|
+
self._entity_state_adapter = EntityStateAdapter(rust_core=rust_core)
|
|
197
|
+
|
|
198
|
+
logger.info("Created EntityStateAdapter with Rust core for state management")
|
|
199
|
+
|
|
200
|
+
# Component registration: auto-discover or explicit
|
|
201
|
+
if auto_register:
|
|
202
|
+
# Auto-registration mode: discover from source paths
|
|
203
|
+
if auto_register_paths:
|
|
204
|
+
source_paths = auto_register_paths
|
|
205
|
+
logger.info(f"Auto-registration with explicit paths: {source_paths}")
|
|
206
|
+
else:
|
|
207
|
+
source_paths = self._discover_source_paths(pyproject_path)
|
|
208
|
+
logger.info(f"Auto-registration with discovered paths: {source_paths}")
|
|
209
|
+
|
|
210
|
+
# Auto-discover components (will populate _explicit_components)
|
|
211
|
+
self._auto_discover_components(source_paths)
|
|
212
|
+
else:
|
|
213
|
+
# Explicit registration from constructor kwargs
|
|
214
|
+
self._explicit_components = {
|
|
215
|
+
'functions': list(functions or []),
|
|
216
|
+
'workflows': list(workflows or []),
|
|
217
|
+
'entities': list(entities or []),
|
|
218
|
+
'agents': list(agents or []),
|
|
219
|
+
'tools': list(tools or []),
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
# Count explicitly registered components
|
|
223
|
+
total_explicit = sum(len(v) for v in self._explicit_components.values())
|
|
224
|
+
logger.info(
|
|
225
|
+
f"Worker initialized: {service_name} v{service_version} (runtime: {runtime}), "
|
|
226
|
+
f"{total_explicit} components explicitly registered"
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
def register_components(
|
|
230
|
+
self,
|
|
231
|
+
functions=None,
|
|
232
|
+
workflows=None,
|
|
233
|
+
entities=None,
|
|
234
|
+
agents=None,
|
|
235
|
+
tools=None,
|
|
236
|
+
):
|
|
237
|
+
"""Register additional components after Worker initialization.
|
|
238
|
+
|
|
239
|
+
This method allows incremental registration of components after the Worker
|
|
240
|
+
has been created. Useful for conditional or dynamic component registration.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
functions: List of functions decorated with @function
|
|
244
|
+
workflows: List of workflows decorated with @workflow
|
|
245
|
+
entities: List of entity classes
|
|
246
|
+
agents: List of agent instances
|
|
247
|
+
tools: List of tool instances
|
|
248
|
+
|
|
249
|
+
Example:
|
|
250
|
+
```python
|
|
251
|
+
worker = Worker(service_name="my-service")
|
|
252
|
+
|
|
253
|
+
# Register conditionally
|
|
254
|
+
if feature_enabled:
|
|
255
|
+
worker.register_components(workflows=[advanced_workflow])
|
|
256
|
+
```
|
|
257
|
+
"""
|
|
258
|
+
if functions:
|
|
259
|
+
self._explicit_components['functions'].extend(functions)
|
|
260
|
+
logger.debug(f"Incrementally registered {len(functions)} functions")
|
|
261
|
+
|
|
262
|
+
if workflows:
|
|
263
|
+
self._explicit_components['workflows'].extend(workflows)
|
|
264
|
+
logger.debug(f"Incrementally registered {len(workflows)} workflows")
|
|
265
|
+
|
|
266
|
+
if entities:
|
|
267
|
+
self._explicit_components['entities'].extend(entities)
|
|
268
|
+
logger.debug(f"Incrementally registered {len(entities)} entities")
|
|
269
|
+
|
|
270
|
+
if agents:
|
|
271
|
+
self._explicit_components['agents'].extend(agents)
|
|
272
|
+
logger.debug(f"Incrementally registered {len(agents)} agents")
|
|
273
|
+
|
|
274
|
+
if tools:
|
|
275
|
+
self._explicit_components['tools'].extend(tools)
|
|
276
|
+
logger.debug(f"Incrementally registered {len(tools)} tools")
|
|
277
|
+
|
|
278
|
+
total = sum(len(v) for v in self._explicit_components.values())
|
|
279
|
+
logger.info(f"Total components now registered: {total}")
|
|
280
|
+
|
|
281
|
+
def _discover_source_paths(self, pyproject_path: Optional[str] = None) -> List[str]:
|
|
282
|
+
"""Discover source paths from pyproject.toml.
|
|
283
|
+
|
|
284
|
+
Reads pyproject.toml to find package source directories using:
|
|
285
|
+
- Hatch: [tool.hatch.build.targets.wheel] packages
|
|
286
|
+
- Maturin: [tool.maturin] python-source
|
|
287
|
+
- Fallback: ["src"] if not found
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
pyproject_path: Path to pyproject.toml (default: current directory)
|
|
291
|
+
|
|
292
|
+
Returns:
|
|
293
|
+
List of directory paths to scan (e.g., ["src/agnt5_benchmark"])
|
|
294
|
+
"""
|
|
295
|
+
from pathlib import Path
|
|
296
|
+
|
|
297
|
+
# Python 3.11+ has tomllib in stdlib
|
|
298
|
+
try:
|
|
299
|
+
import tomllib
|
|
300
|
+
except ImportError:
|
|
301
|
+
logger.error("tomllib not available (Python 3.11+ required for auto-registration)")
|
|
302
|
+
return ["src"]
|
|
303
|
+
|
|
304
|
+
# Determine pyproject.toml location
|
|
305
|
+
if pyproject_path:
|
|
306
|
+
pyproject_file = Path(pyproject_path)
|
|
307
|
+
else:
|
|
308
|
+
# Look in current directory
|
|
309
|
+
pyproject_file = Path.cwd() / "pyproject.toml"
|
|
310
|
+
|
|
311
|
+
if not pyproject_file.exists():
|
|
312
|
+
logger.warning(
|
|
313
|
+
f"pyproject.toml not found at {pyproject_file}, "
|
|
314
|
+
f"defaulting to 'src/' directory"
|
|
315
|
+
)
|
|
316
|
+
return ["src"]
|
|
317
|
+
|
|
318
|
+
# Parse pyproject.toml
|
|
319
|
+
try:
|
|
320
|
+
with open(pyproject_file, "rb") as f:
|
|
321
|
+
config = tomllib.load(f)
|
|
322
|
+
except Exception as e:
|
|
323
|
+
logger.error(f"Failed to parse pyproject.toml: {e}")
|
|
324
|
+
return ["src"]
|
|
325
|
+
|
|
326
|
+
# Extract source paths based on build system
|
|
327
|
+
source_paths = []
|
|
328
|
+
|
|
329
|
+
# Try Hatch configuration
|
|
330
|
+
if "tool" in config and "hatch" in config["tool"]:
|
|
331
|
+
hatch_config = config["tool"]["hatch"]
|
|
332
|
+
if "build" in hatch_config and "targets" in hatch_config["build"]:
|
|
333
|
+
wheel_config = hatch_config["build"]["targets"].get("wheel", {})
|
|
334
|
+
packages = wheel_config.get("packages", [])
|
|
335
|
+
source_paths.extend(packages)
|
|
336
|
+
|
|
337
|
+
# Try Maturin configuration
|
|
338
|
+
if not source_paths and "tool" in config and "maturin" in config["tool"]:
|
|
339
|
+
maturin_config = config["tool"]["maturin"]
|
|
340
|
+
python_source = maturin_config.get("python-source")
|
|
341
|
+
if python_source:
|
|
342
|
+
source_paths.append(python_source)
|
|
343
|
+
|
|
344
|
+
# Fallback to src/
|
|
345
|
+
if not source_paths:
|
|
346
|
+
logger.info("No source paths in pyproject.toml, defaulting to 'src/'")
|
|
347
|
+
source_paths = ["src"]
|
|
348
|
+
|
|
349
|
+
logger.info(f"Discovered source paths from pyproject.toml: {source_paths}")
|
|
350
|
+
return source_paths
|
|
351
|
+
|
|
352
|
+
def _auto_discover_components(self, source_paths: List[str]) -> None:
|
|
353
|
+
"""Auto-discover components by importing all Python files in source paths.
|
|
354
|
+
|
|
355
|
+
Args:
|
|
356
|
+
source_paths: List of directory paths to scan
|
|
357
|
+
"""
|
|
358
|
+
import importlib.util
|
|
359
|
+
import sys
|
|
360
|
+
from pathlib import Path
|
|
361
|
+
|
|
362
|
+
logger.info(f"Auto-discovering components in paths: {source_paths}")
|
|
363
|
+
|
|
364
|
+
total_modules = 0
|
|
365
|
+
|
|
366
|
+
for source_path in source_paths:
|
|
367
|
+
path = Path(source_path)
|
|
368
|
+
|
|
369
|
+
if not path.exists():
|
|
370
|
+
logger.warning(f"Source path does not exist: {source_path}")
|
|
371
|
+
continue
|
|
372
|
+
|
|
373
|
+
# Recursively find all .py files
|
|
374
|
+
for py_file in path.rglob("*.py"):
|
|
375
|
+
# Skip __pycache__ and test files
|
|
376
|
+
if "__pycache__" in str(py_file) or py_file.name.startswith("test_"):
|
|
377
|
+
continue
|
|
378
|
+
|
|
379
|
+
# Convert path to module name
|
|
380
|
+
# e.g., src/agnt5_benchmark/functions.py -> agnt5_benchmark.functions
|
|
381
|
+
relative_path = py_file.relative_to(path.parent)
|
|
382
|
+
module_parts = list(relative_path.parts[:-1]) # Remove .py extension part
|
|
383
|
+
module_parts.append(relative_path.stem) # Add filename without .py
|
|
384
|
+
module_name = ".".join(module_parts)
|
|
385
|
+
|
|
386
|
+
# Import module (triggers decorators)
|
|
387
|
+
try:
|
|
388
|
+
if module_name in sys.modules:
|
|
389
|
+
logger.debug(f"Module already imported: {module_name}")
|
|
390
|
+
else:
|
|
391
|
+
spec = importlib.util.spec_from_file_location(module_name, py_file)
|
|
392
|
+
if spec and spec.loader:
|
|
393
|
+
module = importlib.util.module_from_spec(spec)
|
|
394
|
+
sys.modules[module_name] = module
|
|
395
|
+
spec.loader.exec_module(module)
|
|
396
|
+
logger.debug(f"Auto-imported: {module_name}")
|
|
397
|
+
total_modules += 1
|
|
398
|
+
except Exception as e:
|
|
399
|
+
logger.warning(f"Failed to import {module_name}: {e}")
|
|
400
|
+
|
|
401
|
+
logger.info(f"Auto-imported {total_modules} modules")
|
|
402
|
+
|
|
403
|
+
# Collect components from registries
|
|
404
|
+
from .agent import AgentRegistry
|
|
405
|
+
from .entity import EntityRegistry
|
|
406
|
+
from .tool import ToolRegistry
|
|
407
|
+
|
|
408
|
+
# Extract actual objects from registries
|
|
409
|
+
functions = [cfg.handler for cfg in FunctionRegistry.all().values()]
|
|
410
|
+
workflows = [cfg.handler for cfg in WorkflowRegistry.all().values()]
|
|
411
|
+
entities = [et.entity_class for et in EntityRegistry.all().values()]
|
|
412
|
+
agents = list(AgentRegistry.all().values())
|
|
413
|
+
tools = list(ToolRegistry.all().values())
|
|
414
|
+
|
|
415
|
+
self._explicit_components = {
|
|
416
|
+
'functions': functions,
|
|
417
|
+
'workflows': workflows,
|
|
418
|
+
'entities': entities,
|
|
419
|
+
'agents': agents,
|
|
420
|
+
'tools': tools,
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
logger.info(
|
|
424
|
+
f"Auto-discovered components: "
|
|
425
|
+
f"{len(functions)} functions, "
|
|
426
|
+
f"{len(workflows)} workflows, "
|
|
427
|
+
f"{len(entities)} entities, "
|
|
428
|
+
f"{len(agents)} agents, "
|
|
429
|
+
f"{len(tools)} tools"
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
def _discover_components(self):
|
|
433
|
+
"""Discover explicit components and auto-include their dependencies.
|
|
434
|
+
|
|
435
|
+
Hybrid approach:
|
|
436
|
+
- Explicitly registered workflows/agents are processed
|
|
437
|
+
- Functions called by workflows are auto-included (TODO: implement)
|
|
438
|
+
- Tools used by agents are auto-included
|
|
439
|
+
- Standalone functions/tools can be explicitly registered
|
|
440
|
+
|
|
441
|
+
Returns:
|
|
442
|
+
List of PyComponentInfo instances for all components
|
|
443
|
+
"""
|
|
444
|
+
components = []
|
|
445
|
+
import json
|
|
446
|
+
|
|
447
|
+
# Import registries
|
|
448
|
+
from .entity import EntityRegistry
|
|
449
|
+
from .tool import ToolRegistry
|
|
450
|
+
|
|
451
|
+
# Track all components (explicit + auto-included)
|
|
452
|
+
all_functions = set(self._explicit_components['functions'])
|
|
453
|
+
all_tools = set(self._explicit_components['tools'])
|
|
454
|
+
|
|
455
|
+
# Auto-include agent tool dependencies
|
|
456
|
+
for agent in self._explicit_components['agents']:
|
|
457
|
+
if hasattr(agent, 'tools') and agent.tools:
|
|
458
|
+
# Agent.tools is a dict of {tool_name: tool_instance}
|
|
459
|
+
all_tools.update(agent.tools.values())
|
|
460
|
+
logger.debug(
|
|
461
|
+
f"Auto-included {len(agent.tools)} tools from agent '{agent.name}'"
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
# Log registration summary
|
|
465
|
+
explicit_func_count = len(self._explicit_components['functions'])
|
|
466
|
+
explicit_tool_count = len(self._explicit_components['tools'])
|
|
467
|
+
auto_func_count = len(all_functions) - explicit_func_count
|
|
468
|
+
auto_tool_count = len(all_tools) - explicit_tool_count
|
|
469
|
+
|
|
470
|
+
logger.info(
|
|
471
|
+
f"Component registration summary: "
|
|
472
|
+
f"{len(all_functions)} functions ({explicit_func_count} explicit, {auto_func_count} auto-included), "
|
|
473
|
+
f"{len(self._explicit_components['workflows'])} workflows, "
|
|
474
|
+
f"{len(self._explicit_components['entities'])} entities, "
|
|
475
|
+
f"{len(self._explicit_components['agents'])} agents, "
|
|
476
|
+
f"{len(all_tools)} tools ({explicit_tool_count} explicit, {auto_tool_count} auto-included)"
|
|
477
|
+
)
|
|
478
|
+
|
|
479
|
+
# Process functions (explicit + auto-included)
|
|
480
|
+
for func in all_functions:
|
|
481
|
+
config = FunctionRegistry.get(func.__name__)
|
|
482
|
+
if not config:
|
|
483
|
+
logger.warning(f"Function '{func.__name__}' not found in FunctionRegistry")
|
|
484
|
+
continue
|
|
485
|
+
|
|
486
|
+
input_schema_str = json.dumps(config.input_schema) if config.input_schema else None
|
|
487
|
+
output_schema_str = json.dumps(config.output_schema) if config.output_schema else None
|
|
488
|
+
metadata = config.metadata if config.metadata else {}
|
|
489
|
+
|
|
490
|
+
# Serialize retry and backoff policies
|
|
491
|
+
config_dict = {}
|
|
492
|
+
if config.retries:
|
|
493
|
+
config_dict["max_attempts"] = str(config.retries.max_attempts)
|
|
494
|
+
config_dict["initial_interval_ms"] = str(config.retries.initial_interval_ms)
|
|
495
|
+
config_dict["max_interval_ms"] = str(config.retries.max_interval_ms)
|
|
496
|
+
|
|
497
|
+
if config.backoff:
|
|
498
|
+
config_dict["backoff_type"] = config.backoff.type.value
|
|
499
|
+
config_dict["backoff_multiplier"] = str(config.backoff.multiplier)
|
|
500
|
+
|
|
501
|
+
component_info = self._PyComponentInfo(
|
|
502
|
+
name=config.name,
|
|
503
|
+
component_type="function",
|
|
504
|
+
metadata=metadata,
|
|
505
|
+
config=config_dict,
|
|
506
|
+
input_schema=input_schema_str,
|
|
507
|
+
output_schema=output_schema_str,
|
|
508
|
+
definition=None,
|
|
509
|
+
)
|
|
510
|
+
components.append(component_info)
|
|
511
|
+
|
|
512
|
+
# Process workflows
|
|
513
|
+
for workflow in self._explicit_components['workflows']:
|
|
514
|
+
config = WorkflowRegistry.get(workflow.__name__)
|
|
515
|
+
if not config:
|
|
516
|
+
logger.warning(f"Workflow '{workflow.__name__}' not found in WorkflowRegistry")
|
|
517
|
+
continue
|
|
518
|
+
|
|
519
|
+
input_schema_str = json.dumps(config.input_schema) if config.input_schema else None
|
|
520
|
+
output_schema_str = json.dumps(config.output_schema) if config.output_schema else None
|
|
521
|
+
metadata = config.metadata if config.metadata else {}
|
|
522
|
+
|
|
523
|
+
component_info = self._PyComponentInfo(
|
|
524
|
+
name=config.name,
|
|
525
|
+
component_type="workflow",
|
|
526
|
+
metadata=metadata,
|
|
527
|
+
config={},
|
|
528
|
+
input_schema=input_schema_str,
|
|
529
|
+
output_schema=output_schema_str,
|
|
530
|
+
definition=None,
|
|
531
|
+
)
|
|
532
|
+
components.append(component_info)
|
|
533
|
+
|
|
534
|
+
# Process entities
|
|
535
|
+
for entity_class in self._explicit_components['entities']:
|
|
536
|
+
entity_type = EntityRegistry.get(entity_class.__name__)
|
|
537
|
+
if not entity_type:
|
|
538
|
+
logger.warning(f"Entity '{entity_class.__name__}' not found in EntityRegistry")
|
|
539
|
+
continue
|
|
540
|
+
|
|
541
|
+
# Build complete entity definition with state schema and method schemas
|
|
542
|
+
entity_definition = entity_type.build_entity_definition()
|
|
543
|
+
definition_str = json.dumps(entity_definition)
|
|
544
|
+
|
|
545
|
+
# Keep minimal metadata for backward compatibility
|
|
546
|
+
metadata_dict = {
|
|
547
|
+
"methods": json.dumps(list(entity_type._method_schemas.keys())),
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
component_info = self._PyComponentInfo(
|
|
551
|
+
name=entity_type.name,
|
|
552
|
+
component_type="entity",
|
|
553
|
+
metadata=metadata_dict,
|
|
554
|
+
config={},
|
|
555
|
+
input_schema=None, # Entities don't have single input/output schemas
|
|
556
|
+
output_schema=None,
|
|
557
|
+
definition=definition_str, # Complete entity definition with state and methods
|
|
558
|
+
)
|
|
559
|
+
components.append(component_info)
|
|
560
|
+
logger.debug(f"Registered entity '{entity_type.name}' with definition")
|
|
561
|
+
|
|
562
|
+
# Process agents
|
|
563
|
+
from .agent import AgentRegistry
|
|
564
|
+
|
|
565
|
+
for agent in self._explicit_components['agents']:
|
|
566
|
+
# Register agent in AgentRegistry for execution lookup
|
|
567
|
+
AgentRegistry.register(agent)
|
|
568
|
+
logger.debug(f"Registered agent '{agent.name}' in AgentRegistry for execution")
|
|
569
|
+
|
|
570
|
+
input_schema_str = json.dumps(agent.input_schema) if hasattr(agent, 'input_schema') and agent.input_schema else None
|
|
571
|
+
output_schema_str = json.dumps(agent.output_schema) if hasattr(agent, 'output_schema') and agent.output_schema else None
|
|
572
|
+
|
|
573
|
+
metadata_dict = agent.metadata if hasattr(agent, 'metadata') else {}
|
|
574
|
+
if hasattr(agent, 'tools'):
|
|
575
|
+
metadata_dict["tools"] = json.dumps(list(agent.tools.keys()))
|
|
576
|
+
|
|
577
|
+
component_info = self._PyComponentInfo(
|
|
578
|
+
name=agent.name,
|
|
579
|
+
component_type="agent",
|
|
580
|
+
metadata=metadata_dict,
|
|
581
|
+
config={},
|
|
582
|
+
input_schema=input_schema_str,
|
|
583
|
+
output_schema=output_schema_str,
|
|
584
|
+
definition=None,
|
|
585
|
+
)
|
|
586
|
+
components.append(component_info)
|
|
587
|
+
|
|
588
|
+
# Process tools (explicit + auto-included)
|
|
589
|
+
for tool in all_tools:
|
|
590
|
+
input_schema_str = json.dumps(tool.input_schema) if hasattr(tool, 'input_schema') and tool.input_schema else None
|
|
591
|
+
output_schema_str = json.dumps(tool.output_schema) if hasattr(tool, 'output_schema') and tool.output_schema else None
|
|
592
|
+
|
|
593
|
+
component_info = self._PyComponentInfo(
|
|
594
|
+
name=tool.name,
|
|
595
|
+
component_type="tool",
|
|
596
|
+
metadata={},
|
|
597
|
+
config={},
|
|
598
|
+
input_schema=input_schema_str,
|
|
599
|
+
output_schema=output_schema_str,
|
|
600
|
+
definition=None,
|
|
601
|
+
)
|
|
602
|
+
components.append(component_info)
|
|
603
|
+
|
|
604
|
+
logger.info(f"Discovered {len(components)} total components")
|
|
605
|
+
return components
|
|
606
|
+
|
|
607
|
+
def _create_message_handler(self):
|
|
608
|
+
"""Create the message handler that will be called by Rust worker."""
|
|
609
|
+
|
|
610
|
+
def handle_message(request):
|
|
611
|
+
"""Handle incoming execution requests - returns coroutine for Rust to await."""
|
|
612
|
+
# Extract request details
|
|
613
|
+
component_name = request.component_name
|
|
614
|
+
component_type = request.component_type
|
|
615
|
+
input_data = request.input_data
|
|
616
|
+
|
|
617
|
+
logger.debug(
|
|
618
|
+
f"Handling {component_type} request: {component_name}, input size: {len(input_data)} bytes"
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
# Import all registries
|
|
622
|
+
from .tool import ToolRegistry
|
|
623
|
+
from .entity import EntityRegistry
|
|
624
|
+
from .agent import AgentRegistry
|
|
625
|
+
|
|
626
|
+
# Route based on component type and return coroutines
|
|
627
|
+
if component_type == "tool":
|
|
628
|
+
tool = ToolRegistry.get(component_name)
|
|
629
|
+
if tool:
|
|
630
|
+
logger.debug(f"Found tool: {component_name}")
|
|
631
|
+
# Return coroutine, don't await it
|
|
632
|
+
return self._execute_tool(tool, input_data, request)
|
|
633
|
+
|
|
634
|
+
elif component_type == "entity":
|
|
635
|
+
entity_type = EntityRegistry.get(component_name)
|
|
636
|
+
if entity_type:
|
|
637
|
+
logger.debug(f"Found entity: {component_name}")
|
|
638
|
+
# Return coroutine, don't await it
|
|
639
|
+
return self._execute_entity(entity_type, input_data, request)
|
|
640
|
+
|
|
641
|
+
elif component_type == "agent":
|
|
642
|
+
agent = AgentRegistry.get(component_name)
|
|
643
|
+
if agent:
|
|
644
|
+
logger.debug(f"Found agent: {component_name}")
|
|
645
|
+
# Return coroutine, don't await it
|
|
646
|
+
return self._execute_agent(agent, input_data, request)
|
|
647
|
+
|
|
648
|
+
elif component_type == "workflow":
|
|
649
|
+
workflow_config = WorkflowRegistry.get(component_name)
|
|
650
|
+
if workflow_config:
|
|
651
|
+
logger.debug(f"Found workflow: {component_name}")
|
|
652
|
+
# Return coroutine, don't await it
|
|
653
|
+
return self._execute_workflow(workflow_config, input_data, request)
|
|
654
|
+
|
|
655
|
+
elif component_type == "function":
|
|
656
|
+
function_config = FunctionRegistry.get(component_name)
|
|
657
|
+
if function_config:
|
|
658
|
+
# Return coroutine, don't await it
|
|
659
|
+
return self._execute_function(function_config, input_data, request)
|
|
660
|
+
|
|
661
|
+
# Not found - need to return an async error response
|
|
662
|
+
error_msg = f"Component '{component_name}' of type '{component_type}' not found"
|
|
663
|
+
logger.error(error_msg)
|
|
664
|
+
|
|
665
|
+
# Create async wrapper for error response
|
|
666
|
+
async def error_response():
|
|
667
|
+
return self._create_error_response(request, error_msg)
|
|
668
|
+
|
|
669
|
+
return error_response()
|
|
670
|
+
|
|
671
|
+
return handle_message
|
|
672
|
+
|
|
673
|
+
def _extract_critical_metadata(self, request) -> Dict[str, str]:
|
|
674
|
+
"""
|
|
675
|
+
Extract critical metadata from request that MUST be propagated to response.
|
|
676
|
+
|
|
677
|
+
This ensures journal events are written to the correct tenant partition
|
|
678
|
+
and can be properly replayed. Missing tenant_id causes catastrophic
|
|
679
|
+
event sourcing corruption where events are split across partitions.
|
|
680
|
+
|
|
681
|
+
Returns:
|
|
682
|
+
Dict[str, str]: Metadata with all values normalized to strings for Rust FFI
|
|
683
|
+
"""
|
|
684
|
+
metadata = {}
|
|
685
|
+
if hasattr(request, 'metadata') and request.metadata:
|
|
686
|
+
# CRITICAL: Propagate tenant_id to prevent journal corruption
|
|
687
|
+
# Convert to string immediately to ensure Rust FFI compatibility
|
|
688
|
+
if "tenant_id" in request.metadata:
|
|
689
|
+
metadata["tenant_id"] = str(request.metadata["tenant_id"])
|
|
690
|
+
if "deployment_id" in request.metadata:
|
|
691
|
+
metadata["deployment_id"] = str(request.metadata["deployment_id"])
|
|
692
|
+
|
|
693
|
+
# CRITICAL: Normalize all metadata values to strings for Rust FFI (PyO3)
|
|
694
|
+
# PyO3 expects HashMap<String, String> and will fail with bool/int values
|
|
695
|
+
return _normalize_metadata(metadata)
|
|
696
|
+
|
|
697
|
+
async def _execute_function(self, config, input_data: bytes, request):
|
|
698
|
+
"""Execute a function handler (supports both regular and streaming functions)."""
|
|
699
|
+
import json
|
|
700
|
+
import inspect
|
|
701
|
+
import time
|
|
702
|
+
from .context import Context
|
|
703
|
+
from ._core import PyExecuteComponentResponse
|
|
704
|
+
|
|
705
|
+
exec_start = time.time()
|
|
706
|
+
|
|
707
|
+
try:
|
|
708
|
+
# Parse input data
|
|
709
|
+
input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
|
|
710
|
+
|
|
711
|
+
# Store trace metadata in contextvar for LM calls to access
|
|
712
|
+
# The Rust worker injects traceparent into request.metadata for trace propagation
|
|
713
|
+
if hasattr(request, 'metadata') and request.metadata:
|
|
714
|
+
_trace_metadata.set(dict(request.metadata))
|
|
715
|
+
logger.debug(f"Trace metadata stored: traceparent={request.metadata.get('traceparent', 'N/A')}")
|
|
716
|
+
|
|
717
|
+
# Extract attempt number from platform request (if provided)
|
|
718
|
+
platform_attempt = getattr(request, 'attempt', 0)
|
|
719
|
+
|
|
720
|
+
# Create FunctionContext with attempt number for retry tracking
|
|
721
|
+
# - If platform_attempt > 0: Platform is orchestrating retries
|
|
722
|
+
# - If platform_attempt == 0: First attempt (or no retry config)
|
|
723
|
+
from .function import FunctionContext
|
|
724
|
+
ctx = FunctionContext(
|
|
725
|
+
run_id=f"{self.service_name}:{config.name}",
|
|
726
|
+
attempt=platform_attempt,
|
|
727
|
+
runtime_context=request.runtime_context,
|
|
728
|
+
retry_policy=config.retries,
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
# Set context in contextvar so get_current_context() and error handlers can access it
|
|
732
|
+
from .context import set_current_context, _current_context
|
|
733
|
+
token = set_current_context(ctx)
|
|
734
|
+
|
|
735
|
+
# Execute function directly - Rust bridge handles tracing
|
|
736
|
+
# Note: Removed Python-level span creation to avoid duplicate spans.
|
|
737
|
+
# The Rust worker bridge (sdk-python/rust-src/worker.rs:413-659) already
|
|
738
|
+
# creates a comprehensive OpenTelemetry span with all necessary attributes.
|
|
739
|
+
# See DUPLICATE_SPANS_FIX.md for details.
|
|
740
|
+
#
|
|
741
|
+
# Note on retry handling:
|
|
742
|
+
# - If platform_attempt > 0: Platform is orchestrating retries, execute once
|
|
743
|
+
# - If platform_attempt == 0: Local retry loop in decorator wrapper handles retries
|
|
744
|
+
if input_dict:
|
|
745
|
+
result = config.handler(ctx, **input_dict)
|
|
746
|
+
else:
|
|
747
|
+
result = config.handler(ctx)
|
|
748
|
+
|
|
749
|
+
# Note: Removed flush_telemetry_py() call here - it was causing 2-second blocking delay!
|
|
750
|
+
# The batch span processor handles flushing automatically with 5s timeout
|
|
751
|
+
# We only need to flush on worker shutdown, not after each function execution
|
|
752
|
+
|
|
753
|
+
# Check if result is an async generator (streaming function)
|
|
754
|
+
if inspect.isasyncgen(result):
|
|
755
|
+
# Streaming function - return list of responses
|
|
756
|
+
# Rust bridge will send each response separately to coordinator
|
|
757
|
+
responses = []
|
|
758
|
+
chunk_index = 0
|
|
759
|
+
|
|
760
|
+
async for chunk in result:
|
|
761
|
+
# Serialize chunk
|
|
762
|
+
chunk_data = json.dumps(chunk).encode("utf-8")
|
|
763
|
+
|
|
764
|
+
responses.append(PyExecuteComponentResponse(
|
|
765
|
+
invocation_id=request.invocation_id,
|
|
766
|
+
success=True,
|
|
767
|
+
output_data=chunk_data,
|
|
768
|
+
state_update=None,
|
|
769
|
+
error_message=None,
|
|
770
|
+
metadata=None,
|
|
771
|
+
is_chunk=True,
|
|
772
|
+
done=False,
|
|
773
|
+
chunk_index=chunk_index,
|
|
774
|
+
attempt=platform_attempt,
|
|
775
|
+
))
|
|
776
|
+
chunk_index += 1
|
|
777
|
+
|
|
778
|
+
# Add final "done" marker
|
|
779
|
+
responses.append(PyExecuteComponentResponse(
|
|
780
|
+
invocation_id=request.invocation_id,
|
|
781
|
+
success=True,
|
|
782
|
+
output_data=b"",
|
|
783
|
+
state_update=None,
|
|
784
|
+
error_message=None,
|
|
785
|
+
metadata=None,
|
|
786
|
+
is_chunk=True,
|
|
787
|
+
done=True,
|
|
788
|
+
chunk_index=chunk_index,
|
|
789
|
+
attempt=platform_attempt,
|
|
790
|
+
))
|
|
791
|
+
|
|
792
|
+
logger.debug(f"Streaming function produced {len(responses)} chunks")
|
|
793
|
+
return responses
|
|
794
|
+
else:
|
|
795
|
+
# Regular function - await and return single response
|
|
796
|
+
if inspect.iscoroutine(result):
|
|
797
|
+
result = await result
|
|
798
|
+
|
|
799
|
+
# Serialize result
|
|
800
|
+
output_data = json.dumps(result).encode("utf-8")
|
|
801
|
+
|
|
802
|
+
# Extract critical metadata for journal event correlation
|
|
803
|
+
response_metadata = self._extract_critical_metadata(request)
|
|
804
|
+
|
|
805
|
+
return PyExecuteComponentResponse(
|
|
806
|
+
invocation_id=request.invocation_id,
|
|
807
|
+
success=True,
|
|
808
|
+
output_data=output_data,
|
|
809
|
+
state_update=None,
|
|
810
|
+
error_message=None,
|
|
811
|
+
metadata=response_metadata if response_metadata else None,
|
|
812
|
+
is_chunk=False,
|
|
813
|
+
done=True,
|
|
814
|
+
chunk_index=0,
|
|
815
|
+
attempt=platform_attempt,
|
|
816
|
+
)
|
|
817
|
+
|
|
818
|
+
except Exception as e:
|
|
819
|
+
# Include exception type for better error messages
|
|
820
|
+
error_msg = f"{type(e).__name__}: {str(e)}"
|
|
821
|
+
|
|
822
|
+
# Capture full stack trace for telemetry
|
|
823
|
+
import traceback
|
|
824
|
+
stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
825
|
+
|
|
826
|
+
# Log with full traceback using ctx.logger to ensure run_id correlation
|
|
827
|
+
from .context import get_current_context
|
|
828
|
+
current_ctx = get_current_context()
|
|
829
|
+
error_logger = current_ctx.logger if current_ctx else logger
|
|
830
|
+
error_logger.error(f"Function execution failed: {error_msg}", exc_info=True)
|
|
831
|
+
|
|
832
|
+
# Store stack trace in metadata for observability
|
|
833
|
+
metadata = {
|
|
834
|
+
"error_type": type(e).__name__,
|
|
835
|
+
"stack_trace": stack_trace,
|
|
836
|
+
"error": True, # Boolean flag for error detection
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
# CRITICAL: Extract critical metadata (including tenant_id) for journal event correlation
|
|
840
|
+
# This ensures run.failed events are properly emitted by Worker Coordinator
|
|
841
|
+
critical_metadata = self._extract_critical_metadata(request)
|
|
842
|
+
metadata.update(critical_metadata)
|
|
843
|
+
|
|
844
|
+
# CRITICAL: Normalize metadata to ensure all values are strings (Rust FFI requirement)
|
|
845
|
+
# PyO3 expects HashMap<String, String>, but we may have booleans or other types
|
|
846
|
+
normalized_metadata = _normalize_metadata(metadata)
|
|
847
|
+
|
|
848
|
+
return PyExecuteComponentResponse(
|
|
849
|
+
invocation_id=request.invocation_id,
|
|
850
|
+
success=False,
|
|
851
|
+
output_data=b"",
|
|
852
|
+
state_update=None,
|
|
853
|
+
error_message=error_msg,
|
|
854
|
+
metadata=normalized_metadata,
|
|
855
|
+
is_chunk=False,
|
|
856
|
+
done=True,
|
|
857
|
+
chunk_index=0,
|
|
858
|
+
attempt=getattr(request, 'attempt', 0),
|
|
859
|
+
)
|
|
860
|
+
|
|
861
|
+
finally:
|
|
862
|
+
# Always reset context to prevent leakage between executions
|
|
863
|
+
_current_context.reset(token)
|
|
864
|
+
|
|
865
|
+
async def _execute_workflow(self, config, input_data: bytes, request):
|
|
866
|
+
"""Execute a workflow handler with automatic replay support."""
|
|
867
|
+
import json
|
|
868
|
+
from .workflow import WorkflowEntity, WorkflowContext
|
|
869
|
+
from .entity import _get_state_adapter, _entity_state_adapter_ctx
|
|
870
|
+
from .exceptions import WaitingForUserInputException
|
|
871
|
+
from ._core import PyExecuteComponentResponse
|
|
872
|
+
|
|
873
|
+
# Set entity state adapter in context so workflows can use Entities
|
|
874
|
+
_entity_state_adapter_ctx.set(self._entity_state_adapter)
|
|
875
|
+
|
|
876
|
+
try:
|
|
877
|
+
# Parse input data
|
|
878
|
+
input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
|
|
879
|
+
|
|
880
|
+
# Extract or generate session_id for multi-turn conversation support (for chat workflows)
|
|
881
|
+
# If session_id is provided, the workflow can maintain conversation context
|
|
882
|
+
session_id = input_dict.get("session_id")
|
|
883
|
+
|
|
884
|
+
if not session_id:
|
|
885
|
+
session_id = str(uuid.uuid4())
|
|
886
|
+
logger.info(f"Created new workflow session: {session_id}")
|
|
887
|
+
else:
|
|
888
|
+
logger.info(f"Using existing workflow session: {session_id}")
|
|
889
|
+
|
|
890
|
+
# Parse replay data from request metadata for crash recovery
|
|
891
|
+
completed_steps = {}
|
|
892
|
+
initial_state = {}
|
|
893
|
+
user_response = None
|
|
894
|
+
|
|
895
|
+
if hasattr(request, 'metadata') and request.metadata:
|
|
896
|
+
# Parse completed steps for replay
|
|
897
|
+
if "completed_steps" in request.metadata:
|
|
898
|
+
completed_steps_json = request.metadata["completed_steps"]
|
|
899
|
+
if completed_steps_json:
|
|
900
|
+
try:
|
|
901
|
+
completed_steps = json.loads(completed_steps_json)
|
|
902
|
+
logger.info(f"🔄 Replaying workflow with {len(completed_steps)} cached steps")
|
|
903
|
+
except json.JSONDecodeError:
|
|
904
|
+
logger.warning("Failed to parse completed_steps from metadata")
|
|
905
|
+
|
|
906
|
+
# Parse initial workflow state for replay
|
|
907
|
+
if "workflow_state" in request.metadata:
|
|
908
|
+
workflow_state_json = request.metadata["workflow_state"]
|
|
909
|
+
if workflow_state_json:
|
|
910
|
+
try:
|
|
911
|
+
initial_state = json.loads(workflow_state_json)
|
|
912
|
+
logger.info(f"🔄 Loaded workflow state: {len(initial_state)} keys")
|
|
913
|
+
except json.JSONDecodeError:
|
|
914
|
+
logger.warning("Failed to parse workflow_state from metadata")
|
|
915
|
+
|
|
916
|
+
# Check for user response (workflow resume after pause)
|
|
917
|
+
if "user_response" in request.metadata:
|
|
918
|
+
user_response = request.metadata["user_response"]
|
|
919
|
+
logger.info(f"▶️ Resuming workflow with user response: {user_response}")
|
|
920
|
+
|
|
921
|
+
# NEW: Check for agent resume (agent-level HITL)
|
|
922
|
+
agent_context = None
|
|
923
|
+
if hasattr(request, 'metadata') and request.metadata:
|
|
924
|
+
if "agent_context" in request.metadata:
|
|
925
|
+
agent_context_json = request.metadata["agent_context"]
|
|
926
|
+
try:
|
|
927
|
+
agent_context = json.loads(agent_context_json)
|
|
928
|
+
agent_name = agent_context.get("agent_name", "unknown")
|
|
929
|
+
iteration = agent_context.get("iteration", 0)
|
|
930
|
+
logger.info(
|
|
931
|
+
f"▶️ Resuming agent '{agent_name}' from iteration {iteration} "
|
|
932
|
+
f"with user response: {user_response}"
|
|
933
|
+
)
|
|
934
|
+
except json.JSONDecodeError:
|
|
935
|
+
logger.warning("Failed to parse agent_context from metadata")
|
|
936
|
+
agent_context = None
|
|
937
|
+
|
|
938
|
+
# Extract session_id and user_id from request for memory scoping
|
|
939
|
+
# Do this FIRST so we can pass to WorkflowEntity constructor
|
|
940
|
+
session_id = request.session_id if hasattr(request, 'session_id') and request.session_id else request.invocation_id
|
|
941
|
+
user_id = request.user_id if hasattr(request, 'user_id') and request.user_id else None
|
|
942
|
+
|
|
943
|
+
# Create WorkflowEntity for state management with memory scoping
|
|
944
|
+
# Entity key will be scoped based on priority: user_id > session_id > run_id
|
|
945
|
+
workflow_entity = WorkflowEntity(
|
|
946
|
+
run_id=request.invocation_id,
|
|
947
|
+
session_id=session_id,
|
|
948
|
+
user_id=user_id,
|
|
949
|
+
)
|
|
950
|
+
|
|
951
|
+
# Load replay data into entity if provided
|
|
952
|
+
if completed_steps:
|
|
953
|
+
workflow_entity._completed_steps = completed_steps
|
|
954
|
+
logger.debug(f"Loaded {len(completed_steps)} completed steps into workflow entity")
|
|
955
|
+
|
|
956
|
+
# Inject user response if resuming from pause
|
|
957
|
+
if user_response:
|
|
958
|
+
workflow_entity.inject_user_response(user_response)
|
|
959
|
+
logger.debug(f"Injected user response into workflow entity")
|
|
960
|
+
|
|
961
|
+
if initial_state:
|
|
962
|
+
# Load initial state into entity's state adapter
|
|
963
|
+
state_adapter = _get_state_adapter()
|
|
964
|
+
if hasattr(state_adapter, '_standalone_states'):
|
|
965
|
+
# Standalone mode - set state directly
|
|
966
|
+
state_adapter._standalone_states[workflow_entity._state_key] = initial_state
|
|
967
|
+
logger.debug(f"Loaded initial state with {len(initial_state)} keys into workflow entity (standalone)")
|
|
968
|
+
else:
|
|
969
|
+
# Production mode - state is managed by Rust core
|
|
970
|
+
logger.debug(f"Initial state will be loaded from platform (production mode)")
|
|
971
|
+
|
|
972
|
+
# Create checkpoint callback for real-time streaming
|
|
973
|
+
def checkpoint_callback(checkpoint: dict) -> None:
|
|
974
|
+
"""Send checkpoint to Rust worker queue."""
|
|
975
|
+
try:
|
|
976
|
+
# Extract critical metadata for checkpoint routing
|
|
977
|
+
metadata = self._extract_critical_metadata(request)
|
|
978
|
+
|
|
979
|
+
# DEBUG: Log metadata types for troubleshooting PyO3 conversion errors
|
|
980
|
+
logger.debug(f"Checkpoint metadata types: {[(k, type(v).__name__) for k, v in metadata.items()]}")
|
|
981
|
+
|
|
982
|
+
# Queue checkpoint via Rust FFI
|
|
983
|
+
self._rust_worker.queue_workflow_checkpoint(
|
|
984
|
+
invocation_id=request.invocation_id,
|
|
985
|
+
checkpoint_type=checkpoint["checkpoint_type"],
|
|
986
|
+
checkpoint_data=json.dumps(checkpoint["checkpoint_data"]),
|
|
987
|
+
sequence_number=checkpoint["sequence_number"],
|
|
988
|
+
metadata=metadata,
|
|
989
|
+
)
|
|
990
|
+
logger.debug(
|
|
991
|
+
f"Queued checkpoint: type={checkpoint['checkpoint_type']} "
|
|
992
|
+
f"seq={checkpoint['sequence_number']}"
|
|
993
|
+
)
|
|
994
|
+
except Exception as e:
|
|
995
|
+
logger.error(f"Failed to queue checkpoint: {e}", exc_info=True)
|
|
996
|
+
logger.error(f"Checkpoint metadata causing error: {metadata}")
|
|
997
|
+
logger.error(f"Checkpoint data: {checkpoint}")
|
|
998
|
+
|
|
999
|
+
# Create WorkflowContext with entity, runtime_context, and checkpoint callback
|
|
1000
|
+
ctx = WorkflowContext(
|
|
1001
|
+
workflow_entity=workflow_entity,
|
|
1002
|
+
run_id=request.invocation_id, # Use unique invocation_id for this execution
|
|
1003
|
+
session_id=session_id, # Session for multi-turn conversations
|
|
1004
|
+
user_id=user_id, # User for long-term memory
|
|
1005
|
+
runtime_context=request.runtime_context,
|
|
1006
|
+
checkpoint_callback=checkpoint_callback,
|
|
1007
|
+
)
|
|
1008
|
+
|
|
1009
|
+
# NEW: Populate agent resume info if this is an agent HITL resume
|
|
1010
|
+
if agent_context and user_response:
|
|
1011
|
+
ctx._agent_resume_info = {
|
|
1012
|
+
"agent_name": agent_context["agent_name"],
|
|
1013
|
+
"agent_context": agent_context,
|
|
1014
|
+
"user_response": user_response,
|
|
1015
|
+
}
|
|
1016
|
+
logger.debug(
|
|
1017
|
+
f"Set agent resume info for '{agent_context['agent_name']}' "
|
|
1018
|
+
f"in workflow context"
|
|
1019
|
+
)
|
|
1020
|
+
|
|
1021
|
+
# Execute workflow directly - Rust bridge handles tracing
|
|
1022
|
+
# Note: Removed Python-level span creation to avoid duplicate spans.
|
|
1023
|
+
# The Rust worker bridge creates comprehensive OpenTelemetry spans.
|
|
1024
|
+
# See DUPLICATE_SPANS_FIX.md for details.
|
|
1025
|
+
|
|
1026
|
+
# CRITICAL: Set context in contextvar so LM/Agent/Tool calls can access it
|
|
1027
|
+
from .context import set_current_context
|
|
1028
|
+
token = set_current_context(ctx)
|
|
1029
|
+
try:
|
|
1030
|
+
if input_dict:
|
|
1031
|
+
result = await config.handler(ctx, **input_dict)
|
|
1032
|
+
else:
|
|
1033
|
+
result = await config.handler(ctx)
|
|
1034
|
+
|
|
1035
|
+
# Note: Workflow entity persistence is handled by the @workflow decorator wrapper
|
|
1036
|
+
# which persists before returning. No need to persist here.
|
|
1037
|
+
finally:
|
|
1038
|
+
# Always reset context to prevent leakage
|
|
1039
|
+
from .context import _current_context
|
|
1040
|
+
_current_context.reset(token)
|
|
1041
|
+
|
|
1042
|
+
# Note: Removed flush_telemetry_py() call here - it was causing 2-second blocking delay!
|
|
1043
|
+
# The batch span processor handles flushing automatically with 5s timeout
|
|
1044
|
+
|
|
1045
|
+
# Serialize result
|
|
1046
|
+
output_data = json.dumps(result).encode("utf-8")
|
|
1047
|
+
|
|
1048
|
+
# Collect workflow execution metadata for durability
|
|
1049
|
+
metadata = {}
|
|
1050
|
+
|
|
1051
|
+
# CRITICAL: Propagate tenant_id and deployment_id to prevent journal corruption
|
|
1052
|
+
# Missing tenant_id causes events to be written to wrong partition
|
|
1053
|
+
critical_metadata = self._extract_critical_metadata(request)
|
|
1054
|
+
metadata.update(critical_metadata)
|
|
1055
|
+
|
|
1056
|
+
# Add step events to metadata (for workflow durability)
|
|
1057
|
+
# Access _step_events from the workflow entity, not the context
|
|
1058
|
+
step_events = ctx._workflow_entity._step_events
|
|
1059
|
+
if step_events:
|
|
1060
|
+
metadata["step_events"] = json.dumps(step_events)
|
|
1061
|
+
logger.debug(f"Workflow has {len(step_events)} recorded steps")
|
|
1062
|
+
|
|
1063
|
+
# Add final state snapshot to metadata (if state was used)
|
|
1064
|
+
# Check if _state was initialized without triggering property getter
|
|
1065
|
+
if hasattr(ctx, '_workflow_entity') and ctx._workflow_entity._state is not None:
|
|
1066
|
+
if ctx._workflow_entity._state.has_changes():
|
|
1067
|
+
state_snapshot = ctx._workflow_entity._state.get_state_snapshot()
|
|
1068
|
+
metadata["workflow_state"] = json.dumps(state_snapshot)
|
|
1069
|
+
logger.debug(f"Workflow state snapshot: {state_snapshot}")
|
|
1070
|
+
|
|
1071
|
+
# AUDIT TRAIL: Serialize complete state change history for replay and debugging
|
|
1072
|
+
# This captures all intermediate state mutations, not just final snapshot
|
|
1073
|
+
state_changes = ctx._workflow_entity._state_changes
|
|
1074
|
+
logger.info(f"🔍 DEBUG: _state_changes list has {len(state_changes)} entries")
|
|
1075
|
+
if state_changes:
|
|
1076
|
+
metadata["state_changes"] = json.dumps(state_changes)
|
|
1077
|
+
logger.info(f"✅ Serialized {len(state_changes)} state changes to metadata")
|
|
1078
|
+
else:
|
|
1079
|
+
logger.warning("⚠️ _state_changes list is empty - no state change history captured")
|
|
1080
|
+
|
|
1081
|
+
# CRITICAL: Persist workflow entity state to platform
|
|
1082
|
+
# This stores the WorkflowEntity as a first-class entity with proper versioning
|
|
1083
|
+
try:
|
|
1084
|
+
logger.info(f"🔍 DEBUG: About to call _persist_state() for run {request.invocation_id}")
|
|
1085
|
+
await ctx._workflow_entity._persist_state()
|
|
1086
|
+
logger.info(f"✅ Successfully persisted WorkflowEntity state for run {request.invocation_id}")
|
|
1087
|
+
except Exception as persist_error:
|
|
1088
|
+
logger.error(f"❌ Failed to persist WorkflowEntity state (non-fatal): {persist_error}", exc_info=True)
|
|
1089
|
+
# Continue anyway - persistence failure shouldn't fail the workflow
|
|
1090
|
+
|
|
1091
|
+
logger.info(f"Workflow completed successfully with {len(step_events)} steps")
|
|
1092
|
+
|
|
1093
|
+
# Add session_id to metadata for multi-turn conversation support
|
|
1094
|
+
metadata["session_id"] = session_id
|
|
1095
|
+
|
|
1096
|
+
# CRITICAL: Flush all buffered checkpoints before returning response
|
|
1097
|
+
# This ensures checkpoints arrive at platform BEFORE run.completed event
|
|
1098
|
+
try:
|
|
1099
|
+
flushed_count = self._rust_worker.flush_workflow_checkpoints()
|
|
1100
|
+
if flushed_count > 0:
|
|
1101
|
+
logger.info(f"✅ Flushed {flushed_count} checkpoints before completion")
|
|
1102
|
+
except Exception as flush_error:
|
|
1103
|
+
logger.error(f"Failed to flush checkpoints: {flush_error}", exc_info=True)
|
|
1104
|
+
# Continue anyway - checkpoint flushing is best-effort
|
|
1105
|
+
|
|
1106
|
+
return PyExecuteComponentResponse(
|
|
1107
|
+
invocation_id=request.invocation_id,
|
|
1108
|
+
success=True,
|
|
1109
|
+
output_data=output_data,
|
|
1110
|
+
state_update=None, # Not used for workflows (use metadata instead)
|
|
1111
|
+
error_message=None,
|
|
1112
|
+
metadata=metadata if metadata else None, # Include step events + state + session_id
|
|
1113
|
+
is_chunk=False,
|
|
1114
|
+
done=True,
|
|
1115
|
+
chunk_index=0,
|
|
1116
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1117
|
+
)
|
|
1118
|
+
|
|
1119
|
+
except WaitingForUserInputException as e:
|
|
1120
|
+
# Workflow or agent paused for user input
|
|
1121
|
+
pause_type = "agent" if e.agent_context else "workflow"
|
|
1122
|
+
logger.info(f"⏸️ {pause_type.capitalize()} paused waiting for user input: {e.question}")
|
|
1123
|
+
|
|
1124
|
+
# Collect metadata for pause state
|
|
1125
|
+
# Note: All metadata values must be strings for Rust FFI
|
|
1126
|
+
pause_metadata = {
|
|
1127
|
+
"status": "awaiting_user_input",
|
|
1128
|
+
"question": e.question,
|
|
1129
|
+
"input_type": e.input_type,
|
|
1130
|
+
"pause_type": pause_type, # NEW: Indicates workflow vs agent pause
|
|
1131
|
+
}
|
|
1132
|
+
|
|
1133
|
+
# CRITICAL: Propagate tenant_id even when pausing
|
|
1134
|
+
critical_metadata = self._extract_critical_metadata(request)
|
|
1135
|
+
pause_metadata.update(critical_metadata)
|
|
1136
|
+
|
|
1137
|
+
# Add optional fields only if they exist
|
|
1138
|
+
if e.options:
|
|
1139
|
+
pause_metadata["options"] = json.dumps(e.options)
|
|
1140
|
+
if e.checkpoint_state:
|
|
1141
|
+
pause_metadata["checkpoint_state"] = json.dumps(e.checkpoint_state)
|
|
1142
|
+
if session_id:
|
|
1143
|
+
pause_metadata["session_id"] = session_id
|
|
1144
|
+
|
|
1145
|
+
# NEW: Store agent execution state if present
|
|
1146
|
+
if e.agent_context:
|
|
1147
|
+
pause_metadata["agent_context"] = json.dumps(e.agent_context)
|
|
1148
|
+
logger.debug(
|
|
1149
|
+
f"Agent '{e.agent_context['agent_name']}' paused at "
|
|
1150
|
+
f"iteration {e.agent_context['iteration']}"
|
|
1151
|
+
)
|
|
1152
|
+
|
|
1153
|
+
# Add step events to pause metadata for durability
|
|
1154
|
+
step_events = ctx._workflow_entity._step_events
|
|
1155
|
+
if step_events:
|
|
1156
|
+
pause_metadata["step_events"] = json.dumps(step_events)
|
|
1157
|
+
logger.debug(f"Paused workflow has {len(step_events)} recorded steps")
|
|
1158
|
+
|
|
1159
|
+
# Add current workflow state to pause metadata
|
|
1160
|
+
if hasattr(ctx, '_workflow_entity') and ctx._workflow_entity._state is not None:
|
|
1161
|
+
if ctx._workflow_entity._state.has_changes():
|
|
1162
|
+
state_snapshot = ctx._workflow_entity._state.get_state_snapshot()
|
|
1163
|
+
pause_metadata["workflow_state"] = json.dumps(state_snapshot)
|
|
1164
|
+
logger.debug(f"Paused workflow state snapshot: {state_snapshot}")
|
|
1165
|
+
|
|
1166
|
+
# AUDIT TRAIL: Also include state change history for paused workflows
|
|
1167
|
+
state_changes = ctx._workflow_entity._state_changes
|
|
1168
|
+
if state_changes:
|
|
1169
|
+
pause_metadata["state_changes"] = json.dumps(state_changes)
|
|
1170
|
+
logger.debug(f"Paused workflow has {len(state_changes)} state changes in history")
|
|
1171
|
+
|
|
1172
|
+
# Return "success" with awaiting_user_input metadata
|
|
1173
|
+
# The output contains the question details for the client
|
|
1174
|
+
output = {
|
|
1175
|
+
"question": e.question,
|
|
1176
|
+
"input_type": e.input_type,
|
|
1177
|
+
"options": e.options,
|
|
1178
|
+
}
|
|
1179
|
+
output_data = json.dumps(output).encode("utf-8")
|
|
1180
|
+
|
|
1181
|
+
return PyExecuteComponentResponse(
|
|
1182
|
+
invocation_id=request.invocation_id,
|
|
1183
|
+
success=True, # This is a valid pause state, not an error
|
|
1184
|
+
output_data=output_data,
|
|
1185
|
+
state_update=None,
|
|
1186
|
+
error_message=None,
|
|
1187
|
+
metadata=pause_metadata,
|
|
1188
|
+
is_chunk=False,
|
|
1189
|
+
done=True,
|
|
1190
|
+
chunk_index=0,
|
|
1191
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1192
|
+
)
|
|
1193
|
+
|
|
1194
|
+
except Exception as e:
|
|
1195
|
+
# Include exception type for better error messages
|
|
1196
|
+
error_msg = f"{type(e).__name__}: {str(e)}"
|
|
1197
|
+
|
|
1198
|
+
# Capture full stack trace for telemetry
|
|
1199
|
+
import traceback
|
|
1200
|
+
stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
1201
|
+
|
|
1202
|
+
# Log with full traceback
|
|
1203
|
+
logger.error(f"Workflow execution failed: {error_msg}", exc_info=True)
|
|
1204
|
+
|
|
1205
|
+
# Store error metadata for observability
|
|
1206
|
+
metadata = {
|
|
1207
|
+
"error_type": type(e).__name__,
|
|
1208
|
+
"stack_trace": stack_trace,
|
|
1209
|
+
"error": True,
|
|
1210
|
+
}
|
|
1211
|
+
|
|
1212
|
+
# Extract critical metadata for journal correlation (if available)
|
|
1213
|
+
critical_metadata = self._extract_critical_metadata(request)
|
|
1214
|
+
metadata.update(critical_metadata)
|
|
1215
|
+
|
|
1216
|
+
# Normalize metadata for Rust FFI compatibility
|
|
1217
|
+
normalized_metadata = _normalize_metadata(metadata)
|
|
1218
|
+
|
|
1219
|
+
return PyExecuteComponentResponse(
|
|
1220
|
+
invocation_id=request.invocation_id,
|
|
1221
|
+
success=False,
|
|
1222
|
+
output_data=b"",
|
|
1223
|
+
state_update=None,
|
|
1224
|
+
error_message=error_msg,
|
|
1225
|
+
metadata=normalized_metadata,
|
|
1226
|
+
is_chunk=False,
|
|
1227
|
+
done=True,
|
|
1228
|
+
chunk_index=0,
|
|
1229
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1230
|
+
)
|
|
1231
|
+
|
|
1232
|
+
async def _execute_tool(self, tool, input_data: bytes, request):
|
|
1233
|
+
"""Execute a tool handler."""
|
|
1234
|
+
import json
|
|
1235
|
+
from .context import Context
|
|
1236
|
+
from ._core import PyExecuteComponentResponse
|
|
1237
|
+
|
|
1238
|
+
try:
|
|
1239
|
+
# Parse input data
|
|
1240
|
+
input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
|
|
1241
|
+
|
|
1242
|
+
# Create context with runtime_context for trace correlation
|
|
1243
|
+
ctx = Context(
|
|
1244
|
+
run_id=f"{self.service_name}:{tool.name}",
|
|
1245
|
+
runtime_context=request.runtime_context,
|
|
1246
|
+
)
|
|
1247
|
+
|
|
1248
|
+
# Set context in contextvar so get_current_context() and error handlers can access it
|
|
1249
|
+
from .context import set_current_context, _current_context
|
|
1250
|
+
token = set_current_context(ctx)
|
|
1251
|
+
|
|
1252
|
+
# Execute tool
|
|
1253
|
+
result = await tool.invoke(ctx, **input_dict)
|
|
1254
|
+
|
|
1255
|
+
# Serialize result
|
|
1256
|
+
output_data = json.dumps(result).encode("utf-8")
|
|
1257
|
+
|
|
1258
|
+
return PyExecuteComponentResponse(
|
|
1259
|
+
invocation_id=request.invocation_id,
|
|
1260
|
+
success=True,
|
|
1261
|
+
output_data=output_data,
|
|
1262
|
+
state_update=None,
|
|
1263
|
+
error_message=None,
|
|
1264
|
+
metadata=None,
|
|
1265
|
+
is_chunk=False,
|
|
1266
|
+
done=True,
|
|
1267
|
+
chunk_index=0,
|
|
1268
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1269
|
+
)
|
|
1270
|
+
|
|
1271
|
+
except Exception as e:
|
|
1272
|
+
# Include exception type for better error messages
|
|
1273
|
+
error_msg = f"{type(e).__name__}: {str(e)}"
|
|
1274
|
+
|
|
1275
|
+
# Capture full stack trace for telemetry
|
|
1276
|
+
import traceback
|
|
1277
|
+
stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
1278
|
+
|
|
1279
|
+
# Log with full traceback using ctx.logger to ensure run_id correlation
|
|
1280
|
+
from .context import get_current_context
|
|
1281
|
+
current_ctx = get_current_context()
|
|
1282
|
+
error_logger = current_ctx.logger if current_ctx else logger
|
|
1283
|
+
error_logger.error(f"Tool execution failed: {error_msg}", exc_info=True)
|
|
1284
|
+
|
|
1285
|
+
# Store error metadata for observability
|
|
1286
|
+
metadata = {
|
|
1287
|
+
"error_type": type(e).__name__,
|
|
1288
|
+
"stack_trace": stack_trace,
|
|
1289
|
+
"error": True,
|
|
1290
|
+
}
|
|
1291
|
+
|
|
1292
|
+
# CRITICAL: Extract critical metadata (including tenant_id) for journal event correlation
|
|
1293
|
+
critical_metadata = self._extract_critical_metadata(request)
|
|
1294
|
+
metadata.update(critical_metadata)
|
|
1295
|
+
|
|
1296
|
+
# Normalize metadata for Rust FFI compatibility
|
|
1297
|
+
normalized_metadata = _normalize_metadata(metadata)
|
|
1298
|
+
|
|
1299
|
+
return PyExecuteComponentResponse(
|
|
1300
|
+
invocation_id=request.invocation_id,
|
|
1301
|
+
success=False,
|
|
1302
|
+
output_data=b"",
|
|
1303
|
+
state_update=None,
|
|
1304
|
+
error_message=error_msg,
|
|
1305
|
+
metadata=normalized_metadata,
|
|
1306
|
+
is_chunk=False,
|
|
1307
|
+
done=True,
|
|
1308
|
+
chunk_index=0,
|
|
1309
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1310
|
+
)
|
|
1311
|
+
|
|
1312
|
+
finally:
|
|
1313
|
+
# Always reset context to prevent leakage between executions
|
|
1314
|
+
_current_context.reset(token)
|
|
1315
|
+
|
|
1316
|
+
async def _execute_entity(self, entity_type, input_data: bytes, request):
|
|
1317
|
+
"""Execute an entity method."""
|
|
1318
|
+
import json
|
|
1319
|
+
from .context import Context
|
|
1320
|
+
from .entity import EntityType, Entity, _entity_state_adapter_ctx
|
|
1321
|
+
from ._core import PyExecuteComponentResponse
|
|
1322
|
+
|
|
1323
|
+
# Set entity state adapter in context for Entity instances to access
|
|
1324
|
+
_entity_state_adapter_ctx.set(self._entity_state_adapter)
|
|
1325
|
+
|
|
1326
|
+
try:
|
|
1327
|
+
# Parse input data
|
|
1328
|
+
input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
|
|
1329
|
+
|
|
1330
|
+
# Extract entity key and method name from input
|
|
1331
|
+
entity_key = input_dict.pop("key", None)
|
|
1332
|
+
method_name = input_dict.pop("method", None)
|
|
1333
|
+
|
|
1334
|
+
if not entity_key:
|
|
1335
|
+
raise ValueError("Entity invocation requires 'key' parameter")
|
|
1336
|
+
if not method_name:
|
|
1337
|
+
raise ValueError("Entity invocation requires 'method' parameter")
|
|
1338
|
+
|
|
1339
|
+
# Create context for logging and tracing
|
|
1340
|
+
ctx = Context(
|
|
1341
|
+
run_id=f"{self.service_name}:{entity_type.name}:{entity_key}",
|
|
1342
|
+
runtime_context=request.runtime_context,
|
|
1343
|
+
)
|
|
1344
|
+
|
|
1345
|
+
# Set context in contextvar so get_current_context() and error handlers can access it
|
|
1346
|
+
from .context import set_current_context, _current_context
|
|
1347
|
+
token = set_current_context(ctx)
|
|
1348
|
+
|
|
1349
|
+
# Note: State loading is now handled automatically by the entity method wrapper
|
|
1350
|
+
# via EntityStateAdapter which uses the Rust core for cache + platform persistence
|
|
1351
|
+
|
|
1352
|
+
# Create entity instance using the stored class reference
|
|
1353
|
+
entity_instance = entity_type.entity_class(key=entity_key)
|
|
1354
|
+
|
|
1355
|
+
# Get method
|
|
1356
|
+
if not hasattr(entity_instance, method_name):
|
|
1357
|
+
raise ValueError(f"Entity '{entity_type.name}' has no method '{method_name}'")
|
|
1358
|
+
|
|
1359
|
+
method = getattr(entity_instance, method_name)
|
|
1360
|
+
|
|
1361
|
+
# Execute method (entity method wrapper handles state load/save automatically)
|
|
1362
|
+
result = await method(**input_dict)
|
|
1363
|
+
|
|
1364
|
+
# Serialize result
|
|
1365
|
+
output_data = json.dumps(result).encode("utf-8")
|
|
1366
|
+
|
|
1367
|
+
# Note: State persistence is now handled automatically by the entity method wrapper
|
|
1368
|
+
# via EntityStateAdapter which uses Rust core for optimistic locking + version tracking
|
|
1369
|
+
|
|
1370
|
+
# CRITICAL: Propagate tenant_id and deployment_id to prevent journal corruption
|
|
1371
|
+
metadata = self._extract_critical_metadata(request)
|
|
1372
|
+
|
|
1373
|
+
return PyExecuteComponentResponse(
|
|
1374
|
+
invocation_id=request.invocation_id,
|
|
1375
|
+
success=True,
|
|
1376
|
+
output_data=output_data,
|
|
1377
|
+
state_update=None, # TODO: Use structured StateUpdate object
|
|
1378
|
+
error_message=None,
|
|
1379
|
+
metadata=metadata if metadata else None, # Include state in metadata for Worker Coordinator
|
|
1380
|
+
is_chunk=False,
|
|
1381
|
+
done=True,
|
|
1382
|
+
chunk_index=0,
|
|
1383
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1384
|
+
)
|
|
1385
|
+
|
|
1386
|
+
except Exception as e:
|
|
1387
|
+
# Include exception type for better error messages
|
|
1388
|
+
error_msg = f"{type(e).__name__}: {str(e)}"
|
|
1389
|
+
|
|
1390
|
+
# Capture full stack trace for telemetry
|
|
1391
|
+
import traceback
|
|
1392
|
+
stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
1393
|
+
|
|
1394
|
+
# Log with full traceback using ctx.logger to ensure run_id correlation
|
|
1395
|
+
from .context import get_current_context
|
|
1396
|
+
current_ctx = get_current_context()
|
|
1397
|
+
error_logger = current_ctx.logger if current_ctx else logger
|
|
1398
|
+
error_logger.error(f"Entity execution failed: {error_msg}", exc_info=True)
|
|
1399
|
+
|
|
1400
|
+
# Store error metadata for observability
|
|
1401
|
+
metadata = {
|
|
1402
|
+
"error_type": type(e).__name__,
|
|
1403
|
+
"stack_trace": stack_trace,
|
|
1404
|
+
"error": True,
|
|
1405
|
+
}
|
|
1406
|
+
|
|
1407
|
+
# Extract critical metadata for journal correlation (if available)
|
|
1408
|
+
critical_metadata = self._extract_critical_metadata(request)
|
|
1409
|
+
metadata.update(critical_metadata)
|
|
1410
|
+
|
|
1411
|
+
# Normalize metadata for Rust FFI compatibility
|
|
1412
|
+
normalized_metadata = _normalize_metadata(metadata)
|
|
1413
|
+
|
|
1414
|
+
return PyExecuteComponentResponse(
|
|
1415
|
+
invocation_id=request.invocation_id,
|
|
1416
|
+
success=False,
|
|
1417
|
+
output_data=b"",
|
|
1418
|
+
state_update=None,
|
|
1419
|
+
error_message=error_msg,
|
|
1420
|
+
metadata=normalized_metadata,
|
|
1421
|
+
is_chunk=False,
|
|
1422
|
+
done=True,
|
|
1423
|
+
chunk_index=0,
|
|
1424
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1425
|
+
)
|
|
1426
|
+
|
|
1427
|
+
finally:
|
|
1428
|
+
# Always reset context to prevent leakage between executions
|
|
1429
|
+
_current_context.reset(token)
|
|
1430
|
+
|
|
1431
|
+
async def _execute_agent(self, agent, input_data: bytes, request):
|
|
1432
|
+
"""Execute an agent with session support for multi-turn conversations."""
|
|
1433
|
+
import json
|
|
1434
|
+
import uuid
|
|
1435
|
+
from .agent import AgentContext
|
|
1436
|
+
from .entity import _entity_state_adapter_ctx
|
|
1437
|
+
from ._core import PyExecuteComponentResponse
|
|
1438
|
+
|
|
1439
|
+
# Set entity state adapter in context so AgentContext can access it
|
|
1440
|
+
_entity_state_adapter_ctx.set(self._entity_state_adapter)
|
|
1441
|
+
|
|
1442
|
+
try:
|
|
1443
|
+
# Parse input data
|
|
1444
|
+
input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
|
|
1445
|
+
|
|
1446
|
+
# Extract user message
|
|
1447
|
+
user_message = input_dict.get("message", "")
|
|
1448
|
+
if not user_message:
|
|
1449
|
+
raise ValueError("Agent invocation requires 'message' parameter")
|
|
1450
|
+
|
|
1451
|
+
# Extract or generate session_id for multi-turn conversation support
|
|
1452
|
+
# If session_id is provided, the agent will load previous conversation history
|
|
1453
|
+
# If not provided, a new session is created with auto-generated ID
|
|
1454
|
+
session_id = input_dict.get("session_id")
|
|
1455
|
+
|
|
1456
|
+
if not session_id:
|
|
1457
|
+
session_id = str(uuid.uuid4())
|
|
1458
|
+
logger.info(f"Created new agent session: {session_id}")
|
|
1459
|
+
else:
|
|
1460
|
+
logger.info(f"Using existing agent session: {session_id}")
|
|
1461
|
+
|
|
1462
|
+
# Create AgentContext with session support for conversation persistence
|
|
1463
|
+
# AgentContext automatically loads/saves conversation history based on session_id
|
|
1464
|
+
ctx = AgentContext(
|
|
1465
|
+
run_id=request.invocation_id,
|
|
1466
|
+
agent_name=agent.name,
|
|
1467
|
+
session_id=session_id,
|
|
1468
|
+
runtime_context=request.runtime_context,
|
|
1469
|
+
)
|
|
1470
|
+
|
|
1471
|
+
# Set context in contextvar so get_current_context() and error handlers can access it
|
|
1472
|
+
from .context import set_current_context, _current_context
|
|
1473
|
+
token = set_current_context(ctx)
|
|
1474
|
+
|
|
1475
|
+
# Execute agent - conversation history is automatically included
|
|
1476
|
+
agent_result = await agent.run(user_message, context=ctx)
|
|
1477
|
+
|
|
1478
|
+
# Build response with agent output and tool calls
|
|
1479
|
+
result = {
|
|
1480
|
+
"output": agent_result.output,
|
|
1481
|
+
"tool_calls": agent_result.tool_calls,
|
|
1482
|
+
}
|
|
1483
|
+
|
|
1484
|
+
# Serialize result
|
|
1485
|
+
output_data = json.dumps(result).encode("utf-8")
|
|
1486
|
+
|
|
1487
|
+
# CRITICAL: Propagate tenant_id and deployment_id to prevent journal corruption
|
|
1488
|
+
metadata = self._extract_critical_metadata(request)
|
|
1489
|
+
# Also include session_id for UI to persist conversation
|
|
1490
|
+
metadata["session_id"] = session_id
|
|
1491
|
+
|
|
1492
|
+
return PyExecuteComponentResponse(
|
|
1493
|
+
invocation_id=request.invocation_id,
|
|
1494
|
+
success=True,
|
|
1495
|
+
output_data=output_data,
|
|
1496
|
+
state_update=None,
|
|
1497
|
+
error_message=None,
|
|
1498
|
+
metadata=metadata if metadata else None,
|
|
1499
|
+
is_chunk=False,
|
|
1500
|
+
done=True,
|
|
1501
|
+
chunk_index=0,
|
|
1502
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1503
|
+
)
|
|
1504
|
+
|
|
1505
|
+
except Exception as e:
|
|
1506
|
+
# Include exception type for better error messages
|
|
1507
|
+
error_msg = f"{type(e).__name__}: {str(e)}"
|
|
1508
|
+
|
|
1509
|
+
# Capture full stack trace for telemetry
|
|
1510
|
+
import traceback
|
|
1511
|
+
stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
1512
|
+
|
|
1513
|
+
# Log with full traceback using ctx.logger to ensure run_id correlation
|
|
1514
|
+
from .context import get_current_context
|
|
1515
|
+
current_ctx = get_current_context()
|
|
1516
|
+
error_logger = current_ctx.logger if current_ctx else logger
|
|
1517
|
+
error_logger.error(f"Agent execution failed: {error_msg}", exc_info=True)
|
|
1518
|
+
|
|
1519
|
+
# Store error metadata for observability
|
|
1520
|
+
metadata = {
|
|
1521
|
+
"error_type": type(e).__name__,
|
|
1522
|
+
"stack_trace": stack_trace,
|
|
1523
|
+
"error": True,
|
|
1524
|
+
}
|
|
1525
|
+
|
|
1526
|
+
# Extract critical metadata for journal correlation (if available)
|
|
1527
|
+
critical_metadata = self._extract_critical_metadata(request)
|
|
1528
|
+
metadata.update(critical_metadata)
|
|
1529
|
+
|
|
1530
|
+
# Normalize metadata for Rust FFI compatibility
|
|
1531
|
+
normalized_metadata = _normalize_metadata(metadata)
|
|
1532
|
+
|
|
1533
|
+
return PyExecuteComponentResponse(
|
|
1534
|
+
invocation_id=request.invocation_id,
|
|
1535
|
+
success=False,
|
|
1536
|
+
output_data=b"",
|
|
1537
|
+
state_update=None,
|
|
1538
|
+
error_message=error_msg,
|
|
1539
|
+
metadata=normalized_metadata,
|
|
1540
|
+
is_chunk=False,
|
|
1541
|
+
done=True,
|
|
1542
|
+
chunk_index=0,
|
|
1543
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1544
|
+
)
|
|
1545
|
+
|
|
1546
|
+
finally:
|
|
1547
|
+
# Always reset context to prevent leakage between executions
|
|
1548
|
+
_current_context.reset(token)
|
|
1549
|
+
|
|
1550
|
+
def _create_error_response(self, request, error_message: str):
|
|
1551
|
+
"""Create an error response."""
|
|
1552
|
+
from ._core import PyExecuteComponentResponse
|
|
1553
|
+
|
|
1554
|
+
return PyExecuteComponentResponse(
|
|
1555
|
+
invocation_id=request.invocation_id,
|
|
1556
|
+
success=False,
|
|
1557
|
+
output_data=b"",
|
|
1558
|
+
state_update=None,
|
|
1559
|
+
error_message=error_message,
|
|
1560
|
+
metadata=None,
|
|
1561
|
+
is_chunk=False,
|
|
1562
|
+
done=True,
|
|
1563
|
+
chunk_index=0,
|
|
1564
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1565
|
+
)
|
|
1566
|
+
|
|
1567
|
+
async def run(self):
|
|
1568
|
+
"""Run the worker (register and start message loop).
|
|
1569
|
+
|
|
1570
|
+
This method will:
|
|
1571
|
+
1. Discover all registered @function and @workflow handlers
|
|
1572
|
+
2. Register with the coordinator
|
|
1573
|
+
3. Create a shared Python event loop for all function executions
|
|
1574
|
+
4. Enter the message processing loop
|
|
1575
|
+
5. Block until shutdown
|
|
1576
|
+
|
|
1577
|
+
This is the main entry point for your worker service.
|
|
1578
|
+
"""
|
|
1579
|
+
logger.info(f"Starting worker: {self.service_name}")
|
|
1580
|
+
|
|
1581
|
+
# Discover components
|
|
1582
|
+
components = self._discover_components()
|
|
1583
|
+
|
|
1584
|
+
# Set components on Rust worker
|
|
1585
|
+
self._rust_worker.set_components(components)
|
|
1586
|
+
|
|
1587
|
+
# Set metadata
|
|
1588
|
+
if self.metadata:
|
|
1589
|
+
self._rust_worker.set_service_metadata(self.metadata)
|
|
1590
|
+
|
|
1591
|
+
# Configure entity state manager on Rust worker for database persistence
|
|
1592
|
+
logger.info("Configuring Rust EntityStateManager for database persistence")
|
|
1593
|
+
# Access the Rust core from the adapter
|
|
1594
|
+
if hasattr(self._entity_state_adapter, '_rust_core') and self._entity_state_adapter._rust_core:
|
|
1595
|
+
self._rust_worker.set_entity_state_manager(self._entity_state_adapter._rust_core)
|
|
1596
|
+
logger.info("Successfully configured Rust EntityStateManager")
|
|
1597
|
+
|
|
1598
|
+
# Get the current event loop to pass to Rust for concurrent Python async execution
|
|
1599
|
+
# This allows Rust to execute Python async functions on the same event loop
|
|
1600
|
+
# without spawn_blocking overhead, enabling true concurrency
|
|
1601
|
+
loop = asyncio.get_running_loop()
|
|
1602
|
+
logger.info("Passing Python event loop to Rust worker for concurrent execution")
|
|
1603
|
+
|
|
1604
|
+
# Set event loop on Rust worker
|
|
1605
|
+
self._rust_worker.set_event_loop(loop)
|
|
1606
|
+
|
|
1607
|
+
# Set message handler
|
|
1608
|
+
handler = self._create_message_handler()
|
|
1609
|
+
self._rust_worker.set_message_handler(handler)
|
|
1610
|
+
|
|
1611
|
+
# Initialize worker
|
|
1612
|
+
self._rust_worker.initialize()
|
|
1613
|
+
|
|
1614
|
+
logger.info("Worker registered successfully, entering message loop...")
|
|
1615
|
+
|
|
1616
|
+
# Run worker (this will block until shutdown)
|
|
1617
|
+
await self._rust_worker.run()
|
|
1618
|
+
|
|
1619
|
+
logger.info("Worker shutdown complete")
|