agnt5 0.3.2a1__cp310-abi3-manylinux_2_34_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of agnt5 might be problematic. Click here for more details.
- agnt5/__init__.py +119 -0
- agnt5/_compat.py +16 -0
- agnt5/_core.abi3.so +0 -0
- agnt5/_retry_utils.py +196 -0
- agnt5/_schema_utils.py +312 -0
- agnt5/_sentry.py +515 -0
- agnt5/_telemetry.py +279 -0
- agnt5/agent/__init__.py +48 -0
- agnt5/agent/context.py +581 -0
- agnt5/agent/core.py +1782 -0
- agnt5/agent/decorator.py +112 -0
- agnt5/agent/handoff.py +105 -0
- agnt5/agent/registry.py +68 -0
- agnt5/agent/result.py +39 -0
- agnt5/checkpoint.py +246 -0
- agnt5/client.py +1556 -0
- agnt5/context.py +288 -0
- agnt5/emit.py +197 -0
- agnt5/entity.py +1230 -0
- agnt5/events.py +567 -0
- agnt5/exceptions.py +110 -0
- agnt5/function.py +330 -0
- agnt5/journal.py +212 -0
- agnt5/lm.py +1266 -0
- agnt5/memoization.py +379 -0
- agnt5/memory.py +521 -0
- agnt5/tool.py +721 -0
- agnt5/tracing.py +300 -0
- agnt5/types.py +111 -0
- agnt5/version.py +19 -0
- agnt5/worker.py +2094 -0
- agnt5/workflow.py +1632 -0
- agnt5-0.3.2a1.dist-info/METADATA +26 -0
- agnt5-0.3.2a1.dist-info/RECORD +35 -0
- agnt5-0.3.2a1.dist-info/WHEEL +4 -0
agnt5/worker.py
ADDED
|
@@ -0,0 +1,2094 @@
|
|
|
1
|
+
"""Worker implementation for AGNT5 SDK."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import contextvars
|
|
7
|
+
import logging
|
|
8
|
+
import time
|
|
9
|
+
import uuid
|
|
10
|
+
from typing import Any, Dict, List, Optional
|
|
11
|
+
|
|
12
|
+
from .function import FunctionRegistry
|
|
13
|
+
from .workflow import WorkflowRegistry
|
|
14
|
+
from ._telemetry import setup_module_logger
|
|
15
|
+
from . import _sentry
|
|
16
|
+
|
|
17
|
+
logger = setup_module_logger(__name__)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
import dataclasses
|
|
21
|
+
import json as _json
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class _ResultEncoder(_json.JSONEncoder):
|
|
25
|
+
"""Custom JSON encoder for serializing component results.
|
|
26
|
+
|
|
27
|
+
Handles Pydantic models, dataclasses, bytes, and sets that are commonly
|
|
28
|
+
returned from functions, workflows, entities, and agents.
|
|
29
|
+
"""
|
|
30
|
+
def default(self, obj):
|
|
31
|
+
# Handle Pydantic models (v2 API)
|
|
32
|
+
if hasattr(obj, 'model_dump'):
|
|
33
|
+
return obj.model_dump()
|
|
34
|
+
# Handle Pydantic models (v1 API)
|
|
35
|
+
if hasattr(obj, 'dict') and hasattr(obj, '__fields__'):
|
|
36
|
+
return obj.dict()
|
|
37
|
+
# Handle dataclasses
|
|
38
|
+
if dataclasses.is_dataclass(obj) and not isinstance(obj, type):
|
|
39
|
+
return dataclasses.asdict(obj)
|
|
40
|
+
# Handle bytes
|
|
41
|
+
if isinstance(obj, bytes):
|
|
42
|
+
return obj.decode('utf-8', errors='replace')
|
|
43
|
+
# Handle sets
|
|
44
|
+
if isinstance(obj, set):
|
|
45
|
+
return list(obj)
|
|
46
|
+
# Fallback to default behavior
|
|
47
|
+
return super().default(obj)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _serialize_result(result) -> bytes:
|
|
51
|
+
"""Serialize a component result to JSON bytes.
|
|
52
|
+
|
|
53
|
+
Uses _ResultEncoder to handle Pydantic models, dataclasses, and other
|
|
54
|
+
complex types that may be returned from functions, workflows, entities,
|
|
55
|
+
tools, and agents.
|
|
56
|
+
"""
|
|
57
|
+
return _json.dumps(result, cls=_ResultEncoder).encode("utf-8")
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def _normalize_metadata(metadata: Dict[str, Any]) -> Dict[str, str]:
|
|
61
|
+
"""
|
|
62
|
+
Convert metadata dictionary to Dict[str, str] for Rust FFI compatibility.
|
|
63
|
+
|
|
64
|
+
PyO3 requires HashMap<String, String>, but Python code may include booleans,
|
|
65
|
+
integers, or other types. This helper ensures all values are strings.
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
metadata: Dictionary with potentially mixed types
|
|
69
|
+
|
|
70
|
+
Returns:
|
|
71
|
+
Dictionary with all string values
|
|
72
|
+
|
|
73
|
+
Example:
|
|
74
|
+
>>> _normalize_metadata({"error": True, "count": 42, "msg": "hello"})
|
|
75
|
+
{"error": "true", "count": "42", "msg": "hello"}
|
|
76
|
+
"""
|
|
77
|
+
normalized = {}
|
|
78
|
+
for key, value in metadata.items():
|
|
79
|
+
if isinstance(value, str):
|
|
80
|
+
normalized[key] = value
|
|
81
|
+
elif isinstance(value, bool):
|
|
82
|
+
# Convert bool to lowercase string for JSON compatibility
|
|
83
|
+
normalized[key] = str(value).lower()
|
|
84
|
+
elif value is None:
|
|
85
|
+
normalized[key] = ""
|
|
86
|
+
else:
|
|
87
|
+
# Convert any other type to string representation
|
|
88
|
+
normalized[key] = str(value)
|
|
89
|
+
return normalized
|
|
90
|
+
|
|
91
|
+
# Context variable to store trace metadata for propagation to LM calls
|
|
92
|
+
# This allows Rust LM layer to access traceparent without explicit parameter passing
|
|
93
|
+
_trace_metadata: contextvars.ContextVar[Dict[str, str]] = contextvars.ContextVar(
|
|
94
|
+
'_trace_metadata', default={}
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class Worker:
|
|
99
|
+
"""AGNT5 Worker for registering and running functions/workflows with the coordinator.
|
|
100
|
+
|
|
101
|
+
The Worker class manages the lifecycle of your service, including:
|
|
102
|
+
- Registration with the AGNT5 coordinator
|
|
103
|
+
- Automatic discovery of @function and @workflow decorated handlers
|
|
104
|
+
- Message handling and execution
|
|
105
|
+
- Health monitoring
|
|
106
|
+
|
|
107
|
+
Example:
|
|
108
|
+
```python
|
|
109
|
+
from agnt5 import Worker, function
|
|
110
|
+
|
|
111
|
+
@function
|
|
112
|
+
async def process_data(ctx: Context, data: str) -> dict:
|
|
113
|
+
return {"result": data.upper()}
|
|
114
|
+
|
|
115
|
+
async def main():
|
|
116
|
+
worker = Worker(
|
|
117
|
+
service_name="data-processor",
|
|
118
|
+
service_version="1.0.0",
|
|
119
|
+
coordinator_endpoint="http://localhost:34186"
|
|
120
|
+
)
|
|
121
|
+
await worker.run()
|
|
122
|
+
|
|
123
|
+
if __name__ == "__main__":
|
|
124
|
+
asyncio.run(main())
|
|
125
|
+
```
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
def __init__(
|
|
129
|
+
self,
|
|
130
|
+
service_name: str,
|
|
131
|
+
service_version: str = "1.0.0",
|
|
132
|
+
coordinator_endpoint: Optional[str] = None,
|
|
133
|
+
runtime: str = "standalone",
|
|
134
|
+
metadata: Optional[Dict[str, str]] = None,
|
|
135
|
+
functions: Optional[List] = None,
|
|
136
|
+
workflows: Optional[List] = None,
|
|
137
|
+
entities: Optional[List] = None,
|
|
138
|
+
agents: Optional[List] = None,
|
|
139
|
+
tools: Optional[List] = None,
|
|
140
|
+
auto_register: bool = False,
|
|
141
|
+
auto_register_paths: Optional[List[str]] = None,
|
|
142
|
+
pyproject_path: Optional[str] = None,
|
|
143
|
+
):
|
|
144
|
+
"""Initialize a new Worker with explicit or automatic component registration.
|
|
145
|
+
|
|
146
|
+
The Worker supports two registration modes:
|
|
147
|
+
|
|
148
|
+
**Explicit Mode (default, production):**
|
|
149
|
+
- Register workflows/agents explicitly, their dependencies are auto-included
|
|
150
|
+
- Optionally register standalone functions/tools for direct API invocation
|
|
151
|
+
|
|
152
|
+
**Auto-Registration Mode (development):**
|
|
153
|
+
- Automatically discovers all decorated components in source paths
|
|
154
|
+
- Reads source paths from pyproject.toml or uses explicit paths
|
|
155
|
+
- No need to maintain import lists
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
service_name: Unique name for this service
|
|
159
|
+
service_version: Version string (semantic versioning recommended)
|
|
160
|
+
coordinator_endpoint: Coordinator endpoint URL (default: from env AGNT5_COORDINATOR_ENDPOINT)
|
|
161
|
+
runtime: Runtime type - "standalone", "docker", "kubernetes", etc.
|
|
162
|
+
metadata: Optional service-level metadata
|
|
163
|
+
functions: List of @function decorated handlers (explicit mode)
|
|
164
|
+
workflows: List of @workflow decorated handlers (explicit mode)
|
|
165
|
+
entities: List of Entity classes (explicit mode)
|
|
166
|
+
agents: List of Agent instances (explicit mode)
|
|
167
|
+
tools: List of Tool instances (explicit mode)
|
|
168
|
+
auto_register: Enable automatic component discovery (default: False)
|
|
169
|
+
auto_register_paths: Explicit source paths to scan (overrides pyproject.toml discovery)
|
|
170
|
+
pyproject_path: Path to pyproject.toml (default: current directory)
|
|
171
|
+
|
|
172
|
+
Example (explicit mode - production):
|
|
173
|
+
```python
|
|
174
|
+
from agnt5 import Worker
|
|
175
|
+
from my_service import greet_user, order_fulfillment, ShoppingCart, analyst_agent
|
|
176
|
+
|
|
177
|
+
worker = Worker(
|
|
178
|
+
service_name="my-service",
|
|
179
|
+
workflows=[order_fulfillment],
|
|
180
|
+
entities=[ShoppingCart],
|
|
181
|
+
agents=[analyst_agent],
|
|
182
|
+
functions=[greet_user],
|
|
183
|
+
)
|
|
184
|
+
await worker.run()
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
Example (auto-register mode - development):
|
|
188
|
+
```python
|
|
189
|
+
from agnt5 import Worker
|
|
190
|
+
|
|
191
|
+
worker = Worker(
|
|
192
|
+
service_name="my-service",
|
|
193
|
+
auto_register=True, # Discovers from pyproject.toml
|
|
194
|
+
)
|
|
195
|
+
await worker.run()
|
|
196
|
+
```
|
|
197
|
+
"""
|
|
198
|
+
self.service_name = service_name
|
|
199
|
+
self.service_version = service_version
|
|
200
|
+
self.coordinator_endpoint = coordinator_endpoint
|
|
201
|
+
self.runtime = runtime
|
|
202
|
+
self.metadata = metadata or {}
|
|
203
|
+
|
|
204
|
+
# Get tenant_id from environment (required for entity state management)
|
|
205
|
+
import os
|
|
206
|
+
self._tenant_id = os.getenv("AGNT5_TENANT_ID", "default-tenant")
|
|
207
|
+
|
|
208
|
+
# Import Rust worker
|
|
209
|
+
try:
|
|
210
|
+
from ._core import PyWorker, PyWorkerConfig, PyComponentInfo
|
|
211
|
+
self._PyWorker = PyWorker
|
|
212
|
+
self._PyWorkerConfig = PyWorkerConfig
|
|
213
|
+
self._PyComponentInfo = PyComponentInfo
|
|
214
|
+
except ImportError as e:
|
|
215
|
+
# Capture SDK-level import failure in Sentry
|
|
216
|
+
_sentry.capture_exception(
|
|
217
|
+
e,
|
|
218
|
+
context={
|
|
219
|
+
"service_name": service_name,
|
|
220
|
+
"service_version": service_version,
|
|
221
|
+
"error_location": "Worker.__init__",
|
|
222
|
+
"error_phase": "rust_core_import",
|
|
223
|
+
},
|
|
224
|
+
tags={
|
|
225
|
+
"sdk_error": "true",
|
|
226
|
+
"error_type": "import_error",
|
|
227
|
+
"component": "rust_core",
|
|
228
|
+
},
|
|
229
|
+
level="error",
|
|
230
|
+
)
|
|
231
|
+
raise ImportError(
|
|
232
|
+
f"Failed to import Rust core worker: {e}. "
|
|
233
|
+
"Make sure agnt5 is properly installed with: pip install agnt5"
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
# Create Rust worker config
|
|
237
|
+
self._rust_config = self._PyWorkerConfig(
|
|
238
|
+
service_name=service_name,
|
|
239
|
+
service_version=service_version,
|
|
240
|
+
service_type=runtime,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
# Create Rust worker instance
|
|
244
|
+
self._rust_worker = self._PyWorker(self._rust_config)
|
|
245
|
+
|
|
246
|
+
# Create worker-scoped entity state adapter with Rust core
|
|
247
|
+
from .entity import EntityStateAdapter
|
|
248
|
+
from ._core import EntityStateManager as RustEntityStateManager
|
|
249
|
+
|
|
250
|
+
# Create Rust core for entity state management
|
|
251
|
+
rust_core = RustEntityStateManager(tenant_id=self._tenant_id)
|
|
252
|
+
|
|
253
|
+
# Create Python adapter (thin wrapper around Rust core)
|
|
254
|
+
self._entity_state_adapter = EntityStateAdapter(rust_core=rust_core)
|
|
255
|
+
|
|
256
|
+
logger.info("Created EntityStateAdapter with Rust core for state management")
|
|
257
|
+
|
|
258
|
+
# Create CheckpointClient for step-level memoization (Phase 3)
|
|
259
|
+
# This client is shared across all workflow executions and connects lazily on first use
|
|
260
|
+
try:
|
|
261
|
+
from .checkpoint import CheckpointClient
|
|
262
|
+
self._checkpoint_client = CheckpointClient()
|
|
263
|
+
logger.info("Created CheckpointClient for step-level memoization")
|
|
264
|
+
except Exception as e:
|
|
265
|
+
logger.warning(f"Failed to create CheckpointClient (memoization disabled): {e}")
|
|
266
|
+
self._checkpoint_client = None
|
|
267
|
+
|
|
268
|
+
# Initialize Sentry for SDK-level error tracking
|
|
269
|
+
# Telemetry behavior:
|
|
270
|
+
# - Alpha/Beta releases: ENABLED by default (opt-out with AGNT5_DISABLE_SDK_TELEMETRY=true)
|
|
271
|
+
# - Stable releases: DISABLED by default (opt-in with AGNT5_ENABLE_SDK_TELEMETRY=true)
|
|
272
|
+
# This captures SDK bugs, initialization failures, and Python-specific issues
|
|
273
|
+
# NOT user code execution errors (those should be handled by users)
|
|
274
|
+
from .version import _get_version
|
|
275
|
+
sdk_version = _get_version()
|
|
276
|
+
|
|
277
|
+
sentry_enabled = _sentry.initialize_sentry(
|
|
278
|
+
service_name=service_name,
|
|
279
|
+
service_version=service_version,
|
|
280
|
+
sdk_version=sdk_version,
|
|
281
|
+
)
|
|
282
|
+
if sentry_enabled:
|
|
283
|
+
# Set service-level context (anonymized)
|
|
284
|
+
_sentry.set_context("service", {
|
|
285
|
+
"name": service_name, # User's service name (they control this)
|
|
286
|
+
"version": service_version,
|
|
287
|
+
"runtime": runtime,
|
|
288
|
+
})
|
|
289
|
+
else:
|
|
290
|
+
logger.debug("SDK telemetry not enabled")
|
|
291
|
+
|
|
292
|
+
# Component registration: auto-discover or explicit
|
|
293
|
+
if auto_register:
|
|
294
|
+
# Warn if explicit components are passed with auto_register=True
|
|
295
|
+
if any([functions, workflows, entities, agents, tools]):
|
|
296
|
+
logger.warning(
|
|
297
|
+
"auto_register=True ignores explicit functions/workflows/entities/agents/tools parameters. "
|
|
298
|
+
"Remove explicit params or set auto_register=False to use explicit registration."
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
# Auto-registration mode: discover from source paths
|
|
302
|
+
if auto_register_paths:
|
|
303
|
+
source_paths = auto_register_paths
|
|
304
|
+
logger.info(f"Auto-registration with explicit paths: {source_paths}")
|
|
305
|
+
else:
|
|
306
|
+
source_paths = self._discover_source_paths(pyproject_path)
|
|
307
|
+
logger.info(f"Auto-registration with discovered paths: {source_paths}")
|
|
308
|
+
|
|
309
|
+
# Auto-discover components (will populate _explicit_components)
|
|
310
|
+
self._auto_discover_components(source_paths)
|
|
311
|
+
else:
|
|
312
|
+
# Explicit registration from constructor kwargs
|
|
313
|
+
self._explicit_components = {
|
|
314
|
+
'functions': list(functions or []),
|
|
315
|
+
'workflows': list(workflows or []),
|
|
316
|
+
'entities': list(entities or []),
|
|
317
|
+
'agents': list(agents or []),
|
|
318
|
+
'tools': list(tools or []),
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
# Count explicitly registered components
|
|
322
|
+
total_explicit = sum(len(v) for v in self._explicit_components.values())
|
|
323
|
+
logger.info(
|
|
324
|
+
f"Worker initialized: {service_name} v{service_version} (runtime: {runtime}), "
|
|
325
|
+
f"{total_explicit} components explicitly registered"
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
def register_components(
|
|
329
|
+
self,
|
|
330
|
+
functions=None,
|
|
331
|
+
workflows=None,
|
|
332
|
+
entities=None,
|
|
333
|
+
agents=None,
|
|
334
|
+
tools=None,
|
|
335
|
+
):
|
|
336
|
+
"""Register additional components after Worker initialization.
|
|
337
|
+
|
|
338
|
+
This method allows incremental registration of components after the Worker
|
|
339
|
+
has been created. Useful for conditional or dynamic component registration.
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
functions: List of functions decorated with @function
|
|
343
|
+
workflows: List of workflows decorated with @workflow
|
|
344
|
+
entities: List of entity classes
|
|
345
|
+
agents: List of agent instances
|
|
346
|
+
tools: List of tool instances
|
|
347
|
+
|
|
348
|
+
Example:
|
|
349
|
+
```python
|
|
350
|
+
worker = Worker(service_name="my-service")
|
|
351
|
+
|
|
352
|
+
# Register conditionally
|
|
353
|
+
if feature_enabled:
|
|
354
|
+
worker.register_components(workflows=[advanced_workflow])
|
|
355
|
+
```
|
|
356
|
+
"""
|
|
357
|
+
if functions:
|
|
358
|
+
self._explicit_components['functions'].extend(functions)
|
|
359
|
+
logger.debug(f"Incrementally registered {len(functions)} functions")
|
|
360
|
+
|
|
361
|
+
if workflows:
|
|
362
|
+
self._explicit_components['workflows'].extend(workflows)
|
|
363
|
+
logger.debug(f"Incrementally registered {len(workflows)} workflows")
|
|
364
|
+
|
|
365
|
+
if entities:
|
|
366
|
+
self._explicit_components['entities'].extend(entities)
|
|
367
|
+
logger.debug(f"Incrementally registered {len(entities)} entities")
|
|
368
|
+
|
|
369
|
+
if agents:
|
|
370
|
+
self._explicit_components['agents'].extend(agents)
|
|
371
|
+
logger.debug(f"Incrementally registered {len(agents)} agents")
|
|
372
|
+
|
|
373
|
+
if tools:
|
|
374
|
+
self._explicit_components['tools'].extend(tools)
|
|
375
|
+
logger.debug(f"Incrementally registered {len(tools)} tools")
|
|
376
|
+
|
|
377
|
+
total = sum(len(v) for v in self._explicit_components.values())
|
|
378
|
+
logger.info(f"Total components now registered: {total}")
|
|
379
|
+
|
|
380
|
+
def _discover_source_paths(self, pyproject_path: Optional[str] = None) -> List[str]:
|
|
381
|
+
"""Discover source paths from pyproject.toml.
|
|
382
|
+
|
|
383
|
+
Reads pyproject.toml to find package source directories using:
|
|
384
|
+
- Hatch: [tool.hatch.build.targets.wheel] packages
|
|
385
|
+
- Maturin: [tool.maturin] python-source
|
|
386
|
+
- Fallback: ["src"] if not found
|
|
387
|
+
|
|
388
|
+
Args:
|
|
389
|
+
pyproject_path: Path to pyproject.toml (default: current directory)
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
List of directory paths to scan (e.g., ["src/agnt5_benchmark"])
|
|
393
|
+
"""
|
|
394
|
+
from pathlib import Path
|
|
395
|
+
|
|
396
|
+
# Python 3.11+ has tomllib in stdlib
|
|
397
|
+
try:
|
|
398
|
+
import tomllib
|
|
399
|
+
except ImportError:
|
|
400
|
+
logger.error("tomllib not available (Python 3.11+ required for auto-registration)")
|
|
401
|
+
return ["src"]
|
|
402
|
+
|
|
403
|
+
# Determine pyproject.toml location
|
|
404
|
+
if pyproject_path:
|
|
405
|
+
pyproject_file = Path(pyproject_path)
|
|
406
|
+
else:
|
|
407
|
+
# Look in current directory
|
|
408
|
+
pyproject_file = Path.cwd() / "pyproject.toml"
|
|
409
|
+
|
|
410
|
+
if not pyproject_file.exists():
|
|
411
|
+
logger.warning(
|
|
412
|
+
f"pyproject.toml not found at {pyproject_file}, "
|
|
413
|
+
f"defaulting to 'src/' directory"
|
|
414
|
+
)
|
|
415
|
+
return ["src"]
|
|
416
|
+
|
|
417
|
+
# Parse pyproject.toml
|
|
418
|
+
try:
|
|
419
|
+
with open(pyproject_file, "rb") as f:
|
|
420
|
+
config = tomllib.load(f)
|
|
421
|
+
except Exception as e:
|
|
422
|
+
logger.error(f"Failed to parse pyproject.toml: {e}")
|
|
423
|
+
return ["src"]
|
|
424
|
+
|
|
425
|
+
# Extract source paths based on build system
|
|
426
|
+
source_paths = []
|
|
427
|
+
|
|
428
|
+
# Try Hatch configuration
|
|
429
|
+
if "tool" in config and "hatch" in config["tool"]:
|
|
430
|
+
hatch_config = config["tool"]["hatch"]
|
|
431
|
+
if "build" in hatch_config and "targets" in hatch_config["build"]:
|
|
432
|
+
wheel_config = hatch_config["build"]["targets"].get("wheel", {})
|
|
433
|
+
packages = wheel_config.get("packages", [])
|
|
434
|
+
source_paths.extend(packages)
|
|
435
|
+
|
|
436
|
+
# Try Maturin configuration
|
|
437
|
+
if not source_paths and "tool" in config and "maturin" in config["tool"]:
|
|
438
|
+
maturin_config = config["tool"]["maturin"]
|
|
439
|
+
python_source = maturin_config.get("python-source")
|
|
440
|
+
if python_source:
|
|
441
|
+
source_paths.append(python_source)
|
|
442
|
+
|
|
443
|
+
# Fallback to src/
|
|
444
|
+
if not source_paths:
|
|
445
|
+
logger.info("No source paths in pyproject.toml, defaulting to 'src/'")
|
|
446
|
+
source_paths = ["src"]
|
|
447
|
+
|
|
448
|
+
logger.info(f"Discovered source paths from pyproject.toml: {source_paths}")
|
|
449
|
+
return source_paths
|
|
450
|
+
|
|
451
|
+
def _auto_discover_components(self, source_paths: List[str]) -> None:
|
|
452
|
+
"""Auto-discover components by importing all Python files in source paths.
|
|
453
|
+
|
|
454
|
+
Args:
|
|
455
|
+
source_paths: List of directory paths to scan
|
|
456
|
+
"""
|
|
457
|
+
import importlib.util
|
|
458
|
+
import sys
|
|
459
|
+
from pathlib import Path
|
|
460
|
+
|
|
461
|
+
logger.info(f"Auto-discovering components in paths: {source_paths}")
|
|
462
|
+
|
|
463
|
+
total_modules = 0
|
|
464
|
+
|
|
465
|
+
for source_path in source_paths:
|
|
466
|
+
path = Path(source_path)
|
|
467
|
+
|
|
468
|
+
if not path.exists():
|
|
469
|
+
logger.warning(f"Source path does not exist: {source_path}")
|
|
470
|
+
continue
|
|
471
|
+
|
|
472
|
+
# Recursively find all .py files
|
|
473
|
+
for py_file in path.rglob("*.py"):
|
|
474
|
+
# Skip __pycache__ and test files
|
|
475
|
+
if "__pycache__" in str(py_file) or py_file.name.startswith("test_"):
|
|
476
|
+
continue
|
|
477
|
+
|
|
478
|
+
# Convert path to module name
|
|
479
|
+
# e.g., src/agnt5_benchmark/functions.py -> agnt5_benchmark.functions
|
|
480
|
+
relative_path = py_file.relative_to(path.parent)
|
|
481
|
+
module_parts = list(relative_path.parts[:-1]) # Remove .py extension part
|
|
482
|
+
module_parts.append(relative_path.stem) # Add filename without .py
|
|
483
|
+
module_name = ".".join(module_parts)
|
|
484
|
+
|
|
485
|
+
# Import module (triggers decorators)
|
|
486
|
+
try:
|
|
487
|
+
if module_name in sys.modules:
|
|
488
|
+
logger.debug(f"Module already imported: {module_name}")
|
|
489
|
+
else:
|
|
490
|
+
spec = importlib.util.spec_from_file_location(module_name, py_file)
|
|
491
|
+
if spec and spec.loader:
|
|
492
|
+
module = importlib.util.module_from_spec(spec)
|
|
493
|
+
sys.modules[module_name] = module
|
|
494
|
+
spec.loader.exec_module(module)
|
|
495
|
+
logger.debug(f"Auto-imported: {module_name}")
|
|
496
|
+
total_modules += 1
|
|
497
|
+
except Exception as e:
|
|
498
|
+
logger.warning(f"Failed to import {module_name}: {e}")
|
|
499
|
+
# Capture SDK auto-registration failures
|
|
500
|
+
_sentry.capture_exception(
|
|
501
|
+
e,
|
|
502
|
+
context={
|
|
503
|
+
"service_name": self.service_name,
|
|
504
|
+
"module_name": module_name,
|
|
505
|
+
"source_path": str(py_file),
|
|
506
|
+
"error_location": "_auto_discover_components",
|
|
507
|
+
},
|
|
508
|
+
tags={
|
|
509
|
+
"sdk_error": "true",
|
|
510
|
+
"error_type": "auto_registration_failure",
|
|
511
|
+
},
|
|
512
|
+
level="warning",
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
logger.info(f"Auto-imported {total_modules} modules")
|
|
516
|
+
|
|
517
|
+
# Collect components from registries
|
|
518
|
+
from .agent import AgentRegistry
|
|
519
|
+
from .entity import EntityRegistry
|
|
520
|
+
from .tool import ToolRegistry
|
|
521
|
+
|
|
522
|
+
# Extract actual objects from registries
|
|
523
|
+
functions = [cfg.handler for cfg in FunctionRegistry.all().values()]
|
|
524
|
+
workflows = [cfg.handler for cfg in WorkflowRegistry.all().values()]
|
|
525
|
+
entities = [et.entity_class for et in EntityRegistry.all().values()]
|
|
526
|
+
agents = list(AgentRegistry.all().values())
|
|
527
|
+
tools = list(ToolRegistry.all().values())
|
|
528
|
+
|
|
529
|
+
self._explicit_components = {
|
|
530
|
+
'functions': functions,
|
|
531
|
+
'workflows': workflows,
|
|
532
|
+
'entities': entities,
|
|
533
|
+
'agents': agents,
|
|
534
|
+
'tools': tools,
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
logger.info(
|
|
538
|
+
f"Auto-discovered components: "
|
|
539
|
+
f"{len(functions)} functions, "
|
|
540
|
+
f"{len(workflows)} workflows, "
|
|
541
|
+
f"{len(entities)} entities, "
|
|
542
|
+
f"{len(agents)} agents, "
|
|
543
|
+
f"{len(tools)} tools"
|
|
544
|
+
)
|
|
545
|
+
|
|
546
|
+
def _discover_components(self):
|
|
547
|
+
"""Discover explicit components and auto-include their dependencies.
|
|
548
|
+
|
|
549
|
+
Hybrid approach:
|
|
550
|
+
- Explicitly registered workflows/agents are processed
|
|
551
|
+
- Functions called by workflows are auto-included (TODO: implement)
|
|
552
|
+
- Tools used by agents are auto-included
|
|
553
|
+
- Standalone functions/tools can be explicitly registered
|
|
554
|
+
|
|
555
|
+
Returns:
|
|
556
|
+
List of PyComponentInfo instances for all components
|
|
557
|
+
"""
|
|
558
|
+
components = []
|
|
559
|
+
import json
|
|
560
|
+
|
|
561
|
+
# Import registries and types
|
|
562
|
+
from .entity import EntityRegistry
|
|
563
|
+
from .tool import ToolRegistry, Tool
|
|
564
|
+
|
|
565
|
+
# Track all components (explicit + auto-included)
|
|
566
|
+
all_functions = set(self._explicit_components['functions'])
|
|
567
|
+
all_tools = set(self._explicit_components['tools'])
|
|
568
|
+
|
|
569
|
+
# Auto-include agent tool dependencies
|
|
570
|
+
for agent in self._explicit_components['agents']:
|
|
571
|
+
if hasattr(agent, 'tools') and agent.tools:
|
|
572
|
+
# Agent.tools is a dict of {tool_name: tool_instance}
|
|
573
|
+
all_tools.update(agent.tools.values())
|
|
574
|
+
logger.debug(
|
|
575
|
+
f"Auto-included {len(agent.tools)} tools from agent '{agent.name}'"
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
# Log registration summary
|
|
579
|
+
explicit_func_count = len(self._explicit_components['functions'])
|
|
580
|
+
explicit_tool_count = len(self._explicit_components['tools'])
|
|
581
|
+
auto_func_count = len(all_functions) - explicit_func_count
|
|
582
|
+
auto_tool_count = len(all_tools) - explicit_tool_count
|
|
583
|
+
|
|
584
|
+
logger.info(
|
|
585
|
+
f"Component registration summary: "
|
|
586
|
+
f"{len(all_functions)} functions ({explicit_func_count} explicit, {auto_func_count} auto-included), "
|
|
587
|
+
f"{len(self._explicit_components['workflows'])} workflows, "
|
|
588
|
+
f"{len(self._explicit_components['entities'])} entities, "
|
|
589
|
+
f"{len(self._explicit_components['agents'])} agents, "
|
|
590
|
+
f"{len(all_tools)} tools ({explicit_tool_count} explicit, {auto_tool_count} auto-included)"
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
# Process functions (explicit + auto-included)
|
|
594
|
+
for func in all_functions:
|
|
595
|
+
config = FunctionRegistry.get(func.__name__)
|
|
596
|
+
if not config:
|
|
597
|
+
logger.warning(f"Function '{func.__name__}' not found in FunctionRegistry")
|
|
598
|
+
continue
|
|
599
|
+
|
|
600
|
+
input_schema_str = json.dumps(config.input_schema) if config.input_schema else None
|
|
601
|
+
output_schema_str = json.dumps(config.output_schema) if config.output_schema else None
|
|
602
|
+
metadata = config.metadata if config.metadata else {}
|
|
603
|
+
|
|
604
|
+
# Serialize retry and backoff policies
|
|
605
|
+
config_dict = {}
|
|
606
|
+
if config.retries:
|
|
607
|
+
config_dict["max_attempts"] = str(config.retries.max_attempts)
|
|
608
|
+
config_dict["initial_interval_ms"] = str(config.retries.initial_interval_ms)
|
|
609
|
+
config_dict["max_interval_ms"] = str(config.retries.max_interval_ms)
|
|
610
|
+
|
|
611
|
+
if config.backoff:
|
|
612
|
+
config_dict["backoff_type"] = config.backoff.type.value
|
|
613
|
+
config_dict["backoff_multiplier"] = str(config.backoff.multiplier)
|
|
614
|
+
|
|
615
|
+
component_info = self._PyComponentInfo(
|
|
616
|
+
name=config.name,
|
|
617
|
+
component_type="function",
|
|
618
|
+
metadata=metadata,
|
|
619
|
+
config=config_dict,
|
|
620
|
+
input_schema=input_schema_str,
|
|
621
|
+
output_schema=output_schema_str,
|
|
622
|
+
definition=None,
|
|
623
|
+
)
|
|
624
|
+
components.append(component_info)
|
|
625
|
+
|
|
626
|
+
# Process workflows
|
|
627
|
+
for workflow in self._explicit_components['workflows']:
|
|
628
|
+
config = WorkflowRegistry.get(workflow.__name__)
|
|
629
|
+
if not config:
|
|
630
|
+
logger.warning(f"Workflow '{workflow.__name__}' not found in WorkflowRegistry")
|
|
631
|
+
continue
|
|
632
|
+
|
|
633
|
+
input_schema_str = json.dumps(config.input_schema) if config.input_schema else None
|
|
634
|
+
output_schema_str = json.dumps(config.output_schema) if config.output_schema else None
|
|
635
|
+
metadata = config.metadata if config.metadata else {}
|
|
636
|
+
|
|
637
|
+
component_info = self._PyComponentInfo(
|
|
638
|
+
name=config.name,
|
|
639
|
+
component_type="workflow",
|
|
640
|
+
metadata=metadata,
|
|
641
|
+
config={},
|
|
642
|
+
input_schema=input_schema_str,
|
|
643
|
+
output_schema=output_schema_str,
|
|
644
|
+
definition=None,
|
|
645
|
+
)
|
|
646
|
+
components.append(component_info)
|
|
647
|
+
|
|
648
|
+
# Process entities
|
|
649
|
+
for entity_class in self._explicit_components['entities']:
|
|
650
|
+
entity_type = EntityRegistry.get(entity_class.__name__)
|
|
651
|
+
if not entity_type:
|
|
652
|
+
logger.warning(f"Entity '{entity_class.__name__}' not found in EntityRegistry")
|
|
653
|
+
continue
|
|
654
|
+
|
|
655
|
+
# Build complete entity definition with state schema and method schemas
|
|
656
|
+
entity_definition = entity_type.build_entity_definition()
|
|
657
|
+
definition_str = json.dumps(entity_definition)
|
|
658
|
+
|
|
659
|
+
# Keep minimal metadata for backward compatibility
|
|
660
|
+
metadata_dict = {
|
|
661
|
+
"methods": json.dumps(list(entity_type._method_schemas.keys())),
|
|
662
|
+
}
|
|
663
|
+
|
|
664
|
+
component_info = self._PyComponentInfo(
|
|
665
|
+
name=entity_type.name,
|
|
666
|
+
component_type="entity",
|
|
667
|
+
metadata=metadata_dict,
|
|
668
|
+
config={},
|
|
669
|
+
input_schema=None, # Entities don't have single input/output schemas
|
|
670
|
+
output_schema=None,
|
|
671
|
+
definition=definition_str, # Complete entity definition with state and methods
|
|
672
|
+
)
|
|
673
|
+
components.append(component_info)
|
|
674
|
+
logger.debug(f"Registered entity '{entity_type.name}' with definition")
|
|
675
|
+
|
|
676
|
+
# Process agents
|
|
677
|
+
from .agent import AgentRegistry
|
|
678
|
+
|
|
679
|
+
for agent in self._explicit_components['agents']:
|
|
680
|
+
# Register agent in AgentRegistry for execution lookup
|
|
681
|
+
AgentRegistry.register(agent)
|
|
682
|
+
logger.debug(f"Registered agent '{agent.name}' in AgentRegistry for execution")
|
|
683
|
+
|
|
684
|
+
input_schema_str = json.dumps(agent.input_schema) if hasattr(agent, 'input_schema') and agent.input_schema else None
|
|
685
|
+
output_schema_str = json.dumps(agent.output_schema) if hasattr(agent, 'output_schema') and agent.output_schema else None
|
|
686
|
+
|
|
687
|
+
metadata_dict = agent.metadata if hasattr(agent, 'metadata') else {}
|
|
688
|
+
if hasattr(agent, 'tools'):
|
|
689
|
+
metadata_dict["tools"] = json.dumps(list(agent.tools.keys()))
|
|
690
|
+
|
|
691
|
+
component_info = self._PyComponentInfo(
|
|
692
|
+
name=agent.name,
|
|
693
|
+
component_type="agent",
|
|
694
|
+
metadata=metadata_dict,
|
|
695
|
+
config={},
|
|
696
|
+
input_schema=input_schema_str,
|
|
697
|
+
output_schema=output_schema_str,
|
|
698
|
+
definition=None,
|
|
699
|
+
)
|
|
700
|
+
components.append(component_info)
|
|
701
|
+
|
|
702
|
+
# Process tools (explicit + auto-included)
|
|
703
|
+
for tool in all_tools:
|
|
704
|
+
# Validate that item is a Tool instance
|
|
705
|
+
if not isinstance(tool, Tool):
|
|
706
|
+
logger.warning(
|
|
707
|
+
f"Skipping non-Tool item in tools collection: {type(tool).__name__}. "
|
|
708
|
+
f"Use @tool decorator or pass Tool instances."
|
|
709
|
+
)
|
|
710
|
+
continue
|
|
711
|
+
|
|
712
|
+
input_schema_str = json.dumps(tool.input_schema) if hasattr(tool, 'input_schema') and tool.input_schema else None
|
|
713
|
+
output_schema_str = json.dumps(tool.output_schema) if hasattr(tool, 'output_schema') and tool.output_schema else None
|
|
714
|
+
|
|
715
|
+
component_info = self._PyComponentInfo(
|
|
716
|
+
name=tool.name,
|
|
717
|
+
component_type="tool",
|
|
718
|
+
metadata={},
|
|
719
|
+
config={},
|
|
720
|
+
input_schema=input_schema_str,
|
|
721
|
+
output_schema=output_schema_str,
|
|
722
|
+
definition=None,
|
|
723
|
+
)
|
|
724
|
+
components.append(component_info)
|
|
725
|
+
|
|
726
|
+
logger.info(f"Discovered {len(components)} total components")
|
|
727
|
+
return components
|
|
728
|
+
|
|
729
|
+
def _create_message_handler(self):
|
|
730
|
+
"""Create the message handler that will be called by Rust worker."""
|
|
731
|
+
|
|
732
|
+
def handle_message(request):
|
|
733
|
+
"""Handle incoming execution requests - returns coroutine for Rust to await."""
|
|
734
|
+
# Extract request details
|
|
735
|
+
component_name = request.component_name
|
|
736
|
+
component_type = request.component_type
|
|
737
|
+
input_data = request.input_data
|
|
738
|
+
|
|
739
|
+
logger.debug(
|
|
740
|
+
f"Handling {component_type} request: {component_name}, input size: {len(input_data)} bytes"
|
|
741
|
+
)
|
|
742
|
+
|
|
743
|
+
# Import all registries
|
|
744
|
+
from .tool import ToolRegistry
|
|
745
|
+
from .entity import EntityRegistry
|
|
746
|
+
from .agent import AgentRegistry
|
|
747
|
+
|
|
748
|
+
# Route based on component type and return coroutines
|
|
749
|
+
if component_type == "tool":
|
|
750
|
+
tool = ToolRegistry.get(component_name)
|
|
751
|
+
if tool:
|
|
752
|
+
logger.debug(f"Found tool: {component_name}")
|
|
753
|
+
# Return coroutine, don't await it
|
|
754
|
+
return self._execute_tool(tool, input_data, request)
|
|
755
|
+
|
|
756
|
+
elif component_type == "entity":
|
|
757
|
+
entity_type = EntityRegistry.get(component_name)
|
|
758
|
+
if entity_type:
|
|
759
|
+
logger.debug(f"Found entity: {component_name}")
|
|
760
|
+
# Return coroutine, don't await it
|
|
761
|
+
return self._execute_entity(entity_type, input_data, request)
|
|
762
|
+
|
|
763
|
+
elif component_type == "agent":
|
|
764
|
+
agent = AgentRegistry.get(component_name)
|
|
765
|
+
if agent:
|
|
766
|
+
logger.debug(f"Found agent: {component_name}")
|
|
767
|
+
# Return coroutine, don't await it
|
|
768
|
+
return self._execute_agent(agent, input_data, request)
|
|
769
|
+
|
|
770
|
+
elif component_type == "workflow":
|
|
771
|
+
workflow_config = WorkflowRegistry.get(component_name)
|
|
772
|
+
if workflow_config:
|
|
773
|
+
logger.debug(f"Found workflow: {component_name}")
|
|
774
|
+
# Return coroutine, don't await it
|
|
775
|
+
return self._execute_workflow(workflow_config, input_data, request)
|
|
776
|
+
|
|
777
|
+
elif component_type == "function":
|
|
778
|
+
function_config = FunctionRegistry.get(component_name)
|
|
779
|
+
if function_config:
|
|
780
|
+
# Return coroutine, don't await it
|
|
781
|
+
return self._execute_function(function_config, input_data, request)
|
|
782
|
+
|
|
783
|
+
# Not found - need to return an async error response
|
|
784
|
+
error_msg = f"Component '{component_name}' of type '{component_type}' not found"
|
|
785
|
+
logger.error(error_msg)
|
|
786
|
+
|
|
787
|
+
# Create async wrapper for error response
|
|
788
|
+
async def error_response():
|
|
789
|
+
return self._create_error_response(request, error_msg)
|
|
790
|
+
|
|
791
|
+
return error_response()
|
|
792
|
+
|
|
793
|
+
return handle_message
|
|
794
|
+
|
|
795
|
+
def _extract_critical_metadata(self, request) -> Dict[str, str]:
|
|
796
|
+
"""
|
|
797
|
+
Extract critical metadata from request that MUST be propagated to response.
|
|
798
|
+
|
|
799
|
+
This ensures journal events are written to the correct tenant partition
|
|
800
|
+
and can be properly replayed. Missing tenant_id causes catastrophic
|
|
801
|
+
event sourcing corruption where events are split across partitions.
|
|
802
|
+
|
|
803
|
+
Returns:
|
|
804
|
+
Dict[str, str]: Metadata with all values normalized to strings for Rust FFI
|
|
805
|
+
"""
|
|
806
|
+
metadata = {}
|
|
807
|
+
if hasattr(request, 'metadata') and request.metadata:
|
|
808
|
+
# CRITICAL: Propagate tenant_id to prevent journal corruption
|
|
809
|
+
# Convert to string immediately to ensure Rust FFI compatibility
|
|
810
|
+
if "tenant_id" in request.metadata:
|
|
811
|
+
metadata["tenant_id"] = str(request.metadata["tenant_id"])
|
|
812
|
+
if "deployment_id" in request.metadata:
|
|
813
|
+
metadata["deployment_id"] = str(request.metadata["deployment_id"])
|
|
814
|
+
|
|
815
|
+
# CRITICAL: Normalize all metadata values to strings for Rust FFI (PyO3)
|
|
816
|
+
# PyO3 expects HashMap<String, String> and will fail with bool/int values
|
|
817
|
+
return _normalize_metadata(metadata)
|
|
818
|
+
|
|
819
|
+
async def _execute_function(self, config, input_data: bytes, request):
|
|
820
|
+
"""Execute a function handler (supports both regular and streaming functions)."""
|
|
821
|
+
import json
|
|
822
|
+
import inspect
|
|
823
|
+
import time
|
|
824
|
+
from .context import Context
|
|
825
|
+
from ._core import PyExecuteComponentResponse
|
|
826
|
+
|
|
827
|
+
exec_start = time.time()
|
|
828
|
+
|
|
829
|
+
try:
|
|
830
|
+
# Parse input data
|
|
831
|
+
input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
|
|
832
|
+
|
|
833
|
+
# Store trace metadata in contextvar for LM calls to access
|
|
834
|
+
# The Rust worker injects traceparent into request.metadata for trace propagation
|
|
835
|
+
if hasattr(request, 'metadata') and request.metadata:
|
|
836
|
+
_trace_metadata.set(dict(request.metadata))
|
|
837
|
+
logger.debug(f"Trace metadata stored: traceparent={request.metadata.get('traceparent', 'N/A')}")
|
|
838
|
+
|
|
839
|
+
# Extract attempt number from platform request (if provided)
|
|
840
|
+
platform_attempt = getattr(request, 'attempt', 0)
|
|
841
|
+
|
|
842
|
+
# Extract streaming context for real-time SSE log delivery
|
|
843
|
+
is_streaming = getattr(request, 'is_streaming', False)
|
|
844
|
+
tenant_id = request.metadata.get('tenant_id') if hasattr(request, 'metadata') else None
|
|
845
|
+
|
|
846
|
+
# Create FunctionContext with attempt number for retry tracking
|
|
847
|
+
# - If platform_attempt > 0: Platform is orchestrating retries
|
|
848
|
+
# - If platform_attempt == 0: First attempt (or no retry config)
|
|
849
|
+
from .function import FunctionContext
|
|
850
|
+
ctx = FunctionContext(
|
|
851
|
+
run_id=f"{self.service_name}:{config.name}",
|
|
852
|
+
attempt=platform_attempt,
|
|
853
|
+
runtime_context=request.runtime_context,
|
|
854
|
+
retry_policy=config.retries,
|
|
855
|
+
is_streaming=is_streaming,
|
|
856
|
+
tenant_id=tenant_id,
|
|
857
|
+
)
|
|
858
|
+
|
|
859
|
+
# Set context in contextvar so get_current_context() and error handlers can access it
|
|
860
|
+
from .context import set_current_context, _current_context
|
|
861
|
+
token = set_current_context(ctx)
|
|
862
|
+
|
|
863
|
+
# Set up _current_span contextvar for proper trace parent-child linking
|
|
864
|
+
# The Rust worker creates spans (python_component_execution) and passes trace context
|
|
865
|
+
# via runtime_context. We need to set this in Python's _current_span contextvar
|
|
866
|
+
# so that spans created in Python (e.g., agent.calculator) become proper children.
|
|
867
|
+
from .tracing import _current_span, SpanInfo
|
|
868
|
+
span_token = None
|
|
869
|
+
if request.runtime_context:
|
|
870
|
+
trace_id = request.runtime_context.trace_id
|
|
871
|
+
span_id = request.runtime_context.span_id
|
|
872
|
+
if trace_id and span_id:
|
|
873
|
+
span_info = SpanInfo(trace_id=trace_id, span_id=span_id)
|
|
874
|
+
span_token = _current_span.set(span_info)
|
|
875
|
+
|
|
876
|
+
# Execute function directly - Rust bridge handles tracing
|
|
877
|
+
# Note: Removed Python-level span creation to avoid duplicate spans.
|
|
878
|
+
# The Rust worker bridge (sdk-python/rust-src/worker.rs:413-659) already
|
|
879
|
+
# creates a comprehensive OpenTelemetry span with all necessary attributes.
|
|
880
|
+
# See DUPLICATE_SPANS_FIX.md for details.
|
|
881
|
+
#
|
|
882
|
+
# Note on retry handling:
|
|
883
|
+
# - If platform_attempt > 0: Platform is orchestrating retries, execute once
|
|
884
|
+
# - If platform_attempt == 0: Local retry loop in decorator wrapper handles retries
|
|
885
|
+
if input_dict:
|
|
886
|
+
result = config.handler(ctx, **input_dict)
|
|
887
|
+
else:
|
|
888
|
+
result = config.handler(ctx)
|
|
889
|
+
|
|
890
|
+
# Check if result is an async generator BEFORE awaiting
|
|
891
|
+
# Async generators (streaming functions) cannot be awaited directly
|
|
892
|
+
if inspect.isasyncgen(result):
|
|
893
|
+
# result is already an async generator, proceed to streaming handling below
|
|
894
|
+
pass
|
|
895
|
+
elif inspect.iscoroutine(result):
|
|
896
|
+
# Apply timeout if specified in function config
|
|
897
|
+
if hasattr(config, 'timeout_ms') and config.timeout_ms is not None:
|
|
898
|
+
timeout_seconds = config.timeout_ms / 1000.0
|
|
899
|
+
try:
|
|
900
|
+
result = await asyncio.wait_for(result, timeout=timeout_seconds)
|
|
901
|
+
except asyncio.TimeoutError:
|
|
902
|
+
raise asyncio.TimeoutError(
|
|
903
|
+
f"Function '{config.name}' execution timed out after {config.timeout_ms}ms"
|
|
904
|
+
)
|
|
905
|
+
else:
|
|
906
|
+
result = await result
|
|
907
|
+
|
|
908
|
+
# Note: Removed flush_telemetry_py() call here - it was causing 2-second blocking delay!
|
|
909
|
+
# The batch span processor handles flushing automatically with 5s timeout
|
|
910
|
+
# We only need to flush on worker shutdown, not after each function execution
|
|
911
|
+
|
|
912
|
+
# Check if result is an async generator (streaming function)
|
|
913
|
+
if inspect.isasyncgen(result):
|
|
914
|
+
# Streaming function - queue deltas immediately via Rust for real-time delivery
|
|
915
|
+
# Instead of collecting into a list and returning, we send each chunk
|
|
916
|
+
# as it's yielded via the delta queue with 10ms flush interval
|
|
917
|
+
from .events import Event
|
|
918
|
+
|
|
919
|
+
sequence = 0
|
|
920
|
+
has_typed_events = False # Track if user yields Event objects
|
|
921
|
+
first_chunk = True
|
|
922
|
+
|
|
923
|
+
# Extract metadata for delta queue (must be Dict[str, str] for Rust FFI)
|
|
924
|
+
metadata = _normalize_metadata(self._extract_critical_metadata(request))
|
|
925
|
+
|
|
926
|
+
async for chunk in result:
|
|
927
|
+
# Check if chunk is a typed Event
|
|
928
|
+
if isinstance(chunk, Event):
|
|
929
|
+
has_typed_events = True
|
|
930
|
+
# Use the event's fields directly
|
|
931
|
+
event_data = chunk.to_response_fields()
|
|
932
|
+
output_data = event_data.get("output_data", b"")
|
|
933
|
+
# Convert bytes to string for queue_delta
|
|
934
|
+
output_str = output_data.decode("utf-8") if isinstance(output_data, bytes) else str(output_data or "{}")
|
|
935
|
+
self._rust_worker.queue_delta(
|
|
936
|
+
invocation_id=request.invocation_id,
|
|
937
|
+
event_type=event_data.get("event_type", ""),
|
|
938
|
+
output_data=output_str,
|
|
939
|
+
content_index=event_data.get("content_index", 0),
|
|
940
|
+
sequence=sequence,
|
|
941
|
+
metadata=metadata,
|
|
942
|
+
source_timestamp_ns=chunk.source_timestamp_ns,
|
|
943
|
+
)
|
|
944
|
+
else:
|
|
945
|
+
# Regular chunk - wrap with output events
|
|
946
|
+
if first_chunk:
|
|
947
|
+
# Emit output.start event before first chunk
|
|
948
|
+
self._rust_worker.queue_delta(
|
|
949
|
+
invocation_id=request.invocation_id,
|
|
950
|
+
event_type="output.start",
|
|
951
|
+
output_data="{}",
|
|
952
|
+
content_index=0,
|
|
953
|
+
sequence=sequence,
|
|
954
|
+
metadata=metadata,
|
|
955
|
+
source_timestamp_ns=time.time_ns(),
|
|
956
|
+
)
|
|
957
|
+
sequence += 1
|
|
958
|
+
first_chunk = False
|
|
959
|
+
|
|
960
|
+
# Serialize chunk for streaming
|
|
961
|
+
# Strings are passed through directly to avoid double-encoding
|
|
962
|
+
# (functions may yield pre-formatted JSON strings)
|
|
963
|
+
# Other types (dicts, Pydantic models, etc.) are JSON-serialized
|
|
964
|
+
if isinstance(chunk, str):
|
|
965
|
+
chunk_str = chunk
|
|
966
|
+
elif isinstance(chunk, bytes):
|
|
967
|
+
chunk_str = chunk.decode("utf-8")
|
|
968
|
+
else:
|
|
969
|
+
# Use _serialize_result for complex types (dicts, Pydantic models, etc.)
|
|
970
|
+
chunk_data = _serialize_result(chunk)
|
|
971
|
+
chunk_str = chunk_data.decode("utf-8") if isinstance(chunk_data, bytes) else str(chunk_data)
|
|
972
|
+
|
|
973
|
+
# Emit output.delta event
|
|
974
|
+
self._rust_worker.queue_delta(
|
|
975
|
+
invocation_id=request.invocation_id,
|
|
976
|
+
event_type="output.delta",
|
|
977
|
+
output_data=chunk_str,
|
|
978
|
+
content_index=0,
|
|
979
|
+
sequence=sequence,
|
|
980
|
+
metadata=metadata,
|
|
981
|
+
source_timestamp_ns=time.time_ns(),
|
|
982
|
+
)
|
|
983
|
+
sequence += 1
|
|
984
|
+
|
|
985
|
+
# Emit closing events if we had regular chunks
|
|
986
|
+
if not has_typed_events and not first_chunk:
|
|
987
|
+
# Emit output.stop event
|
|
988
|
+
self._rust_worker.queue_delta(
|
|
989
|
+
invocation_id=request.invocation_id,
|
|
990
|
+
event_type="output.stop",
|
|
991
|
+
output_data="{}",
|
|
992
|
+
content_index=0,
|
|
993
|
+
sequence=sequence,
|
|
994
|
+
metadata=metadata,
|
|
995
|
+
source_timestamp_ns=time.time_ns(),
|
|
996
|
+
)
|
|
997
|
+
sequence += 1
|
|
998
|
+
|
|
999
|
+
# Always emit run.completed event
|
|
1000
|
+
self._rust_worker.queue_delta(
|
|
1001
|
+
invocation_id=request.invocation_id,
|
|
1002
|
+
event_type="run.completed",
|
|
1003
|
+
output_data="{}",
|
|
1004
|
+
content_index=0,
|
|
1005
|
+
sequence=sequence,
|
|
1006
|
+
metadata=metadata,
|
|
1007
|
+
source_timestamp_ns=time.time_ns(),
|
|
1008
|
+
)
|
|
1009
|
+
|
|
1010
|
+
logger.debug(f"Streaming function queued {sequence + 1} deltas for real-time delivery")
|
|
1011
|
+
# Return None to signal that streaming was handled via delta queue
|
|
1012
|
+
return None
|
|
1013
|
+
else:
|
|
1014
|
+
# Regular function - await and return single response
|
|
1015
|
+
if inspect.iscoroutine(result):
|
|
1016
|
+
result = await result
|
|
1017
|
+
|
|
1018
|
+
# Serialize result
|
|
1019
|
+
output_data = _serialize_result(result)
|
|
1020
|
+
|
|
1021
|
+
# Extract critical metadata for journal event correlation
|
|
1022
|
+
response_metadata = self._extract_critical_metadata(request)
|
|
1023
|
+
|
|
1024
|
+
# Emit run.completed event with output
|
|
1025
|
+
return PyExecuteComponentResponse(
|
|
1026
|
+
invocation_id=request.invocation_id,
|
|
1027
|
+
success=True,
|
|
1028
|
+
output_data=output_data,
|
|
1029
|
+
state_update=None,
|
|
1030
|
+
error_message=None,
|
|
1031
|
+
metadata=response_metadata if response_metadata else None,
|
|
1032
|
+
event_type="run.completed",
|
|
1033
|
+
content_index=0,
|
|
1034
|
+
sequence=0,
|
|
1035
|
+
attempt=platform_attempt,
|
|
1036
|
+
)
|
|
1037
|
+
|
|
1038
|
+
except Exception as e:
|
|
1039
|
+
# Include exception type for better error messages
|
|
1040
|
+
error_msg = f"{type(e).__name__}: {str(e)}"
|
|
1041
|
+
|
|
1042
|
+
# Capture full stack trace for telemetry
|
|
1043
|
+
import traceback
|
|
1044
|
+
stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
1045
|
+
|
|
1046
|
+
# Log with full traceback using ctx.logger to ensure run_id correlation
|
|
1047
|
+
from .context import get_current_context
|
|
1048
|
+
current_ctx = get_current_context()
|
|
1049
|
+
error_logger = current_ctx.logger if current_ctx else logger
|
|
1050
|
+
error_logger.error(f"Function execution failed: {error_msg}", exc_info=True)
|
|
1051
|
+
|
|
1052
|
+
# Store stack trace in metadata for observability
|
|
1053
|
+
metadata = {
|
|
1054
|
+
"error_type": type(e).__name__,
|
|
1055
|
+
"stack_trace": stack_trace,
|
|
1056
|
+
"error": True, # Boolean flag for error detection
|
|
1057
|
+
}
|
|
1058
|
+
|
|
1059
|
+
# CRITICAL: Extract critical metadata (including tenant_id) for journal event correlation
|
|
1060
|
+
# This ensures run.failed events are properly emitted by Worker Coordinator
|
|
1061
|
+
critical_metadata = self._extract_critical_metadata(request)
|
|
1062
|
+
metadata.update(critical_metadata)
|
|
1063
|
+
|
|
1064
|
+
# CRITICAL: Normalize metadata to ensure all values are strings (Rust FFI requirement)
|
|
1065
|
+
# PyO3 expects HashMap<String, String>, but we may have booleans or other types
|
|
1066
|
+
normalized_metadata = _normalize_metadata(metadata)
|
|
1067
|
+
|
|
1068
|
+
# Emit run.failed event
|
|
1069
|
+
return PyExecuteComponentResponse(
|
|
1070
|
+
invocation_id=request.invocation_id,
|
|
1071
|
+
success=False,
|
|
1072
|
+
output_data=b"",
|
|
1073
|
+
state_update=None,
|
|
1074
|
+
error_message=error_msg,
|
|
1075
|
+
metadata=normalized_metadata,
|
|
1076
|
+
event_type="run.failed",
|
|
1077
|
+
content_index=0,
|
|
1078
|
+
sequence=0,
|
|
1079
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1080
|
+
)
|
|
1081
|
+
|
|
1082
|
+
finally:
|
|
1083
|
+
# Always reset context to prevent leakage between executions
|
|
1084
|
+
_current_context.reset(token)
|
|
1085
|
+
|
|
1086
|
+
async def _execute_workflow(self, config, input_data: bytes, request):
|
|
1087
|
+
"""Execute a workflow handler with automatic replay support."""
|
|
1088
|
+
import json
|
|
1089
|
+
from .workflow import WorkflowEntity, WorkflowContext
|
|
1090
|
+
from .entity import _get_state_adapter, _entity_state_adapter_ctx
|
|
1091
|
+
from .exceptions import WaitingForUserInputException
|
|
1092
|
+
from ._core import PyExecuteComponentResponse
|
|
1093
|
+
|
|
1094
|
+
# Set entity state adapter in context so workflows can use Entities
|
|
1095
|
+
_entity_state_adapter_ctx.set(self._entity_state_adapter)
|
|
1096
|
+
|
|
1097
|
+
try:
|
|
1098
|
+
# Parse input data
|
|
1099
|
+
input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
|
|
1100
|
+
|
|
1101
|
+
# Extract or generate session_id for multi-turn conversation support (for chat workflows)
|
|
1102
|
+
# If session_id is provided, the workflow can maintain conversation context
|
|
1103
|
+
session_id = input_dict.get("session_id")
|
|
1104
|
+
|
|
1105
|
+
if not session_id:
|
|
1106
|
+
session_id = str(uuid.uuid4())
|
|
1107
|
+
logger.info(f"Created new workflow session: {session_id}")
|
|
1108
|
+
else:
|
|
1109
|
+
logger.info(f"Using existing workflow session: {session_id}")
|
|
1110
|
+
|
|
1111
|
+
# Parse replay data from request metadata for crash recovery
|
|
1112
|
+
completed_steps = {}
|
|
1113
|
+
step_events = [] # Raw step_events list for serialization on next pause
|
|
1114
|
+
initial_state = {}
|
|
1115
|
+
user_response = None
|
|
1116
|
+
|
|
1117
|
+
if hasattr(request, 'metadata') and request.metadata:
|
|
1118
|
+
# Parse completed steps for replay (from crash recovery or HITL resume)
|
|
1119
|
+
# Try both formats: completed_steps (dict) and step_events (list from pause)
|
|
1120
|
+
if "completed_steps" in request.metadata:
|
|
1121
|
+
completed_steps_json = request.metadata["completed_steps"]
|
|
1122
|
+
if completed_steps_json:
|
|
1123
|
+
try:
|
|
1124
|
+
completed_steps = json.loads(completed_steps_json)
|
|
1125
|
+
logger.info(f"🔄 Replaying workflow with {len(completed_steps)} cached steps")
|
|
1126
|
+
except json.JSONDecodeError:
|
|
1127
|
+
logger.warning("Failed to parse completed_steps from metadata")
|
|
1128
|
+
elif "step_events" in request.metadata:
|
|
1129
|
+
# Convert step_events list to completed_steps dict for HITL resume
|
|
1130
|
+
step_events_json = request.metadata["step_events"]
|
|
1131
|
+
if step_events_json:
|
|
1132
|
+
try:
|
|
1133
|
+
step_events_list = json.loads(step_events_json)
|
|
1134
|
+
# Convert list format to dict: {step_name: result, ...}
|
|
1135
|
+
for event in step_events_list:
|
|
1136
|
+
if "step_name" in event and "result" in event:
|
|
1137
|
+
completed_steps[event["step_name"]] = event["result"]
|
|
1138
|
+
# Also preserve raw step_events list for serialization on next pause
|
|
1139
|
+
step_events = step_events_list
|
|
1140
|
+
logger.info(f"🔄 Resuming workflow with {len(completed_steps)} completed steps from pause")
|
|
1141
|
+
except json.JSONDecodeError:
|
|
1142
|
+
logger.warning("Failed to parse step_events from metadata")
|
|
1143
|
+
|
|
1144
|
+
# Parse initial workflow state for replay
|
|
1145
|
+
if "workflow_state" in request.metadata:
|
|
1146
|
+
workflow_state_json = request.metadata["workflow_state"]
|
|
1147
|
+
if workflow_state_json:
|
|
1148
|
+
try:
|
|
1149
|
+
initial_state = json.loads(workflow_state_json)
|
|
1150
|
+
logger.info(f"🔄 Loaded workflow state: {len(initial_state)} keys")
|
|
1151
|
+
except json.JSONDecodeError:
|
|
1152
|
+
logger.warning("Failed to parse workflow_state from metadata")
|
|
1153
|
+
|
|
1154
|
+
# Check for user response (workflow resume after pause)
|
|
1155
|
+
if "user_response" in request.metadata:
|
|
1156
|
+
user_response = request.metadata["user_response"]
|
|
1157
|
+
logger.info(f"▶️ Resuming workflow with user response: {user_response}")
|
|
1158
|
+
|
|
1159
|
+
# NEW: Check for agent resume (agent-level HITL)
|
|
1160
|
+
agent_context = None
|
|
1161
|
+
if hasattr(request, 'metadata') and request.metadata:
|
|
1162
|
+
if "agent_context" in request.metadata:
|
|
1163
|
+
agent_context_json = request.metadata["agent_context"]
|
|
1164
|
+
try:
|
|
1165
|
+
agent_context = json.loads(agent_context_json)
|
|
1166
|
+
agent_name = agent_context.get("agent_name", "unknown")
|
|
1167
|
+
iteration = agent_context.get("iteration", 0)
|
|
1168
|
+
logger.info(
|
|
1169
|
+
f"▶️ Resuming agent '{agent_name}' from iteration {iteration} "
|
|
1170
|
+
f"with user response: {user_response}"
|
|
1171
|
+
)
|
|
1172
|
+
except json.JSONDecodeError:
|
|
1173
|
+
logger.warning("Failed to parse agent_context from metadata")
|
|
1174
|
+
agent_context = None
|
|
1175
|
+
|
|
1176
|
+
# Extract session_id and user_id from request for memory scoping
|
|
1177
|
+
# Do this FIRST so we can pass to WorkflowEntity constructor
|
|
1178
|
+
session_id = request.session_id if hasattr(request, 'session_id') and request.session_id else request.invocation_id
|
|
1179
|
+
user_id = request.user_id if hasattr(request, 'user_id') and request.user_id else None
|
|
1180
|
+
|
|
1181
|
+
# Extract streaming context for real-time SSE log delivery
|
|
1182
|
+
is_streaming = getattr(request, 'is_streaming', False)
|
|
1183
|
+
tenant_id = request.metadata.get('tenant_id') if hasattr(request, 'metadata') else None
|
|
1184
|
+
|
|
1185
|
+
# Create WorkflowEntity for state management with memory scoping
|
|
1186
|
+
# Entity key will be scoped based on priority: user_id > session_id > run_id
|
|
1187
|
+
# For session scope, include component_name to enable listing sessions by workflow
|
|
1188
|
+
component_name = getattr(request, 'component_name', None)
|
|
1189
|
+
workflow_entity = WorkflowEntity(
|
|
1190
|
+
run_id=request.invocation_id,
|
|
1191
|
+
session_id=session_id,
|
|
1192
|
+
user_id=user_id,
|
|
1193
|
+
component_name=component_name,
|
|
1194
|
+
)
|
|
1195
|
+
|
|
1196
|
+
# Load replay data into entity if provided
|
|
1197
|
+
if completed_steps:
|
|
1198
|
+
workflow_entity._completed_steps = completed_steps
|
|
1199
|
+
logger.debug(f"Loaded {len(completed_steps)} completed steps into workflow entity")
|
|
1200
|
+
|
|
1201
|
+
# Restore raw step_events list for serialization on next pause
|
|
1202
|
+
# This ensures previous user responses are preserved across multiple resumes
|
|
1203
|
+
if step_events:
|
|
1204
|
+
workflow_entity._step_events = step_events
|
|
1205
|
+
logger.debug(f"Restored {len(step_events)} step events into workflow entity")
|
|
1206
|
+
|
|
1207
|
+
# Inject user response if resuming from pause
|
|
1208
|
+
if user_response:
|
|
1209
|
+
# Restore pause_index from metadata for multi-step HITL
|
|
1210
|
+
# This ensures we inject at the correct position in the pause sequence
|
|
1211
|
+
if hasattr(request, 'metadata') and request.metadata:
|
|
1212
|
+
pause_index_str = request.metadata.get("pause_index", "0")
|
|
1213
|
+
try:
|
|
1214
|
+
workflow_entity._pause_index = int(pause_index_str)
|
|
1215
|
+
logger.debug(f"Restored pause_index={workflow_entity._pause_index} for resume")
|
|
1216
|
+
except ValueError:
|
|
1217
|
+
logger.warning(f"Invalid pause_index in metadata: {pause_index_str}, using 0")
|
|
1218
|
+
workflow_entity._pause_index = 0
|
|
1219
|
+
|
|
1220
|
+
workflow_entity.inject_user_response(user_response)
|
|
1221
|
+
logger.debug(f"Injected user response into workflow entity at pause {workflow_entity._pause_index}")
|
|
1222
|
+
|
|
1223
|
+
# IMPORTANT: Reset pause_index to 0 for replay
|
|
1224
|
+
# The workflow replays from the beginning, so the first wait_for_user
|
|
1225
|
+
# should check at index 0, not at the stored index
|
|
1226
|
+
workflow_entity._pause_index = 0
|
|
1227
|
+
logger.debug("Reset pause_index to 0 for replay")
|
|
1228
|
+
|
|
1229
|
+
if initial_state:
|
|
1230
|
+
# Load initial state into entity's state adapter AND workflow entity's state
|
|
1231
|
+
state_adapter = _get_state_adapter()
|
|
1232
|
+
if hasattr(state_adapter, '_standalone_states'):
|
|
1233
|
+
# Standalone mode - set state directly in adapter
|
|
1234
|
+
state_adapter._standalone_states[workflow_entity._state_key] = initial_state
|
|
1235
|
+
logger.debug(f"Loaded initial state with {len(initial_state)} keys into state adapter (standalone)")
|
|
1236
|
+
|
|
1237
|
+
# Also initialize the workflow entity's internal state with the loaded data
|
|
1238
|
+
# This ensures workflow_entity.state.get() returns the persisted values
|
|
1239
|
+
from .workflow import WorkflowState
|
|
1240
|
+
workflow_entity._state = WorkflowState(initial_state.copy(), workflow_entity)
|
|
1241
|
+
logger.info(f"🔄 Initialized workflow entity state with {len(initial_state)} keys from session")
|
|
1242
|
+
|
|
1243
|
+
# Create checkpoint callback for real-time streaming
|
|
1244
|
+
def checkpoint_callback(checkpoint: dict) -> None:
|
|
1245
|
+
"""Send checkpoint to Rust worker queue."""
|
|
1246
|
+
try:
|
|
1247
|
+
# Extract critical metadata for checkpoint routing
|
|
1248
|
+
metadata = self._extract_critical_metadata(request)
|
|
1249
|
+
|
|
1250
|
+
# DEBUG: Log metadata types for troubleshooting PyO3 conversion errors
|
|
1251
|
+
logger.debug(f"Checkpoint metadata types: {[(k, type(v).__name__) for k, v in metadata.items()]}")
|
|
1252
|
+
|
|
1253
|
+
# Get source timestamp (use from checkpoint if provided, otherwise generate now)
|
|
1254
|
+
source_timestamp_ns = checkpoint.get("source_timestamp_ns", time.time_ns())
|
|
1255
|
+
|
|
1256
|
+
# Queue checkpoint via Rust FFI
|
|
1257
|
+
self._rust_worker.queue_workflow_checkpoint(
|
|
1258
|
+
invocation_id=request.invocation_id,
|
|
1259
|
+
checkpoint_type=checkpoint["checkpoint_type"],
|
|
1260
|
+
checkpoint_data=_json.dumps(checkpoint["checkpoint_data"], cls=_ResultEncoder),
|
|
1261
|
+
sequence_number=checkpoint["sequence_number"],
|
|
1262
|
+
metadata=metadata,
|
|
1263
|
+
source_timestamp_ns=source_timestamp_ns,
|
|
1264
|
+
)
|
|
1265
|
+
logger.debug(
|
|
1266
|
+
f"Queued checkpoint: type={checkpoint['checkpoint_type']} "
|
|
1267
|
+
f"seq={checkpoint['sequence_number']}"
|
|
1268
|
+
)
|
|
1269
|
+
except Exception as e:
|
|
1270
|
+
# Checkpoints are critical for durability - failing to persist them
|
|
1271
|
+
# means we cannot guarantee replay/recovery. Re-raise to fail the workflow.
|
|
1272
|
+
logger.error(f"Failed to queue checkpoint: {e}", exc_info=True)
|
|
1273
|
+
logger.error(f"Checkpoint metadata: {metadata}")
|
|
1274
|
+
logger.error(f"Checkpoint type: {checkpoint.get('checkpoint_type')}")
|
|
1275
|
+
raise RuntimeError(
|
|
1276
|
+
f"Failed to queue checkpoint '{checkpoint.get('checkpoint_type')}': {e}. "
|
|
1277
|
+
f"Workflow cannot continue without durable checkpoints."
|
|
1278
|
+
) from e
|
|
1279
|
+
|
|
1280
|
+
# Create delta callback for forwarding streaming events from nested agents/functions
|
|
1281
|
+
# This is used by WorkflowContext._consume_streaming_result to forward events
|
|
1282
|
+
delta_metadata = _normalize_metadata(self._extract_critical_metadata(request))
|
|
1283
|
+
|
|
1284
|
+
def delta_callback(event_type: str, output_data: str, content_index: int, sequence: int, source_timestamp_ns: int = 0) -> None:
|
|
1285
|
+
"""Forward streaming delta event from nested component."""
|
|
1286
|
+
try:
|
|
1287
|
+
# Use provided timestamp or generate one if not provided
|
|
1288
|
+
ts = source_timestamp_ns if source_timestamp_ns > 0 else time.time_ns()
|
|
1289
|
+
self._rust_worker.queue_delta(
|
|
1290
|
+
invocation_id=request.invocation_id,
|
|
1291
|
+
event_type=event_type,
|
|
1292
|
+
output_data=output_data,
|
|
1293
|
+
content_index=content_index,
|
|
1294
|
+
sequence=sequence,
|
|
1295
|
+
metadata=delta_metadata,
|
|
1296
|
+
source_timestamp_ns=ts,
|
|
1297
|
+
)
|
|
1298
|
+
logger.debug(f"Forwarded delta: type={event_type} seq={sequence}")
|
|
1299
|
+
except Exception as e:
|
|
1300
|
+
# Delta forwarding is best-effort - log but don't fail the workflow
|
|
1301
|
+
logger.warning(f"Failed to forward delta event: {e}")
|
|
1302
|
+
|
|
1303
|
+
# Create WorkflowContext with entity, runtime_context, checkpoint callback, and checkpoint client
|
|
1304
|
+
ctx = WorkflowContext(
|
|
1305
|
+
workflow_entity=workflow_entity,
|
|
1306
|
+
run_id=request.invocation_id, # Use unique invocation_id for this execution
|
|
1307
|
+
session_id=session_id, # Session for multi-turn conversations
|
|
1308
|
+
user_id=user_id, # User for long-term memory
|
|
1309
|
+
runtime_context=request.runtime_context,
|
|
1310
|
+
checkpoint_callback=checkpoint_callback,
|
|
1311
|
+
checkpoint_client=self._checkpoint_client, # Phase 3: platform-side memoization
|
|
1312
|
+
is_streaming=is_streaming, # For real-time SSE log delivery
|
|
1313
|
+
tenant_id=tenant_id, # For multi-tenant deployments
|
|
1314
|
+
delta_callback=delta_callback, # For forwarding streaming events from nested components
|
|
1315
|
+
)
|
|
1316
|
+
|
|
1317
|
+
# NEW: Populate agent resume info if this is an agent HITL resume
|
|
1318
|
+
if agent_context and user_response:
|
|
1319
|
+
ctx._agent_resume_info = {
|
|
1320
|
+
"agent_name": agent_context["agent_name"],
|
|
1321
|
+
"agent_context": agent_context,
|
|
1322
|
+
"user_response": user_response,
|
|
1323
|
+
}
|
|
1324
|
+
logger.debug(
|
|
1325
|
+
f"Set agent resume info for '{agent_context['agent_name']}' "
|
|
1326
|
+
f"in workflow context"
|
|
1327
|
+
)
|
|
1328
|
+
|
|
1329
|
+
# Execute workflow directly - Rust bridge handles tracing
|
|
1330
|
+
# Note: Removed Python-level span creation to avoid duplicate spans.
|
|
1331
|
+
# The Rust worker bridge creates comprehensive OpenTelemetry spans.
|
|
1332
|
+
# See DUPLICATE_SPANS_FIX.md for details.
|
|
1333
|
+
|
|
1334
|
+
# CRITICAL: Set context in contextvar so LM/Agent/Tool calls can access it
|
|
1335
|
+
from .context import set_current_context
|
|
1336
|
+
import time as _time
|
|
1337
|
+
token = set_current_context(ctx)
|
|
1338
|
+
|
|
1339
|
+
# Set up _current_span contextvar for proper trace parent-child linking
|
|
1340
|
+
# The Rust worker creates spans and passes trace context via runtime_context.
|
|
1341
|
+
# We need to set this in Python's _current_span contextvar so that spans
|
|
1342
|
+
# created in Python (e.g., agent spans, nested function calls) become proper children.
|
|
1343
|
+
from .tracing import _current_span, SpanInfo
|
|
1344
|
+
span_token = None
|
|
1345
|
+
if request.runtime_context:
|
|
1346
|
+
trace_id = request.runtime_context.trace_id
|
|
1347
|
+
span_id = request.runtime_context.span_id
|
|
1348
|
+
if trace_id and span_id:
|
|
1349
|
+
span_info = SpanInfo(trace_id=trace_id, span_id=span_id)
|
|
1350
|
+
span_token = _current_span.set(span_info)
|
|
1351
|
+
|
|
1352
|
+
workflow_start_time = _time.time()
|
|
1353
|
+
try:
|
|
1354
|
+
# Emit workflow.started checkpoint
|
|
1355
|
+
ctx._send_checkpoint("workflow.started", {
|
|
1356
|
+
"workflow.name": config.name,
|
|
1357
|
+
"run_id": request.invocation_id,
|
|
1358
|
+
"session_id": session_id,
|
|
1359
|
+
"is_replay": bool(completed_steps),
|
|
1360
|
+
})
|
|
1361
|
+
|
|
1362
|
+
# CRITICAL: Flush immediately to ensure workflow.started arrives at platform
|
|
1363
|
+
# BEFORE handler runs. Without this, nested events (agent.started, lm.call.started)
|
|
1364
|
+
# which use direct journal writes would arrive before workflow.started which is queued.
|
|
1365
|
+
self._rust_worker.flush_workflow_checkpoints()
|
|
1366
|
+
|
|
1367
|
+
if input_dict:
|
|
1368
|
+
result = await config.handler(ctx, **input_dict)
|
|
1369
|
+
else:
|
|
1370
|
+
result = await config.handler(ctx)
|
|
1371
|
+
|
|
1372
|
+
# Serialize result BEFORE emitting workflow.completed
|
|
1373
|
+
# This ensures serialization errors trigger workflow.failed, not run.failed
|
|
1374
|
+
output_data = _serialize_result(result)
|
|
1375
|
+
|
|
1376
|
+
# Emit workflow.completed checkpoint
|
|
1377
|
+
workflow_duration_ms = int((_time.time() - workflow_start_time) * 1000)
|
|
1378
|
+
ctx._send_checkpoint("workflow.completed", {
|
|
1379
|
+
"workflow.name": config.name,
|
|
1380
|
+
"run_id": request.invocation_id,
|
|
1381
|
+
"duration_ms": workflow_duration_ms,
|
|
1382
|
+
"steps_count": len(ctx._workflow_entity._step_events),
|
|
1383
|
+
})
|
|
1384
|
+
|
|
1385
|
+
# Note: Workflow entity persistence is handled by the @workflow decorator wrapper
|
|
1386
|
+
# which persists before returning. No need to persist here.
|
|
1387
|
+
except Exception as workflow_error:
|
|
1388
|
+
# Emit workflow.failed checkpoint
|
|
1389
|
+
workflow_duration_ms = int((_time.time() - workflow_start_time) * 1000)
|
|
1390
|
+
ctx._send_checkpoint("workflow.failed", {
|
|
1391
|
+
"workflow.name": config.name,
|
|
1392
|
+
"run_id": request.invocation_id,
|
|
1393
|
+
"duration_ms": workflow_duration_ms,
|
|
1394
|
+
"error": str(workflow_error),
|
|
1395
|
+
"error_type": type(workflow_error).__name__,
|
|
1396
|
+
})
|
|
1397
|
+
raise
|
|
1398
|
+
finally:
|
|
1399
|
+
# Always reset context to prevent leakage
|
|
1400
|
+
from .context import _current_context
|
|
1401
|
+
_current_context.reset(token)
|
|
1402
|
+
|
|
1403
|
+
# Note: Removed flush_telemetry_py() call here - it was causing 2-second blocking delay!
|
|
1404
|
+
# The batch span processor handles flushing automatically with 5s timeout
|
|
1405
|
+
|
|
1406
|
+
# Collect workflow execution metadata for durability
|
|
1407
|
+
metadata = {}
|
|
1408
|
+
|
|
1409
|
+
# CRITICAL: Propagate tenant_id and deployment_id to prevent journal corruption
|
|
1410
|
+
# Missing tenant_id causes events to be written to wrong partition
|
|
1411
|
+
critical_metadata = self._extract_critical_metadata(request)
|
|
1412
|
+
metadata.update(critical_metadata)
|
|
1413
|
+
|
|
1414
|
+
# Add step events to metadata (for workflow durability)
|
|
1415
|
+
# Access _step_events from the workflow entity, not the context
|
|
1416
|
+
step_events = ctx._workflow_entity._step_events
|
|
1417
|
+
if step_events:
|
|
1418
|
+
metadata["step_events"] = json.dumps(step_events)
|
|
1419
|
+
logger.debug(f"Workflow has {len(step_events)} recorded steps")
|
|
1420
|
+
|
|
1421
|
+
# Add final state snapshot to metadata (if state was used)
|
|
1422
|
+
# Check if _state was initialized without triggering property getter
|
|
1423
|
+
if hasattr(ctx, '_workflow_entity') and ctx._workflow_entity._state is not None:
|
|
1424
|
+
if ctx._workflow_entity._state.has_changes():
|
|
1425
|
+
state_snapshot = ctx._workflow_entity._state.get_state_snapshot()
|
|
1426
|
+
metadata["workflow_state"] = json.dumps(state_snapshot)
|
|
1427
|
+
logger.debug(f"Workflow state snapshot: {state_snapshot}")
|
|
1428
|
+
|
|
1429
|
+
# AUDIT TRAIL: Serialize complete state change history for replay and debugging
|
|
1430
|
+
# This captures all intermediate state mutations, not just final snapshot
|
|
1431
|
+
state_changes = ctx._workflow_entity._state_changes
|
|
1432
|
+
logger.info(f"🔍 DEBUG: _state_changes list has {len(state_changes)} entries")
|
|
1433
|
+
if state_changes:
|
|
1434
|
+
metadata["state_changes"] = json.dumps(state_changes)
|
|
1435
|
+
logger.info(f"✅ Serialized {len(state_changes)} state changes to metadata")
|
|
1436
|
+
else:
|
|
1437
|
+
logger.warning("⚠️ _state_changes list is empty - no state change history captured")
|
|
1438
|
+
|
|
1439
|
+
# CRITICAL: Persist workflow entity state to platform
|
|
1440
|
+
# This stores the WorkflowEntity as a first-class entity with proper versioning
|
|
1441
|
+
try:
|
|
1442
|
+
logger.info(f"🔍 DEBUG: About to call _persist_state() for run {request.invocation_id}")
|
|
1443
|
+
await ctx._workflow_entity._persist_state()
|
|
1444
|
+
logger.info(f"✅ Successfully persisted WorkflowEntity state for run {request.invocation_id}")
|
|
1445
|
+
except Exception as persist_error:
|
|
1446
|
+
logger.error(f"❌ Failed to persist WorkflowEntity state (non-fatal): {persist_error}", exc_info=True)
|
|
1447
|
+
# Continue anyway - persistence failure shouldn't fail the workflow
|
|
1448
|
+
|
|
1449
|
+
logger.info(f"Workflow completed successfully with {len(step_events)} steps")
|
|
1450
|
+
|
|
1451
|
+
# Add session_id to metadata for multi-turn conversation support
|
|
1452
|
+
metadata["session_id"] = session_id
|
|
1453
|
+
|
|
1454
|
+
# CRITICAL: Flush all buffered checkpoints before returning response
|
|
1455
|
+
# This ensures checkpoints arrive at platform BEFORE run.completed event
|
|
1456
|
+
try:
|
|
1457
|
+
flushed_count = self._rust_worker.flush_workflow_checkpoints()
|
|
1458
|
+
if flushed_count > 0:
|
|
1459
|
+
logger.info(f"✅ Flushed {flushed_count} checkpoints before completion")
|
|
1460
|
+
except Exception as flush_error:
|
|
1461
|
+
logger.error(f"Failed to flush checkpoints: {flush_error}", exc_info=True)
|
|
1462
|
+
# Continue anyway - checkpoint flushing is best-effort
|
|
1463
|
+
|
|
1464
|
+
return PyExecuteComponentResponse(
|
|
1465
|
+
invocation_id=request.invocation_id,
|
|
1466
|
+
success=True,
|
|
1467
|
+
output_data=output_data,
|
|
1468
|
+
state_update=None, # Not used for workflows (use metadata instead)
|
|
1469
|
+
error_message=None,
|
|
1470
|
+
metadata=metadata if metadata else None, # Include step events + state + session_id
|
|
1471
|
+
event_type="run.completed",
|
|
1472
|
+
content_index=0,
|
|
1473
|
+
sequence=0,
|
|
1474
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1475
|
+
)
|
|
1476
|
+
|
|
1477
|
+
except WaitingForUserInputException as e:
|
|
1478
|
+
# Workflow or agent paused for user input
|
|
1479
|
+
pause_type = "agent" if e.agent_context else "workflow"
|
|
1480
|
+
logger.info(f"⏸️ {pause_type.capitalize()} paused waiting for user input: {e.question}")
|
|
1481
|
+
|
|
1482
|
+
# Collect metadata for pause state
|
|
1483
|
+
# Note: All metadata values must be strings for Rust FFI
|
|
1484
|
+
pause_metadata = {
|
|
1485
|
+
"status": "awaiting_user_input",
|
|
1486
|
+
"question": e.question,
|
|
1487
|
+
"input_type": e.input_type,
|
|
1488
|
+
"pause_type": pause_type, # NEW: Indicates workflow vs agent pause
|
|
1489
|
+
"pause_index": str(e.pause_index), # Store pause index for multi-step HITL
|
|
1490
|
+
}
|
|
1491
|
+
|
|
1492
|
+
# CRITICAL: Propagate tenant_id even when pausing
|
|
1493
|
+
critical_metadata = self._extract_critical_metadata(request)
|
|
1494
|
+
pause_metadata.update(critical_metadata)
|
|
1495
|
+
|
|
1496
|
+
# Add optional fields only if they exist
|
|
1497
|
+
if e.options:
|
|
1498
|
+
pause_metadata["options"] = json.dumps(e.options)
|
|
1499
|
+
if e.checkpoint_state:
|
|
1500
|
+
pause_metadata["checkpoint_state"] = json.dumps(e.checkpoint_state)
|
|
1501
|
+
if session_id:
|
|
1502
|
+
pause_metadata["session_id"] = session_id
|
|
1503
|
+
|
|
1504
|
+
# NEW: Store agent execution state if present
|
|
1505
|
+
if e.agent_context:
|
|
1506
|
+
pause_metadata["agent_context"] = json.dumps(e.agent_context)
|
|
1507
|
+
logger.debug(
|
|
1508
|
+
f"Agent '{e.agent_context['agent_name']}' paused at "
|
|
1509
|
+
f"iteration {e.agent_context['iteration']}"
|
|
1510
|
+
)
|
|
1511
|
+
|
|
1512
|
+
# Add step events to pause metadata for durability
|
|
1513
|
+
step_events = ctx._workflow_entity._step_events
|
|
1514
|
+
if step_events:
|
|
1515
|
+
pause_metadata["step_events"] = json.dumps(step_events)
|
|
1516
|
+
logger.debug(f"Paused workflow has {len(step_events)} recorded steps")
|
|
1517
|
+
|
|
1518
|
+
# Add current workflow state to pause metadata
|
|
1519
|
+
if hasattr(ctx, '_workflow_entity') and ctx._workflow_entity._state is not None:
|
|
1520
|
+
if ctx._workflow_entity._state.has_changes():
|
|
1521
|
+
state_snapshot = ctx._workflow_entity._state.get_state_snapshot()
|
|
1522
|
+
pause_metadata["workflow_state"] = json.dumps(state_snapshot)
|
|
1523
|
+
logger.debug(f"Paused workflow state snapshot: {state_snapshot}")
|
|
1524
|
+
|
|
1525
|
+
# AUDIT TRAIL: Also include state change history for paused workflows
|
|
1526
|
+
state_changes = ctx._workflow_entity._state_changes
|
|
1527
|
+
if state_changes:
|
|
1528
|
+
pause_metadata["state_changes"] = json.dumps(state_changes)
|
|
1529
|
+
logger.debug(f"Paused workflow has {len(state_changes)} state changes in history")
|
|
1530
|
+
|
|
1531
|
+
# Return "success" with awaiting_user_input metadata
|
|
1532
|
+
# The output contains the question details for the client
|
|
1533
|
+
output = {
|
|
1534
|
+
"question": e.question,
|
|
1535
|
+
"input_type": e.input_type,
|
|
1536
|
+
"options": e.options,
|
|
1537
|
+
}
|
|
1538
|
+
output_data = _serialize_result(output)
|
|
1539
|
+
|
|
1540
|
+
# Emit run.paused event for HITL (human-in-the-loop)
|
|
1541
|
+
return PyExecuteComponentResponse(
|
|
1542
|
+
invocation_id=request.invocation_id,
|
|
1543
|
+
success=True, # This is a valid pause state, not an error
|
|
1544
|
+
output_data=output_data,
|
|
1545
|
+
state_update=None,
|
|
1546
|
+
error_message=None,
|
|
1547
|
+
metadata=pause_metadata,
|
|
1548
|
+
event_type="run.paused",
|
|
1549
|
+
content_index=0,
|
|
1550
|
+
sequence=0,
|
|
1551
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1552
|
+
)
|
|
1553
|
+
|
|
1554
|
+
except Exception as e:
|
|
1555
|
+
# Include exception type for better error messages
|
|
1556
|
+
error_msg = f"{type(e).__name__}: {str(e)}"
|
|
1557
|
+
|
|
1558
|
+
# Capture full stack trace for telemetry
|
|
1559
|
+
import traceback
|
|
1560
|
+
stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
1561
|
+
|
|
1562
|
+
# Log with full traceback
|
|
1563
|
+
logger.error(f"Workflow execution failed: {error_msg}", exc_info=True)
|
|
1564
|
+
|
|
1565
|
+
# CRITICAL: Flush all buffered checkpoints before returning error response
|
|
1566
|
+
# This ensures workflow.failed checkpoint arrives at platform BEFORE run.failed event
|
|
1567
|
+
# Without this, SSE clients may not receive workflow.failed events
|
|
1568
|
+
try:
|
|
1569
|
+
flushed_count = self._rust_worker.flush_workflow_checkpoints()
|
|
1570
|
+
if flushed_count > 0:
|
|
1571
|
+
logger.info(f"✅ Flushed {flushed_count} checkpoints before error response")
|
|
1572
|
+
except Exception as flush_error:
|
|
1573
|
+
logger.error(f"Failed to flush checkpoints in error path: {flush_error}", exc_info=True)
|
|
1574
|
+
# Continue anyway - checkpoint flushing is best-effort
|
|
1575
|
+
|
|
1576
|
+
# Store error metadata for observability
|
|
1577
|
+
metadata = {
|
|
1578
|
+
"error_type": type(e).__name__,
|
|
1579
|
+
"stack_trace": stack_trace,
|
|
1580
|
+
"error": True,
|
|
1581
|
+
}
|
|
1582
|
+
|
|
1583
|
+
# Extract critical metadata for journal correlation (if available)
|
|
1584
|
+
critical_metadata = self._extract_critical_metadata(request)
|
|
1585
|
+
metadata.update(critical_metadata)
|
|
1586
|
+
|
|
1587
|
+
# Normalize metadata for Rust FFI compatibility
|
|
1588
|
+
normalized_metadata = _normalize_metadata(metadata)
|
|
1589
|
+
|
|
1590
|
+
# Emit run.failed event
|
|
1591
|
+
return PyExecuteComponentResponse(
|
|
1592
|
+
invocation_id=request.invocation_id,
|
|
1593
|
+
success=False,
|
|
1594
|
+
output_data=b"",
|
|
1595
|
+
state_update=None,
|
|
1596
|
+
error_message=error_msg,
|
|
1597
|
+
metadata=normalized_metadata,
|
|
1598
|
+
event_type="run.failed",
|
|
1599
|
+
content_index=0,
|
|
1600
|
+
sequence=0,
|
|
1601
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1602
|
+
)
|
|
1603
|
+
|
|
1604
|
+
async def _execute_tool(self, tool, input_data: bytes, request):
|
|
1605
|
+
"""Execute a tool handler."""
|
|
1606
|
+
import json
|
|
1607
|
+
from .context import Context
|
|
1608
|
+
from ._core import PyExecuteComponentResponse
|
|
1609
|
+
|
|
1610
|
+
try:
|
|
1611
|
+
# Parse input data
|
|
1612
|
+
input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
|
|
1613
|
+
|
|
1614
|
+
# Create context with runtime_context for trace correlation
|
|
1615
|
+
ctx = Context(
|
|
1616
|
+
run_id=f"{self.service_name}:{tool.name}",
|
|
1617
|
+
runtime_context=request.runtime_context,
|
|
1618
|
+
)
|
|
1619
|
+
|
|
1620
|
+
# Set context in contextvar so get_current_context() and error handlers can access it
|
|
1621
|
+
from .context import set_current_context, _current_context
|
|
1622
|
+
token = set_current_context(ctx)
|
|
1623
|
+
|
|
1624
|
+
# Execute tool
|
|
1625
|
+
result = await tool.invoke(ctx, **input_dict)
|
|
1626
|
+
|
|
1627
|
+
# Serialize result
|
|
1628
|
+
output_data = _serialize_result(result)
|
|
1629
|
+
|
|
1630
|
+
return PyExecuteComponentResponse(
|
|
1631
|
+
invocation_id=request.invocation_id,
|
|
1632
|
+
success=True,
|
|
1633
|
+
output_data=output_data,
|
|
1634
|
+
state_update=None,
|
|
1635
|
+
error_message=None,
|
|
1636
|
+
metadata=None,
|
|
1637
|
+
event_type="run.completed",
|
|
1638
|
+
content_index=0,
|
|
1639
|
+
sequence=0,
|
|
1640
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1641
|
+
)
|
|
1642
|
+
|
|
1643
|
+
except Exception as e:
|
|
1644
|
+
# Include exception type for better error messages
|
|
1645
|
+
error_msg = f"{type(e).__name__}: {str(e)}"
|
|
1646
|
+
|
|
1647
|
+
# Capture full stack trace for telemetry
|
|
1648
|
+
import traceback
|
|
1649
|
+
stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
1650
|
+
|
|
1651
|
+
# Log with full traceback using ctx.logger to ensure run_id correlation
|
|
1652
|
+
from .context import get_current_context
|
|
1653
|
+
current_ctx = get_current_context()
|
|
1654
|
+
error_logger = current_ctx.logger if current_ctx else logger
|
|
1655
|
+
error_logger.error(f"Tool execution failed: {error_msg}", exc_info=True)
|
|
1656
|
+
|
|
1657
|
+
# Store error metadata for observability
|
|
1658
|
+
metadata = {
|
|
1659
|
+
"error_type": type(e).__name__,
|
|
1660
|
+
"stack_trace": stack_trace,
|
|
1661
|
+
"error": True,
|
|
1662
|
+
}
|
|
1663
|
+
|
|
1664
|
+
# CRITICAL: Extract critical metadata (including tenant_id) for journal event correlation
|
|
1665
|
+
critical_metadata = self._extract_critical_metadata(request)
|
|
1666
|
+
metadata.update(critical_metadata)
|
|
1667
|
+
|
|
1668
|
+
# Normalize metadata for Rust FFI compatibility
|
|
1669
|
+
normalized_metadata = _normalize_metadata(metadata)
|
|
1670
|
+
|
|
1671
|
+
# Emit run.failed event
|
|
1672
|
+
return PyExecuteComponentResponse(
|
|
1673
|
+
invocation_id=request.invocation_id,
|
|
1674
|
+
success=False,
|
|
1675
|
+
output_data=b"",
|
|
1676
|
+
state_update=None,
|
|
1677
|
+
error_message=error_msg,
|
|
1678
|
+
metadata=normalized_metadata,
|
|
1679
|
+
event_type="run.failed",
|
|
1680
|
+
content_index=0,
|
|
1681
|
+
sequence=0,
|
|
1682
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1683
|
+
)
|
|
1684
|
+
|
|
1685
|
+
finally:
|
|
1686
|
+
# Always reset context to prevent leakage between executions
|
|
1687
|
+
_current_context.reset(token)
|
|
1688
|
+
|
|
1689
|
+
async def _execute_entity(self, entity_type, input_data: bytes, request):
|
|
1690
|
+
"""Execute an entity method."""
|
|
1691
|
+
import json
|
|
1692
|
+
from .context import Context
|
|
1693
|
+
from .entity import EntityType, Entity, _entity_state_adapter_ctx
|
|
1694
|
+
from ._core import PyExecuteComponentResponse
|
|
1695
|
+
|
|
1696
|
+
# Set entity state adapter in context for Entity instances to access
|
|
1697
|
+
_entity_state_adapter_ctx.set(self._entity_state_adapter)
|
|
1698
|
+
|
|
1699
|
+
try:
|
|
1700
|
+
# Parse input data
|
|
1701
|
+
input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
|
|
1702
|
+
|
|
1703
|
+
# Extract entity key and method name from input
|
|
1704
|
+
entity_key = input_dict.pop("key", None)
|
|
1705
|
+
method_name = input_dict.pop("method", None)
|
|
1706
|
+
|
|
1707
|
+
if not entity_key:
|
|
1708
|
+
raise ValueError("Entity invocation requires 'key' parameter")
|
|
1709
|
+
if not method_name:
|
|
1710
|
+
raise ValueError("Entity invocation requires 'method' parameter")
|
|
1711
|
+
|
|
1712
|
+
# Create context for logging and tracing
|
|
1713
|
+
ctx = Context(
|
|
1714
|
+
run_id=f"{self.service_name}:{entity_type.name}:{entity_key}",
|
|
1715
|
+
runtime_context=request.runtime_context,
|
|
1716
|
+
)
|
|
1717
|
+
|
|
1718
|
+
# Set context in contextvar so get_current_context() and error handlers can access it
|
|
1719
|
+
from .context import set_current_context, _current_context
|
|
1720
|
+
token = set_current_context(ctx)
|
|
1721
|
+
|
|
1722
|
+
# Note: State loading is now handled automatically by the entity method wrapper
|
|
1723
|
+
# via EntityStateAdapter which uses the Rust core for cache + platform persistence
|
|
1724
|
+
|
|
1725
|
+
# Create entity instance using the stored class reference
|
|
1726
|
+
entity_instance = entity_type.entity_class(key=entity_key)
|
|
1727
|
+
|
|
1728
|
+
# Get method
|
|
1729
|
+
if not hasattr(entity_instance, method_name):
|
|
1730
|
+
raise ValueError(f"Entity '{entity_type.name}' has no method '{method_name}'")
|
|
1731
|
+
|
|
1732
|
+
method = getattr(entity_instance, method_name)
|
|
1733
|
+
|
|
1734
|
+
# Execute method (entity method wrapper handles state load/save automatically)
|
|
1735
|
+
result = await method(**input_dict)
|
|
1736
|
+
|
|
1737
|
+
# Serialize result
|
|
1738
|
+
output_data = _serialize_result(result)
|
|
1739
|
+
|
|
1740
|
+
# Note: State persistence is now handled automatically by the entity method wrapper
|
|
1741
|
+
# via EntityStateAdapter which uses Rust core for optimistic locking + version tracking
|
|
1742
|
+
|
|
1743
|
+
# CRITICAL: Propagate tenant_id and deployment_id to prevent journal corruption
|
|
1744
|
+
metadata = self._extract_critical_metadata(request)
|
|
1745
|
+
|
|
1746
|
+
return PyExecuteComponentResponse(
|
|
1747
|
+
invocation_id=request.invocation_id,
|
|
1748
|
+
success=True,
|
|
1749
|
+
output_data=output_data,
|
|
1750
|
+
state_update=None, # TODO: Use structured StateUpdate object
|
|
1751
|
+
error_message=None,
|
|
1752
|
+
metadata=metadata if metadata else None, # Include state in metadata for Worker Coordinator
|
|
1753
|
+
event_type="run.completed",
|
|
1754
|
+
content_index=0,
|
|
1755
|
+
sequence=0,
|
|
1756
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1757
|
+
)
|
|
1758
|
+
|
|
1759
|
+
except Exception as e:
|
|
1760
|
+
# Include exception type for better error messages
|
|
1761
|
+
error_msg = f"{type(e).__name__}: {str(e)}"
|
|
1762
|
+
|
|
1763
|
+
# Capture full stack trace for telemetry
|
|
1764
|
+
import traceback
|
|
1765
|
+
stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
1766
|
+
|
|
1767
|
+
# Log with full traceback using ctx.logger to ensure run_id correlation
|
|
1768
|
+
from .context import get_current_context
|
|
1769
|
+
current_ctx = get_current_context()
|
|
1770
|
+
error_logger = current_ctx.logger if current_ctx else logger
|
|
1771
|
+
error_logger.error(f"Entity execution failed: {error_msg}", exc_info=True)
|
|
1772
|
+
|
|
1773
|
+
# Store error metadata for observability
|
|
1774
|
+
metadata = {
|
|
1775
|
+
"error_type": type(e).__name__,
|
|
1776
|
+
"stack_trace": stack_trace,
|
|
1777
|
+
"error": True,
|
|
1778
|
+
}
|
|
1779
|
+
|
|
1780
|
+
# Extract critical metadata for journal correlation (if available)
|
|
1781
|
+
critical_metadata = self._extract_critical_metadata(request)
|
|
1782
|
+
metadata.update(critical_metadata)
|
|
1783
|
+
|
|
1784
|
+
# Normalize metadata for Rust FFI compatibility
|
|
1785
|
+
normalized_metadata = _normalize_metadata(metadata)
|
|
1786
|
+
|
|
1787
|
+
# Emit run.failed event
|
|
1788
|
+
return PyExecuteComponentResponse(
|
|
1789
|
+
invocation_id=request.invocation_id,
|
|
1790
|
+
success=False,
|
|
1791
|
+
output_data=b"",
|
|
1792
|
+
state_update=None,
|
|
1793
|
+
error_message=error_msg,
|
|
1794
|
+
metadata=normalized_metadata,
|
|
1795
|
+
event_type="run.failed",
|
|
1796
|
+
content_index=0,
|
|
1797
|
+
sequence=0,
|
|
1798
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1799
|
+
)
|
|
1800
|
+
|
|
1801
|
+
finally:
|
|
1802
|
+
# Always reset context to prevent leakage between executions
|
|
1803
|
+
_current_context.reset(token)
|
|
1804
|
+
|
|
1805
|
+
async def _execute_agent(self, agent, input_data: bytes, request):
|
|
1806
|
+
"""Execute an agent with session support for multi-turn conversations."""
|
|
1807
|
+
import json
|
|
1808
|
+
import uuid
|
|
1809
|
+
from .agent import AgentContext
|
|
1810
|
+
from .entity import _entity_state_adapter_ctx
|
|
1811
|
+
from ._core import PyExecuteComponentResponse
|
|
1812
|
+
|
|
1813
|
+
# Set entity state adapter in context so AgentContext can access it
|
|
1814
|
+
_entity_state_adapter_ctx.set(self._entity_state_adapter)
|
|
1815
|
+
|
|
1816
|
+
try:
|
|
1817
|
+
# Parse input data
|
|
1818
|
+
input_dict = json.loads(input_data.decode("utf-8")) if input_data else {}
|
|
1819
|
+
|
|
1820
|
+
# Extract user message
|
|
1821
|
+
user_message = input_dict.get("message", "")
|
|
1822
|
+
if not user_message:
|
|
1823
|
+
raise ValueError("Agent invocation requires 'message' parameter")
|
|
1824
|
+
|
|
1825
|
+
# Extract or generate session_id for multi-turn conversation support
|
|
1826
|
+
# If session_id is provided, the agent will load previous conversation history
|
|
1827
|
+
# If not provided, a new session is created with auto-generated ID
|
|
1828
|
+
session_id = input_dict.get("session_id")
|
|
1829
|
+
|
|
1830
|
+
if not session_id:
|
|
1831
|
+
session_id = str(uuid.uuid4())
|
|
1832
|
+
logger.info(f"Created new agent session: {session_id}")
|
|
1833
|
+
else:
|
|
1834
|
+
logger.info(f"Using existing agent session: {session_id}")
|
|
1835
|
+
|
|
1836
|
+
# Extract streaming context for real-time SSE log delivery
|
|
1837
|
+
is_streaming = getattr(request, 'is_streaming', False)
|
|
1838
|
+
tenant_id = request.metadata.get('tenant_id') if hasattr(request, 'metadata') else None
|
|
1839
|
+
|
|
1840
|
+
# Create AgentContext with session support for conversation persistence
|
|
1841
|
+
# AgentContext automatically loads/saves conversation history based on session_id
|
|
1842
|
+
ctx = AgentContext(
|
|
1843
|
+
run_id=request.invocation_id,
|
|
1844
|
+
agent_name=agent.name,
|
|
1845
|
+
session_id=session_id,
|
|
1846
|
+
runtime_context=request.runtime_context,
|
|
1847
|
+
is_streaming=is_streaming,
|
|
1848
|
+
tenant_id=tenant_id,
|
|
1849
|
+
)
|
|
1850
|
+
|
|
1851
|
+
# Set context in contextvar so get_current_context() and error handlers can access it
|
|
1852
|
+
from .context import set_current_context, _current_context
|
|
1853
|
+
token = set_current_context(ctx)
|
|
1854
|
+
|
|
1855
|
+
# Execute agent - now returns an async generator for streaming
|
|
1856
|
+
result = agent.run(user_message, context=ctx)
|
|
1857
|
+
|
|
1858
|
+
# Agent.run() always returns an async generator
|
|
1859
|
+
# Queue each event via delta queue for real-time delivery
|
|
1860
|
+
import inspect
|
|
1861
|
+
if inspect.isasyncgen(result):
|
|
1862
|
+
from .events import Event, EventType
|
|
1863
|
+
|
|
1864
|
+
sequence = 0
|
|
1865
|
+
final_output = None
|
|
1866
|
+
final_tool_calls = []
|
|
1867
|
+
handoff_to = None
|
|
1868
|
+
|
|
1869
|
+
# Extract metadata for delta queue (must be Dict[str, str] for Rust FFI)
|
|
1870
|
+
metadata = _normalize_metadata(self._extract_critical_metadata(request))
|
|
1871
|
+
metadata["session_id"] = session_id # Include session for UI
|
|
1872
|
+
|
|
1873
|
+
async for event in result:
|
|
1874
|
+
if isinstance(event, Event):
|
|
1875
|
+
# Queue the event via delta queue
|
|
1876
|
+
event_data = event.to_response_fields()
|
|
1877
|
+
output_data = event_data.get("output_data", b"")
|
|
1878
|
+
output_str = output_data.decode("utf-8") if isinstance(output_data, bytes) else str(output_data or "{}")
|
|
1879
|
+
|
|
1880
|
+
self._rust_worker.queue_delta(
|
|
1881
|
+
invocation_id=request.invocation_id,
|
|
1882
|
+
event_type=event_data.get("event_type", ""),
|
|
1883
|
+
output_data=output_str,
|
|
1884
|
+
content_index=event_data.get("content_index", 0),
|
|
1885
|
+
sequence=sequence,
|
|
1886
|
+
metadata=metadata,
|
|
1887
|
+
source_timestamp_ns=event.source_timestamp_ns,
|
|
1888
|
+
)
|
|
1889
|
+
sequence += 1
|
|
1890
|
+
|
|
1891
|
+
# Capture final result from agent.completed event
|
|
1892
|
+
if event.event_type == EventType.AGENT_COMPLETED:
|
|
1893
|
+
final_output = event.data.get("output", "")
|
|
1894
|
+
final_tool_calls = event.data.get("tool_calls", [])
|
|
1895
|
+
handoff_to = event.data.get("handoff_to")
|
|
1896
|
+
|
|
1897
|
+
# Emit run.completed event with the final agent result
|
|
1898
|
+
final_result = {
|
|
1899
|
+
"output": final_output,
|
|
1900
|
+
"tool_calls": final_tool_calls,
|
|
1901
|
+
}
|
|
1902
|
+
if handoff_to:
|
|
1903
|
+
final_result["handoff_to"] = handoff_to
|
|
1904
|
+
|
|
1905
|
+
self._rust_worker.queue_delta(
|
|
1906
|
+
invocation_id=request.invocation_id,
|
|
1907
|
+
event_type="run.completed",
|
|
1908
|
+
output_data=json.dumps(final_result),
|
|
1909
|
+
content_index=0,
|
|
1910
|
+
sequence=sequence,
|
|
1911
|
+
metadata=metadata,
|
|
1912
|
+
source_timestamp_ns=time.time_ns(),
|
|
1913
|
+
)
|
|
1914
|
+
|
|
1915
|
+
logger.debug(f"Agent streaming queued {sequence + 1} deltas for real-time delivery")
|
|
1916
|
+
# Return None to signal that streaming was handled via delta queue
|
|
1917
|
+
return None
|
|
1918
|
+
else:
|
|
1919
|
+
# Fallback for non-generator (shouldn't happen but handle gracefully)
|
|
1920
|
+
if inspect.iscoroutine(result):
|
|
1921
|
+
agent_result = await result
|
|
1922
|
+
else:
|
|
1923
|
+
agent_result = result
|
|
1924
|
+
|
|
1925
|
+
# Build response with agent output and tool calls
|
|
1926
|
+
result = {
|
|
1927
|
+
"output": agent_result.output,
|
|
1928
|
+
"tool_calls": agent_result.tool_calls,
|
|
1929
|
+
}
|
|
1930
|
+
|
|
1931
|
+
# Serialize result
|
|
1932
|
+
output_data = _serialize_result(result)
|
|
1933
|
+
|
|
1934
|
+
# CRITICAL: Propagate tenant_id and deployment_id to prevent journal corruption
|
|
1935
|
+
metadata = self._extract_critical_metadata(request)
|
|
1936
|
+
# Also include session_id for UI to persist conversation
|
|
1937
|
+
metadata["session_id"] = session_id
|
|
1938
|
+
|
|
1939
|
+
return PyExecuteComponentResponse(
|
|
1940
|
+
invocation_id=request.invocation_id,
|
|
1941
|
+
success=True,
|
|
1942
|
+
output_data=output_data,
|
|
1943
|
+
state_update=None,
|
|
1944
|
+
error_message=None,
|
|
1945
|
+
metadata=metadata if metadata else None,
|
|
1946
|
+
event_type="run.completed",
|
|
1947
|
+
content_index=0,
|
|
1948
|
+
sequence=0,
|
|
1949
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1950
|
+
)
|
|
1951
|
+
|
|
1952
|
+
except Exception as e:
|
|
1953
|
+
# Include exception type for better error messages
|
|
1954
|
+
error_msg = f"{type(e).__name__}: {str(e)}"
|
|
1955
|
+
|
|
1956
|
+
# Capture full stack trace for telemetry
|
|
1957
|
+
import traceback
|
|
1958
|
+
stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
1959
|
+
|
|
1960
|
+
# Log with full traceback using ctx.logger to ensure run_id correlation
|
|
1961
|
+
from .context import get_current_context
|
|
1962
|
+
current_ctx = get_current_context()
|
|
1963
|
+
error_logger = current_ctx.logger if current_ctx else logger
|
|
1964
|
+
error_logger.error(f"Agent execution failed: {error_msg}", exc_info=True)
|
|
1965
|
+
|
|
1966
|
+
# Store error metadata for observability
|
|
1967
|
+
metadata = {
|
|
1968
|
+
"error_type": type(e).__name__,
|
|
1969
|
+
"stack_trace": stack_trace,
|
|
1970
|
+
"error": True,
|
|
1971
|
+
}
|
|
1972
|
+
|
|
1973
|
+
# Extract critical metadata for journal correlation (if available)
|
|
1974
|
+
critical_metadata = self._extract_critical_metadata(request)
|
|
1975
|
+
metadata.update(critical_metadata)
|
|
1976
|
+
|
|
1977
|
+
# Normalize metadata for Rust FFI compatibility
|
|
1978
|
+
normalized_metadata = _normalize_metadata(metadata)
|
|
1979
|
+
|
|
1980
|
+
# Emit run.failed event
|
|
1981
|
+
return PyExecuteComponentResponse(
|
|
1982
|
+
invocation_id=request.invocation_id,
|
|
1983
|
+
success=False,
|
|
1984
|
+
output_data=b"",
|
|
1985
|
+
state_update=None,
|
|
1986
|
+
error_message=error_msg,
|
|
1987
|
+
metadata=normalized_metadata,
|
|
1988
|
+
event_type="run.failed",
|
|
1989
|
+
content_index=0,
|
|
1990
|
+
sequence=0,
|
|
1991
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1992
|
+
)
|
|
1993
|
+
|
|
1994
|
+
finally:
|
|
1995
|
+
# Always reset context to prevent leakage between executions
|
|
1996
|
+
_current_context.reset(token)
|
|
1997
|
+
|
|
1998
|
+
def _create_error_response(self, request, error_message: str):
|
|
1999
|
+
"""Create an error response."""
|
|
2000
|
+
from ._core import PyExecuteComponentResponse
|
|
2001
|
+
|
|
2002
|
+
# Emit run.failed event
|
|
2003
|
+
return PyExecuteComponentResponse(
|
|
2004
|
+
invocation_id=request.invocation_id,
|
|
2005
|
+
success=False,
|
|
2006
|
+
output_data=b"",
|
|
2007
|
+
state_update=None,
|
|
2008
|
+
error_message=error_message,
|
|
2009
|
+
metadata=None,
|
|
2010
|
+
event_type="run.failed",
|
|
2011
|
+
content_index=0,
|
|
2012
|
+
sequence=0,
|
|
2013
|
+
attempt=getattr(request, 'attempt', 0),
|
|
2014
|
+
)
|
|
2015
|
+
|
|
2016
|
+
async def run(self):
|
|
2017
|
+
"""Run the worker (register and start message loop).
|
|
2018
|
+
|
|
2019
|
+
This method will:
|
|
2020
|
+
1. Discover all registered @function and @workflow handlers
|
|
2021
|
+
2. Register with the coordinator
|
|
2022
|
+
3. Create a shared Python event loop for all function executions
|
|
2023
|
+
4. Enter the message processing loop
|
|
2024
|
+
5. Block until shutdown
|
|
2025
|
+
|
|
2026
|
+
This is the main entry point for your worker service.
|
|
2027
|
+
"""
|
|
2028
|
+
try:
|
|
2029
|
+
logger.info(f"Starting worker: {self.service_name}")
|
|
2030
|
+
|
|
2031
|
+
# Discover components
|
|
2032
|
+
components = self._discover_components()
|
|
2033
|
+
|
|
2034
|
+
# Set components on Rust worker
|
|
2035
|
+
self._rust_worker.set_components(components)
|
|
2036
|
+
|
|
2037
|
+
# Set metadata
|
|
2038
|
+
if self.metadata:
|
|
2039
|
+
self._rust_worker.set_service_metadata(self.metadata)
|
|
2040
|
+
|
|
2041
|
+
# Configure entity state manager on Rust worker for database persistence
|
|
2042
|
+
logger.info("Configuring Rust EntityStateManager for database persistence")
|
|
2043
|
+
# Access the Rust core from the adapter
|
|
2044
|
+
if hasattr(self._entity_state_adapter, '_rust_core') and self._entity_state_adapter._rust_core:
|
|
2045
|
+
self._rust_worker.set_entity_state_manager(self._entity_state_adapter._rust_core)
|
|
2046
|
+
logger.info("Successfully configured Rust EntityStateManager")
|
|
2047
|
+
|
|
2048
|
+
# Get the current event loop to pass to Rust for concurrent Python async execution
|
|
2049
|
+
# This allows Rust to execute Python async functions on the same event loop
|
|
2050
|
+
# without spawn_blocking overhead, enabling true concurrency
|
|
2051
|
+
loop = asyncio.get_running_loop()
|
|
2052
|
+
logger.info("Passing Python event loop to Rust worker for concurrent execution")
|
|
2053
|
+
|
|
2054
|
+
# Set event loop on Rust worker
|
|
2055
|
+
self._rust_worker.set_event_loop(loop)
|
|
2056
|
+
|
|
2057
|
+
# Set message handler
|
|
2058
|
+
handler = self._create_message_handler()
|
|
2059
|
+
self._rust_worker.set_message_handler(handler)
|
|
2060
|
+
|
|
2061
|
+
# Initialize worker
|
|
2062
|
+
self._rust_worker.initialize()
|
|
2063
|
+
|
|
2064
|
+
logger.info("Worker registered successfully, entering message loop...")
|
|
2065
|
+
|
|
2066
|
+
# Run worker (this will block until shutdown)
|
|
2067
|
+
await self._rust_worker.run()
|
|
2068
|
+
|
|
2069
|
+
except Exception as e:
|
|
2070
|
+
# Capture SDK-level startup/runtime failures
|
|
2071
|
+
logger.error(f"Worker failed to start or encountered critical error: {e}", exc_info=True)
|
|
2072
|
+
_sentry.capture_exception(
|
|
2073
|
+
e,
|
|
2074
|
+
context={
|
|
2075
|
+
"service_name": self.service_name,
|
|
2076
|
+
"service_version": self.service_version,
|
|
2077
|
+
"error_location": "Worker.run",
|
|
2078
|
+
"error_phase": "worker_lifecycle",
|
|
2079
|
+
},
|
|
2080
|
+
tags={
|
|
2081
|
+
"sdk_error": "true",
|
|
2082
|
+
"error_type": "worker_failure",
|
|
2083
|
+
"severity": "critical",
|
|
2084
|
+
},
|
|
2085
|
+
level="error",
|
|
2086
|
+
)
|
|
2087
|
+
raise
|
|
2088
|
+
|
|
2089
|
+
finally:
|
|
2090
|
+
# Flush Sentry events before shutdown
|
|
2091
|
+
logger.info("Flushing Sentry events before shutdown...")
|
|
2092
|
+
_sentry.flush(timeout=5.0)
|
|
2093
|
+
|
|
2094
|
+
logger.info("Worker shutdown complete")
|