agnt5 0.2.8a7__cp310-abi3-macosx_11_0_arm64.whl → 0.2.8a9__cp310-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of agnt5 might be problematic. Click here for more details.
- agnt5/_core.abi3.so +0 -0
- agnt5/_telemetry.py +7 -2
- agnt5/agent.py +744 -171
- agnt5/client.py +18 -1
- agnt5/context.py +94 -0
- agnt5/exceptions.py +13 -0
- agnt5/function.py +18 -11
- agnt5/lm.py +124 -16
- agnt5/tool.py +110 -29
- agnt5/worker.py +421 -28
- agnt5/workflow.py +367 -72
- {agnt5-0.2.8a7.dist-info → agnt5-0.2.8a9.dist-info}/METADATA +1 -1
- agnt5-0.2.8a9.dist-info/RECORD +22 -0
- agnt5-0.2.8a7.dist-info/RECORD +0 -22
- {agnt5-0.2.8a7.dist-info → agnt5-0.2.8a9.dist-info}/WHEEL +0 -0
agnt5/worker.py
CHANGED
|
@@ -14,6 +14,38 @@ from ._telemetry import setup_module_logger
|
|
|
14
14
|
|
|
15
15
|
logger = setup_module_logger(__name__)
|
|
16
16
|
|
|
17
|
+
|
|
18
|
+
def _normalize_metadata(metadata: Dict[str, Any]) -> Dict[str, str]:
|
|
19
|
+
"""
|
|
20
|
+
Convert metadata dictionary to Dict[str, str] for Rust FFI compatibility.
|
|
21
|
+
|
|
22
|
+
PyO3 requires HashMap<String, String>, but Python code may include booleans,
|
|
23
|
+
integers, or other types. This helper ensures all values are strings.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
metadata: Dictionary with potentially mixed types
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Dictionary with all string values
|
|
30
|
+
|
|
31
|
+
Example:
|
|
32
|
+
>>> _normalize_metadata({"error": True, "count": 42, "msg": "hello"})
|
|
33
|
+
{"error": "true", "count": "42", "msg": "hello"}
|
|
34
|
+
"""
|
|
35
|
+
normalized = {}
|
|
36
|
+
for key, value in metadata.items():
|
|
37
|
+
if isinstance(value, str):
|
|
38
|
+
normalized[key] = value
|
|
39
|
+
elif isinstance(value, bool):
|
|
40
|
+
# Convert bool to lowercase string for JSON compatibility
|
|
41
|
+
normalized[key] = str(value).lower()
|
|
42
|
+
elif value is None:
|
|
43
|
+
normalized[key] = ""
|
|
44
|
+
else:
|
|
45
|
+
# Convert any other type to string representation
|
|
46
|
+
normalized[key] = str(value)
|
|
47
|
+
return normalized
|
|
48
|
+
|
|
17
49
|
# Context variable to store trace metadata for propagation to LM calls
|
|
18
50
|
# This allows Rust LM layer to access traceparent without explicit parameter passing
|
|
19
51
|
_trace_metadata: contextvars.ContextVar[Dict[str, str]] = contextvars.ContextVar(
|
|
@@ -455,11 +487,22 @@ class Worker:
|
|
|
455
487
|
output_schema_str = json.dumps(config.output_schema) if config.output_schema else None
|
|
456
488
|
metadata = config.metadata if config.metadata else {}
|
|
457
489
|
|
|
490
|
+
# Serialize retry and backoff policies
|
|
491
|
+
config_dict = {}
|
|
492
|
+
if config.retries:
|
|
493
|
+
config_dict["max_attempts"] = str(config.retries.max_attempts)
|
|
494
|
+
config_dict["initial_interval_ms"] = str(config.retries.initial_interval_ms)
|
|
495
|
+
config_dict["max_interval_ms"] = str(config.retries.max_interval_ms)
|
|
496
|
+
|
|
497
|
+
if config.backoff:
|
|
498
|
+
config_dict["backoff_type"] = config.backoff.type.value
|
|
499
|
+
config_dict["backoff_multiplier"] = str(config.backoff.multiplier)
|
|
500
|
+
|
|
458
501
|
component_info = self._PyComponentInfo(
|
|
459
502
|
name=config.name,
|
|
460
503
|
component_type="function",
|
|
461
504
|
metadata=metadata,
|
|
462
|
-
config=
|
|
505
|
+
config=config_dict,
|
|
463
506
|
input_schema=input_schema_str,
|
|
464
507
|
output_schema=output_schema_str,
|
|
465
508
|
definition=None,
|
|
@@ -627,6 +670,30 @@ class Worker:
|
|
|
627
670
|
|
|
628
671
|
return handle_message
|
|
629
672
|
|
|
673
|
+
def _extract_critical_metadata(self, request) -> Dict[str, str]:
|
|
674
|
+
"""
|
|
675
|
+
Extract critical metadata from request that MUST be propagated to response.
|
|
676
|
+
|
|
677
|
+
This ensures journal events are written to the correct tenant partition
|
|
678
|
+
and can be properly replayed. Missing tenant_id causes catastrophic
|
|
679
|
+
event sourcing corruption where events are split across partitions.
|
|
680
|
+
|
|
681
|
+
Returns:
|
|
682
|
+
Dict[str, str]: Metadata with all values normalized to strings for Rust FFI
|
|
683
|
+
"""
|
|
684
|
+
metadata = {}
|
|
685
|
+
if hasattr(request, 'metadata') and request.metadata:
|
|
686
|
+
# CRITICAL: Propagate tenant_id to prevent journal corruption
|
|
687
|
+
# Convert to string immediately to ensure Rust FFI compatibility
|
|
688
|
+
if "tenant_id" in request.metadata:
|
|
689
|
+
metadata["tenant_id"] = str(request.metadata["tenant_id"])
|
|
690
|
+
if "deployment_id" in request.metadata:
|
|
691
|
+
metadata["deployment_id"] = str(request.metadata["deployment_id"])
|
|
692
|
+
|
|
693
|
+
# CRITICAL: Normalize all metadata values to strings for Rust FFI (PyO3)
|
|
694
|
+
# PyO3 expects HashMap<String, String> and will fail with bool/int values
|
|
695
|
+
return _normalize_metadata(metadata)
|
|
696
|
+
|
|
630
697
|
async def _execute_function(self, config, input_data: bytes, request):
|
|
631
698
|
"""Execute a function handler (supports both regular and streaming functions)."""
|
|
632
699
|
import json
|
|
@@ -647,17 +714,33 @@ class Worker:
|
|
|
647
714
|
_trace_metadata.set(dict(request.metadata))
|
|
648
715
|
logger.debug(f"Trace metadata stored: traceparent={request.metadata.get('traceparent', 'N/A')}")
|
|
649
716
|
|
|
650
|
-
#
|
|
651
|
-
|
|
717
|
+
# Extract attempt number from platform request (if provided)
|
|
718
|
+
platform_attempt = getattr(request, 'attempt', 0)
|
|
719
|
+
|
|
720
|
+
# Create FunctionContext with attempt number for retry tracking
|
|
721
|
+
# - If platform_attempt > 0: Platform is orchestrating retries
|
|
722
|
+
# - If platform_attempt == 0: First attempt (or no retry config)
|
|
723
|
+
from .function import FunctionContext
|
|
724
|
+
ctx = FunctionContext(
|
|
652
725
|
run_id=f"{self.service_name}:{config.name}",
|
|
726
|
+
attempt=platform_attempt,
|
|
653
727
|
runtime_context=request.runtime_context,
|
|
728
|
+
retry_policy=config.retries,
|
|
654
729
|
)
|
|
655
730
|
|
|
731
|
+
# Set context in contextvar so get_current_context() and error handlers can access it
|
|
732
|
+
from .context import set_current_context, _current_context
|
|
733
|
+
token = set_current_context(ctx)
|
|
734
|
+
|
|
656
735
|
# Execute function directly - Rust bridge handles tracing
|
|
657
736
|
# Note: Removed Python-level span creation to avoid duplicate spans.
|
|
658
737
|
# The Rust worker bridge (sdk-python/rust-src/worker.rs:413-659) already
|
|
659
738
|
# creates a comprehensive OpenTelemetry span with all necessary attributes.
|
|
660
739
|
# See DUPLICATE_SPANS_FIX.md for details.
|
|
740
|
+
#
|
|
741
|
+
# Note on retry handling:
|
|
742
|
+
# - If platform_attempt > 0: Platform is orchestrating retries, execute once
|
|
743
|
+
# - If platform_attempt == 0: Local retry loop in decorator wrapper handles retries
|
|
661
744
|
if input_dict:
|
|
662
745
|
result = config.handler(ctx, **input_dict)
|
|
663
746
|
else:
|
|
@@ -688,6 +771,7 @@ class Worker:
|
|
|
688
771
|
is_chunk=True,
|
|
689
772
|
done=False,
|
|
690
773
|
chunk_index=chunk_index,
|
|
774
|
+
attempt=platform_attempt,
|
|
691
775
|
))
|
|
692
776
|
chunk_index += 1
|
|
693
777
|
|
|
@@ -702,6 +786,7 @@ class Worker:
|
|
|
702
786
|
is_chunk=True,
|
|
703
787
|
done=True,
|
|
704
788
|
chunk_index=chunk_index,
|
|
789
|
+
attempt=platform_attempt,
|
|
705
790
|
))
|
|
706
791
|
|
|
707
792
|
logger.debug(f"Streaming function produced {len(responses)} chunks")
|
|
@@ -714,34 +799,69 @@ class Worker:
|
|
|
714
799
|
# Serialize result
|
|
715
800
|
output_data = json.dumps(result).encode("utf-8")
|
|
716
801
|
|
|
802
|
+
# Extract critical metadata for journal event correlation
|
|
803
|
+
response_metadata = self._extract_critical_metadata(request)
|
|
804
|
+
|
|
717
805
|
return PyExecuteComponentResponse(
|
|
718
806
|
invocation_id=request.invocation_id,
|
|
719
807
|
success=True,
|
|
720
808
|
output_data=output_data,
|
|
721
809
|
state_update=None,
|
|
722
810
|
error_message=None,
|
|
723
|
-
metadata=None,
|
|
811
|
+
metadata=response_metadata if response_metadata else None,
|
|
724
812
|
is_chunk=False,
|
|
725
813
|
done=True,
|
|
726
814
|
chunk_index=0,
|
|
815
|
+
attempt=platform_attempt,
|
|
727
816
|
)
|
|
728
817
|
|
|
729
818
|
except Exception as e:
|
|
730
819
|
# Include exception type for better error messages
|
|
731
820
|
error_msg = f"{type(e).__name__}: {str(e)}"
|
|
732
|
-
|
|
821
|
+
|
|
822
|
+
# Capture full stack trace for telemetry
|
|
823
|
+
import traceback
|
|
824
|
+
stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
825
|
+
|
|
826
|
+
# Log with full traceback using ctx.logger to ensure run_id correlation
|
|
827
|
+
from .context import get_current_context
|
|
828
|
+
current_ctx = get_current_context()
|
|
829
|
+
error_logger = current_ctx.logger if current_ctx else logger
|
|
830
|
+
error_logger.error(f"Function execution failed: {error_msg}", exc_info=True)
|
|
831
|
+
|
|
832
|
+
# Store stack trace in metadata for observability
|
|
833
|
+
metadata = {
|
|
834
|
+
"error_type": type(e).__name__,
|
|
835
|
+
"stack_trace": stack_trace,
|
|
836
|
+
"error": True, # Boolean flag for error detection
|
|
837
|
+
}
|
|
838
|
+
|
|
839
|
+
# CRITICAL: Extract critical metadata (including tenant_id) for journal event correlation
|
|
840
|
+
# This ensures run.failed events are properly emitted by Worker Coordinator
|
|
841
|
+
critical_metadata = self._extract_critical_metadata(request)
|
|
842
|
+
metadata.update(critical_metadata)
|
|
843
|
+
|
|
844
|
+
# CRITICAL: Normalize metadata to ensure all values are strings (Rust FFI requirement)
|
|
845
|
+
# PyO3 expects HashMap<String, String>, but we may have booleans or other types
|
|
846
|
+
normalized_metadata = _normalize_metadata(metadata)
|
|
847
|
+
|
|
733
848
|
return PyExecuteComponentResponse(
|
|
734
849
|
invocation_id=request.invocation_id,
|
|
735
850
|
success=False,
|
|
736
851
|
output_data=b"",
|
|
737
852
|
state_update=None,
|
|
738
853
|
error_message=error_msg,
|
|
739
|
-
metadata=
|
|
854
|
+
metadata=normalized_metadata,
|
|
740
855
|
is_chunk=False,
|
|
741
856
|
done=True,
|
|
742
857
|
chunk_index=0,
|
|
858
|
+
attempt=getattr(request, 'attempt', 0),
|
|
743
859
|
)
|
|
744
860
|
|
|
861
|
+
finally:
|
|
862
|
+
# Always reset context to prevent leakage between executions
|
|
863
|
+
_current_context.reset(token)
|
|
864
|
+
|
|
745
865
|
async def _execute_workflow(self, config, input_data: bytes, request):
|
|
746
866
|
"""Execute a workflow handler with automatic replay support."""
|
|
747
867
|
import json
|
|
@@ -798,8 +918,35 @@ class Worker:
|
|
|
798
918
|
user_response = request.metadata["user_response"]
|
|
799
919
|
logger.info(f"▶️ Resuming workflow with user response: {user_response}")
|
|
800
920
|
|
|
801
|
-
#
|
|
802
|
-
|
|
921
|
+
# NEW: Check for agent resume (agent-level HITL)
|
|
922
|
+
agent_context = None
|
|
923
|
+
if hasattr(request, 'metadata') and request.metadata:
|
|
924
|
+
if "agent_context" in request.metadata:
|
|
925
|
+
agent_context_json = request.metadata["agent_context"]
|
|
926
|
+
try:
|
|
927
|
+
agent_context = json.loads(agent_context_json)
|
|
928
|
+
agent_name = agent_context.get("agent_name", "unknown")
|
|
929
|
+
iteration = agent_context.get("iteration", 0)
|
|
930
|
+
logger.info(
|
|
931
|
+
f"▶️ Resuming agent '{agent_name}' from iteration {iteration} "
|
|
932
|
+
f"with user response: {user_response}"
|
|
933
|
+
)
|
|
934
|
+
except json.JSONDecodeError:
|
|
935
|
+
logger.warning("Failed to parse agent_context from metadata")
|
|
936
|
+
agent_context = None
|
|
937
|
+
|
|
938
|
+
# Extract session_id and user_id from request for memory scoping
|
|
939
|
+
# Do this FIRST so we can pass to WorkflowEntity constructor
|
|
940
|
+
session_id = request.session_id if hasattr(request, 'session_id') and request.session_id else request.invocation_id
|
|
941
|
+
user_id = request.user_id if hasattr(request, 'user_id') and request.user_id else None
|
|
942
|
+
|
|
943
|
+
# Create WorkflowEntity for state management with memory scoping
|
|
944
|
+
# Entity key will be scoped based on priority: user_id > session_id > run_id
|
|
945
|
+
workflow_entity = WorkflowEntity(
|
|
946
|
+
run_id=request.invocation_id,
|
|
947
|
+
session_id=session_id,
|
|
948
|
+
user_id=user_id,
|
|
949
|
+
)
|
|
803
950
|
|
|
804
951
|
# Load replay data into entity if provided
|
|
805
952
|
if completed_steps:
|
|
@@ -822,21 +969,75 @@ class Worker:
|
|
|
822
969
|
# Production mode - state is managed by Rust core
|
|
823
970
|
logger.debug(f"Initial state will be loaded from platform (production mode)")
|
|
824
971
|
|
|
825
|
-
# Create
|
|
972
|
+
# Create checkpoint callback for real-time streaming
|
|
973
|
+
def checkpoint_callback(checkpoint: dict) -> None:
|
|
974
|
+
"""Send checkpoint to Rust worker queue."""
|
|
975
|
+
try:
|
|
976
|
+
# Extract critical metadata for checkpoint routing
|
|
977
|
+
metadata = self._extract_critical_metadata(request)
|
|
978
|
+
|
|
979
|
+
# DEBUG: Log metadata types for troubleshooting PyO3 conversion errors
|
|
980
|
+
logger.debug(f"Checkpoint metadata types: {[(k, type(v).__name__) for k, v in metadata.items()]}")
|
|
981
|
+
|
|
982
|
+
# Queue checkpoint via Rust FFI
|
|
983
|
+
self._rust_worker.queue_workflow_checkpoint(
|
|
984
|
+
invocation_id=request.invocation_id,
|
|
985
|
+
checkpoint_type=checkpoint["checkpoint_type"],
|
|
986
|
+
checkpoint_data=json.dumps(checkpoint["checkpoint_data"]),
|
|
987
|
+
sequence_number=checkpoint["sequence_number"],
|
|
988
|
+
metadata=metadata,
|
|
989
|
+
)
|
|
990
|
+
logger.debug(
|
|
991
|
+
f"Queued checkpoint: type={checkpoint['checkpoint_type']} "
|
|
992
|
+
f"seq={checkpoint['sequence_number']}"
|
|
993
|
+
)
|
|
994
|
+
except Exception as e:
|
|
995
|
+
logger.error(f"Failed to queue checkpoint: {e}", exc_info=True)
|
|
996
|
+
logger.error(f"Checkpoint metadata causing error: {metadata}")
|
|
997
|
+
logger.error(f"Checkpoint data: {checkpoint}")
|
|
998
|
+
|
|
999
|
+
# Create WorkflowContext with entity, runtime_context, and checkpoint callback
|
|
826
1000
|
ctx = WorkflowContext(
|
|
827
1001
|
workflow_entity=workflow_entity,
|
|
828
|
-
run_id=
|
|
1002
|
+
run_id=request.invocation_id, # Use unique invocation_id for this execution
|
|
1003
|
+
session_id=session_id, # Session for multi-turn conversations
|
|
1004
|
+
user_id=user_id, # User for long-term memory
|
|
829
1005
|
runtime_context=request.runtime_context,
|
|
1006
|
+
checkpoint_callback=checkpoint_callback,
|
|
830
1007
|
)
|
|
831
1008
|
|
|
1009
|
+
# NEW: Populate agent resume info if this is an agent HITL resume
|
|
1010
|
+
if agent_context and user_response:
|
|
1011
|
+
ctx._agent_resume_info = {
|
|
1012
|
+
"agent_name": agent_context["agent_name"],
|
|
1013
|
+
"agent_context": agent_context,
|
|
1014
|
+
"user_response": user_response,
|
|
1015
|
+
}
|
|
1016
|
+
logger.debug(
|
|
1017
|
+
f"Set agent resume info for '{agent_context['agent_name']}' "
|
|
1018
|
+
f"in workflow context"
|
|
1019
|
+
)
|
|
1020
|
+
|
|
832
1021
|
# Execute workflow directly - Rust bridge handles tracing
|
|
833
1022
|
# Note: Removed Python-level span creation to avoid duplicate spans.
|
|
834
1023
|
# The Rust worker bridge creates comprehensive OpenTelemetry spans.
|
|
835
1024
|
# See DUPLICATE_SPANS_FIX.md for details.
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
1025
|
+
|
|
1026
|
+
# CRITICAL: Set context in contextvar so LM/Agent/Tool calls can access it
|
|
1027
|
+
from .context import set_current_context
|
|
1028
|
+
token = set_current_context(ctx)
|
|
1029
|
+
try:
|
|
1030
|
+
if input_dict:
|
|
1031
|
+
result = await config.handler(ctx, **input_dict)
|
|
1032
|
+
else:
|
|
1033
|
+
result = await config.handler(ctx)
|
|
1034
|
+
|
|
1035
|
+
# Note: Workflow entity persistence is handled by the @workflow decorator wrapper
|
|
1036
|
+
# which persists before returning. No need to persist here.
|
|
1037
|
+
finally:
|
|
1038
|
+
# Always reset context to prevent leakage
|
|
1039
|
+
from .context import _current_context
|
|
1040
|
+
_current_context.reset(token)
|
|
840
1041
|
|
|
841
1042
|
# Note: Removed flush_telemetry_py() call here - it was causing 2-second blocking delay!
|
|
842
1043
|
# The batch span processor handles flushing automatically with 5s timeout
|
|
@@ -847,6 +1048,11 @@ class Worker:
|
|
|
847
1048
|
# Collect workflow execution metadata for durability
|
|
848
1049
|
metadata = {}
|
|
849
1050
|
|
|
1051
|
+
# CRITICAL: Propagate tenant_id and deployment_id to prevent journal corruption
|
|
1052
|
+
# Missing tenant_id causes events to be written to wrong partition
|
|
1053
|
+
critical_metadata = self._extract_critical_metadata(request)
|
|
1054
|
+
metadata.update(critical_metadata)
|
|
1055
|
+
|
|
850
1056
|
# Add step events to metadata (for workflow durability)
|
|
851
1057
|
# Access _step_events from the workflow entity, not the context
|
|
852
1058
|
step_events = ctx._workflow_entity._step_events
|
|
@@ -862,11 +1068,41 @@ class Worker:
|
|
|
862
1068
|
metadata["workflow_state"] = json.dumps(state_snapshot)
|
|
863
1069
|
logger.debug(f"Workflow state snapshot: {state_snapshot}")
|
|
864
1070
|
|
|
1071
|
+
# AUDIT TRAIL: Serialize complete state change history for replay and debugging
|
|
1072
|
+
# This captures all intermediate state mutations, not just final snapshot
|
|
1073
|
+
state_changes = ctx._workflow_entity._state_changes
|
|
1074
|
+
logger.info(f"🔍 DEBUG: _state_changes list has {len(state_changes)} entries")
|
|
1075
|
+
if state_changes:
|
|
1076
|
+
metadata["state_changes"] = json.dumps(state_changes)
|
|
1077
|
+
logger.info(f"✅ Serialized {len(state_changes)} state changes to metadata")
|
|
1078
|
+
else:
|
|
1079
|
+
logger.warning("⚠️ _state_changes list is empty - no state change history captured")
|
|
1080
|
+
|
|
1081
|
+
# CRITICAL: Persist workflow entity state to platform
|
|
1082
|
+
# This stores the WorkflowEntity as a first-class entity with proper versioning
|
|
1083
|
+
try:
|
|
1084
|
+
logger.info(f"🔍 DEBUG: About to call _persist_state() for run {request.invocation_id}")
|
|
1085
|
+
await ctx._workflow_entity._persist_state()
|
|
1086
|
+
logger.info(f"✅ Successfully persisted WorkflowEntity state for run {request.invocation_id}")
|
|
1087
|
+
except Exception as persist_error:
|
|
1088
|
+
logger.error(f"❌ Failed to persist WorkflowEntity state (non-fatal): {persist_error}", exc_info=True)
|
|
1089
|
+
# Continue anyway - persistence failure shouldn't fail the workflow
|
|
1090
|
+
|
|
865
1091
|
logger.info(f"Workflow completed successfully with {len(step_events)} steps")
|
|
866
1092
|
|
|
867
1093
|
# Add session_id to metadata for multi-turn conversation support
|
|
868
1094
|
metadata["session_id"] = session_id
|
|
869
1095
|
|
|
1096
|
+
# CRITICAL: Flush all buffered checkpoints before returning response
|
|
1097
|
+
# This ensures checkpoints arrive at platform BEFORE run.completed event
|
|
1098
|
+
try:
|
|
1099
|
+
flushed_count = self._rust_worker.flush_workflow_checkpoints()
|
|
1100
|
+
if flushed_count > 0:
|
|
1101
|
+
logger.info(f"✅ Flushed {flushed_count} checkpoints before completion")
|
|
1102
|
+
except Exception as flush_error:
|
|
1103
|
+
logger.error(f"Failed to flush checkpoints: {flush_error}", exc_info=True)
|
|
1104
|
+
# Continue anyway - checkpoint flushing is best-effort
|
|
1105
|
+
|
|
870
1106
|
return PyExecuteComponentResponse(
|
|
871
1107
|
invocation_id=request.invocation_id,
|
|
872
1108
|
success=True,
|
|
@@ -877,11 +1113,13 @@ class Worker:
|
|
|
877
1113
|
is_chunk=False,
|
|
878
1114
|
done=True,
|
|
879
1115
|
chunk_index=0,
|
|
1116
|
+
attempt=getattr(request, 'attempt', 0),
|
|
880
1117
|
)
|
|
881
1118
|
|
|
882
1119
|
except WaitingForUserInputException as e:
|
|
883
|
-
# Workflow paused for user input
|
|
884
|
-
|
|
1120
|
+
# Workflow or agent paused for user input
|
|
1121
|
+
pause_type = "agent" if e.agent_context else "workflow"
|
|
1122
|
+
logger.info(f"⏸️ {pause_type.capitalize()} paused waiting for user input: {e.question}")
|
|
885
1123
|
|
|
886
1124
|
# Collect metadata for pause state
|
|
887
1125
|
# Note: All metadata values must be strings for Rust FFI
|
|
@@ -889,8 +1127,13 @@ class Worker:
|
|
|
889
1127
|
"status": "awaiting_user_input",
|
|
890
1128
|
"question": e.question,
|
|
891
1129
|
"input_type": e.input_type,
|
|
1130
|
+
"pause_type": pause_type, # NEW: Indicates workflow vs agent pause
|
|
892
1131
|
}
|
|
893
1132
|
|
|
1133
|
+
# CRITICAL: Propagate tenant_id even when pausing
|
|
1134
|
+
critical_metadata = self._extract_critical_metadata(request)
|
|
1135
|
+
pause_metadata.update(critical_metadata)
|
|
1136
|
+
|
|
894
1137
|
# Add optional fields only if they exist
|
|
895
1138
|
if e.options:
|
|
896
1139
|
pause_metadata["options"] = json.dumps(e.options)
|
|
@@ -899,6 +1142,14 @@ class Worker:
|
|
|
899
1142
|
if session_id:
|
|
900
1143
|
pause_metadata["session_id"] = session_id
|
|
901
1144
|
|
|
1145
|
+
# NEW: Store agent execution state if present
|
|
1146
|
+
if e.agent_context:
|
|
1147
|
+
pause_metadata["agent_context"] = json.dumps(e.agent_context)
|
|
1148
|
+
logger.debug(
|
|
1149
|
+
f"Agent '{e.agent_context['agent_name']}' paused at "
|
|
1150
|
+
f"iteration {e.agent_context['iteration']}"
|
|
1151
|
+
)
|
|
1152
|
+
|
|
902
1153
|
# Add step events to pause metadata for durability
|
|
903
1154
|
step_events = ctx._workflow_entity._step_events
|
|
904
1155
|
if step_events:
|
|
@@ -912,6 +1163,12 @@ class Worker:
|
|
|
912
1163
|
pause_metadata["workflow_state"] = json.dumps(state_snapshot)
|
|
913
1164
|
logger.debug(f"Paused workflow state snapshot: {state_snapshot}")
|
|
914
1165
|
|
|
1166
|
+
# AUDIT TRAIL: Also include state change history for paused workflows
|
|
1167
|
+
state_changes = ctx._workflow_entity._state_changes
|
|
1168
|
+
if state_changes:
|
|
1169
|
+
pause_metadata["state_changes"] = json.dumps(state_changes)
|
|
1170
|
+
logger.debug(f"Paused workflow has {len(state_changes)} state changes in history")
|
|
1171
|
+
|
|
915
1172
|
# Return "success" with awaiting_user_input metadata
|
|
916
1173
|
# The output contains the question details for the client
|
|
917
1174
|
output = {
|
|
@@ -931,22 +1188,45 @@ class Worker:
|
|
|
931
1188
|
is_chunk=False,
|
|
932
1189
|
done=True,
|
|
933
1190
|
chunk_index=0,
|
|
1191
|
+
attempt=getattr(request, 'attempt', 0),
|
|
934
1192
|
)
|
|
935
1193
|
|
|
936
1194
|
except Exception as e:
|
|
937
1195
|
# Include exception type for better error messages
|
|
938
1196
|
error_msg = f"{type(e).__name__}: {str(e)}"
|
|
1197
|
+
|
|
1198
|
+
# Capture full stack trace for telemetry
|
|
1199
|
+
import traceback
|
|
1200
|
+
stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
1201
|
+
|
|
1202
|
+
# Log with full traceback
|
|
939
1203
|
logger.error(f"Workflow execution failed: {error_msg}", exc_info=True)
|
|
1204
|
+
|
|
1205
|
+
# Store error metadata for observability
|
|
1206
|
+
metadata = {
|
|
1207
|
+
"error_type": type(e).__name__,
|
|
1208
|
+
"stack_trace": stack_trace,
|
|
1209
|
+
"error": True,
|
|
1210
|
+
}
|
|
1211
|
+
|
|
1212
|
+
# Extract critical metadata for journal correlation (if available)
|
|
1213
|
+
critical_metadata = self._extract_critical_metadata(request)
|
|
1214
|
+
metadata.update(critical_metadata)
|
|
1215
|
+
|
|
1216
|
+
# Normalize metadata for Rust FFI compatibility
|
|
1217
|
+
normalized_metadata = _normalize_metadata(metadata)
|
|
1218
|
+
|
|
940
1219
|
return PyExecuteComponentResponse(
|
|
941
1220
|
invocation_id=request.invocation_id,
|
|
942
1221
|
success=False,
|
|
943
1222
|
output_data=b"",
|
|
944
1223
|
state_update=None,
|
|
945
1224
|
error_message=error_msg,
|
|
946
|
-
metadata=
|
|
1225
|
+
metadata=normalized_metadata,
|
|
947
1226
|
is_chunk=False,
|
|
948
1227
|
done=True,
|
|
949
1228
|
chunk_index=0,
|
|
1229
|
+
attempt=getattr(request, 'attempt', 0),
|
|
950
1230
|
)
|
|
951
1231
|
|
|
952
1232
|
async def _execute_tool(self, tool, input_data: bytes, request):
|
|
@@ -965,6 +1245,10 @@ class Worker:
|
|
|
965
1245
|
runtime_context=request.runtime_context,
|
|
966
1246
|
)
|
|
967
1247
|
|
|
1248
|
+
# Set context in contextvar so get_current_context() and error handlers can access it
|
|
1249
|
+
from .context import set_current_context, _current_context
|
|
1250
|
+
token = set_current_context(ctx)
|
|
1251
|
+
|
|
968
1252
|
# Execute tool
|
|
969
1253
|
result = await tool.invoke(ctx, **input_dict)
|
|
970
1254
|
|
|
@@ -981,24 +1265,54 @@ class Worker:
|
|
|
981
1265
|
is_chunk=False,
|
|
982
1266
|
done=True,
|
|
983
1267
|
chunk_index=0,
|
|
1268
|
+
attempt=getattr(request, 'attempt', 0),
|
|
984
1269
|
)
|
|
985
1270
|
|
|
986
1271
|
except Exception as e:
|
|
987
1272
|
# Include exception type for better error messages
|
|
988
1273
|
error_msg = f"{type(e).__name__}: {str(e)}"
|
|
989
|
-
|
|
1274
|
+
|
|
1275
|
+
# Capture full stack trace for telemetry
|
|
1276
|
+
import traceback
|
|
1277
|
+
stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
1278
|
+
|
|
1279
|
+
# Log with full traceback using ctx.logger to ensure run_id correlation
|
|
1280
|
+
from .context import get_current_context
|
|
1281
|
+
current_ctx = get_current_context()
|
|
1282
|
+
error_logger = current_ctx.logger if current_ctx else logger
|
|
1283
|
+
error_logger.error(f"Tool execution failed: {error_msg}", exc_info=True)
|
|
1284
|
+
|
|
1285
|
+
# Store error metadata for observability
|
|
1286
|
+
metadata = {
|
|
1287
|
+
"error_type": type(e).__name__,
|
|
1288
|
+
"stack_trace": stack_trace,
|
|
1289
|
+
"error": True,
|
|
1290
|
+
}
|
|
1291
|
+
|
|
1292
|
+
# CRITICAL: Extract critical metadata (including tenant_id) for journal event correlation
|
|
1293
|
+
critical_metadata = self._extract_critical_metadata(request)
|
|
1294
|
+
metadata.update(critical_metadata)
|
|
1295
|
+
|
|
1296
|
+
# Normalize metadata for Rust FFI compatibility
|
|
1297
|
+
normalized_metadata = _normalize_metadata(metadata)
|
|
1298
|
+
|
|
990
1299
|
return PyExecuteComponentResponse(
|
|
991
1300
|
invocation_id=request.invocation_id,
|
|
992
1301
|
success=False,
|
|
993
1302
|
output_data=b"",
|
|
994
1303
|
state_update=None,
|
|
995
1304
|
error_message=error_msg,
|
|
996
|
-
metadata=
|
|
1305
|
+
metadata=normalized_metadata,
|
|
997
1306
|
is_chunk=False,
|
|
998
1307
|
done=True,
|
|
999
1308
|
chunk_index=0,
|
|
1309
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1000
1310
|
)
|
|
1001
1311
|
|
|
1312
|
+
finally:
|
|
1313
|
+
# Always reset context to prevent leakage between executions
|
|
1314
|
+
_current_context.reset(token)
|
|
1315
|
+
|
|
1002
1316
|
async def _execute_entity(self, entity_type, input_data: bytes, request):
|
|
1003
1317
|
"""Execute an entity method."""
|
|
1004
1318
|
import json
|
|
@@ -1022,6 +1336,16 @@ class Worker:
|
|
|
1022
1336
|
if not method_name:
|
|
1023
1337
|
raise ValueError("Entity invocation requires 'method' parameter")
|
|
1024
1338
|
|
|
1339
|
+
# Create context for logging and tracing
|
|
1340
|
+
ctx = Context(
|
|
1341
|
+
run_id=f"{self.service_name}:{entity_type.name}:{entity_key}",
|
|
1342
|
+
runtime_context=request.runtime_context,
|
|
1343
|
+
)
|
|
1344
|
+
|
|
1345
|
+
# Set context in contextvar so get_current_context() and error handlers can access it
|
|
1346
|
+
from .context import set_current_context, _current_context
|
|
1347
|
+
token = set_current_context(ctx)
|
|
1348
|
+
|
|
1025
1349
|
# Note: State loading is now handled automatically by the entity method wrapper
|
|
1026
1350
|
# via EntityStateAdapter which uses the Rust core for cache + platform persistence
|
|
1027
1351
|
|
|
@@ -1042,7 +1366,9 @@ class Worker:
|
|
|
1042
1366
|
|
|
1043
1367
|
# Note: State persistence is now handled automatically by the entity method wrapper
|
|
1044
1368
|
# via EntityStateAdapter which uses Rust core for optimistic locking + version tracking
|
|
1045
|
-
|
|
1369
|
+
|
|
1370
|
+
# CRITICAL: Propagate tenant_id and deployment_id to prevent journal corruption
|
|
1371
|
+
metadata = self._extract_critical_metadata(request)
|
|
1046
1372
|
|
|
1047
1373
|
return PyExecuteComponentResponse(
|
|
1048
1374
|
invocation_id=request.invocation_id,
|
|
@@ -1050,28 +1376,58 @@ class Worker:
|
|
|
1050
1376
|
output_data=output_data,
|
|
1051
1377
|
state_update=None, # TODO: Use structured StateUpdate object
|
|
1052
1378
|
error_message=None,
|
|
1053
|
-
metadata=metadata, # Include state in metadata for Worker Coordinator
|
|
1379
|
+
metadata=metadata if metadata else None, # Include state in metadata for Worker Coordinator
|
|
1054
1380
|
is_chunk=False,
|
|
1055
1381
|
done=True,
|
|
1056
1382
|
chunk_index=0,
|
|
1383
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1057
1384
|
)
|
|
1058
1385
|
|
|
1059
1386
|
except Exception as e:
|
|
1060
1387
|
# Include exception type for better error messages
|
|
1061
1388
|
error_msg = f"{type(e).__name__}: {str(e)}"
|
|
1062
|
-
|
|
1389
|
+
|
|
1390
|
+
# Capture full stack trace for telemetry
|
|
1391
|
+
import traceback
|
|
1392
|
+
stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
1393
|
+
|
|
1394
|
+
# Log with full traceback using ctx.logger to ensure run_id correlation
|
|
1395
|
+
from .context import get_current_context
|
|
1396
|
+
current_ctx = get_current_context()
|
|
1397
|
+
error_logger = current_ctx.logger if current_ctx else logger
|
|
1398
|
+
error_logger.error(f"Entity execution failed: {error_msg}", exc_info=True)
|
|
1399
|
+
|
|
1400
|
+
# Store error metadata for observability
|
|
1401
|
+
metadata = {
|
|
1402
|
+
"error_type": type(e).__name__,
|
|
1403
|
+
"stack_trace": stack_trace,
|
|
1404
|
+
"error": True,
|
|
1405
|
+
}
|
|
1406
|
+
|
|
1407
|
+
# Extract critical metadata for journal correlation (if available)
|
|
1408
|
+
critical_metadata = self._extract_critical_metadata(request)
|
|
1409
|
+
metadata.update(critical_metadata)
|
|
1410
|
+
|
|
1411
|
+
# Normalize metadata for Rust FFI compatibility
|
|
1412
|
+
normalized_metadata = _normalize_metadata(metadata)
|
|
1413
|
+
|
|
1063
1414
|
return PyExecuteComponentResponse(
|
|
1064
1415
|
invocation_id=request.invocation_id,
|
|
1065
1416
|
success=False,
|
|
1066
1417
|
output_data=b"",
|
|
1067
1418
|
state_update=None,
|
|
1068
1419
|
error_message=error_msg,
|
|
1069
|
-
metadata=
|
|
1420
|
+
metadata=normalized_metadata,
|
|
1070
1421
|
is_chunk=False,
|
|
1071
1422
|
done=True,
|
|
1072
1423
|
chunk_index=0,
|
|
1424
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1073
1425
|
)
|
|
1074
1426
|
|
|
1427
|
+
finally:
|
|
1428
|
+
# Always reset context to prevent leakage between executions
|
|
1429
|
+
_current_context.reset(token)
|
|
1430
|
+
|
|
1075
1431
|
async def _execute_agent(self, agent, input_data: bytes, request):
|
|
1076
1432
|
"""Execute an agent with session support for multi-turn conversations."""
|
|
1077
1433
|
import json
|
|
@@ -1112,6 +1468,10 @@ class Worker:
|
|
|
1112
1468
|
runtime_context=request.runtime_context,
|
|
1113
1469
|
)
|
|
1114
1470
|
|
|
1471
|
+
# Set context in contextvar so get_current_context() and error handlers can access it
|
|
1472
|
+
from .context import set_current_context, _current_context
|
|
1473
|
+
token = set_current_context(ctx)
|
|
1474
|
+
|
|
1115
1475
|
# Execute agent - conversation history is automatically included
|
|
1116
1476
|
agent_result = await agent.run(user_message, context=ctx)
|
|
1117
1477
|
|
|
@@ -1124,8 +1484,10 @@ class Worker:
|
|
|
1124
1484
|
# Serialize result
|
|
1125
1485
|
output_data = json.dumps(result).encode("utf-8")
|
|
1126
1486
|
|
|
1127
|
-
#
|
|
1128
|
-
metadata =
|
|
1487
|
+
# CRITICAL: Propagate tenant_id and deployment_id to prevent journal corruption
|
|
1488
|
+
metadata = self._extract_critical_metadata(request)
|
|
1489
|
+
# Also include session_id for UI to persist conversation
|
|
1490
|
+
metadata["session_id"] = session_id
|
|
1129
1491
|
|
|
1130
1492
|
return PyExecuteComponentResponse(
|
|
1131
1493
|
invocation_id=request.invocation_id,
|
|
@@ -1133,28 +1495,58 @@ class Worker:
|
|
|
1133
1495
|
output_data=output_data,
|
|
1134
1496
|
state_update=None,
|
|
1135
1497
|
error_message=None,
|
|
1136
|
-
metadata=metadata,
|
|
1498
|
+
metadata=metadata if metadata else None,
|
|
1137
1499
|
is_chunk=False,
|
|
1138
1500
|
done=True,
|
|
1139
1501
|
chunk_index=0,
|
|
1502
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1140
1503
|
)
|
|
1141
1504
|
|
|
1142
1505
|
except Exception as e:
|
|
1143
1506
|
# Include exception type for better error messages
|
|
1144
1507
|
error_msg = f"{type(e).__name__}: {str(e)}"
|
|
1145
|
-
|
|
1508
|
+
|
|
1509
|
+
# Capture full stack trace for telemetry
|
|
1510
|
+
import traceback
|
|
1511
|
+
stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
|
|
1512
|
+
|
|
1513
|
+
# Log with full traceback using ctx.logger to ensure run_id correlation
|
|
1514
|
+
from .context import get_current_context
|
|
1515
|
+
current_ctx = get_current_context()
|
|
1516
|
+
error_logger = current_ctx.logger if current_ctx else logger
|
|
1517
|
+
error_logger.error(f"Agent execution failed: {error_msg}", exc_info=True)
|
|
1518
|
+
|
|
1519
|
+
# Store error metadata for observability
|
|
1520
|
+
metadata = {
|
|
1521
|
+
"error_type": type(e).__name__,
|
|
1522
|
+
"stack_trace": stack_trace,
|
|
1523
|
+
"error": True,
|
|
1524
|
+
}
|
|
1525
|
+
|
|
1526
|
+
# Extract critical metadata for journal correlation (if available)
|
|
1527
|
+
critical_metadata = self._extract_critical_metadata(request)
|
|
1528
|
+
metadata.update(critical_metadata)
|
|
1529
|
+
|
|
1530
|
+
# Normalize metadata for Rust FFI compatibility
|
|
1531
|
+
normalized_metadata = _normalize_metadata(metadata)
|
|
1532
|
+
|
|
1146
1533
|
return PyExecuteComponentResponse(
|
|
1147
1534
|
invocation_id=request.invocation_id,
|
|
1148
1535
|
success=False,
|
|
1149
1536
|
output_data=b"",
|
|
1150
1537
|
state_update=None,
|
|
1151
1538
|
error_message=error_msg,
|
|
1152
|
-
metadata=
|
|
1539
|
+
metadata=normalized_metadata,
|
|
1153
1540
|
is_chunk=False,
|
|
1154
1541
|
done=True,
|
|
1155
1542
|
chunk_index=0,
|
|
1543
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1156
1544
|
)
|
|
1157
1545
|
|
|
1546
|
+
finally:
|
|
1547
|
+
# Always reset context to prevent leakage between executions
|
|
1548
|
+
_current_context.reset(token)
|
|
1549
|
+
|
|
1158
1550
|
def _create_error_response(self, request, error_message: str):
|
|
1159
1551
|
"""Create an error response."""
|
|
1160
1552
|
from ._core import PyExecuteComponentResponse
|
|
@@ -1169,6 +1561,7 @@ class Worker:
|
|
|
1169
1561
|
is_chunk=False,
|
|
1170
1562
|
done=True,
|
|
1171
1563
|
chunk_index=0,
|
|
1564
|
+
attempt=getattr(request, 'attempt', 0),
|
|
1172
1565
|
)
|
|
1173
1566
|
|
|
1174
1567
|
async def run(self):
|