agnt5 0.2.8a6__cp310-abi3-macosx_11_0_arm64.whl → 0.2.8a8__cp310-abi3-macosx_11_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of agnt5 might be problematic. Click here for more details.

agnt5/worker.py CHANGED
@@ -14,6 +14,38 @@ from ._telemetry import setup_module_logger
14
14
 
15
15
  logger = setup_module_logger(__name__)
16
16
 
17
+
18
+ def _normalize_metadata(metadata: Dict[str, Any]) -> Dict[str, str]:
19
+ """
20
+ Convert metadata dictionary to Dict[str, str] for Rust FFI compatibility.
21
+
22
+ PyO3 requires HashMap<String, String>, but Python code may include booleans,
23
+ integers, or other types. This helper ensures all values are strings.
24
+
25
+ Args:
26
+ metadata: Dictionary with potentially mixed types
27
+
28
+ Returns:
29
+ Dictionary with all string values
30
+
31
+ Example:
32
+ >>> _normalize_metadata({"error": True, "count": 42, "msg": "hello"})
33
+ {"error": "true", "count": "42", "msg": "hello"}
34
+ """
35
+ normalized = {}
36
+ for key, value in metadata.items():
37
+ if isinstance(value, str):
38
+ normalized[key] = value
39
+ elif isinstance(value, bool):
40
+ # Convert bool to lowercase string for JSON compatibility
41
+ normalized[key] = str(value).lower()
42
+ elif value is None:
43
+ normalized[key] = ""
44
+ else:
45
+ # Convert any other type to string representation
46
+ normalized[key] = str(value)
47
+ return normalized
48
+
17
49
  # Context variable to store trace metadata for propagation to LM calls
18
50
  # This allows Rust LM layer to access traceparent without explicit parameter passing
19
51
  _trace_metadata: contextvars.ContextVar[Dict[str, str]] = contextvars.ContextVar(
@@ -455,11 +487,22 @@ class Worker:
455
487
  output_schema_str = json.dumps(config.output_schema) if config.output_schema else None
456
488
  metadata = config.metadata if config.metadata else {}
457
489
 
490
+ # Serialize retry and backoff policies
491
+ config_dict = {}
492
+ if config.retries:
493
+ config_dict["max_attempts"] = str(config.retries.max_attempts)
494
+ config_dict["initial_interval_ms"] = str(config.retries.initial_interval_ms)
495
+ config_dict["max_interval_ms"] = str(config.retries.max_interval_ms)
496
+
497
+ if config.backoff:
498
+ config_dict["backoff_type"] = config.backoff.type.value
499
+ config_dict["backoff_multiplier"] = str(config.backoff.multiplier)
500
+
458
501
  component_info = self._PyComponentInfo(
459
502
  name=config.name,
460
503
  component_type="function",
461
504
  metadata=metadata,
462
- config={},
505
+ config=config_dict,
463
506
  input_schema=input_schema_str,
464
507
  output_schema=output_schema_str,
465
508
  definition=None,
@@ -627,6 +670,30 @@ class Worker:
627
670
 
628
671
  return handle_message
629
672
 
673
+ def _extract_critical_metadata(self, request) -> Dict[str, str]:
674
+ """
675
+ Extract critical metadata from request that MUST be propagated to response.
676
+
677
+ This ensures journal events are written to the correct tenant partition
678
+ and can be properly replayed. Missing tenant_id causes catastrophic
679
+ event sourcing corruption where events are split across partitions.
680
+
681
+ Returns:
682
+ Dict[str, str]: Metadata with all values normalized to strings for Rust FFI
683
+ """
684
+ metadata = {}
685
+ if hasattr(request, 'metadata') and request.metadata:
686
+ # CRITICAL: Propagate tenant_id to prevent journal corruption
687
+ # Convert to string immediately to ensure Rust FFI compatibility
688
+ if "tenant_id" in request.metadata:
689
+ metadata["tenant_id"] = str(request.metadata["tenant_id"])
690
+ if "deployment_id" in request.metadata:
691
+ metadata["deployment_id"] = str(request.metadata["deployment_id"])
692
+
693
+ # CRITICAL: Normalize all metadata values to strings for Rust FFI (PyO3)
694
+ # PyO3 expects HashMap<String, String> and will fail with bool/int values
695
+ return _normalize_metadata(metadata)
696
+
630
697
  async def _execute_function(self, config, input_data: bytes, request):
631
698
  """Execute a function handler (supports both regular and streaming functions)."""
632
699
  import json
@@ -647,17 +714,33 @@ class Worker:
647
714
  _trace_metadata.set(dict(request.metadata))
648
715
  logger.debug(f"Trace metadata stored: traceparent={request.metadata.get('traceparent', 'N/A')}")
649
716
 
650
- # Create context with runtime_context for trace correlation
651
- ctx = Context(
717
+ # Extract attempt number from platform request (if provided)
718
+ platform_attempt = getattr(request, 'attempt', 0)
719
+
720
+ # Create FunctionContext with attempt number for retry tracking
721
+ # - If platform_attempt > 0: Platform is orchestrating retries
722
+ # - If platform_attempt == 0: First attempt (or no retry config)
723
+ from .function import FunctionContext
724
+ ctx = FunctionContext(
652
725
  run_id=f"{self.service_name}:{config.name}",
726
+ attempt=platform_attempt,
653
727
  runtime_context=request.runtime_context,
728
+ retry_policy=config.retries,
654
729
  )
655
730
 
731
+ # Set context in contextvar so get_current_context() and error handlers can access it
732
+ from .context import set_current_context, _current_context
733
+ token = set_current_context(ctx)
734
+
656
735
  # Execute function directly - Rust bridge handles tracing
657
736
  # Note: Removed Python-level span creation to avoid duplicate spans.
658
737
  # The Rust worker bridge (sdk-python/rust-src/worker.rs:413-659) already
659
738
  # creates a comprehensive OpenTelemetry span with all necessary attributes.
660
739
  # See DUPLICATE_SPANS_FIX.md for details.
740
+ #
741
+ # Note on retry handling:
742
+ # - If platform_attempt > 0: Platform is orchestrating retries, execute once
743
+ # - If platform_attempt == 0: Local retry loop in decorator wrapper handles retries
661
744
  if input_dict:
662
745
  result = config.handler(ctx, **input_dict)
663
746
  else:
@@ -688,6 +771,7 @@ class Worker:
688
771
  is_chunk=True,
689
772
  done=False,
690
773
  chunk_index=chunk_index,
774
+ attempt=platform_attempt,
691
775
  ))
692
776
  chunk_index += 1
693
777
 
@@ -702,6 +786,7 @@ class Worker:
702
786
  is_chunk=True,
703
787
  done=True,
704
788
  chunk_index=chunk_index,
789
+ attempt=platform_attempt,
705
790
  ))
706
791
 
707
792
  logger.debug(f"Streaming function produced {len(responses)} chunks")
@@ -714,34 +799,69 @@ class Worker:
714
799
  # Serialize result
715
800
  output_data = json.dumps(result).encode("utf-8")
716
801
 
802
+ # Extract critical metadata for journal event correlation
803
+ response_metadata = self._extract_critical_metadata(request)
804
+
717
805
  return PyExecuteComponentResponse(
718
806
  invocation_id=request.invocation_id,
719
807
  success=True,
720
808
  output_data=output_data,
721
809
  state_update=None,
722
810
  error_message=None,
723
- metadata=None,
811
+ metadata=response_metadata if response_metadata else None,
724
812
  is_chunk=False,
725
813
  done=True,
726
814
  chunk_index=0,
815
+ attempt=platform_attempt,
727
816
  )
728
817
 
729
818
  except Exception as e:
730
819
  # Include exception type for better error messages
731
820
  error_msg = f"{type(e).__name__}: {str(e)}"
732
- logger.error(f"Function execution failed: {error_msg}", exc_info=True)
821
+
822
+ # Capture full stack trace for telemetry
823
+ import traceback
824
+ stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
825
+
826
+ # Log with full traceback using ctx.logger to ensure run_id correlation
827
+ from .context import get_current_context
828
+ current_ctx = get_current_context()
829
+ error_logger = current_ctx.logger if current_ctx else logger
830
+ error_logger.error(f"Function execution failed: {error_msg}", exc_info=True)
831
+
832
+ # Store stack trace in metadata for observability
833
+ metadata = {
834
+ "error_type": type(e).__name__,
835
+ "stack_trace": stack_trace,
836
+ "error": True, # Boolean flag for error detection
837
+ }
838
+
839
+ # CRITICAL: Extract critical metadata (including tenant_id) for journal event correlation
840
+ # This ensures run.failed events are properly emitted by Worker Coordinator
841
+ critical_metadata = self._extract_critical_metadata(request)
842
+ metadata.update(critical_metadata)
843
+
844
+ # CRITICAL: Normalize metadata to ensure all values are strings (Rust FFI requirement)
845
+ # PyO3 expects HashMap<String, String>, but we may have booleans or other types
846
+ normalized_metadata = _normalize_metadata(metadata)
847
+
733
848
  return PyExecuteComponentResponse(
734
849
  invocation_id=request.invocation_id,
735
850
  success=False,
736
851
  output_data=b"",
737
852
  state_update=None,
738
853
  error_message=error_msg,
739
- metadata=None,
854
+ metadata=normalized_metadata,
740
855
  is_chunk=False,
741
856
  done=True,
742
857
  chunk_index=0,
858
+ attempt=getattr(request, 'attempt', 0),
743
859
  )
744
860
 
861
+ finally:
862
+ # Always reset context to prevent leakage between executions
863
+ _current_context.reset(token)
864
+
745
865
  async def _execute_workflow(self, config, input_data: bytes, request):
746
866
  """Execute a workflow handler with automatic replay support."""
747
867
  import json
@@ -798,8 +918,35 @@ class Worker:
798
918
  user_response = request.metadata["user_response"]
799
919
  logger.info(f"▶️ Resuming workflow with user response: {user_response}")
800
920
 
801
- # Create WorkflowEntity for state management
802
- workflow_entity = WorkflowEntity(run_id=f"{self.service_name}:{config.name}")
921
+ # NEW: Check for agent resume (agent-level HITL)
922
+ agent_context = None
923
+ if hasattr(request, 'metadata') and request.metadata:
924
+ if "agent_context" in request.metadata:
925
+ agent_context_json = request.metadata["agent_context"]
926
+ try:
927
+ agent_context = json.loads(agent_context_json)
928
+ agent_name = agent_context.get("agent_name", "unknown")
929
+ iteration = agent_context.get("iteration", 0)
930
+ logger.info(
931
+ f"▶️ Resuming agent '{agent_name}' from iteration {iteration} "
932
+ f"with user response: {user_response}"
933
+ )
934
+ except json.JSONDecodeError:
935
+ logger.warning("Failed to parse agent_context from metadata")
936
+ agent_context = None
937
+
938
+ # Extract session_id and user_id from request for memory scoping
939
+ # Do this FIRST so we can pass to WorkflowEntity constructor
940
+ session_id = request.session_id if hasattr(request, 'session_id') and request.session_id else request.invocation_id
941
+ user_id = request.user_id if hasattr(request, 'user_id') and request.user_id else None
942
+
943
+ # Create WorkflowEntity for state management with memory scoping
944
+ # Entity key will be scoped based on priority: user_id > session_id > run_id
945
+ workflow_entity = WorkflowEntity(
946
+ run_id=request.invocation_id,
947
+ session_id=session_id,
948
+ user_id=user_id,
949
+ )
803
950
 
804
951
  # Load replay data into entity if provided
805
952
  if completed_steps:
@@ -822,21 +969,75 @@ class Worker:
822
969
  # Production mode - state is managed by Rust core
823
970
  logger.debug(f"Initial state will be loaded from platform (production mode)")
824
971
 
825
- # Create WorkflowContext with entity and runtime_context for trace correlation
972
+ # Create checkpoint callback for real-time streaming
973
+ def checkpoint_callback(checkpoint: dict) -> None:
974
+ """Send checkpoint to Rust worker queue."""
975
+ try:
976
+ # Extract critical metadata for checkpoint routing
977
+ metadata = self._extract_critical_metadata(request)
978
+
979
+ # DEBUG: Log metadata types for troubleshooting PyO3 conversion errors
980
+ logger.debug(f"Checkpoint metadata types: {[(k, type(v).__name__) for k, v in metadata.items()]}")
981
+
982
+ # Queue checkpoint via Rust FFI
983
+ self._rust_worker.queue_workflow_checkpoint(
984
+ invocation_id=request.invocation_id,
985
+ checkpoint_type=checkpoint["checkpoint_type"],
986
+ checkpoint_data=json.dumps(checkpoint["checkpoint_data"]),
987
+ sequence_number=checkpoint["sequence_number"],
988
+ metadata=metadata,
989
+ )
990
+ logger.debug(
991
+ f"Queued checkpoint: type={checkpoint['checkpoint_type']} "
992
+ f"seq={checkpoint['sequence_number']}"
993
+ )
994
+ except Exception as e:
995
+ logger.error(f"Failed to queue checkpoint: {e}", exc_info=True)
996
+ logger.error(f"Checkpoint metadata causing error: {metadata}")
997
+ logger.error(f"Checkpoint data: {checkpoint}")
998
+
999
+ # Create WorkflowContext with entity, runtime_context, and checkpoint callback
826
1000
  ctx = WorkflowContext(
827
1001
  workflow_entity=workflow_entity,
828
- run_id=f"{self.service_name}:{config.name}",
1002
+ run_id=request.invocation_id, # Use unique invocation_id for this execution
1003
+ session_id=session_id, # Session for multi-turn conversations
1004
+ user_id=user_id, # User for long-term memory
829
1005
  runtime_context=request.runtime_context,
1006
+ checkpoint_callback=checkpoint_callback,
830
1007
  )
831
1008
 
1009
+ # NEW: Populate agent resume info if this is an agent HITL resume
1010
+ if agent_context and user_response:
1011
+ ctx._agent_resume_info = {
1012
+ "agent_name": agent_context["agent_name"],
1013
+ "agent_context": agent_context,
1014
+ "user_response": user_response,
1015
+ }
1016
+ logger.debug(
1017
+ f"Set agent resume info for '{agent_context['agent_name']}' "
1018
+ f"in workflow context"
1019
+ )
1020
+
832
1021
  # Execute workflow directly - Rust bridge handles tracing
833
1022
  # Note: Removed Python-level span creation to avoid duplicate spans.
834
1023
  # The Rust worker bridge creates comprehensive OpenTelemetry spans.
835
1024
  # See DUPLICATE_SPANS_FIX.md for details.
836
- if input_dict:
837
- result = await config.handler(ctx, **input_dict)
838
- else:
839
- result = await config.handler(ctx)
1025
+
1026
+ # CRITICAL: Set context in contextvar so LM/Agent/Tool calls can access it
1027
+ from .context import set_current_context
1028
+ token = set_current_context(ctx)
1029
+ try:
1030
+ if input_dict:
1031
+ result = await config.handler(ctx, **input_dict)
1032
+ else:
1033
+ result = await config.handler(ctx)
1034
+
1035
+ # Note: Workflow entity persistence is handled by the @workflow decorator wrapper
1036
+ # which persists before returning. No need to persist here.
1037
+ finally:
1038
+ # Always reset context to prevent leakage
1039
+ from .context import _current_context
1040
+ _current_context.reset(token)
840
1041
 
841
1042
  # Note: Removed flush_telemetry_py() call here - it was causing 2-second blocking delay!
842
1043
  # The batch span processor handles flushing automatically with 5s timeout
@@ -847,6 +1048,11 @@ class Worker:
847
1048
  # Collect workflow execution metadata for durability
848
1049
  metadata = {}
849
1050
 
1051
+ # CRITICAL: Propagate tenant_id and deployment_id to prevent journal corruption
1052
+ # Missing tenant_id causes events to be written to wrong partition
1053
+ critical_metadata = self._extract_critical_metadata(request)
1054
+ metadata.update(critical_metadata)
1055
+
850
1056
  # Add step events to metadata (for workflow durability)
851
1057
  # Access _step_events from the workflow entity, not the context
852
1058
  step_events = ctx._workflow_entity._step_events
@@ -862,11 +1068,41 @@ class Worker:
862
1068
  metadata["workflow_state"] = json.dumps(state_snapshot)
863
1069
  logger.debug(f"Workflow state snapshot: {state_snapshot}")
864
1070
 
1071
+ # AUDIT TRAIL: Serialize complete state change history for replay and debugging
1072
+ # This captures all intermediate state mutations, not just final snapshot
1073
+ state_changes = ctx._workflow_entity._state_changes
1074
+ logger.info(f"🔍 DEBUG: _state_changes list has {len(state_changes)} entries")
1075
+ if state_changes:
1076
+ metadata["state_changes"] = json.dumps(state_changes)
1077
+ logger.info(f"✅ Serialized {len(state_changes)} state changes to metadata")
1078
+ else:
1079
+ logger.warning("⚠️ _state_changes list is empty - no state change history captured")
1080
+
1081
+ # CRITICAL: Persist workflow entity state to platform
1082
+ # This stores the WorkflowEntity as a first-class entity with proper versioning
1083
+ try:
1084
+ logger.info(f"🔍 DEBUG: About to call _persist_state() for run {request.invocation_id}")
1085
+ await ctx._workflow_entity._persist_state()
1086
+ logger.info(f"✅ Successfully persisted WorkflowEntity state for run {request.invocation_id}")
1087
+ except Exception as persist_error:
1088
+ logger.error(f"❌ Failed to persist WorkflowEntity state (non-fatal): {persist_error}", exc_info=True)
1089
+ # Continue anyway - persistence failure shouldn't fail the workflow
1090
+
865
1091
  logger.info(f"Workflow completed successfully with {len(step_events)} steps")
866
1092
 
867
1093
  # Add session_id to metadata for multi-turn conversation support
868
1094
  metadata["session_id"] = session_id
869
1095
 
1096
+ # CRITICAL: Flush all buffered checkpoints before returning response
1097
+ # This ensures checkpoints arrive at platform BEFORE run.completed event
1098
+ try:
1099
+ flushed_count = self._rust_worker.flush_workflow_checkpoints()
1100
+ if flushed_count > 0:
1101
+ logger.info(f"✅ Flushed {flushed_count} checkpoints before completion")
1102
+ except Exception as flush_error:
1103
+ logger.error(f"Failed to flush checkpoints: {flush_error}", exc_info=True)
1104
+ # Continue anyway - checkpoint flushing is best-effort
1105
+
870
1106
  return PyExecuteComponentResponse(
871
1107
  invocation_id=request.invocation_id,
872
1108
  success=True,
@@ -877,11 +1113,13 @@ class Worker:
877
1113
  is_chunk=False,
878
1114
  done=True,
879
1115
  chunk_index=0,
1116
+ attempt=getattr(request, 'attempt', 0),
880
1117
  )
881
1118
 
882
1119
  except WaitingForUserInputException as e:
883
- # Workflow paused for user input
884
- logger.info(f"⏸️ Workflow paused waiting for user input: {e.question}")
1120
+ # Workflow or agent paused for user input
1121
+ pause_type = "agent" if e.agent_context else "workflow"
1122
+ logger.info(f"⏸️ {pause_type.capitalize()} paused waiting for user input: {e.question}")
885
1123
 
886
1124
  # Collect metadata for pause state
887
1125
  # Note: All metadata values must be strings for Rust FFI
@@ -889,8 +1127,13 @@ class Worker:
889
1127
  "status": "awaiting_user_input",
890
1128
  "question": e.question,
891
1129
  "input_type": e.input_type,
1130
+ "pause_type": pause_type, # NEW: Indicates workflow vs agent pause
892
1131
  }
893
1132
 
1133
+ # CRITICAL: Propagate tenant_id even when pausing
1134
+ critical_metadata = self._extract_critical_metadata(request)
1135
+ pause_metadata.update(critical_metadata)
1136
+
894
1137
  # Add optional fields only if they exist
895
1138
  if e.options:
896
1139
  pause_metadata["options"] = json.dumps(e.options)
@@ -899,6 +1142,14 @@ class Worker:
899
1142
  if session_id:
900
1143
  pause_metadata["session_id"] = session_id
901
1144
 
1145
+ # NEW: Store agent execution state if present
1146
+ if e.agent_context:
1147
+ pause_metadata["agent_context"] = json.dumps(e.agent_context)
1148
+ logger.debug(
1149
+ f"Agent '{e.agent_context['agent_name']}' paused at "
1150
+ f"iteration {e.agent_context['iteration']}"
1151
+ )
1152
+
902
1153
  # Add step events to pause metadata for durability
903
1154
  step_events = ctx._workflow_entity._step_events
904
1155
  if step_events:
@@ -912,6 +1163,12 @@ class Worker:
912
1163
  pause_metadata["workflow_state"] = json.dumps(state_snapshot)
913
1164
  logger.debug(f"Paused workflow state snapshot: {state_snapshot}")
914
1165
 
1166
+ # AUDIT TRAIL: Also include state change history for paused workflows
1167
+ state_changes = ctx._workflow_entity._state_changes
1168
+ if state_changes:
1169
+ pause_metadata["state_changes"] = json.dumps(state_changes)
1170
+ logger.debug(f"Paused workflow has {len(state_changes)} state changes in history")
1171
+
915
1172
  # Return "success" with awaiting_user_input metadata
916
1173
  # The output contains the question details for the client
917
1174
  output = {
@@ -931,22 +1188,45 @@ class Worker:
931
1188
  is_chunk=False,
932
1189
  done=True,
933
1190
  chunk_index=0,
1191
+ attempt=getattr(request, 'attempt', 0),
934
1192
  )
935
1193
 
936
1194
  except Exception as e:
937
1195
  # Include exception type for better error messages
938
1196
  error_msg = f"{type(e).__name__}: {str(e)}"
1197
+
1198
+ # Capture full stack trace for telemetry
1199
+ import traceback
1200
+ stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
1201
+
1202
+ # Log with full traceback
939
1203
  logger.error(f"Workflow execution failed: {error_msg}", exc_info=True)
1204
+
1205
+ # Store error metadata for observability
1206
+ metadata = {
1207
+ "error_type": type(e).__name__,
1208
+ "stack_trace": stack_trace,
1209
+ "error": True,
1210
+ }
1211
+
1212
+ # Extract critical metadata for journal correlation (if available)
1213
+ critical_metadata = self._extract_critical_metadata(request)
1214
+ metadata.update(critical_metadata)
1215
+
1216
+ # Normalize metadata for Rust FFI compatibility
1217
+ normalized_metadata = _normalize_metadata(metadata)
1218
+
940
1219
  return PyExecuteComponentResponse(
941
1220
  invocation_id=request.invocation_id,
942
1221
  success=False,
943
1222
  output_data=b"",
944
1223
  state_update=None,
945
1224
  error_message=error_msg,
946
- metadata=None,
1225
+ metadata=normalized_metadata,
947
1226
  is_chunk=False,
948
1227
  done=True,
949
1228
  chunk_index=0,
1229
+ attempt=getattr(request, 'attempt', 0),
950
1230
  )
951
1231
 
952
1232
  async def _execute_tool(self, tool, input_data: bytes, request):
@@ -965,6 +1245,10 @@ class Worker:
965
1245
  runtime_context=request.runtime_context,
966
1246
  )
967
1247
 
1248
+ # Set context in contextvar so get_current_context() and error handlers can access it
1249
+ from .context import set_current_context, _current_context
1250
+ token = set_current_context(ctx)
1251
+
968
1252
  # Execute tool
969
1253
  result = await tool.invoke(ctx, **input_dict)
970
1254
 
@@ -981,24 +1265,54 @@ class Worker:
981
1265
  is_chunk=False,
982
1266
  done=True,
983
1267
  chunk_index=0,
1268
+ attempt=getattr(request, 'attempt', 0),
984
1269
  )
985
1270
 
986
1271
  except Exception as e:
987
1272
  # Include exception type for better error messages
988
1273
  error_msg = f"{type(e).__name__}: {str(e)}"
989
- logger.error(f"Tool execution failed: {error_msg}", exc_info=True)
1274
+
1275
+ # Capture full stack trace for telemetry
1276
+ import traceback
1277
+ stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
1278
+
1279
+ # Log with full traceback using ctx.logger to ensure run_id correlation
1280
+ from .context import get_current_context
1281
+ current_ctx = get_current_context()
1282
+ error_logger = current_ctx.logger if current_ctx else logger
1283
+ error_logger.error(f"Tool execution failed: {error_msg}", exc_info=True)
1284
+
1285
+ # Store error metadata for observability
1286
+ metadata = {
1287
+ "error_type": type(e).__name__,
1288
+ "stack_trace": stack_trace,
1289
+ "error": True,
1290
+ }
1291
+
1292
+ # CRITICAL: Extract critical metadata (including tenant_id) for journal event correlation
1293
+ critical_metadata = self._extract_critical_metadata(request)
1294
+ metadata.update(critical_metadata)
1295
+
1296
+ # Normalize metadata for Rust FFI compatibility
1297
+ normalized_metadata = _normalize_metadata(metadata)
1298
+
990
1299
  return PyExecuteComponentResponse(
991
1300
  invocation_id=request.invocation_id,
992
1301
  success=False,
993
1302
  output_data=b"",
994
1303
  state_update=None,
995
1304
  error_message=error_msg,
996
- metadata=None,
1305
+ metadata=normalized_metadata,
997
1306
  is_chunk=False,
998
1307
  done=True,
999
1308
  chunk_index=0,
1309
+ attempt=getattr(request, 'attempt', 0),
1000
1310
  )
1001
1311
 
1312
+ finally:
1313
+ # Always reset context to prevent leakage between executions
1314
+ _current_context.reset(token)
1315
+
1002
1316
  async def _execute_entity(self, entity_type, input_data: bytes, request):
1003
1317
  """Execute an entity method."""
1004
1318
  import json
@@ -1022,6 +1336,16 @@ class Worker:
1022
1336
  if not method_name:
1023
1337
  raise ValueError("Entity invocation requires 'method' parameter")
1024
1338
 
1339
+ # Create context for logging and tracing
1340
+ ctx = Context(
1341
+ run_id=f"{self.service_name}:{entity_type.name}:{entity_key}",
1342
+ runtime_context=request.runtime_context,
1343
+ )
1344
+
1345
+ # Set context in contextvar so get_current_context() and error handlers can access it
1346
+ from .context import set_current_context, _current_context
1347
+ token = set_current_context(ctx)
1348
+
1025
1349
  # Note: State loading is now handled automatically by the entity method wrapper
1026
1350
  # via EntityStateAdapter which uses the Rust core for cache + platform persistence
1027
1351
 
@@ -1042,7 +1366,9 @@ class Worker:
1042
1366
 
1043
1367
  # Note: State persistence is now handled automatically by the entity method wrapper
1044
1368
  # via EntityStateAdapter which uses Rust core for optimistic locking + version tracking
1045
- metadata = {}
1369
+
1370
+ # CRITICAL: Propagate tenant_id and deployment_id to prevent journal corruption
1371
+ metadata = self._extract_critical_metadata(request)
1046
1372
 
1047
1373
  return PyExecuteComponentResponse(
1048
1374
  invocation_id=request.invocation_id,
@@ -1050,28 +1376,58 @@ class Worker:
1050
1376
  output_data=output_data,
1051
1377
  state_update=None, # TODO: Use structured StateUpdate object
1052
1378
  error_message=None,
1053
- metadata=metadata, # Include state in metadata for Worker Coordinator
1379
+ metadata=metadata if metadata else None, # Include state in metadata for Worker Coordinator
1054
1380
  is_chunk=False,
1055
1381
  done=True,
1056
1382
  chunk_index=0,
1383
+ attempt=getattr(request, 'attempt', 0),
1057
1384
  )
1058
1385
 
1059
1386
  except Exception as e:
1060
1387
  # Include exception type for better error messages
1061
1388
  error_msg = f"{type(e).__name__}: {str(e)}"
1062
- logger.error(f"Entity execution failed: {error_msg}", exc_info=True)
1389
+
1390
+ # Capture full stack trace for telemetry
1391
+ import traceback
1392
+ stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
1393
+
1394
+ # Log with full traceback using ctx.logger to ensure run_id correlation
1395
+ from .context import get_current_context
1396
+ current_ctx = get_current_context()
1397
+ error_logger = current_ctx.logger if current_ctx else logger
1398
+ error_logger.error(f"Entity execution failed: {error_msg}", exc_info=True)
1399
+
1400
+ # Store error metadata for observability
1401
+ metadata = {
1402
+ "error_type": type(e).__name__,
1403
+ "stack_trace": stack_trace,
1404
+ "error": True,
1405
+ }
1406
+
1407
+ # Extract critical metadata for journal correlation (if available)
1408
+ critical_metadata = self._extract_critical_metadata(request)
1409
+ metadata.update(critical_metadata)
1410
+
1411
+ # Normalize metadata for Rust FFI compatibility
1412
+ normalized_metadata = _normalize_metadata(metadata)
1413
+
1063
1414
  return PyExecuteComponentResponse(
1064
1415
  invocation_id=request.invocation_id,
1065
1416
  success=False,
1066
1417
  output_data=b"",
1067
1418
  state_update=None,
1068
1419
  error_message=error_msg,
1069
- metadata=None,
1420
+ metadata=normalized_metadata,
1070
1421
  is_chunk=False,
1071
1422
  done=True,
1072
1423
  chunk_index=0,
1424
+ attempt=getattr(request, 'attempt', 0),
1073
1425
  )
1074
1426
 
1427
+ finally:
1428
+ # Always reset context to prevent leakage between executions
1429
+ _current_context.reset(token)
1430
+
1075
1431
  async def _execute_agent(self, agent, input_data: bytes, request):
1076
1432
  """Execute an agent with session support for multi-turn conversations."""
1077
1433
  import json
@@ -1112,6 +1468,10 @@ class Worker:
1112
1468
  runtime_context=request.runtime_context,
1113
1469
  )
1114
1470
 
1471
+ # Set context in contextvar so get_current_context() and error handlers can access it
1472
+ from .context import set_current_context, _current_context
1473
+ token = set_current_context(ctx)
1474
+
1115
1475
  # Execute agent - conversation history is automatically included
1116
1476
  agent_result = await agent.run(user_message, context=ctx)
1117
1477
 
@@ -1124,8 +1484,10 @@ class Worker:
1124
1484
  # Serialize result
1125
1485
  output_data = json.dumps(result).encode("utf-8")
1126
1486
 
1127
- # Return session_id in metadata so UI can persist it
1128
- metadata = {"session_id": session_id}
1487
+ # CRITICAL: Propagate tenant_id and deployment_id to prevent journal corruption
1488
+ metadata = self._extract_critical_metadata(request)
1489
+ # Also include session_id for UI to persist conversation
1490
+ metadata["session_id"] = session_id
1129
1491
 
1130
1492
  return PyExecuteComponentResponse(
1131
1493
  invocation_id=request.invocation_id,
@@ -1133,28 +1495,58 @@ class Worker:
1133
1495
  output_data=output_data,
1134
1496
  state_update=None,
1135
1497
  error_message=None,
1136
- metadata=metadata,
1498
+ metadata=metadata if metadata else None,
1137
1499
  is_chunk=False,
1138
1500
  done=True,
1139
1501
  chunk_index=0,
1502
+ attempt=getattr(request, 'attempt', 0),
1140
1503
  )
1141
1504
 
1142
1505
  except Exception as e:
1143
1506
  # Include exception type for better error messages
1144
1507
  error_msg = f"{type(e).__name__}: {str(e)}"
1145
- logger.error(f"Agent execution failed: {error_msg}", exc_info=True)
1508
+
1509
+ # Capture full stack trace for telemetry
1510
+ import traceback
1511
+ stack_trace = ''.join(traceback.format_exception(type(e), e, e.__traceback__))
1512
+
1513
+ # Log with full traceback using ctx.logger to ensure run_id correlation
1514
+ from .context import get_current_context
1515
+ current_ctx = get_current_context()
1516
+ error_logger = current_ctx.logger if current_ctx else logger
1517
+ error_logger.error(f"Agent execution failed: {error_msg}", exc_info=True)
1518
+
1519
+ # Store error metadata for observability
1520
+ metadata = {
1521
+ "error_type": type(e).__name__,
1522
+ "stack_trace": stack_trace,
1523
+ "error": True,
1524
+ }
1525
+
1526
+ # Extract critical metadata for journal correlation (if available)
1527
+ critical_metadata = self._extract_critical_metadata(request)
1528
+ metadata.update(critical_metadata)
1529
+
1530
+ # Normalize metadata for Rust FFI compatibility
1531
+ normalized_metadata = _normalize_metadata(metadata)
1532
+
1146
1533
  return PyExecuteComponentResponse(
1147
1534
  invocation_id=request.invocation_id,
1148
1535
  success=False,
1149
1536
  output_data=b"",
1150
1537
  state_update=None,
1151
1538
  error_message=error_msg,
1152
- metadata=None,
1539
+ metadata=normalized_metadata,
1153
1540
  is_chunk=False,
1154
1541
  done=True,
1155
1542
  chunk_index=0,
1543
+ attempt=getattr(request, 'attempt', 0),
1156
1544
  )
1157
1545
 
1546
+ finally:
1547
+ # Always reset context to prevent leakage between executions
1548
+ _current_context.reset(token)
1549
+
1158
1550
  def _create_error_response(self, request, error_message: str):
1159
1551
  """Create an error response."""
1160
1552
  from ._core import PyExecuteComponentResponse
@@ -1169,6 +1561,7 @@ class Worker:
1169
1561
  is_chunk=False,
1170
1562
  done=True,
1171
1563
  chunk_index=0,
1564
+ attempt=getattr(request, 'attempt', 0),
1172
1565
  )
1173
1566
 
1174
1567
  async def run(self):