by-framework 0.2.2.dev0__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. by_framework/__init__.py +2 -0
  2. by_framework/client/__init__.py +2 -1
  3. by_framework/client/client.py +410 -5
  4. by_framework/common/constants.py +30 -0
  5. by_framework/common/logger.py +71 -10
  6. by_framework/observability/__init__.py +62 -0
  7. by_framework/observability/dashboard.py +1145 -0
  8. by_framework/observability/external_trace.py +148 -0
  9. by_framework/observability/frontend/index.html +12 -0
  10. by_framework/observability/frontend/package-lock.json +1696 -0
  11. by_framework/observability/frontend/package.json +18 -0
  12. by_framework/observability/frontend/src/main.jsx +1351 -0
  13. by_framework/observability/frontend/src/styles.css +1214 -0
  14. by_framework/observability/frontend/vite.config.js +18 -0
  15. by_framework/observability/metrics.py +195 -0
  16. by_framework/observability/snapshot.py +2294 -0
  17. by_framework/observability/span_recorder.py +840 -0
  18. by_framework/observability/static/app.js +115 -0
  19. by_framework/observability/static/index.html +13 -0
  20. by_framework/observability/static/styles.css +1 -0
  21. by_framework/worker/_control_handling.py +30 -0
  22. by_framework/worker/context.py +347 -10
  23. by_framework/worker/runner.py +238 -10
  24. by_framework/worker/worker.py +23 -6
  25. {by_framework-0.2.2.dev0.dist-info → by_framework-0.2.2.dev1.dist-info}/METADATA +33 -2
  26. by_framework-0.2.2.dev1.dist-info/RECORD +56 -0
  27. {by_framework-0.2.2.dev0.dist-info → by_framework-0.2.2.dev1.dist-info}/WHEEL +1 -1
  28. by_framework/core/__init__.py +0 -95
  29. by_framework/core/availability.py +0 -495
  30. by_framework/core/delivery_gate.py +0 -60
  31. by_framework/core/discovery.py +0 -359
  32. by_framework/core/extensions/__init__.py +0 -35
  33. by_framework/core/extensions/agent_config.py +0 -64
  34. by_framework/core/extensions/plugin.py +0 -282
  35. by_framework/core/extensions/registry.py +0 -653
  36. by_framework/core/extensions/trace_provider.py +0 -20
  37. by_framework/core/protocol/__init__.py +0 -133
  38. by_framework/core/protocol/action_type.py +0 -33
  39. by_framework/core/protocol/agent_state.py +0 -78
  40. by_framework/core/protocol/byai_codec.py +0 -101
  41. by_framework/core/protocol/byai_command.py +0 -53
  42. by_framework/core/protocol/byai_types.py +0 -7
  43. by_framework/core/protocol/commands.py +0 -285
  44. by_framework/core/protocol/content_codec.py +0 -17
  45. by_framework/core/protocol/content_type.py +0 -38
  46. by_framework/core/protocol/data_message.py +0 -45
  47. by_framework/core/protocol/data_shapes.py +0 -83
  48. by_framework/core/protocol/event_type.py +0 -34
  49. by_framework/core/protocol/events.py +0 -69
  50. by_framework/core/protocol/message.py +0 -99
  51. by_framework/core/protocol/message_header.py +0 -68
  52. by_framework/core/protocol/responses.py +0 -94
  53. by_framework/core/protocol/results.py +0 -149
  54. by_framework/core/registry.py +0 -1025
  55. by_framework/core/runtime/__init__.py +0 -29
  56. by_framework/core/runtime/agent_config_manager.py +0 -283
  57. by_framework/core/runtime/agent_runtime_state.py +0 -75
  58. by_framework/core/runtime/file_manager.py +0 -437
  59. by_framework/core/runtime/file_paths.py +0 -76
  60. by_framework/core/runtime/file_permissions.py +0 -71
  61. by_framework/core/runtime/filestore/__init__.py +0 -15
  62. by_framework/core/runtime/filestore/base.py +0 -140
  63. by_framework/core/runtime/filestore/local.py +0 -321
  64. by_framework/core/runtime/history/__init__.py +0 -10
  65. by_framework/core/runtime/history/base.py +0 -57
  66. by_framework/core/runtime/history/history_manager.py +0 -55
  67. by_framework/core/runtime/history/in_memory.py +0 -58
  68. by_framework/core/runtime/session_manager.py +0 -118
  69. by_framework/core/wakeup_controller.py +0 -151
  70. by_framework/core/workspace.py +0 -126
  71. by_framework-0.2.2.dev0.dist-info/RECORD +0 -84
  72. {by_framework-0.2.2.dev0.dist-info → by_framework-0.2.2.dev1.dist-info}/licenses/LICENSE +0 -0
by_framework/__init__.py CHANGED
@@ -8,6 +8,7 @@ from `GatewayWorker` and running `run_worker`.
8
8
  from .client.byai_client import ByaiGatewayClient
9
9
  from .client.client import (
10
10
  CancelTaskResponse,
11
+ DataStreamEntry,
11
12
  GatewayClient,
12
13
  GatewayInterceptor,
13
14
  SendMessageResponse,
@@ -140,6 +141,7 @@ __all__ = [
140
141
  "GatewayClient",
141
142
  "ByaiGatewayClient",
142
143
  "GatewayInterceptor",
144
+ "DataStreamEntry",
143
145
  "SendMessageResponse",
144
146
  "CancelTaskResponse",
145
147
  "run_worker",
@@ -1,10 +1,11 @@
1
1
  """Client module for Gateway communication."""
2
2
 
3
3
  from .byai_client import ByaiGatewayClient
4
- from .client import GatewayClient, GatewayInterceptor
4
+ from .client import DataStreamEntry, GatewayClient, GatewayInterceptor
5
5
 
6
6
  __all__ = [
7
7
  "GatewayClient",
8
8
  "ByaiGatewayClient",
9
9
  "GatewayInterceptor",
10
+ "DataStreamEntry",
10
11
  ]
@@ -9,7 +9,8 @@ import json
9
9
  import time
10
10
  import uuid
11
11
  from dataclasses import dataclass
12
- from typing import TYPE_CHECKING, Any, Dict, List, Optional, Protocol
12
+ from dataclasses import fields as dataclass_fields
13
+ from typing import (TYPE_CHECKING, Any, AsyncIterator, Dict, List, Optional, Protocol)
13
14
 
14
15
  from by_framework.common.constants import (
15
16
  CANCEL_MESSAGE_ID_PREFIX,
@@ -17,6 +18,7 @@ from by_framework.common.constants import (
17
18
  MESSAGE_ID_PREFIX,
18
19
  RedisKeys,
19
20
  )
21
+ from by_framework.common.logger import logger
20
22
  from by_framework.common.redis_client import Redis, get_redis
21
23
  from by_framework.core.availability import (
22
24
  AvailabilityRouter,
@@ -32,6 +34,7 @@ from by_framework.core.protocol.commands import (
32
34
  ReloadPluginsCommand,
33
35
  ResumeCommand,
34
36
  )
37
+ from by_framework.core.protocol.data_message import DataMessage
35
38
  from by_framework.core.protocol.message_header import MessageHeader
36
39
  from by_framework.core.protocol.responses import (
37
40
  CancelTaskResponse,
@@ -40,6 +43,11 @@ from by_framework.core.protocol.responses import (
40
43
  )
41
44
  from by_framework.core.registry import WorkerRegistry
42
45
  from by_framework.errors import WorkerRegistryNotSetError
46
+ from by_framework.observability.span_recorder import (
47
+ SpanRecorder,
48
+ TraceSpan,
49
+ str_to_uint64,
50
+ )
43
51
 
44
52
  if TYPE_CHECKING:
45
53
  pass
@@ -59,6 +67,14 @@ class RouteResolution:
59
67
  target_worker_id: str = ""
60
68
 
61
69
 
70
+ @dataclass(frozen=True)
71
+ class DataStreamEntry:
72
+ """A decoded entry from a session data stream."""
73
+
74
+ stream_id: str
75
+ message: DataMessage
76
+
77
+
62
78
  class GatewayClient:
63
79
  """Gateway client for sending messages and cancel requests to Gateway workers.
64
80
 
@@ -76,16 +92,183 @@ class GatewayClient:
76
92
  registry: Optional[WorkerRegistry] = None,
77
93
  redis_client: Optional[Redis] = None,
78
94
  interceptors: Optional[List[GatewayInterceptor]] = None,
95
+ span_recorder: Optional[SpanRecorder] = None,
79
96
  ):
80
97
  self.registry = registry
81
98
  self.redis = (
82
99
  redis_client or (registry.redis if registry else None) or get_redis()
83
100
  )
84
101
  self.interceptors = interceptors or []
102
+ self.span_recorder = span_recorder or SpanRecorder(self.redis)
85
103
 
86
104
  def add_interceptor(self, interceptor: GatewayInterceptor):
87
105
  self.interceptors.append(interceptor)
88
106
 
107
+ @staticmethod
108
+ def _decode_redis_value(value: Any) -> Any:
109
+ """Decode Redis bytes values while preserving already-decoded clients."""
110
+ if isinstance(value, bytes):
111
+ return value.decode("utf-8")
112
+ return value
113
+
114
+ @classmethod
115
+ def _decode_data_stream_entry(
116
+ cls, stream_id: Any, fields: Dict[Any, Any]
117
+ ) -> DataStreamEntry:
118
+ raw = fields.get(b"data")
119
+ if raw is None:
120
+ raw = fields.get("data")
121
+ if raw is None:
122
+ raise ValueError("data stream entry missing 'data' field")
123
+
124
+ payload = json.loads(cls._decode_redis_value(raw))
125
+ data_message_fields = {field.name for field in dataclass_fields(DataMessage)}
126
+ return DataStreamEntry(
127
+ stream_id=cls._decode_redis_value(stream_id),
128
+ message=DataMessage(
129
+ **{
130
+ key: value
131
+ for key, value in payload.items()
132
+ if key in data_message_fields
133
+ }
134
+ ),
135
+ )
136
+
137
+ async def read_data_messages(
138
+ self,
139
+ session_id: str,
140
+ last_id: str = "0-0",
141
+ block_ms: int = 0,
142
+ count: int = 100,
143
+ ) -> List[DataStreamEntry]:
144
+ """Read decoded messages from the session data stream.
145
+
146
+ Pass the last returned ``stream_id`` as ``last_id`` to continue from
147
+ the next entry. ``block_ms`` is passed to Redis XREAD; ``0`` means
148
+ block indefinitely on standard Redis clients.
149
+ """
150
+ stream_name = RedisKeys.session_data_stream(session_id)
151
+ messages = await self.redis.xread(
152
+ streams={stream_name: last_id},
153
+ count=count,
154
+ block=block_ms,
155
+ )
156
+
157
+ results: List[DataStreamEntry] = []
158
+ for _, msg_list in messages or []:
159
+ for stream_id, fields in msg_list:
160
+ results.append(self._decode_data_stream_entry(stream_id, fields))
161
+ return results
162
+
163
+ async def get_data_message_checkpoint(
164
+ self,
165
+ session_id: str,
166
+ consumer_name: str,
167
+ ) -> str:
168
+ """Return the last committed data stream ID for a named consumer."""
169
+ checkpoint = await self.redis.get(
170
+ RedisKeys.session_data_checkpoint(session_id, consumer_name)
171
+ )
172
+ if checkpoint is None:
173
+ return "0-0"
174
+ return self._decode_redis_value(checkpoint)
175
+
176
+ async def commit_data_message(
177
+ self,
178
+ session_id: str,
179
+ stream_id: str,
180
+ consumer_name: str,
181
+ ) -> None:
182
+ """Commit a data stream ID as processed for a named consumer."""
183
+ await self.redis.set(
184
+ RedisKeys.session_data_checkpoint(session_id, consumer_name),
185
+ stream_id,
186
+ ex=RedisKeys.DEFAULT_SESSION_TTL,
187
+ )
188
+
189
+ async def read_data_messages_from_checkpoint(
190
+ self,
191
+ session_id: str,
192
+ consumer_name: str,
193
+ block_ms: int = 0,
194
+ count: int = 100,
195
+ auto_commit: bool = False,
196
+ ) -> List[DataStreamEntry]:
197
+ """Read messages starting after a named consumer's committed checkpoint."""
198
+ last_id = await self.get_data_message_checkpoint(session_id, consumer_name)
199
+ entries = await self.read_data_messages(
200
+ session_id=session_id,
201
+ last_id=last_id,
202
+ block_ms=block_ms,
203
+ count=count,
204
+ )
205
+ if auto_commit and entries:
206
+ await self.commit_data_message(
207
+ session_id=session_id,
208
+ stream_id=entries[-1].stream_id,
209
+ consumer_name=consumer_name,
210
+ )
211
+ return entries
212
+
213
+ async def iter_data_messages(
214
+ self,
215
+ session_id: str,
216
+ last_id: str = "$",
217
+ block_ms: int = 5000,
218
+ count: int = 100,
219
+ ) -> AsyncIterator[DataStreamEntry]:
220
+ """Continuously consume decoded messages from the session data stream.
221
+
222
+ The iterator does not stop on its own. Callers should break when their
223
+ business-level terminal event is observed.
224
+ """
225
+ current_id = last_id
226
+ while True:
227
+ entries = await self.read_data_messages(
228
+ session_id=session_id,
229
+ last_id=current_id,
230
+ block_ms=block_ms,
231
+ count=count,
232
+ )
233
+ for entry in entries:
234
+ current_id = entry.stream_id
235
+ yield entry
236
+
237
+ async def consume_data_messages(
238
+ self,
239
+ session_id: str,
240
+ consumer_name: str,
241
+ block_ms: int = 5000,
242
+ count: int = 100,
243
+ ) -> AsyncIterator[DataStreamEntry]:
244
+ """Continuously consume data stream messages with checkpoint commits.
245
+
246
+ Each entry is committed after the caller's loop body completes and asks
247
+ for the next item. If processing fails or the iterator is closed before
248
+ the next item, the current entry is not committed and will be retried
249
+ from the checkpoint on the next consumer run. The iterator does not stop
250
+ on its own; callers should break on their terminal event.
251
+ """
252
+ current_id = await self.get_data_message_checkpoint(
253
+ session_id=session_id,
254
+ consumer_name=consumer_name,
255
+ )
256
+ while True:
257
+ entries = await self.read_data_messages(
258
+ session_id=session_id,
259
+ last_id=current_id,
260
+ block_ms=block_ms,
261
+ count=count,
262
+ )
263
+ for entry in entries:
264
+ yield entry
265
+ await self.commit_data_message(
266
+ session_id=session_id,
267
+ stream_id=entry.stream_id,
268
+ consumer_name=consumer_name,
269
+ )
270
+ current_id = entry.stream_id
271
+
89
272
  async def reload_plugins_for_agent_type(
90
273
  self,
91
274
  agent_type: str,
@@ -341,11 +524,16 @@ class GatewayClient:
341
524
  )
342
525
 
343
526
  if node_worker_id:
527
+ node_trace_id = (
528
+ node.get("trace_id")
529
+ or execution.get("trace_id")
530
+ or uuid.uuid4().hex
531
+ )
344
532
  cancel_command = CancelTaskCommand(
345
533
  header=MessageHeader(
346
534
  message_id=f"{CANCEL_MESSAGE_ID_PREFIX}{uuid.uuid4().hex[:8]}",
347
535
  session_id=session_id,
348
- trace_id=uuid.uuid4().hex,
536
+ trace_id=node_trace_id,
349
537
  target_agent_type=node.get("target_agent_type", ""),
350
538
  parent_message_id=node_message_id,
351
539
  ),
@@ -424,6 +612,32 @@ class GatewayClient:
424
612
  if not trace_id:
425
613
  trace_id = uuid.uuid4().hex
426
614
 
615
+ metadata = dict(params.get("metadata", {}) or {})
616
+ trace_parent_span_id = metadata.pop("trace_parent_span_id", "")
617
+ langfuse_parent_observation_id = metadata.pop(
618
+ "langfuse_parent_observation_id", ""
619
+ )
620
+ if not trace_parent_span_id:
621
+ trace_parent_span_id = (
622
+ f"{str_to_uint64(f'{message_id}:client.dispatch'):016x}"
623
+ )
624
+
625
+ langfuse_client_dispatch = None
626
+ if not params["parent_message_id"]:
627
+ langfuse_client_dispatch = self._start_langfuse_client_dispatch_observation(
628
+ trace_id=trace_id,
629
+ message_id=message_id,
630
+ target_agent_type=params["target_agent_type"],
631
+ session_id=params["session_id"],
632
+ user_code=params["user_code"],
633
+ user_name=params["user_name"],
634
+ content=params["content"],
635
+ metadata=metadata,
636
+ )
637
+ observation_id = getattr(langfuse_client_dispatch, "id", "")
638
+ if observation_id:
639
+ langfuse_parent_observation_id = observation_id
640
+
427
641
  header = MessageHeader(
428
642
  message_id=message_id,
429
643
  session_id=params["session_id"],
@@ -432,7 +646,9 @@ class GatewayClient:
432
646
  parent_message_id=params["parent_message_id"],
433
647
  user_code=params["user_code"],
434
648
  user_name=params["user_name"],
435
- metadata=params["metadata"],
649
+ metadata=metadata,
650
+ trace_parent_span_id=trace_parent_span_id,
651
+ langfuse_parent_observation_id=langfuse_parent_observation_id,
436
652
  )
437
653
  command = self._build_gateway_command(
438
654
  action_type=params["action_type"],
@@ -451,6 +667,7 @@ class GatewayClient:
451
667
  route_policy != RoutePolicy.SEND_ANYWAY,
452
668
  )
453
669
  else:
670
+ avail_start_ms = int(time.time() * 1000)
454
671
  availability = await AvailabilityRouter(
455
672
  self.redis, self.registry
456
673
  ).prepare_delivery(
@@ -470,13 +687,42 @@ class GatewayClient:
470
687
  metadata=params["metadata"],
471
688
  )
472
689
  )
690
+ try:
691
+ from by_framework.observability.metrics import record_availability_metrics
692
+
693
+ record_availability_metrics(
694
+ agent_type=params["target_agent_type"],
695
+ policy=route_policy,
696
+ status=availability.status,
697
+ routing_ms=float(int(time.time() * 1000) - avail_start_ms),
698
+ )
699
+ except Exception: # pylint: disable=broad-exception-caught
700
+ pass
473
701
  if availability.status not in (
474
702
  AvailabilityStatus.DELIVER_NOW,
475
703
  AvailabilityStatus.WAIT_AND_DELIVER,
476
704
  AvailabilityStatus.FALLBACK_TO_OTHER_AGENT_TYPE,
477
705
  AvailabilityStatus.QUEUE_PENDING,
478
706
  ):
479
- return SendMessageResponse(
707
+ if self.registry and hasattr(
708
+ self.registry, "record_failed_route_decision"
709
+ ):
710
+ await self.registry.record_failed_route_decision(
711
+ execution_id=execution_id,
712
+ message_id=message_id,
713
+ session_id=params["session_id"],
714
+ trace_id=trace_id,
715
+ target_agent_type=params["target_agent_type"],
716
+ parent_message_id=params["parent_message_id"] or "",
717
+ source_agent_type="client",
718
+ route_policy=route_policy,
719
+ route_status=availability.status,
720
+ stream_name=availability.stream_name or "",
721
+ selected_agent_type=availability.selected_agent_type or "",
722
+ availability_error_code=availability.error_code or "",
723
+ availability_error=availability.error or "",
724
+ )
725
+ response = SendMessageResponse(
480
726
  success=False,
481
727
  status=ExecutionStatus.FAILED,
482
728
  message_id="",
@@ -487,6 +733,12 @@ class GatewayClient:
487
733
  error_code=availability.error_code
488
734
  or ExecutionStatus.ERR_AGENT_TYPE_UNAVAILABLE,
489
735
  )
736
+ self._end_langfuse_client_dispatch_observation(
737
+ langfuse_client_dispatch,
738
+ output={"success": False, "error": availability.error},
739
+ error=availability.error,
740
+ )
741
+ return response
490
742
  if availability.status == AvailabilityStatus.QUEUE_PENDING:
491
743
  should_dispatch_control = False
492
744
  route = RouteResolution(
@@ -497,6 +749,11 @@ class GatewayClient:
497
749
  params["target_agent_type"] = availability.selected_agent_type
498
750
  command.header.target_agent_type = availability.selected_agent_type
499
751
  except LookupError as err:
752
+ self._end_langfuse_client_dispatch_observation(
753
+ langfuse_client_dispatch,
754
+ output={"success": False, "error": str(err)},
755
+ error=str(err),
756
+ )
500
757
  return SendMessageResponse(
501
758
  success=False,
502
759
  status=ExecutionStatus.FAILED,
@@ -508,6 +765,11 @@ class GatewayClient:
508
765
  error_code=ExecutionStatus.ERR_WORKER_NOT_ONLINE,
509
766
  )
510
767
  except ValueError as err:
768
+ self._end_langfuse_client_dispatch_observation(
769
+ langfuse_client_dispatch,
770
+ output={"success": False, "error": str(err)},
771
+ error=str(err),
772
+ )
511
773
  return SendMessageResponse(
512
774
  success=False,
513
775
  status=ExecutionStatus.FAILED,
@@ -533,16 +795,44 @@ class GatewayClient:
533
795
  "target_agent_type": params["target_agent_type"],
534
796
  "stream_name": route.stream_name,
535
797
  "status": "QUEUED",
798
+ "route_policy": route_policy,
799
+ "route_status": availability.status
800
+ if not target_worker_id
801
+ else "DIRECT_WORKER",
802
+ "selected_agent_type": availability.selected_agent_type
803
+ if not target_worker_id
804
+ else "",
805
+ "availability_error_code": availability.error_code
806
+ if not target_worker_id
807
+ else "",
808
+ "availability_error": availability.error
809
+ if not target_worker_id
810
+ else "",
536
811
  }
537
812
  )
538
813
  except Exception: # pylint: disable=broad-exception-caught
539
814
  pass # Fallback if registry fails
540
815
 
541
816
  # 4. Route to the appropriate stream
817
+ dispatch_started_at = int(time.time() * 1000)
542
818
  if should_dispatch_control:
543
819
  await self.redis.xadd(route.stream_name, command.to_redis_payload())
820
+ await self._record_client_dispatch_span(
821
+ trace_id=trace_id,
822
+ message_id=message_id,
823
+ session_id=params["session_id"],
824
+ parent_message_id=params["parent_message_id"] or "",
825
+ target_agent_type=params["target_agent_type"],
826
+ target_worker_id=route.target_worker_id,
827
+ route_policy=route_policy,
828
+ route_status=availability.status
829
+ if not target_worker_id
830
+ else "DIRECT_WORKER",
831
+ start_ts=dispatch_started_at,
832
+ end_ts=int(time.time() * 1000),
833
+ )
544
834
 
545
- return SendMessageResponse(
835
+ response = SendMessageResponse(
546
836
  success=True,
547
837
  message_id=message_id,
548
838
  trace_id=trace_id,
@@ -550,3 +840,118 @@ class GatewayClient:
550
840
  timestamp=int(time.time() * 1000),
551
841
  status=ExecutionStatus.QUEUED,
552
842
  )
843
+ self._end_langfuse_client_dispatch_observation(
844
+ langfuse_client_dispatch,
845
+ output={
846
+ "success": True,
847
+ "message_id": message_id,
848
+ "trace_id": trace_id,
849
+ "target_worker_id": route.target_worker_id,
850
+ "status": response.status,
851
+ },
852
+ )
853
+ return response
854
+
855
+ def _start_langfuse_client_dispatch_observation(
856
+ self,
857
+ *,
858
+ trace_id: str,
859
+ message_id: str,
860
+ target_agent_type: str,
861
+ session_id: str,
862
+ user_code: str,
863
+ user_name: str,
864
+ content: Any,
865
+ metadata: Dict[str, Any],
866
+ ) -> Any:
867
+ try:
868
+ from by_framework_trace_langfuse import start_client_dispatch_observation
869
+
870
+ return start_client_dispatch_observation(
871
+ trace_id=trace_id,
872
+ message_id=message_id,
873
+ target_agent_type=target_agent_type,
874
+ session_id=session_id,
875
+ user_code=user_code,
876
+ user_name=user_name,
877
+ content=content,
878
+ metadata=metadata,
879
+ )
880
+ except Exception as err: # pylint: disable=broad-exception-caught
881
+ logger.warning(
882
+ "Langfuse client.dispatch observation skipped: %s",
883
+ err,
884
+ exc_info=True,
885
+ )
886
+ return None
887
+
888
+ @staticmethod
889
+ def _end_langfuse_client_dispatch_observation(
890
+ observation: Any,
891
+ *,
892
+ output: Any,
893
+ error: str = "",
894
+ ) -> None:
895
+ if observation is None:
896
+ return
897
+ try:
898
+ if error and hasattr(observation, "update"):
899
+ observation.update(level="ERROR", status_message=error)
900
+ observation.end(output=output)
901
+ except TypeError:
902
+ try:
903
+ observation.update(output=output)
904
+ observation.end()
905
+ except Exception: # pylint: disable=broad-exception-caught
906
+ pass
907
+ except Exception: # pylint: disable=broad-exception-caught
908
+ pass
909
+
910
+ async def _record_client_dispatch_span(
911
+ self,
912
+ *,
913
+ trace_id: str,
914
+ message_id: str,
915
+ session_id: str,
916
+ parent_message_id: str,
917
+ target_agent_type: str,
918
+ target_worker_id: str,
919
+ route_policy: str,
920
+ route_status: str,
921
+ start_ts: int,
922
+ end_ts: int,
923
+ ) -> None:
924
+ try:
925
+ logger.info(
926
+ "Recording client dispatch span: message_id=%s, trace_id=%s",
927
+ message_id,
928
+ trace_id,
929
+ )
930
+ await self.span_recorder.record_span(
931
+ TraceSpan(
932
+ trace_id=trace_id,
933
+ span_id=f"{message_id}:client.dispatch",
934
+ parent_span_id="",
935
+ operation="client.dispatch",
936
+ component="client",
937
+ start_ts=start_ts,
938
+ end_ts=end_ts,
939
+ status="COMPLETED",
940
+ session_id=session_id,
941
+ message_id=message_id,
942
+ parent_message_id=parent_message_id,
943
+ worker_id=target_worker_id,
944
+ source_agent_type="client",
945
+ target_agent_type=target_agent_type,
946
+ route_policy=route_policy,
947
+ route_status=route_status,
948
+ )
949
+ )
950
+ logger.info(
951
+ "Client dispatch span recorded successfully for message_id=%s",
952
+ message_id,
953
+ )
954
+ except Exception as err: # pylint: disable=broad-exception-caught
955
+ logger.warning(
956
+ "Failed to record client dispatch span: %s", err, exc_info=True
957
+ )
@@ -92,6 +92,36 @@ class RedisKeys:
92
92
  """Session-level data stream. Workers push streaming content here."""
93
93
  return f"byai_gateway:session:{session_id}:data_stream"
94
94
 
95
+ @staticmethod
96
+ def session_data_checkpoint(session_id: str, consumer_name: str) -> str:
97
+ """Checkpoint key storing a consumer's last processed data stream ID."""
98
+ return f"byai_gateway:session:{session_id}:consumer:{consumer_name}:checkpoint"
99
+
100
+ @staticmethod
101
+ def trace_meta(trace_id: str) -> str:
102
+ """Hash storing trace-level metadata for observability."""
103
+ return f"by_framework:trace:{trace_id}"
104
+
105
+ @staticmethod
106
+ def trace_spans(trace_id: str) -> str:
107
+ """List storing trace span JSON payloads ordered by write time."""
108
+ return f"by_framework:trace:spans:{trace_id}"
109
+
110
+ @staticmethod
111
+ def trace_index_session(session_id: str) -> str:
112
+ """Sorted Set index from session_id to trace IDs."""
113
+ return f"by_framework:trace:idx:session:{session_id}"
114
+
115
+ @staticmethod
116
+ def trace_index_worker(worker_id: str) -> str:
117
+ """Sorted Set index from worker_id to trace IDs."""
118
+ return f"by_framework:trace:idx:worker:{worker_id}"
119
+
120
+ @staticmethod
121
+ def trace_index_agent(agent_type: str) -> str:
122
+ """Sorted Set index from agent type to trace IDs."""
123
+ return f"by_framework:trace:idx:agent:{agent_type}"
124
+
95
125
  @staticmethod
96
126
  def task_group(group_id: str) -> str:
97
127
  """Task group progress tracking Hash Key."""