by-framework 0.2.2.dev0__py3-none-any.whl → 0.2.2.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- by_framework/__init__.py +2 -0
- by_framework/client/__init__.py +2 -1
- by_framework/client/client.py +410 -5
- by_framework/common/constants.py +30 -0
- by_framework/common/logger.py +71 -10
- by_framework/observability/__init__.py +62 -0
- by_framework/observability/dashboard.py +1145 -0
- by_framework/observability/external_trace.py +148 -0
- by_framework/observability/frontend/index.html +12 -0
- by_framework/observability/frontend/package-lock.json +1696 -0
- by_framework/observability/frontend/package.json +18 -0
- by_framework/observability/frontend/src/main.jsx +1351 -0
- by_framework/observability/frontend/src/styles.css +1214 -0
- by_framework/observability/frontend/vite.config.js +18 -0
- by_framework/observability/metrics.py +195 -0
- by_framework/observability/snapshot.py +2294 -0
- by_framework/observability/span_recorder.py +840 -0
- by_framework/observability/static/app.js +115 -0
- by_framework/observability/static/index.html +13 -0
- by_framework/observability/static/styles.css +1 -0
- by_framework/worker/_control_handling.py +30 -0
- by_framework/worker/context.py +347 -10
- by_framework/worker/runner.py +238 -10
- by_framework/worker/worker.py +23 -6
- {by_framework-0.2.2.dev0.dist-info → by_framework-0.2.2.dev1.dist-info}/METADATA +33 -2
- by_framework-0.2.2.dev1.dist-info/RECORD +56 -0
- {by_framework-0.2.2.dev0.dist-info → by_framework-0.2.2.dev1.dist-info}/WHEEL +1 -1
- by_framework/core/__init__.py +0 -95
- by_framework/core/availability.py +0 -495
- by_framework/core/delivery_gate.py +0 -60
- by_framework/core/discovery.py +0 -359
- by_framework/core/extensions/__init__.py +0 -35
- by_framework/core/extensions/agent_config.py +0 -64
- by_framework/core/extensions/plugin.py +0 -282
- by_framework/core/extensions/registry.py +0 -653
- by_framework/core/extensions/trace_provider.py +0 -20
- by_framework/core/protocol/__init__.py +0 -133
- by_framework/core/protocol/action_type.py +0 -33
- by_framework/core/protocol/agent_state.py +0 -78
- by_framework/core/protocol/byai_codec.py +0 -101
- by_framework/core/protocol/byai_command.py +0 -53
- by_framework/core/protocol/byai_types.py +0 -7
- by_framework/core/protocol/commands.py +0 -285
- by_framework/core/protocol/content_codec.py +0 -17
- by_framework/core/protocol/content_type.py +0 -38
- by_framework/core/protocol/data_message.py +0 -45
- by_framework/core/protocol/data_shapes.py +0 -83
- by_framework/core/protocol/event_type.py +0 -34
- by_framework/core/protocol/events.py +0 -69
- by_framework/core/protocol/message.py +0 -99
- by_framework/core/protocol/message_header.py +0 -68
- by_framework/core/protocol/responses.py +0 -94
- by_framework/core/protocol/results.py +0 -149
- by_framework/core/registry.py +0 -1025
- by_framework/core/runtime/__init__.py +0 -29
- by_framework/core/runtime/agent_config_manager.py +0 -283
- by_framework/core/runtime/agent_runtime_state.py +0 -75
- by_framework/core/runtime/file_manager.py +0 -437
- by_framework/core/runtime/file_paths.py +0 -76
- by_framework/core/runtime/file_permissions.py +0 -71
- by_framework/core/runtime/filestore/__init__.py +0 -15
- by_framework/core/runtime/filestore/base.py +0 -140
- by_framework/core/runtime/filestore/local.py +0 -321
- by_framework/core/runtime/history/__init__.py +0 -10
- by_framework/core/runtime/history/base.py +0 -57
- by_framework/core/runtime/history/history_manager.py +0 -55
- by_framework/core/runtime/history/in_memory.py +0 -58
- by_framework/core/runtime/session_manager.py +0 -118
- by_framework/core/wakeup_controller.py +0 -151
- by_framework/core/workspace.py +0 -126
- by_framework-0.2.2.dev0.dist-info/RECORD +0 -84
- {by_framework-0.2.2.dev0.dist-info → by_framework-0.2.2.dev1.dist-info}/licenses/LICENSE +0 -0
by_framework/__init__.py
CHANGED
|
@@ -8,6 +8,7 @@ from `GatewayWorker` and running `run_worker`.
|
|
|
8
8
|
from .client.byai_client import ByaiGatewayClient
|
|
9
9
|
from .client.client import (
|
|
10
10
|
CancelTaskResponse,
|
|
11
|
+
DataStreamEntry,
|
|
11
12
|
GatewayClient,
|
|
12
13
|
GatewayInterceptor,
|
|
13
14
|
SendMessageResponse,
|
|
@@ -140,6 +141,7 @@ __all__ = [
|
|
|
140
141
|
"GatewayClient",
|
|
141
142
|
"ByaiGatewayClient",
|
|
142
143
|
"GatewayInterceptor",
|
|
144
|
+
"DataStreamEntry",
|
|
143
145
|
"SendMessageResponse",
|
|
144
146
|
"CancelTaskResponse",
|
|
145
147
|
"run_worker",
|
by_framework/client/__init__.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
"""Client module for Gateway communication."""
|
|
2
2
|
|
|
3
3
|
from .byai_client import ByaiGatewayClient
|
|
4
|
-
from .client import GatewayClient, GatewayInterceptor
|
|
4
|
+
from .client import DataStreamEntry, GatewayClient, GatewayInterceptor
|
|
5
5
|
|
|
6
6
|
__all__ = [
|
|
7
7
|
"GatewayClient",
|
|
8
8
|
"ByaiGatewayClient",
|
|
9
9
|
"GatewayInterceptor",
|
|
10
|
+
"DataStreamEntry",
|
|
10
11
|
]
|
by_framework/client/client.py
CHANGED
|
@@ -9,7 +9,8 @@ import json
|
|
|
9
9
|
import time
|
|
10
10
|
import uuid
|
|
11
11
|
from dataclasses import dataclass
|
|
12
|
-
from
|
|
12
|
+
from dataclasses import fields as dataclass_fields
|
|
13
|
+
from typing import (TYPE_CHECKING, Any, AsyncIterator, Dict, List, Optional, Protocol)
|
|
13
14
|
|
|
14
15
|
from by_framework.common.constants import (
|
|
15
16
|
CANCEL_MESSAGE_ID_PREFIX,
|
|
@@ -17,6 +18,7 @@ from by_framework.common.constants import (
|
|
|
17
18
|
MESSAGE_ID_PREFIX,
|
|
18
19
|
RedisKeys,
|
|
19
20
|
)
|
|
21
|
+
from by_framework.common.logger import logger
|
|
20
22
|
from by_framework.common.redis_client import Redis, get_redis
|
|
21
23
|
from by_framework.core.availability import (
|
|
22
24
|
AvailabilityRouter,
|
|
@@ -32,6 +34,7 @@ from by_framework.core.protocol.commands import (
|
|
|
32
34
|
ReloadPluginsCommand,
|
|
33
35
|
ResumeCommand,
|
|
34
36
|
)
|
|
37
|
+
from by_framework.core.protocol.data_message import DataMessage
|
|
35
38
|
from by_framework.core.protocol.message_header import MessageHeader
|
|
36
39
|
from by_framework.core.protocol.responses import (
|
|
37
40
|
CancelTaskResponse,
|
|
@@ -40,6 +43,11 @@ from by_framework.core.protocol.responses import (
|
|
|
40
43
|
)
|
|
41
44
|
from by_framework.core.registry import WorkerRegistry
|
|
42
45
|
from by_framework.errors import WorkerRegistryNotSetError
|
|
46
|
+
from by_framework.observability.span_recorder import (
|
|
47
|
+
SpanRecorder,
|
|
48
|
+
TraceSpan,
|
|
49
|
+
str_to_uint64,
|
|
50
|
+
)
|
|
43
51
|
|
|
44
52
|
if TYPE_CHECKING:
|
|
45
53
|
pass
|
|
@@ -59,6 +67,14 @@ class RouteResolution:
|
|
|
59
67
|
target_worker_id: str = ""
|
|
60
68
|
|
|
61
69
|
|
|
70
|
+
@dataclass(frozen=True)
|
|
71
|
+
class DataStreamEntry:
|
|
72
|
+
"""A decoded entry from a session data stream."""
|
|
73
|
+
|
|
74
|
+
stream_id: str
|
|
75
|
+
message: DataMessage
|
|
76
|
+
|
|
77
|
+
|
|
62
78
|
class GatewayClient:
|
|
63
79
|
"""Gateway client for sending messages and cancel requests to Gateway workers.
|
|
64
80
|
|
|
@@ -76,16 +92,183 @@ class GatewayClient:
|
|
|
76
92
|
registry: Optional[WorkerRegistry] = None,
|
|
77
93
|
redis_client: Optional[Redis] = None,
|
|
78
94
|
interceptors: Optional[List[GatewayInterceptor]] = None,
|
|
95
|
+
span_recorder: Optional[SpanRecorder] = None,
|
|
79
96
|
):
|
|
80
97
|
self.registry = registry
|
|
81
98
|
self.redis = (
|
|
82
99
|
redis_client or (registry.redis if registry else None) or get_redis()
|
|
83
100
|
)
|
|
84
101
|
self.interceptors = interceptors or []
|
|
102
|
+
self.span_recorder = span_recorder or SpanRecorder(self.redis)
|
|
85
103
|
|
|
86
104
|
def add_interceptor(self, interceptor: GatewayInterceptor):
|
|
87
105
|
self.interceptors.append(interceptor)
|
|
88
106
|
|
|
107
|
+
@staticmethod
|
|
108
|
+
def _decode_redis_value(value: Any) -> Any:
|
|
109
|
+
"""Decode Redis bytes values while preserving already-decoded clients."""
|
|
110
|
+
if isinstance(value, bytes):
|
|
111
|
+
return value.decode("utf-8")
|
|
112
|
+
return value
|
|
113
|
+
|
|
114
|
+
@classmethod
|
|
115
|
+
def _decode_data_stream_entry(
|
|
116
|
+
cls, stream_id: Any, fields: Dict[Any, Any]
|
|
117
|
+
) -> DataStreamEntry:
|
|
118
|
+
raw = fields.get(b"data")
|
|
119
|
+
if raw is None:
|
|
120
|
+
raw = fields.get("data")
|
|
121
|
+
if raw is None:
|
|
122
|
+
raise ValueError("data stream entry missing 'data' field")
|
|
123
|
+
|
|
124
|
+
payload = json.loads(cls._decode_redis_value(raw))
|
|
125
|
+
data_message_fields = {field.name for field in dataclass_fields(DataMessage)}
|
|
126
|
+
return DataStreamEntry(
|
|
127
|
+
stream_id=cls._decode_redis_value(stream_id),
|
|
128
|
+
message=DataMessage(
|
|
129
|
+
**{
|
|
130
|
+
key: value
|
|
131
|
+
for key, value in payload.items()
|
|
132
|
+
if key in data_message_fields
|
|
133
|
+
}
|
|
134
|
+
),
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
async def read_data_messages(
|
|
138
|
+
self,
|
|
139
|
+
session_id: str,
|
|
140
|
+
last_id: str = "0-0",
|
|
141
|
+
block_ms: int = 0,
|
|
142
|
+
count: int = 100,
|
|
143
|
+
) -> List[DataStreamEntry]:
|
|
144
|
+
"""Read decoded messages from the session data stream.
|
|
145
|
+
|
|
146
|
+
Pass the last returned ``stream_id`` as ``last_id`` to continue from
|
|
147
|
+
the next entry. ``block_ms`` is passed to Redis XREAD; ``0`` means
|
|
148
|
+
block indefinitely on standard Redis clients.
|
|
149
|
+
"""
|
|
150
|
+
stream_name = RedisKeys.session_data_stream(session_id)
|
|
151
|
+
messages = await self.redis.xread(
|
|
152
|
+
streams={stream_name: last_id},
|
|
153
|
+
count=count,
|
|
154
|
+
block=block_ms,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
results: List[DataStreamEntry] = []
|
|
158
|
+
for _, msg_list in messages or []:
|
|
159
|
+
for stream_id, fields in msg_list:
|
|
160
|
+
results.append(self._decode_data_stream_entry(stream_id, fields))
|
|
161
|
+
return results
|
|
162
|
+
|
|
163
|
+
async def get_data_message_checkpoint(
|
|
164
|
+
self,
|
|
165
|
+
session_id: str,
|
|
166
|
+
consumer_name: str,
|
|
167
|
+
) -> str:
|
|
168
|
+
"""Return the last committed data stream ID for a named consumer."""
|
|
169
|
+
checkpoint = await self.redis.get(
|
|
170
|
+
RedisKeys.session_data_checkpoint(session_id, consumer_name)
|
|
171
|
+
)
|
|
172
|
+
if checkpoint is None:
|
|
173
|
+
return "0-0"
|
|
174
|
+
return self._decode_redis_value(checkpoint)
|
|
175
|
+
|
|
176
|
+
async def commit_data_message(
|
|
177
|
+
self,
|
|
178
|
+
session_id: str,
|
|
179
|
+
stream_id: str,
|
|
180
|
+
consumer_name: str,
|
|
181
|
+
) -> None:
|
|
182
|
+
"""Commit a data stream ID as processed for a named consumer."""
|
|
183
|
+
await self.redis.set(
|
|
184
|
+
RedisKeys.session_data_checkpoint(session_id, consumer_name),
|
|
185
|
+
stream_id,
|
|
186
|
+
ex=RedisKeys.DEFAULT_SESSION_TTL,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
async def read_data_messages_from_checkpoint(
|
|
190
|
+
self,
|
|
191
|
+
session_id: str,
|
|
192
|
+
consumer_name: str,
|
|
193
|
+
block_ms: int = 0,
|
|
194
|
+
count: int = 100,
|
|
195
|
+
auto_commit: bool = False,
|
|
196
|
+
) -> List[DataStreamEntry]:
|
|
197
|
+
"""Read messages starting after a named consumer's committed checkpoint."""
|
|
198
|
+
last_id = await self.get_data_message_checkpoint(session_id, consumer_name)
|
|
199
|
+
entries = await self.read_data_messages(
|
|
200
|
+
session_id=session_id,
|
|
201
|
+
last_id=last_id,
|
|
202
|
+
block_ms=block_ms,
|
|
203
|
+
count=count,
|
|
204
|
+
)
|
|
205
|
+
if auto_commit and entries:
|
|
206
|
+
await self.commit_data_message(
|
|
207
|
+
session_id=session_id,
|
|
208
|
+
stream_id=entries[-1].stream_id,
|
|
209
|
+
consumer_name=consumer_name,
|
|
210
|
+
)
|
|
211
|
+
return entries
|
|
212
|
+
|
|
213
|
+
async def iter_data_messages(
|
|
214
|
+
self,
|
|
215
|
+
session_id: str,
|
|
216
|
+
last_id: str = "$",
|
|
217
|
+
block_ms: int = 5000,
|
|
218
|
+
count: int = 100,
|
|
219
|
+
) -> AsyncIterator[DataStreamEntry]:
|
|
220
|
+
"""Continuously consume decoded messages from the session data stream.
|
|
221
|
+
|
|
222
|
+
The iterator does not stop on its own. Callers should break when their
|
|
223
|
+
business-level terminal event is observed.
|
|
224
|
+
"""
|
|
225
|
+
current_id = last_id
|
|
226
|
+
while True:
|
|
227
|
+
entries = await self.read_data_messages(
|
|
228
|
+
session_id=session_id,
|
|
229
|
+
last_id=current_id,
|
|
230
|
+
block_ms=block_ms,
|
|
231
|
+
count=count,
|
|
232
|
+
)
|
|
233
|
+
for entry in entries:
|
|
234
|
+
current_id = entry.stream_id
|
|
235
|
+
yield entry
|
|
236
|
+
|
|
237
|
+
async def consume_data_messages(
|
|
238
|
+
self,
|
|
239
|
+
session_id: str,
|
|
240
|
+
consumer_name: str,
|
|
241
|
+
block_ms: int = 5000,
|
|
242
|
+
count: int = 100,
|
|
243
|
+
) -> AsyncIterator[DataStreamEntry]:
|
|
244
|
+
"""Continuously consume data stream messages with checkpoint commits.
|
|
245
|
+
|
|
246
|
+
Each entry is committed after the caller's loop body completes and asks
|
|
247
|
+
for the next item. If processing fails or the iterator is closed before
|
|
248
|
+
the next item, the current entry is not committed and will be retried
|
|
249
|
+
from the checkpoint on the next consumer run. The iterator does not stop
|
|
250
|
+
on its own; callers should break on their terminal event.
|
|
251
|
+
"""
|
|
252
|
+
current_id = await self.get_data_message_checkpoint(
|
|
253
|
+
session_id=session_id,
|
|
254
|
+
consumer_name=consumer_name,
|
|
255
|
+
)
|
|
256
|
+
while True:
|
|
257
|
+
entries = await self.read_data_messages(
|
|
258
|
+
session_id=session_id,
|
|
259
|
+
last_id=current_id,
|
|
260
|
+
block_ms=block_ms,
|
|
261
|
+
count=count,
|
|
262
|
+
)
|
|
263
|
+
for entry in entries:
|
|
264
|
+
yield entry
|
|
265
|
+
await self.commit_data_message(
|
|
266
|
+
session_id=session_id,
|
|
267
|
+
stream_id=entry.stream_id,
|
|
268
|
+
consumer_name=consumer_name,
|
|
269
|
+
)
|
|
270
|
+
current_id = entry.stream_id
|
|
271
|
+
|
|
89
272
|
async def reload_plugins_for_agent_type(
|
|
90
273
|
self,
|
|
91
274
|
agent_type: str,
|
|
@@ -341,11 +524,16 @@ class GatewayClient:
|
|
|
341
524
|
)
|
|
342
525
|
|
|
343
526
|
if node_worker_id:
|
|
527
|
+
node_trace_id = (
|
|
528
|
+
node.get("trace_id")
|
|
529
|
+
or execution.get("trace_id")
|
|
530
|
+
or uuid.uuid4().hex
|
|
531
|
+
)
|
|
344
532
|
cancel_command = CancelTaskCommand(
|
|
345
533
|
header=MessageHeader(
|
|
346
534
|
message_id=f"{CANCEL_MESSAGE_ID_PREFIX}{uuid.uuid4().hex[:8]}",
|
|
347
535
|
session_id=session_id,
|
|
348
|
-
trace_id=
|
|
536
|
+
trace_id=node_trace_id,
|
|
349
537
|
target_agent_type=node.get("target_agent_type", ""),
|
|
350
538
|
parent_message_id=node_message_id,
|
|
351
539
|
),
|
|
@@ -424,6 +612,32 @@ class GatewayClient:
|
|
|
424
612
|
if not trace_id:
|
|
425
613
|
trace_id = uuid.uuid4().hex
|
|
426
614
|
|
|
615
|
+
metadata = dict(params.get("metadata", {}) or {})
|
|
616
|
+
trace_parent_span_id = metadata.pop("trace_parent_span_id", "")
|
|
617
|
+
langfuse_parent_observation_id = metadata.pop(
|
|
618
|
+
"langfuse_parent_observation_id", ""
|
|
619
|
+
)
|
|
620
|
+
if not trace_parent_span_id:
|
|
621
|
+
trace_parent_span_id = (
|
|
622
|
+
f"{str_to_uint64(f'{message_id}:client.dispatch'):016x}"
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
langfuse_client_dispatch = None
|
|
626
|
+
if not params["parent_message_id"]:
|
|
627
|
+
langfuse_client_dispatch = self._start_langfuse_client_dispatch_observation(
|
|
628
|
+
trace_id=trace_id,
|
|
629
|
+
message_id=message_id,
|
|
630
|
+
target_agent_type=params["target_agent_type"],
|
|
631
|
+
session_id=params["session_id"],
|
|
632
|
+
user_code=params["user_code"],
|
|
633
|
+
user_name=params["user_name"],
|
|
634
|
+
content=params["content"],
|
|
635
|
+
metadata=metadata,
|
|
636
|
+
)
|
|
637
|
+
observation_id = getattr(langfuse_client_dispatch, "id", "")
|
|
638
|
+
if observation_id:
|
|
639
|
+
langfuse_parent_observation_id = observation_id
|
|
640
|
+
|
|
427
641
|
header = MessageHeader(
|
|
428
642
|
message_id=message_id,
|
|
429
643
|
session_id=params["session_id"],
|
|
@@ -432,7 +646,9 @@ class GatewayClient:
|
|
|
432
646
|
parent_message_id=params["parent_message_id"],
|
|
433
647
|
user_code=params["user_code"],
|
|
434
648
|
user_name=params["user_name"],
|
|
435
|
-
metadata=
|
|
649
|
+
metadata=metadata,
|
|
650
|
+
trace_parent_span_id=trace_parent_span_id,
|
|
651
|
+
langfuse_parent_observation_id=langfuse_parent_observation_id,
|
|
436
652
|
)
|
|
437
653
|
command = self._build_gateway_command(
|
|
438
654
|
action_type=params["action_type"],
|
|
@@ -451,6 +667,7 @@ class GatewayClient:
|
|
|
451
667
|
route_policy != RoutePolicy.SEND_ANYWAY,
|
|
452
668
|
)
|
|
453
669
|
else:
|
|
670
|
+
avail_start_ms = int(time.time() * 1000)
|
|
454
671
|
availability = await AvailabilityRouter(
|
|
455
672
|
self.redis, self.registry
|
|
456
673
|
).prepare_delivery(
|
|
@@ -470,13 +687,42 @@ class GatewayClient:
|
|
|
470
687
|
metadata=params["metadata"],
|
|
471
688
|
)
|
|
472
689
|
)
|
|
690
|
+
try:
|
|
691
|
+
from by_framework.observability.metrics import record_availability_metrics
|
|
692
|
+
|
|
693
|
+
record_availability_metrics(
|
|
694
|
+
agent_type=params["target_agent_type"],
|
|
695
|
+
policy=route_policy,
|
|
696
|
+
status=availability.status,
|
|
697
|
+
routing_ms=float(int(time.time() * 1000) - avail_start_ms),
|
|
698
|
+
)
|
|
699
|
+
except Exception: # pylint: disable=broad-exception-caught
|
|
700
|
+
pass
|
|
473
701
|
if availability.status not in (
|
|
474
702
|
AvailabilityStatus.DELIVER_NOW,
|
|
475
703
|
AvailabilityStatus.WAIT_AND_DELIVER,
|
|
476
704
|
AvailabilityStatus.FALLBACK_TO_OTHER_AGENT_TYPE,
|
|
477
705
|
AvailabilityStatus.QUEUE_PENDING,
|
|
478
706
|
):
|
|
479
|
-
|
|
707
|
+
if self.registry and hasattr(
|
|
708
|
+
self.registry, "record_failed_route_decision"
|
|
709
|
+
):
|
|
710
|
+
await self.registry.record_failed_route_decision(
|
|
711
|
+
execution_id=execution_id,
|
|
712
|
+
message_id=message_id,
|
|
713
|
+
session_id=params["session_id"],
|
|
714
|
+
trace_id=trace_id,
|
|
715
|
+
target_agent_type=params["target_agent_type"],
|
|
716
|
+
parent_message_id=params["parent_message_id"] or "",
|
|
717
|
+
source_agent_type="client",
|
|
718
|
+
route_policy=route_policy,
|
|
719
|
+
route_status=availability.status,
|
|
720
|
+
stream_name=availability.stream_name or "",
|
|
721
|
+
selected_agent_type=availability.selected_agent_type or "",
|
|
722
|
+
availability_error_code=availability.error_code or "",
|
|
723
|
+
availability_error=availability.error or "",
|
|
724
|
+
)
|
|
725
|
+
response = SendMessageResponse(
|
|
480
726
|
success=False,
|
|
481
727
|
status=ExecutionStatus.FAILED,
|
|
482
728
|
message_id="",
|
|
@@ -487,6 +733,12 @@ class GatewayClient:
|
|
|
487
733
|
error_code=availability.error_code
|
|
488
734
|
or ExecutionStatus.ERR_AGENT_TYPE_UNAVAILABLE,
|
|
489
735
|
)
|
|
736
|
+
self._end_langfuse_client_dispatch_observation(
|
|
737
|
+
langfuse_client_dispatch,
|
|
738
|
+
output={"success": False, "error": availability.error},
|
|
739
|
+
error=availability.error,
|
|
740
|
+
)
|
|
741
|
+
return response
|
|
490
742
|
if availability.status == AvailabilityStatus.QUEUE_PENDING:
|
|
491
743
|
should_dispatch_control = False
|
|
492
744
|
route = RouteResolution(
|
|
@@ -497,6 +749,11 @@ class GatewayClient:
|
|
|
497
749
|
params["target_agent_type"] = availability.selected_agent_type
|
|
498
750
|
command.header.target_agent_type = availability.selected_agent_type
|
|
499
751
|
except LookupError as err:
|
|
752
|
+
self._end_langfuse_client_dispatch_observation(
|
|
753
|
+
langfuse_client_dispatch,
|
|
754
|
+
output={"success": False, "error": str(err)},
|
|
755
|
+
error=str(err),
|
|
756
|
+
)
|
|
500
757
|
return SendMessageResponse(
|
|
501
758
|
success=False,
|
|
502
759
|
status=ExecutionStatus.FAILED,
|
|
@@ -508,6 +765,11 @@ class GatewayClient:
|
|
|
508
765
|
error_code=ExecutionStatus.ERR_WORKER_NOT_ONLINE,
|
|
509
766
|
)
|
|
510
767
|
except ValueError as err:
|
|
768
|
+
self._end_langfuse_client_dispatch_observation(
|
|
769
|
+
langfuse_client_dispatch,
|
|
770
|
+
output={"success": False, "error": str(err)},
|
|
771
|
+
error=str(err),
|
|
772
|
+
)
|
|
511
773
|
return SendMessageResponse(
|
|
512
774
|
success=False,
|
|
513
775
|
status=ExecutionStatus.FAILED,
|
|
@@ -533,16 +795,44 @@ class GatewayClient:
|
|
|
533
795
|
"target_agent_type": params["target_agent_type"],
|
|
534
796
|
"stream_name": route.stream_name,
|
|
535
797
|
"status": "QUEUED",
|
|
798
|
+
"route_policy": route_policy,
|
|
799
|
+
"route_status": availability.status
|
|
800
|
+
if not target_worker_id
|
|
801
|
+
else "DIRECT_WORKER",
|
|
802
|
+
"selected_agent_type": availability.selected_agent_type
|
|
803
|
+
if not target_worker_id
|
|
804
|
+
else "",
|
|
805
|
+
"availability_error_code": availability.error_code
|
|
806
|
+
if not target_worker_id
|
|
807
|
+
else "",
|
|
808
|
+
"availability_error": availability.error
|
|
809
|
+
if not target_worker_id
|
|
810
|
+
else "",
|
|
536
811
|
}
|
|
537
812
|
)
|
|
538
813
|
except Exception: # pylint: disable=broad-exception-caught
|
|
539
814
|
pass # Fallback if registry fails
|
|
540
815
|
|
|
541
816
|
# 4. Route to the appropriate stream
|
|
817
|
+
dispatch_started_at = int(time.time() * 1000)
|
|
542
818
|
if should_dispatch_control:
|
|
543
819
|
await self.redis.xadd(route.stream_name, command.to_redis_payload())
|
|
820
|
+
await self._record_client_dispatch_span(
|
|
821
|
+
trace_id=trace_id,
|
|
822
|
+
message_id=message_id,
|
|
823
|
+
session_id=params["session_id"],
|
|
824
|
+
parent_message_id=params["parent_message_id"] or "",
|
|
825
|
+
target_agent_type=params["target_agent_type"],
|
|
826
|
+
target_worker_id=route.target_worker_id,
|
|
827
|
+
route_policy=route_policy,
|
|
828
|
+
route_status=availability.status
|
|
829
|
+
if not target_worker_id
|
|
830
|
+
else "DIRECT_WORKER",
|
|
831
|
+
start_ts=dispatch_started_at,
|
|
832
|
+
end_ts=int(time.time() * 1000),
|
|
833
|
+
)
|
|
544
834
|
|
|
545
|
-
|
|
835
|
+
response = SendMessageResponse(
|
|
546
836
|
success=True,
|
|
547
837
|
message_id=message_id,
|
|
548
838
|
trace_id=trace_id,
|
|
@@ -550,3 +840,118 @@ class GatewayClient:
|
|
|
550
840
|
timestamp=int(time.time() * 1000),
|
|
551
841
|
status=ExecutionStatus.QUEUED,
|
|
552
842
|
)
|
|
843
|
+
self._end_langfuse_client_dispatch_observation(
|
|
844
|
+
langfuse_client_dispatch,
|
|
845
|
+
output={
|
|
846
|
+
"success": True,
|
|
847
|
+
"message_id": message_id,
|
|
848
|
+
"trace_id": trace_id,
|
|
849
|
+
"target_worker_id": route.target_worker_id,
|
|
850
|
+
"status": response.status,
|
|
851
|
+
},
|
|
852
|
+
)
|
|
853
|
+
return response
|
|
854
|
+
|
|
855
|
+
def _start_langfuse_client_dispatch_observation(
|
|
856
|
+
self,
|
|
857
|
+
*,
|
|
858
|
+
trace_id: str,
|
|
859
|
+
message_id: str,
|
|
860
|
+
target_agent_type: str,
|
|
861
|
+
session_id: str,
|
|
862
|
+
user_code: str,
|
|
863
|
+
user_name: str,
|
|
864
|
+
content: Any,
|
|
865
|
+
metadata: Dict[str, Any],
|
|
866
|
+
) -> Any:
|
|
867
|
+
try:
|
|
868
|
+
from by_framework_trace_langfuse import start_client_dispatch_observation
|
|
869
|
+
|
|
870
|
+
return start_client_dispatch_observation(
|
|
871
|
+
trace_id=trace_id,
|
|
872
|
+
message_id=message_id,
|
|
873
|
+
target_agent_type=target_agent_type,
|
|
874
|
+
session_id=session_id,
|
|
875
|
+
user_code=user_code,
|
|
876
|
+
user_name=user_name,
|
|
877
|
+
content=content,
|
|
878
|
+
metadata=metadata,
|
|
879
|
+
)
|
|
880
|
+
except Exception as err: # pylint: disable=broad-exception-caught
|
|
881
|
+
logger.warning(
|
|
882
|
+
"Langfuse client.dispatch observation skipped: %s",
|
|
883
|
+
err,
|
|
884
|
+
exc_info=True,
|
|
885
|
+
)
|
|
886
|
+
return None
|
|
887
|
+
|
|
888
|
+
@staticmethod
|
|
889
|
+
def _end_langfuse_client_dispatch_observation(
|
|
890
|
+
observation: Any,
|
|
891
|
+
*,
|
|
892
|
+
output: Any,
|
|
893
|
+
error: str = "",
|
|
894
|
+
) -> None:
|
|
895
|
+
if observation is None:
|
|
896
|
+
return
|
|
897
|
+
try:
|
|
898
|
+
if error and hasattr(observation, "update"):
|
|
899
|
+
observation.update(level="ERROR", status_message=error)
|
|
900
|
+
observation.end(output=output)
|
|
901
|
+
except TypeError:
|
|
902
|
+
try:
|
|
903
|
+
observation.update(output=output)
|
|
904
|
+
observation.end()
|
|
905
|
+
except Exception: # pylint: disable=broad-exception-caught
|
|
906
|
+
pass
|
|
907
|
+
except Exception: # pylint: disable=broad-exception-caught
|
|
908
|
+
pass
|
|
909
|
+
|
|
910
|
+
async def _record_client_dispatch_span(
|
|
911
|
+
self,
|
|
912
|
+
*,
|
|
913
|
+
trace_id: str,
|
|
914
|
+
message_id: str,
|
|
915
|
+
session_id: str,
|
|
916
|
+
parent_message_id: str,
|
|
917
|
+
target_agent_type: str,
|
|
918
|
+
target_worker_id: str,
|
|
919
|
+
route_policy: str,
|
|
920
|
+
route_status: str,
|
|
921
|
+
start_ts: int,
|
|
922
|
+
end_ts: int,
|
|
923
|
+
) -> None:
|
|
924
|
+
try:
|
|
925
|
+
logger.info(
|
|
926
|
+
"Recording client dispatch span: message_id=%s, trace_id=%s",
|
|
927
|
+
message_id,
|
|
928
|
+
trace_id,
|
|
929
|
+
)
|
|
930
|
+
await self.span_recorder.record_span(
|
|
931
|
+
TraceSpan(
|
|
932
|
+
trace_id=trace_id,
|
|
933
|
+
span_id=f"{message_id}:client.dispatch",
|
|
934
|
+
parent_span_id="",
|
|
935
|
+
operation="client.dispatch",
|
|
936
|
+
component="client",
|
|
937
|
+
start_ts=start_ts,
|
|
938
|
+
end_ts=end_ts,
|
|
939
|
+
status="COMPLETED",
|
|
940
|
+
session_id=session_id,
|
|
941
|
+
message_id=message_id,
|
|
942
|
+
parent_message_id=parent_message_id,
|
|
943
|
+
worker_id=target_worker_id,
|
|
944
|
+
source_agent_type="client",
|
|
945
|
+
target_agent_type=target_agent_type,
|
|
946
|
+
route_policy=route_policy,
|
|
947
|
+
route_status=route_status,
|
|
948
|
+
)
|
|
949
|
+
)
|
|
950
|
+
logger.info(
|
|
951
|
+
"Client dispatch span recorded successfully for message_id=%s",
|
|
952
|
+
message_id,
|
|
953
|
+
)
|
|
954
|
+
except Exception as err: # pylint: disable=broad-exception-caught
|
|
955
|
+
logger.warning(
|
|
956
|
+
"Failed to record client dispatch span: %s", err, exc_info=True
|
|
957
|
+
)
|
by_framework/common/constants.py
CHANGED
|
@@ -92,6 +92,36 @@ class RedisKeys:
|
|
|
92
92
|
"""Session-level data stream. Workers push streaming content here."""
|
|
93
93
|
return f"byai_gateway:session:{session_id}:data_stream"
|
|
94
94
|
|
|
95
|
+
@staticmethod
|
|
96
|
+
def session_data_checkpoint(session_id: str, consumer_name: str) -> str:
|
|
97
|
+
"""Checkpoint key storing a consumer's last processed data stream ID."""
|
|
98
|
+
return f"byai_gateway:session:{session_id}:consumer:{consumer_name}:checkpoint"
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
def trace_meta(trace_id: str) -> str:
|
|
102
|
+
"""Hash storing trace-level metadata for observability."""
|
|
103
|
+
return f"by_framework:trace:{trace_id}"
|
|
104
|
+
|
|
105
|
+
@staticmethod
|
|
106
|
+
def trace_spans(trace_id: str) -> str:
|
|
107
|
+
"""List storing trace span JSON payloads ordered by write time."""
|
|
108
|
+
return f"by_framework:trace:spans:{trace_id}"
|
|
109
|
+
|
|
110
|
+
@staticmethod
|
|
111
|
+
def trace_index_session(session_id: str) -> str:
|
|
112
|
+
"""Sorted Set index from session_id to trace IDs."""
|
|
113
|
+
return f"by_framework:trace:idx:session:{session_id}"
|
|
114
|
+
|
|
115
|
+
@staticmethod
|
|
116
|
+
def trace_index_worker(worker_id: str) -> str:
|
|
117
|
+
"""Sorted Set index from worker_id to trace IDs."""
|
|
118
|
+
return f"by_framework:trace:idx:worker:{worker_id}"
|
|
119
|
+
|
|
120
|
+
@staticmethod
|
|
121
|
+
def trace_index_agent(agent_type: str) -> str:
|
|
122
|
+
"""Sorted Set index from agent type to trace IDs."""
|
|
123
|
+
return f"by_framework:trace:idx:agent:{agent_type}"
|
|
124
|
+
|
|
95
125
|
@staticmethod
|
|
96
126
|
def task_group(group_id: str) -> str:
|
|
97
127
|
"""Task group progress tracking Hash Key."""
|