by-framework 0.2.2.dev0__py3-none-any.whl → 0.2.2.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- by_framework/__init__.py +2 -0
- by_framework/client/__init__.py +2 -1
- by_framework/client/client.py +406 -5
- by_framework/common/constants.py +30 -0
- by_framework/common/logger.py +71 -10
- by_framework/metrics/__init__.py +218 -0
- by_framework/metrics/read_client.py +257 -0
- by_framework/metrics/snapshot.py +2294 -0
- by_framework/trace/__init__.py +55 -0
- by_framework/trace/external_trace.py +148 -0
- by_framework/trace/span_recorder.py +901 -0
- by_framework/trace/trace_schema.py +329 -0
- by_framework/trace/trace_writer.py +157 -0
- by_framework/worker/_control_handling.py +30 -0
- by_framework/worker/context.py +433 -17
- by_framework/worker/runner.py +258 -10
- by_framework/worker/worker.py +135 -11
- {by_framework-0.2.2.dev0.dist-info → by_framework-0.2.2.dev2.dist-info}/METADATA +34 -2
- by_framework-0.2.2.dev2.dist-info/RECORD +49 -0
- {by_framework-0.2.2.dev0.dist-info → by_framework-0.2.2.dev2.dist-info}/WHEEL +1 -1
- by_framework/core/__init__.py +0 -95
- by_framework/core/availability.py +0 -495
- by_framework/core/delivery_gate.py +0 -60
- by_framework/core/discovery.py +0 -359
- by_framework/core/extensions/__init__.py +0 -35
- by_framework/core/extensions/agent_config.py +0 -64
- by_framework/core/extensions/plugin.py +0 -282
- by_framework/core/extensions/registry.py +0 -653
- by_framework/core/extensions/trace_provider.py +0 -20
- by_framework/core/protocol/__init__.py +0 -133
- by_framework/core/protocol/action_type.py +0 -33
- by_framework/core/protocol/agent_state.py +0 -78
- by_framework/core/protocol/byai_codec.py +0 -101
- by_framework/core/protocol/byai_command.py +0 -53
- by_framework/core/protocol/byai_types.py +0 -7
- by_framework/core/protocol/commands.py +0 -285
- by_framework/core/protocol/content_codec.py +0 -17
- by_framework/core/protocol/content_type.py +0 -38
- by_framework/core/protocol/data_message.py +0 -45
- by_framework/core/protocol/data_shapes.py +0 -83
- by_framework/core/protocol/event_type.py +0 -34
- by_framework/core/protocol/events.py +0 -69
- by_framework/core/protocol/message.py +0 -99
- by_framework/core/protocol/message_header.py +0 -68
- by_framework/core/protocol/responses.py +0 -94
- by_framework/core/protocol/results.py +0 -149
- by_framework/core/registry.py +0 -1025
- by_framework/core/runtime/__init__.py +0 -29
- by_framework/core/runtime/agent_config_manager.py +0 -283
- by_framework/core/runtime/agent_runtime_state.py +0 -75
- by_framework/core/runtime/file_manager.py +0 -437
- by_framework/core/runtime/file_paths.py +0 -76
- by_framework/core/runtime/file_permissions.py +0 -71
- by_framework/core/runtime/filestore/__init__.py +0 -15
- by_framework/core/runtime/filestore/base.py +0 -140
- by_framework/core/runtime/filestore/local.py +0 -321
- by_framework/core/runtime/history/__init__.py +0 -10
- by_framework/core/runtime/history/base.py +0 -57
- by_framework/core/runtime/history/history_manager.py +0 -55
- by_framework/core/runtime/history/in_memory.py +0 -58
- by_framework/core/runtime/session_manager.py +0 -118
- by_framework/core/wakeup_controller.py +0 -151
- by_framework/core/workspace.py +0 -126
- by_framework-0.2.2.dev0.dist-info/RECORD +0 -84
- {by_framework-0.2.2.dev0.dist-info → by_framework-0.2.2.dev2.dist-info}/licenses/LICENSE +0 -0
by_framework/__init__.py
CHANGED
|
@@ -8,6 +8,7 @@ from `GatewayWorker` and running `run_worker`.
|
|
|
8
8
|
from .client.byai_client import ByaiGatewayClient
|
|
9
9
|
from .client.client import (
|
|
10
10
|
CancelTaskResponse,
|
|
11
|
+
DataStreamEntry,
|
|
11
12
|
GatewayClient,
|
|
12
13
|
GatewayInterceptor,
|
|
13
14
|
SendMessageResponse,
|
|
@@ -140,6 +141,7 @@ __all__ = [
|
|
|
140
141
|
"GatewayClient",
|
|
141
142
|
"ByaiGatewayClient",
|
|
142
143
|
"GatewayInterceptor",
|
|
144
|
+
"DataStreamEntry",
|
|
143
145
|
"SendMessageResponse",
|
|
144
146
|
"CancelTaskResponse",
|
|
145
147
|
"run_worker",
|
by_framework/client/__init__.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
"""Client module for Gateway communication."""
|
|
2
2
|
|
|
3
3
|
from .byai_client import ByaiGatewayClient
|
|
4
|
-
from .client import GatewayClient, GatewayInterceptor
|
|
4
|
+
from .client import DataStreamEntry, GatewayClient, GatewayInterceptor
|
|
5
5
|
|
|
6
6
|
__all__ = [
|
|
7
7
|
"GatewayClient",
|
|
8
8
|
"ByaiGatewayClient",
|
|
9
9
|
"GatewayInterceptor",
|
|
10
|
+
"DataStreamEntry",
|
|
10
11
|
]
|
by_framework/client/client.py
CHANGED
|
@@ -9,7 +9,8 @@ import json
|
|
|
9
9
|
import time
|
|
10
10
|
import uuid
|
|
11
11
|
from dataclasses import dataclass
|
|
12
|
-
from
|
|
12
|
+
from dataclasses import fields as dataclass_fields
|
|
13
|
+
from typing import (TYPE_CHECKING, Any, AsyncIterator, Dict, List, Optional, Protocol)
|
|
13
14
|
|
|
14
15
|
from by_framework.common.constants import (
|
|
15
16
|
CANCEL_MESSAGE_ID_PREFIX,
|
|
@@ -17,6 +18,7 @@ from by_framework.common.constants import (
|
|
|
17
18
|
MESSAGE_ID_PREFIX,
|
|
18
19
|
RedisKeys,
|
|
19
20
|
)
|
|
21
|
+
from by_framework.common.logger import logger
|
|
20
22
|
from by_framework.common.redis_client import Redis, get_redis
|
|
21
23
|
from by_framework.core.availability import (
|
|
22
24
|
AvailabilityRouter,
|
|
@@ -32,6 +34,7 @@ from by_framework.core.protocol.commands import (
|
|
|
32
34
|
ReloadPluginsCommand,
|
|
33
35
|
ResumeCommand,
|
|
34
36
|
)
|
|
37
|
+
from by_framework.core.protocol.data_message import DataMessage
|
|
35
38
|
from by_framework.core.protocol.message_header import MessageHeader
|
|
36
39
|
from by_framework.core.protocol.responses import (
|
|
37
40
|
CancelTaskResponse,
|
|
@@ -40,6 +43,7 @@ from by_framework.core.protocol.responses import (
|
|
|
40
43
|
)
|
|
41
44
|
from by_framework.core.registry import WorkerRegistry
|
|
42
45
|
from by_framework.errors import WorkerRegistryNotSetError
|
|
46
|
+
from by_framework.trace.span_recorder import (SpanRecorder, TraceSpan, str_to_uint64)
|
|
43
47
|
|
|
44
48
|
if TYPE_CHECKING:
|
|
45
49
|
pass
|
|
@@ -59,6 +63,14 @@ class RouteResolution:
|
|
|
59
63
|
target_worker_id: str = ""
|
|
60
64
|
|
|
61
65
|
|
|
66
|
+
@dataclass(frozen=True)
|
|
67
|
+
class DataStreamEntry:
|
|
68
|
+
"""A decoded entry from a session data stream."""
|
|
69
|
+
|
|
70
|
+
stream_id: str
|
|
71
|
+
message: DataMessage
|
|
72
|
+
|
|
73
|
+
|
|
62
74
|
class GatewayClient:
|
|
63
75
|
"""Gateway client for sending messages and cancel requests to Gateway workers.
|
|
64
76
|
|
|
@@ -76,16 +88,183 @@ class GatewayClient:
|
|
|
76
88
|
registry: Optional[WorkerRegistry] = None,
|
|
77
89
|
redis_client: Optional[Redis] = None,
|
|
78
90
|
interceptors: Optional[List[GatewayInterceptor]] = None,
|
|
91
|
+
span_recorder: Optional[SpanRecorder] = None,
|
|
79
92
|
):
|
|
80
93
|
self.registry = registry
|
|
81
94
|
self.redis = (
|
|
82
95
|
redis_client or (registry.redis if registry else None) or get_redis()
|
|
83
96
|
)
|
|
84
97
|
self.interceptors = interceptors or []
|
|
98
|
+
self.span_recorder = span_recorder or SpanRecorder(self.redis)
|
|
85
99
|
|
|
86
100
|
def add_interceptor(self, interceptor: GatewayInterceptor):
|
|
87
101
|
self.interceptors.append(interceptor)
|
|
88
102
|
|
|
103
|
+
@staticmethod
|
|
104
|
+
def _decode_redis_value(value: Any) -> Any:
|
|
105
|
+
"""Decode Redis bytes values while preserving already-decoded clients."""
|
|
106
|
+
if isinstance(value, bytes):
|
|
107
|
+
return value.decode("utf-8")
|
|
108
|
+
return value
|
|
109
|
+
|
|
110
|
+
@classmethod
|
|
111
|
+
def _decode_data_stream_entry(
|
|
112
|
+
cls, stream_id: Any, fields: Dict[Any, Any]
|
|
113
|
+
) -> DataStreamEntry:
|
|
114
|
+
raw = fields.get(b"data")
|
|
115
|
+
if raw is None:
|
|
116
|
+
raw = fields.get("data")
|
|
117
|
+
if raw is None:
|
|
118
|
+
raise ValueError("data stream entry missing 'data' field")
|
|
119
|
+
|
|
120
|
+
payload = json.loads(cls._decode_redis_value(raw))
|
|
121
|
+
data_message_fields = {field.name for field in dataclass_fields(DataMessage)}
|
|
122
|
+
return DataStreamEntry(
|
|
123
|
+
stream_id=cls._decode_redis_value(stream_id),
|
|
124
|
+
message=DataMessage(
|
|
125
|
+
**{
|
|
126
|
+
key: value
|
|
127
|
+
for key, value in payload.items()
|
|
128
|
+
if key in data_message_fields
|
|
129
|
+
}
|
|
130
|
+
),
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
async def read_data_messages(
|
|
134
|
+
self,
|
|
135
|
+
session_id: str,
|
|
136
|
+
last_id: str = "0-0",
|
|
137
|
+
block_ms: int = 0,
|
|
138
|
+
count: int = 100,
|
|
139
|
+
) -> List[DataStreamEntry]:
|
|
140
|
+
"""Read decoded messages from the session data stream.
|
|
141
|
+
|
|
142
|
+
Pass the last returned ``stream_id`` as ``last_id`` to continue from
|
|
143
|
+
the next entry. ``block_ms`` is passed to Redis XREAD; ``0`` means
|
|
144
|
+
block indefinitely on standard Redis clients.
|
|
145
|
+
"""
|
|
146
|
+
stream_name = RedisKeys.session_data_stream(session_id)
|
|
147
|
+
messages = await self.redis.xread(
|
|
148
|
+
streams={stream_name: last_id},
|
|
149
|
+
count=count,
|
|
150
|
+
block=block_ms,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
results: List[DataStreamEntry] = []
|
|
154
|
+
for _, msg_list in messages or []:
|
|
155
|
+
for stream_id, fields in msg_list:
|
|
156
|
+
results.append(self._decode_data_stream_entry(stream_id, fields))
|
|
157
|
+
return results
|
|
158
|
+
|
|
159
|
+
async def get_data_message_checkpoint(
|
|
160
|
+
self,
|
|
161
|
+
session_id: str,
|
|
162
|
+
consumer_name: str,
|
|
163
|
+
) -> str:
|
|
164
|
+
"""Return the last committed data stream ID for a named consumer."""
|
|
165
|
+
checkpoint = await self.redis.get(
|
|
166
|
+
RedisKeys.session_data_checkpoint(session_id, consumer_name)
|
|
167
|
+
)
|
|
168
|
+
if checkpoint is None:
|
|
169
|
+
return "0-0"
|
|
170
|
+
return self._decode_redis_value(checkpoint)
|
|
171
|
+
|
|
172
|
+
async def commit_data_message(
|
|
173
|
+
self,
|
|
174
|
+
session_id: str,
|
|
175
|
+
stream_id: str,
|
|
176
|
+
consumer_name: str,
|
|
177
|
+
) -> None:
|
|
178
|
+
"""Commit a data stream ID as processed for a named consumer."""
|
|
179
|
+
await self.redis.set(
|
|
180
|
+
RedisKeys.session_data_checkpoint(session_id, consumer_name),
|
|
181
|
+
stream_id,
|
|
182
|
+
ex=RedisKeys.DEFAULT_SESSION_TTL,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
async def read_data_messages_from_checkpoint(
|
|
186
|
+
self,
|
|
187
|
+
session_id: str,
|
|
188
|
+
consumer_name: str,
|
|
189
|
+
block_ms: int = 0,
|
|
190
|
+
count: int = 100,
|
|
191
|
+
auto_commit: bool = False,
|
|
192
|
+
) -> List[DataStreamEntry]:
|
|
193
|
+
"""Read messages starting after a named consumer's committed checkpoint."""
|
|
194
|
+
last_id = await self.get_data_message_checkpoint(session_id, consumer_name)
|
|
195
|
+
entries = await self.read_data_messages(
|
|
196
|
+
session_id=session_id,
|
|
197
|
+
last_id=last_id,
|
|
198
|
+
block_ms=block_ms,
|
|
199
|
+
count=count,
|
|
200
|
+
)
|
|
201
|
+
if auto_commit and entries:
|
|
202
|
+
await self.commit_data_message(
|
|
203
|
+
session_id=session_id,
|
|
204
|
+
stream_id=entries[-1].stream_id,
|
|
205
|
+
consumer_name=consumer_name,
|
|
206
|
+
)
|
|
207
|
+
return entries
|
|
208
|
+
|
|
209
|
+
async def iter_data_messages(
|
|
210
|
+
self,
|
|
211
|
+
session_id: str,
|
|
212
|
+
last_id: str = "$",
|
|
213
|
+
block_ms: int = 5000,
|
|
214
|
+
count: int = 100,
|
|
215
|
+
) -> AsyncIterator[DataStreamEntry]:
|
|
216
|
+
"""Continuously consume decoded messages from the session data stream.
|
|
217
|
+
|
|
218
|
+
The iterator does not stop on its own. Callers should break when their
|
|
219
|
+
business-level terminal event is observed.
|
|
220
|
+
"""
|
|
221
|
+
current_id = last_id
|
|
222
|
+
while True:
|
|
223
|
+
entries = await self.read_data_messages(
|
|
224
|
+
session_id=session_id,
|
|
225
|
+
last_id=current_id,
|
|
226
|
+
block_ms=block_ms,
|
|
227
|
+
count=count,
|
|
228
|
+
)
|
|
229
|
+
for entry in entries:
|
|
230
|
+
current_id = entry.stream_id
|
|
231
|
+
yield entry
|
|
232
|
+
|
|
233
|
+
async def consume_data_messages(
|
|
234
|
+
self,
|
|
235
|
+
session_id: str,
|
|
236
|
+
consumer_name: str,
|
|
237
|
+
block_ms: int = 5000,
|
|
238
|
+
count: int = 100,
|
|
239
|
+
) -> AsyncIterator[DataStreamEntry]:
|
|
240
|
+
"""Continuously consume data stream messages with checkpoint commits.
|
|
241
|
+
|
|
242
|
+
Each entry is committed after the caller's loop body completes and asks
|
|
243
|
+
for the next item. If processing fails or the iterator is closed before
|
|
244
|
+
the next item, the current entry is not committed and will be retried
|
|
245
|
+
from the checkpoint on the next consumer run. The iterator does not stop
|
|
246
|
+
on its own; callers should break on their terminal event.
|
|
247
|
+
"""
|
|
248
|
+
current_id = await self.get_data_message_checkpoint(
|
|
249
|
+
session_id=session_id,
|
|
250
|
+
consumer_name=consumer_name,
|
|
251
|
+
)
|
|
252
|
+
while True:
|
|
253
|
+
entries = await self.read_data_messages(
|
|
254
|
+
session_id=session_id,
|
|
255
|
+
last_id=current_id,
|
|
256
|
+
block_ms=block_ms,
|
|
257
|
+
count=count,
|
|
258
|
+
)
|
|
259
|
+
for entry in entries:
|
|
260
|
+
yield entry
|
|
261
|
+
await self.commit_data_message(
|
|
262
|
+
session_id=session_id,
|
|
263
|
+
stream_id=entry.stream_id,
|
|
264
|
+
consumer_name=consumer_name,
|
|
265
|
+
)
|
|
266
|
+
current_id = entry.stream_id
|
|
267
|
+
|
|
89
268
|
async def reload_plugins_for_agent_type(
|
|
90
269
|
self,
|
|
91
270
|
agent_type: str,
|
|
@@ -341,11 +520,16 @@ class GatewayClient:
|
|
|
341
520
|
)
|
|
342
521
|
|
|
343
522
|
if node_worker_id:
|
|
523
|
+
node_trace_id = (
|
|
524
|
+
node.get("trace_id")
|
|
525
|
+
or execution.get("trace_id")
|
|
526
|
+
or uuid.uuid4().hex
|
|
527
|
+
)
|
|
344
528
|
cancel_command = CancelTaskCommand(
|
|
345
529
|
header=MessageHeader(
|
|
346
530
|
message_id=f"{CANCEL_MESSAGE_ID_PREFIX}{uuid.uuid4().hex[:8]}",
|
|
347
531
|
session_id=session_id,
|
|
348
|
-
trace_id=
|
|
532
|
+
trace_id=node_trace_id,
|
|
349
533
|
target_agent_type=node.get("target_agent_type", ""),
|
|
350
534
|
parent_message_id=node_message_id,
|
|
351
535
|
),
|
|
@@ -424,6 +608,32 @@ class GatewayClient:
|
|
|
424
608
|
if not trace_id:
|
|
425
609
|
trace_id = uuid.uuid4().hex
|
|
426
610
|
|
|
611
|
+
metadata = dict(params.get("metadata", {}) or {})
|
|
612
|
+
trace_parent_span_id = metadata.pop("trace_parent_span_id", "")
|
|
613
|
+
langfuse_parent_observation_id = metadata.pop(
|
|
614
|
+
"langfuse_parent_observation_id", ""
|
|
615
|
+
)
|
|
616
|
+
if not trace_parent_span_id:
|
|
617
|
+
trace_parent_span_id = (
|
|
618
|
+
f"{str_to_uint64(f'{message_id}:client.dispatch'):016x}"
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
langfuse_client_dispatch = None
|
|
622
|
+
if not params["parent_message_id"]:
|
|
623
|
+
langfuse_client_dispatch = self._start_langfuse_client_dispatch_observation(
|
|
624
|
+
trace_id=trace_id,
|
|
625
|
+
message_id=message_id,
|
|
626
|
+
target_agent_type=params["target_agent_type"],
|
|
627
|
+
session_id=params["session_id"],
|
|
628
|
+
user_code=params["user_code"],
|
|
629
|
+
user_name=params["user_name"],
|
|
630
|
+
content=params["content"],
|
|
631
|
+
metadata=metadata,
|
|
632
|
+
)
|
|
633
|
+
observation_id = getattr(langfuse_client_dispatch, "id", "")
|
|
634
|
+
if observation_id:
|
|
635
|
+
langfuse_parent_observation_id = observation_id
|
|
636
|
+
|
|
427
637
|
header = MessageHeader(
|
|
428
638
|
message_id=message_id,
|
|
429
639
|
session_id=params["session_id"],
|
|
@@ -432,7 +642,9 @@ class GatewayClient:
|
|
|
432
642
|
parent_message_id=params["parent_message_id"],
|
|
433
643
|
user_code=params["user_code"],
|
|
434
644
|
user_name=params["user_name"],
|
|
435
|
-
metadata=
|
|
645
|
+
metadata=metadata,
|
|
646
|
+
trace_parent_span_id=trace_parent_span_id,
|
|
647
|
+
langfuse_parent_observation_id=langfuse_parent_observation_id,
|
|
436
648
|
)
|
|
437
649
|
command = self._build_gateway_command(
|
|
438
650
|
action_type=params["action_type"],
|
|
@@ -451,6 +663,7 @@ class GatewayClient:
|
|
|
451
663
|
route_policy != RoutePolicy.SEND_ANYWAY,
|
|
452
664
|
)
|
|
453
665
|
else:
|
|
666
|
+
avail_start_ms = int(time.time() * 1000)
|
|
454
667
|
availability = await AvailabilityRouter(
|
|
455
668
|
self.redis, self.registry
|
|
456
669
|
).prepare_delivery(
|
|
@@ -470,13 +683,42 @@ class GatewayClient:
|
|
|
470
683
|
metadata=params["metadata"],
|
|
471
684
|
)
|
|
472
685
|
)
|
|
686
|
+
try:
|
|
687
|
+
from by_framework.metrics import record_availability_metrics
|
|
688
|
+
|
|
689
|
+
record_availability_metrics(
|
|
690
|
+
agent_type=params["target_agent_type"],
|
|
691
|
+
policy=route_policy,
|
|
692
|
+
status=availability.status,
|
|
693
|
+
routing_ms=float(int(time.time() * 1000) - avail_start_ms),
|
|
694
|
+
)
|
|
695
|
+
except Exception: # pylint: disable=broad-exception-caught
|
|
696
|
+
pass
|
|
473
697
|
if availability.status not in (
|
|
474
698
|
AvailabilityStatus.DELIVER_NOW,
|
|
475
699
|
AvailabilityStatus.WAIT_AND_DELIVER,
|
|
476
700
|
AvailabilityStatus.FALLBACK_TO_OTHER_AGENT_TYPE,
|
|
477
701
|
AvailabilityStatus.QUEUE_PENDING,
|
|
478
702
|
):
|
|
479
|
-
|
|
703
|
+
if self.registry and hasattr(
|
|
704
|
+
self.registry, "record_failed_route_decision"
|
|
705
|
+
):
|
|
706
|
+
await self.registry.record_failed_route_decision(
|
|
707
|
+
execution_id=execution_id,
|
|
708
|
+
message_id=message_id,
|
|
709
|
+
session_id=params["session_id"],
|
|
710
|
+
trace_id=trace_id,
|
|
711
|
+
target_agent_type=params["target_agent_type"],
|
|
712
|
+
parent_message_id=params["parent_message_id"] or "",
|
|
713
|
+
source_agent_type="client",
|
|
714
|
+
route_policy=route_policy,
|
|
715
|
+
route_status=availability.status,
|
|
716
|
+
stream_name=availability.stream_name or "",
|
|
717
|
+
selected_agent_type=availability.selected_agent_type or "",
|
|
718
|
+
availability_error_code=availability.error_code or "",
|
|
719
|
+
availability_error=availability.error or "",
|
|
720
|
+
)
|
|
721
|
+
response = SendMessageResponse(
|
|
480
722
|
success=False,
|
|
481
723
|
status=ExecutionStatus.FAILED,
|
|
482
724
|
message_id="",
|
|
@@ -487,6 +729,12 @@ class GatewayClient:
|
|
|
487
729
|
error_code=availability.error_code
|
|
488
730
|
or ExecutionStatus.ERR_AGENT_TYPE_UNAVAILABLE,
|
|
489
731
|
)
|
|
732
|
+
self._end_langfuse_client_dispatch_observation(
|
|
733
|
+
langfuse_client_dispatch,
|
|
734
|
+
output={"success": False, "error": availability.error},
|
|
735
|
+
error=availability.error,
|
|
736
|
+
)
|
|
737
|
+
return response
|
|
490
738
|
if availability.status == AvailabilityStatus.QUEUE_PENDING:
|
|
491
739
|
should_dispatch_control = False
|
|
492
740
|
route = RouteResolution(
|
|
@@ -497,6 +745,11 @@ class GatewayClient:
|
|
|
497
745
|
params["target_agent_type"] = availability.selected_agent_type
|
|
498
746
|
command.header.target_agent_type = availability.selected_agent_type
|
|
499
747
|
except LookupError as err:
|
|
748
|
+
self._end_langfuse_client_dispatch_observation(
|
|
749
|
+
langfuse_client_dispatch,
|
|
750
|
+
output={"success": False, "error": str(err)},
|
|
751
|
+
error=str(err),
|
|
752
|
+
)
|
|
500
753
|
return SendMessageResponse(
|
|
501
754
|
success=False,
|
|
502
755
|
status=ExecutionStatus.FAILED,
|
|
@@ -508,6 +761,11 @@ class GatewayClient:
|
|
|
508
761
|
error_code=ExecutionStatus.ERR_WORKER_NOT_ONLINE,
|
|
509
762
|
)
|
|
510
763
|
except ValueError as err:
|
|
764
|
+
self._end_langfuse_client_dispatch_observation(
|
|
765
|
+
langfuse_client_dispatch,
|
|
766
|
+
output={"success": False, "error": str(err)},
|
|
767
|
+
error=str(err),
|
|
768
|
+
)
|
|
511
769
|
return SendMessageResponse(
|
|
512
770
|
success=False,
|
|
513
771
|
status=ExecutionStatus.FAILED,
|
|
@@ -533,16 +791,44 @@ class GatewayClient:
|
|
|
533
791
|
"target_agent_type": params["target_agent_type"],
|
|
534
792
|
"stream_name": route.stream_name,
|
|
535
793
|
"status": "QUEUED",
|
|
794
|
+
"route_policy": route_policy,
|
|
795
|
+
"route_status": availability.status
|
|
796
|
+
if not target_worker_id
|
|
797
|
+
else "DIRECT_WORKER",
|
|
798
|
+
"selected_agent_type": availability.selected_agent_type
|
|
799
|
+
if not target_worker_id
|
|
800
|
+
else "",
|
|
801
|
+
"availability_error_code": availability.error_code
|
|
802
|
+
if not target_worker_id
|
|
803
|
+
else "",
|
|
804
|
+
"availability_error": availability.error
|
|
805
|
+
if not target_worker_id
|
|
806
|
+
else "",
|
|
536
807
|
}
|
|
537
808
|
)
|
|
538
809
|
except Exception: # pylint: disable=broad-exception-caught
|
|
539
810
|
pass # Fallback if registry fails
|
|
540
811
|
|
|
541
812
|
# 4. Route to the appropriate stream
|
|
813
|
+
dispatch_started_at = int(time.time() * 1000)
|
|
542
814
|
if should_dispatch_control:
|
|
543
815
|
await self.redis.xadd(route.stream_name, command.to_redis_payload())
|
|
816
|
+
await self._record_client_dispatch_span(
|
|
817
|
+
trace_id=trace_id,
|
|
818
|
+
message_id=message_id,
|
|
819
|
+
session_id=params["session_id"],
|
|
820
|
+
parent_message_id=params["parent_message_id"] or "",
|
|
821
|
+
target_agent_type=params["target_agent_type"],
|
|
822
|
+
target_worker_id=route.target_worker_id,
|
|
823
|
+
route_policy=route_policy,
|
|
824
|
+
route_status=availability.status
|
|
825
|
+
if not target_worker_id
|
|
826
|
+
else "DIRECT_WORKER",
|
|
827
|
+
start_ts=dispatch_started_at,
|
|
828
|
+
end_ts=int(time.time() * 1000),
|
|
829
|
+
)
|
|
544
830
|
|
|
545
|
-
|
|
831
|
+
response = SendMessageResponse(
|
|
546
832
|
success=True,
|
|
547
833
|
message_id=message_id,
|
|
548
834
|
trace_id=trace_id,
|
|
@@ -550,3 +836,118 @@ class GatewayClient:
|
|
|
550
836
|
timestamp=int(time.time() * 1000),
|
|
551
837
|
status=ExecutionStatus.QUEUED,
|
|
552
838
|
)
|
|
839
|
+
self._end_langfuse_client_dispatch_observation(
|
|
840
|
+
langfuse_client_dispatch,
|
|
841
|
+
output={
|
|
842
|
+
"success": True,
|
|
843
|
+
"message_id": message_id,
|
|
844
|
+
"trace_id": trace_id,
|
|
845
|
+
"target_worker_id": route.target_worker_id,
|
|
846
|
+
"status": response.status,
|
|
847
|
+
},
|
|
848
|
+
)
|
|
849
|
+
return response
|
|
850
|
+
|
|
851
|
+
def _start_langfuse_client_dispatch_observation(
|
|
852
|
+
self,
|
|
853
|
+
*,
|
|
854
|
+
trace_id: str,
|
|
855
|
+
message_id: str,
|
|
856
|
+
target_agent_type: str,
|
|
857
|
+
session_id: str,
|
|
858
|
+
user_code: str,
|
|
859
|
+
user_name: str,
|
|
860
|
+
content: Any,
|
|
861
|
+
metadata: Dict[str, Any],
|
|
862
|
+
) -> Any:
|
|
863
|
+
try:
|
|
864
|
+
from by_framework_trace_langfuse import start_client_dispatch_observation
|
|
865
|
+
|
|
866
|
+
return start_client_dispatch_observation(
|
|
867
|
+
trace_id=trace_id,
|
|
868
|
+
message_id=message_id,
|
|
869
|
+
target_agent_type=target_agent_type,
|
|
870
|
+
session_id=session_id,
|
|
871
|
+
user_code=user_code,
|
|
872
|
+
user_name=user_name,
|
|
873
|
+
content=content,
|
|
874
|
+
metadata=metadata,
|
|
875
|
+
)
|
|
876
|
+
except Exception as err: # pylint: disable=broad-exception-caught
|
|
877
|
+
logger.warning(
|
|
878
|
+
"Langfuse client.dispatch observation skipped: %s",
|
|
879
|
+
err,
|
|
880
|
+
exc_info=True,
|
|
881
|
+
)
|
|
882
|
+
return None
|
|
883
|
+
|
|
884
|
+
@staticmethod
|
|
885
|
+
def _end_langfuse_client_dispatch_observation(
|
|
886
|
+
observation: Any,
|
|
887
|
+
*,
|
|
888
|
+
output: Any,
|
|
889
|
+
error: str = "",
|
|
890
|
+
) -> None:
|
|
891
|
+
if observation is None:
|
|
892
|
+
return
|
|
893
|
+
try:
|
|
894
|
+
if error and hasattr(observation, "update"):
|
|
895
|
+
observation.update(level="ERROR", status_message=error)
|
|
896
|
+
observation.end(output=output)
|
|
897
|
+
except TypeError:
|
|
898
|
+
try:
|
|
899
|
+
observation.update(output=output)
|
|
900
|
+
observation.end()
|
|
901
|
+
except Exception: # pylint: disable=broad-exception-caught
|
|
902
|
+
pass
|
|
903
|
+
except Exception: # pylint: disable=broad-exception-caught
|
|
904
|
+
pass
|
|
905
|
+
|
|
906
|
+
async def _record_client_dispatch_span(
|
|
907
|
+
self,
|
|
908
|
+
*,
|
|
909
|
+
trace_id: str,
|
|
910
|
+
message_id: str,
|
|
911
|
+
session_id: str,
|
|
912
|
+
parent_message_id: str,
|
|
913
|
+
target_agent_type: str,
|
|
914
|
+
target_worker_id: str,
|
|
915
|
+
route_policy: str,
|
|
916
|
+
route_status: str,
|
|
917
|
+
start_ts: int,
|
|
918
|
+
end_ts: int,
|
|
919
|
+
) -> None:
|
|
920
|
+
try:
|
|
921
|
+
logger.info(
|
|
922
|
+
"Recording client dispatch span: message_id=%s, trace_id=%s",
|
|
923
|
+
message_id,
|
|
924
|
+
trace_id,
|
|
925
|
+
)
|
|
926
|
+
await self.span_recorder.record_span(
|
|
927
|
+
TraceSpan(
|
|
928
|
+
trace_id=trace_id,
|
|
929
|
+
span_id=f"{message_id}:client.dispatch",
|
|
930
|
+
parent_span_id="",
|
|
931
|
+
operation="client.dispatch",
|
|
932
|
+
component="client",
|
|
933
|
+
start_ts=start_ts,
|
|
934
|
+
end_ts=end_ts,
|
|
935
|
+
status="COMPLETED",
|
|
936
|
+
session_id=session_id,
|
|
937
|
+
message_id=message_id,
|
|
938
|
+
parent_message_id=parent_message_id,
|
|
939
|
+
worker_id=target_worker_id,
|
|
940
|
+
source_agent_type="client",
|
|
941
|
+
target_agent_type=target_agent_type,
|
|
942
|
+
route_policy=route_policy,
|
|
943
|
+
route_status=route_status,
|
|
944
|
+
)
|
|
945
|
+
)
|
|
946
|
+
logger.info(
|
|
947
|
+
"Client dispatch span recorded successfully for message_id=%s",
|
|
948
|
+
message_id,
|
|
949
|
+
)
|
|
950
|
+
except Exception as err: # pylint: disable=broad-exception-caught
|
|
951
|
+
logger.warning(
|
|
952
|
+
"Failed to record client dispatch span: %s", err, exc_info=True
|
|
953
|
+
)
|
by_framework/common/constants.py
CHANGED
|
@@ -92,6 +92,36 @@ class RedisKeys:
|
|
|
92
92
|
"""Session-level data stream. Workers push streaming content here."""
|
|
93
93
|
return f"byai_gateway:session:{session_id}:data_stream"
|
|
94
94
|
|
|
95
|
+
@staticmethod
|
|
96
|
+
def session_data_checkpoint(session_id: str, consumer_name: str) -> str:
|
|
97
|
+
"""Checkpoint key storing a consumer's last processed data stream ID."""
|
|
98
|
+
return f"byai_gateway:session:{session_id}:consumer:{consumer_name}:checkpoint"
|
|
99
|
+
|
|
100
|
+
@staticmethod
|
|
101
|
+
def trace_meta(trace_id: str) -> str:
|
|
102
|
+
"""Hash storing trace-level metadata for observability."""
|
|
103
|
+
return f"by_framework:trace:{trace_id}"
|
|
104
|
+
|
|
105
|
+
@staticmethod
|
|
106
|
+
def trace_spans(trace_id: str) -> str:
|
|
107
|
+
"""List storing trace span JSON payloads ordered by write time."""
|
|
108
|
+
return f"by_framework:trace:spans:{trace_id}"
|
|
109
|
+
|
|
110
|
+
@staticmethod
|
|
111
|
+
def trace_index_session(session_id: str) -> str:
|
|
112
|
+
"""Sorted Set index from session_id to trace IDs."""
|
|
113
|
+
return f"by_framework:trace:idx:session:{session_id}"
|
|
114
|
+
|
|
115
|
+
@staticmethod
|
|
116
|
+
def trace_index_worker(worker_id: str) -> str:
|
|
117
|
+
"""Sorted Set index from worker_id to trace IDs."""
|
|
118
|
+
return f"by_framework:trace:idx:worker:{worker_id}"
|
|
119
|
+
|
|
120
|
+
@staticmethod
|
|
121
|
+
def trace_index_agent(agent_type: str) -> str:
|
|
122
|
+
"""Sorted Set index from agent type to trace IDs."""
|
|
123
|
+
return f"by_framework:trace:idx:agent:{agent_type}"
|
|
124
|
+
|
|
95
125
|
@staticmethod
|
|
96
126
|
def task_group(group_id: str) -> str:
|
|
97
127
|
"""Task group progress tracking Hash Key."""
|