penguiflow 2.0.0__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of penguiflow might be problematic. Click here for more details.

penguiflow/remote.py ADDED
@@ -0,0 +1,486 @@
1
+ """Remote transport protocol and helper node for PenguiFlow."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import logging
8
+ import time
9
+ from collections.abc import AsyncIterator, Mapping
10
+ from dataclasses import dataclass
11
+ from typing import TYPE_CHECKING, Any, Protocol
12
+
13
+ from pydantic import BaseModel
14
+
15
+ from .core import TraceCancelled
16
+ from .node import Node, NodePolicy
17
+ from .state import RemoteBinding
18
+ from .types import Message
19
+
20
+ if TYPE_CHECKING: # pragma: no cover - import for typing only
21
+ from .core import Context, PenguiFlow
22
+
23
+
24
+ @dataclass(slots=True)
25
+ class RemoteCallRequest:
26
+ """Input to :class:`RemoteTransport` implementations."""
27
+
28
+ message: Message
29
+ skill: str
30
+ agent_url: str
31
+ agent_card: Mapping[str, Any] | None = None
32
+ metadata: Mapping[str, Any] | None = None
33
+ timeout_s: float | None = None
34
+
35
+
36
+ @dataclass(slots=True)
37
+ class RemoteCallResult:
38
+ """Return value for :meth:`RemoteTransport.send`."""
39
+
40
+ result: Any
41
+ context_id: str | None = None
42
+ task_id: str | None = None
43
+ agent_url: str | None = None
44
+ meta: Mapping[str, Any] | None = None
45
+
46
+
47
+ @dataclass(slots=True)
48
+ class RemoteStreamEvent:
49
+ """Streaming event yielded by :meth:`RemoteTransport.stream`."""
50
+
51
+ text: str | None = None
52
+ done: bool = False
53
+ meta: Mapping[str, Any] | None = None
54
+ context_id: str | None = None
55
+ task_id: str | None = None
56
+ agent_url: str | None = None
57
+ result: Any | None = None
58
+
59
+
60
+ class RemoteTransport(Protocol):
61
+ """Protocol describing the minimal remote invocation surface."""
62
+
63
+ async def send(self, request: RemoteCallRequest) -> RemoteCallResult:
64
+ """Perform a unary remote call."""
65
+
66
+ def stream(self, request: RemoteCallRequest) -> AsyncIterator[RemoteStreamEvent]:
67
+ """Perform a remote call that yields streaming events."""
68
+
69
+ async def cancel(self, *, agent_url: str, task_id: str) -> None:
70
+ """Cancel a remote task identified by ``task_id`` at ``agent_url``."""
71
+
72
+
73
+ def _json_default(value: Any) -> Any:
74
+ if isinstance(value, BaseModel):
75
+ return value.model_dump(mode="json")
76
+ if isinstance(value, bytes):
77
+ return value.decode("utf-8", errors="replace")
78
+ return repr(value)
79
+
80
+
81
+ def _estimate_bytes(value: Any) -> int | None:
82
+ """Best-effort size estimation for observability metrics."""
83
+
84
+ if value is None:
85
+ return None
86
+ try:
87
+ if isinstance(value, BaseModel):
88
+ payload = value.model_dump(mode="json")
89
+ else:
90
+ payload = value
91
+ encoded = json.dumps(payload, default=_json_default).encode("utf-8")
92
+ except Exception:
93
+ try:
94
+ encoded = str(value).encode("utf-8")
95
+ except Exception:
96
+ return None
97
+ return len(encoded)
98
+
99
+
100
+ def _text_bytes(text: str | None) -> int:
101
+ if text is None:
102
+ return 0
103
+ return len(text.encode("utf-8"))
104
+
105
+
106
+ def _merge_remote_extra(
107
+ base: Mapping[str, Any],
108
+ *,
109
+ agent_url: str | None,
110
+ context_id: str | None,
111
+ task_id: str | None,
112
+ additional: Mapping[str, Any] | None = None,
113
+ ) -> dict[str, Any]:
114
+ extra = dict(base)
115
+ if agent_url is not None:
116
+ extra["remote_agent_url"] = agent_url
117
+ if context_id is not None:
118
+ extra["remote_context_id"] = context_id
119
+ if task_id is not None:
120
+ extra["remote_task_id"] = task_id
121
+ if additional:
122
+ for key, value in additional.items():
123
+ if value is not None:
124
+ extra[key] = value
125
+ return extra
126
+
127
+
128
+ def RemoteNode(
129
+ *,
130
+ transport: RemoteTransport,
131
+ skill: str,
132
+ agent_url: str,
133
+ name: str,
134
+ agent_card: Mapping[str, Any] | None = None,
135
+ policy: NodePolicy | None = None,
136
+ streaming: bool = False,
137
+ record_binding: bool = True,
138
+ ) -> Node:
139
+ """Create a node that proxies work to a remote agent via ``transport``."""
140
+
141
+ node_policy = policy or NodePolicy()
142
+
143
+ async def _record_binding(
144
+ *,
145
+ runtime: PenguiFlow,
146
+ context: Context,
147
+ node_owner: Node,
148
+ trace_id: str,
149
+ context_id: str | None,
150
+ task_id: str | None,
151
+ agent_url_override: str | None,
152
+ base_extra: Mapping[str, Any],
153
+ ) -> tuple[asyncio.Task[None], asyncio.Event] | None:
154
+ if context_id is None or task_id is None:
155
+ return None
156
+
157
+ agent_ref = agent_url_override or agent_url
158
+
159
+ if record_binding:
160
+ binding = RemoteBinding(
161
+ trace_id=trace_id,
162
+ context_id=context_id,
163
+ task_id=task_id,
164
+ agent_url=agent_ref,
165
+ )
166
+ await runtime.save_remote_binding(binding)
167
+
168
+ cancel_event = runtime.ensure_trace_event(trace_id)
169
+
170
+ async def _issue_cancel(reason: str) -> None:
171
+ start_cancel = time.perf_counter()
172
+ try:
173
+ await transport.cancel(agent_url=agent_ref, task_id=task_id)
174
+ except Exception as exc: # pragma: no cover - defensive logging
175
+ latency = (time.perf_counter() - start_cancel) * 1000
176
+ extra = _merge_remote_extra(
177
+ base_extra,
178
+ agent_url=agent_ref,
179
+ context_id=context_id,
180
+ task_id=task_id,
181
+ additional={
182
+ "remote_cancel_reason": reason,
183
+ "remote_error": repr(exc),
184
+ "remote_status": "cancel_error",
185
+ },
186
+ )
187
+ await runtime.record_remote_event(
188
+ event="remote_cancel_error",
189
+ node=node_owner,
190
+ context=context,
191
+ trace_id=trace_id,
192
+ latency_ms=latency,
193
+ level=logging.ERROR,
194
+ extra=extra,
195
+ )
196
+ return
197
+
198
+ latency = (time.perf_counter() - start_cancel) * 1000
199
+ extra = _merge_remote_extra(
200
+ base_extra,
201
+ agent_url=agent_ref,
202
+ context_id=context_id,
203
+ task_id=task_id,
204
+ additional={
205
+ "remote_cancel_reason": reason,
206
+ "remote_status": "cancelled",
207
+ },
208
+ )
209
+ await runtime.record_remote_event(
210
+ event="remote_call_cancelled",
211
+ node=node_owner,
212
+ context=context,
213
+ trace_id=trace_id,
214
+ latency_ms=latency,
215
+ level=logging.INFO,
216
+ extra=extra,
217
+ )
218
+
219
+ if cancel_event.is_set():
220
+ await _issue_cancel("pre_cancelled")
221
+ raise TraceCancelled(trace_id)
222
+
223
+ async def _mirror_cancel() -> None:
224
+ try:
225
+ await cancel_event.wait()
226
+ except asyncio.CancelledError:
227
+ return
228
+ await _issue_cancel("trace_cancel")
229
+
230
+ cancel_task = asyncio.create_task(_mirror_cancel())
231
+ runtime.register_external_task(trace_id, cancel_task)
232
+ return cancel_task, cancel_event
233
+
234
+ async def _remote_impl(message: Message, ctx: Context) -> Any:
235
+ if not isinstance(message, Message):
236
+ raise TypeError("Remote nodes require penguiflow.types.Message inputs")
237
+
238
+ runtime = ctx.runtime
239
+ if runtime is None:
240
+ raise RuntimeError("Context is not bound to a running PenguiFlow")
241
+
242
+ owner = ctx.owner
243
+ if not isinstance(owner, Node): # pragma: no cover - defensive safety
244
+ raise RuntimeError("Remote context owner must be a Node")
245
+
246
+ trace_id = message.trace_id
247
+ cancel_task: asyncio.Task[None] | None = None
248
+ cancel_event: asyncio.Event | None = None
249
+ binding_registered = False
250
+
251
+ remote_context_id: str | None = None
252
+ remote_task_id: str | None = None
253
+ remote_agent_url_final = agent_url
254
+ response_bytes = 0
255
+ stream_events = 0
256
+
257
+ base_extra: dict[str, Any] = {
258
+ "remote_skill": skill,
259
+ "remote_transport": type(transport).__name__,
260
+ "remote_streaming": streaming,
261
+ }
262
+ request_bytes = _estimate_bytes(message)
263
+ if request_bytes is not None:
264
+ base_extra["remote_request_bytes"] = request_bytes
265
+
266
+ request = RemoteCallRequest(
267
+ message=message,
268
+ skill=skill,
269
+ agent_url=agent_url,
270
+ agent_card=agent_card,
271
+ metadata=message.meta,
272
+ timeout_s=node_policy.timeout_s,
273
+ )
274
+
275
+ async def _ensure_binding(
276
+ *,
277
+ context_id: str | None,
278
+ task_id: str | None,
279
+ agent_url_override: str | None,
280
+ ) -> None:
281
+ nonlocal cancel_task, cancel_event, binding_registered
282
+ nonlocal remote_context_id, remote_task_id, remote_agent_url_final
283
+ if context_id is not None:
284
+ remote_context_id = context_id
285
+ if task_id is not None:
286
+ remote_task_id = task_id
287
+ if agent_url_override is not None:
288
+ remote_agent_url_final = agent_url_override
289
+ if binding_registered:
290
+ return
291
+ if context_id is None or task_id is None:
292
+ return
293
+ record = await _record_binding(
294
+ runtime=runtime,
295
+ context=ctx,
296
+ node_owner=owner,
297
+ trace_id=trace_id,
298
+ context_id=context_id,
299
+ task_id=task_id,
300
+ agent_url_override=agent_url_override,
301
+ base_extra=base_extra,
302
+ )
303
+ if record is None:
304
+ return
305
+ cancel_task, cancel_event = record
306
+ binding_registered = True
307
+
308
+ async def _cleanup_cancel_task() -> None:
309
+ if cancel_task is not None:
310
+ try:
311
+ if cancel_event is not None and cancel_event.is_set():
312
+ await cancel_task
313
+ return
314
+ if not cancel_task.done():
315
+ cancel_task.cancel()
316
+ await cancel_task
317
+ except BaseException: # pragma: no cover - cleanup guard
318
+ pass
319
+
320
+ async def _run_stream() -> Any | None:
321
+ nonlocal response_bytes, stream_events, remote_agent_url_final
322
+ final_result: Any | None = None
323
+ stream_idx = 0
324
+ async for event in transport.stream(request):
325
+ stream_events = stream_idx + 1
326
+ await _ensure_binding(
327
+ context_id=event.context_id,
328
+ task_id=event.task_id,
329
+ agent_url_override=event.agent_url,
330
+ )
331
+ if event.agent_url is not None:
332
+ remote_agent_url_final = event.agent_url
333
+
334
+ chunk_bytes = 0
335
+ if event.text is not None:
336
+ meta = dict(event.meta) if event.meta is not None else None
337
+ chunk_bytes += _text_bytes(event.text)
338
+ meta_bytes = _estimate_bytes(event.meta)
339
+ if meta_bytes is not None:
340
+ chunk_bytes += meta_bytes
341
+ await ctx.emit_chunk(
342
+ parent=message,
343
+ text=event.text,
344
+ done=event.done,
345
+ meta=meta,
346
+ )
347
+
348
+ if runtime is not None:
349
+ meta_keys = None
350
+ if event.meta:
351
+ meta_keys = sorted(event.meta.keys())
352
+ extra = _merge_remote_extra(
353
+ base_extra,
354
+ agent_url=remote_agent_url_final,
355
+ context_id=remote_context_id,
356
+ task_id=remote_task_id,
357
+ additional={
358
+ "remote_stream_seq": stream_idx,
359
+ "remote_chunk_bytes": chunk_bytes if chunk_bytes else None,
360
+ "remote_chunk_done": event.done,
361
+ "remote_chunk_meta_keys": meta_keys,
362
+ },
363
+ )
364
+ await runtime.record_remote_event(
365
+ event="remote_stream_event",
366
+ node=owner,
367
+ context=ctx,
368
+ trace_id=trace_id,
369
+ latency_ms=(time.perf_counter() - call_start) * 1000,
370
+ level=logging.DEBUG,
371
+ extra=extra,
372
+ )
373
+
374
+ if chunk_bytes:
375
+ response_bytes += chunk_bytes
376
+
377
+ if event.result is not None:
378
+ result_bytes = _estimate_bytes(event.result)
379
+ if result_bytes is not None:
380
+ response_bytes += result_bytes
381
+ final_result = event.result
382
+
383
+ stream_idx += 1
384
+
385
+ return final_result
386
+
387
+ call_start = time.perf_counter()
388
+
389
+ await runtime.record_remote_event(
390
+ event="remote_call_start",
391
+ node=owner,
392
+ context=ctx,
393
+ trace_id=trace_id,
394
+ latency_ms=0.0,
395
+ level=logging.DEBUG,
396
+ extra=_merge_remote_extra(
397
+ base_extra,
398
+ agent_url=remote_agent_url_final,
399
+ context_id=None,
400
+ task_id=None,
401
+ ),
402
+ )
403
+
404
+ try:
405
+ if streaming:
406
+ final_result = await _run_stream()
407
+ result_payload = final_result
408
+ else:
409
+ result = await transport.send(request)
410
+ await _ensure_binding(
411
+ context_id=result.context_id,
412
+ task_id=result.task_id,
413
+ agent_url_override=result.agent_url,
414
+ )
415
+ if result.context_id is not None:
416
+ remote_context_id = result.context_id
417
+ if result.task_id is not None:
418
+ remote_task_id = result.task_id
419
+ if result.agent_url is not None:
420
+ remote_agent_url_final = result.agent_url
421
+ result_payload = result.result
422
+ response_size = _estimate_bytes(result_payload)
423
+ if response_size is not None:
424
+ response_bytes += response_size
425
+ except TraceCancelled:
426
+ raise
427
+ except asyncio.CancelledError:
428
+ raise
429
+ except Exception as exc:
430
+ latency = (time.perf_counter() - call_start) * 1000
431
+ extra = _merge_remote_extra(
432
+ base_extra,
433
+ agent_url=remote_agent_url_final,
434
+ context_id=remote_context_id,
435
+ task_id=remote_task_id,
436
+ additional={
437
+ "remote_error": repr(exc),
438
+ "remote_status": "error",
439
+ },
440
+ )
441
+ await runtime.record_remote_event(
442
+ event="remote_call_error",
443
+ node=owner,
444
+ context=ctx,
445
+ trace_id=trace_id,
446
+ latency_ms=latency,
447
+ level=logging.ERROR,
448
+ extra=extra,
449
+ )
450
+ raise
451
+ else:
452
+ latency = (time.perf_counter() - call_start) * 1000
453
+ extra = _merge_remote_extra(
454
+ base_extra,
455
+ agent_url=remote_agent_url_final,
456
+ context_id=remote_context_id,
457
+ task_id=remote_task_id,
458
+ additional={
459
+ "remote_response_bytes": response_bytes,
460
+ "remote_stream_events": stream_events,
461
+ "remote_status": "success",
462
+ },
463
+ )
464
+ await runtime.record_remote_event(
465
+ event="remote_call_success",
466
+ node=owner,
467
+ context=ctx,
468
+ trace_id=trace_id,
469
+ latency_ms=latency,
470
+ level=logging.INFO,
471
+ extra=extra,
472
+ )
473
+ return result_payload
474
+ finally:
475
+ await _cleanup_cancel_task()
476
+
477
+ return Node(_remote_impl, name=name, policy=node_policy)
478
+
479
+
480
+ __all__ = [
481
+ "RemoteCallRequest",
482
+ "RemoteCallResult",
483
+ "RemoteStreamEvent",
484
+ "RemoteTransport",
485
+ "RemoteNode",
486
+ ]
penguiflow/state.py ADDED
@@ -0,0 +1,64 @@
1
+ """State store protocol and helpers for PenguiFlow."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Mapping, Sequence
6
+ from dataclasses import dataclass
7
+ from typing import Any, Protocol
8
+
9
+ from .metrics import FlowEvent
10
+
11
+
12
+ @dataclass(slots=True)
13
+ class StoredEvent:
14
+ """Representation of a runtime event persisted by a state store."""
15
+
16
+ trace_id: str | None
17
+ ts: float
18
+ kind: str
19
+ node_name: str | None
20
+ node_id: str | None
21
+ payload: Mapping[str, Any]
22
+
23
+ @classmethod
24
+ def from_flow_event(cls, event: FlowEvent) -> StoredEvent:
25
+ """Create a stored representation from a :class:`FlowEvent`."""
26
+
27
+ return cls(
28
+ trace_id=event.trace_id,
29
+ ts=event.ts,
30
+ kind=event.event_type,
31
+ node_name=event.node_name,
32
+ node_id=event.node_id,
33
+ payload=event.to_payload(),
34
+ )
35
+
36
+
37
+ @dataclass(slots=True)
38
+ class RemoteBinding:
39
+ """Association between a trace and a remote worker/agent."""
40
+
41
+ trace_id: str
42
+ context_id: str
43
+ task_id: str
44
+ agent_url: str
45
+
46
+
47
+ class StateStore(Protocol):
48
+ """Protocol for durable state adapters used by PenguiFlow."""
49
+
50
+ async def save_event(self, event: StoredEvent) -> None:
51
+ """Persist a runtime event.
52
+
53
+ Implementations may choose any storage backend (Postgres, Redis, etc.).
54
+ The method must be idempotent since retries can emit duplicate events.
55
+ """
56
+
57
+ async def load_history(self, trace_id: str) -> Sequence[StoredEvent]:
58
+ """Return the ordered history for a trace id."""
59
+
60
+ async def save_remote_binding(self, binding: RemoteBinding) -> None:
61
+ """Persist the mapping between a trace and an external worker."""
62
+
63
+
64
+ __all__ = ["StateStore", "StoredEvent", "RemoteBinding"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: penguiflow
3
- Version: 2.0.0
3
+ Version: 2.1.0
4
4
  Summary: Async agent orchestration primitives.
5
5
  Author: PenguiFlow Team
6
6
  License: MIT License
@@ -37,6 +37,10 @@ Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
37
37
  Requires-Dist: pytest-cov>=4.0; extra == "dev"
38
38
  Requires-Dist: coverage[toml]>=7.0; extra == "dev"
39
39
  Requires-Dist: ruff>=0.2; extra == "dev"
40
+ Requires-Dist: fastapi>=0.110; extra == "dev"
41
+ Requires-Dist: httpx>=0.27; extra == "dev"
42
+ Provides-Extra: a2a-server
43
+ Requires-Dist: fastapi>=0.110; extra == "a2a-server"
40
44
  Dynamic: license-file
41
45
 
42
46
  # PenguiFlow 🐧❄️
@@ -77,6 +81,18 @@ It provides:
77
81
  * **Observability hooks** (`FlowEvent` callbacks for logging, MLflow, or custom metrics sinks)
78
82
  * **Policy-driven routing** (optional policies steer routers without breaking existing flows)
79
83
  * **Traceable exceptions** (`FlowError` captures node/trace metadata and optionally emits to Rookery)
84
+ * **Distribution hooks (opt-in)** — plug a `StateStore` to persist trace history and a
85
+ `MessageBus` to publish floe traffic for remote workers without changing existing flows.
86
+ * **Remote calls (opt-in)** — `RemoteNode` bridges the runtime to external agents through a
87
+ pluggable `RemoteTransport` interface (A2A-ready) while propagating streaming chunks and
88
+ cancellation.
89
+ * **A2A server adapter (opt-in)** — wrap a PenguiFlow graph in a FastAPI surface using
90
+ `penguiflow_a2a.A2AServerAdapter` so other agents can call `message/send`,
91
+ `message/stream`, and `tasks/cancel` while reusing the runtime's backpressure and
92
+ cancellation semantics.
93
+ * **Observability & ops polish** — remote calls emit structured metrics (latency, payload
94
+ sizes, cancel reasons) and the `penguiflow-admin` CLI replays trace history from any
95
+ configured `StateStore` for debugging.
80
96
 
81
97
  Built on pure `asyncio` (no threads), PenguiFlow is small, predictable, and repo-agnostic.
82
98
  Product repos only define **their models + node functions** — the core stays dependency-light.
@@ -168,6 +184,10 @@ print(out.payload) # PackOut(...)
168
184
  await flow.stop()
169
185
  ```
170
186
 
187
+ > **Opt-in distribution:** pass `state_store=` and/or `message_bus=` when calling
188
+ > `penguiflow.core.create(...)` to persist trace history and publish floe traffic
189
+ > without changing node logic.
190
+
171
191
  ---
172
192
 
173
193
  ## 🧭 Design Principles
@@ -222,6 +242,60 @@ sacrificing backpressure or ordering guarantees. The helper wraps the payload i
222
242
  increments per-stream sequence numbers. See `tests/test_streaming.py` and
223
243
  `examples/streaming_llm/` for an end-to-end walk-through.
224
244
 
245
+ ### Remote orchestration
246
+
247
+ Phase 2 introduces `RemoteNode` and the `RemoteTransport` protocol so flows can delegate
248
+ work to remote agents (e.g., the A2A JSON-RPC/SSE ecosystem) without changing existing
249
+ nodes. The helper records remote bindings via the `StateStore`, mirrors streaming
250
+ partials back into the graph, and propagates per-trace cancellation to remote tasks via
251
+ `RemoteTransport.cancel`. See `tests/test_remote.py` for reference in-memory transports.
252
+
253
+ ### Exposing a flow over A2A
254
+
255
+ Install the optional extra to expose PenguiFlow as an A2A-compatible FastAPI service:
256
+
257
+ ```bash
258
+ pip install "penguiflow[a2a-server]"
259
+ ```
260
+
261
+ Create the adapter and mount the routes:
262
+
263
+ ```python
264
+ from penguiflow import Message, Node, create
265
+ from penguiflow_a2a import A2AAgentCard, A2AServerAdapter, A2ASkill, create_a2a_app
266
+
267
+ async def orchestrate(message: Message, ctx):
268
+ await ctx.emit_chunk(parent=message, text="thinking...")
269
+ return {"result": "done"}
270
+
271
+ node = Node(orchestrate, name="main")
272
+ flow = create(node.to())
273
+
274
+ card = A2AAgentCard(
275
+ name="Main Agent",
276
+ description="Primary entrypoint for orchestration",
277
+ version="2.1.0",
278
+ skills=[A2ASkill(name="orchestrate", description="Handles orchestration")],
279
+ )
280
+
281
+ adapter = A2AServerAdapter(
282
+ flow,
283
+ agent_card=card,
284
+ agent_url="https://agent.example",
285
+ )
286
+ app = create_a2a_app(adapter)
287
+ ```
288
+
289
+ The generated FastAPI app implements:
290
+
291
+ * `GET /agent` for discovery (Agent Card)
292
+ * `POST /message/send` for unary execution
293
+ * `POST /message/stream` for SSE streaming
294
+ * `POST /tasks/cancel` to mirror cancellation into PenguiFlow traces
295
+
296
+ `A2AServerAdapter` reuses the runtime's `StateStore` hooks, so bindings between trace IDs
297
+ and external `taskId`/`contextId` pairs are persisted automatically.
298
+
225
299
  ### Reliability & guardrails
226
300
 
227
301
  PenguiFlow enforces reliability boundaries out of the box:
@@ -478,9 +552,15 @@ docs or diagramming pipelines.
478
552
  * **Structured `FlowEvent`s**: every node event carries `{ts, trace_id, node_name, event,
479
553
  latency_ms, q_depth_in, q_depth_out, attempt}` plus a mutable `extra` map for custom
480
554
  annotations.
555
+ * **Remote call telemetry**: `RemoteNode` executions emit extra metrics (latency, request
556
+ and response bytes, context/task identifiers, cancel reasons) so remote hops can be
557
+ traced end-to-end.
481
558
  * **Middleware hooks**: subscribe observers (e.g., MLflow) to the structured `FlowEvent`
482
559
  stream. See `examples/mlflow_metrics/` for an MLflow integration and
483
560
  `examples/reliability_middleware/` for a concrete timeout + retry walkthrough.
561
+ * **`penguiflow-admin` CLI**: inspect or replay stored trace history from any configured
562
+ `StateStore` (`penguiflow-admin history <trace>` or `penguiflow-admin replay <trace>`)
563
+ when debugging distributed runs.
484
564
 
485
565
  ---
486
566
 
@@ -488,9 +568,9 @@ docs or diagramming pipelines.
488
568
 
489
569
  - **In-process runtime**: there is no built-in distribution layer yet. Long-running CPU work should be delegated to your own pools or services.
490
570
  - **Registry-driven typing**: nodes default to validation. Provide a `ModelRegistry` when calling `flow.run(...)` or set `validate="none"` explicitly for untyped hops.
491
- - **Observability**: structured `FlowEvent` callbacks power logs/metrics; integrations with
492
- third-party stacks (OTel, Prometheus, Datadog) remain DIY. See the MLflow middleware
493
- example for a lightweight pattern.
571
+ - **Observability**: structured `FlowEvent` callbacks and the `penguiflow-admin` CLI power
572
+ local debugging; integrations with third-party stacks (OTel, Prometheus, Datadog) remain
573
+ DIY. See the MLflow middleware example for a lightweight pattern.
494
574
  - **Roadmap**: follow-up releases focus on optional distributed backends, deeper observability integrations, and additional playbook patterns. Contributions and proposals are welcome!
495
575
 
496
576
  ---