lodestar-autogen 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lodestar_autogen-0.3.0/.gitignore +28 -0
- lodestar_autogen-0.3.0/PKG-INFO +118 -0
- lodestar_autogen-0.3.0/README.md +94 -0
- lodestar_autogen-0.3.0/lodestar_autogen/__init__.py +40 -0
- lodestar_autogen-0.3.0/lodestar_autogen/adapter.py +332 -0
- lodestar_autogen-0.3.0/pyproject.toml +38 -0
- lodestar_autogen-0.3.0/tests/e2e_autogen.py +420 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
node_modules/
|
|
2
|
+
.lodestar/
|
|
3
|
+
dist/
|
|
4
|
+
build/
|
|
5
|
+
_site/
|
|
6
|
+
*.tsbuildinfo
|
|
7
|
+
|
|
8
|
+
# Claude Code / agent-tool per-machine settings — keep `.claude/` tracked
|
|
9
|
+
# for the agent guidance and slash commands, but never commit the
|
|
10
|
+
# per-machine bash-permission allowlists.
|
|
11
|
+
.claude/settings.local.json
|
|
12
|
+
|
|
13
|
+
# Local-only working notes and scratch files — never committed.
|
|
14
|
+
.claude/local/
|
|
15
|
+
|
|
16
|
+
# Internal planning docs — kept local, not committed to this repo. The
|
|
17
|
+
# cast-build tooling under walkthrough/ stays tracked.
|
|
18
|
+
/docs/strategy/
|
|
19
|
+
/docs/internal/*
|
|
20
|
+
!/docs/internal/walkthrough/
|
|
21
|
+
/docs/internal/walkthrough/*
|
|
22
|
+
!/docs/internal/walkthrough/build-poison-cast.ts
|
|
23
|
+
|
|
24
|
+
# Python bytecode caches + build artifacts (runtimes/ — the LangGraph/CrewAI hooks)
|
|
25
|
+
__pycache__/
|
|
26
|
+
*.pyc
|
|
27
|
+
*.pyo
|
|
28
|
+
*.egg-info/
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: lodestar-autogen
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: Govern an AutoGen agent's native tool calls with Lodestar — the thin native hook that remotes each tool call through the Lodestar Action Kernel over NDJSON-RPC (ADR-0027).
|
|
5
|
+
Project-URL: Homepage, https://qmilab.com/lodestar
|
|
6
|
+
Project-URL: Repository, https://github.com/qmilab/lodestar
|
|
7
|
+
Project-URL: Issues, https://github.com/qmilab/lodestar/issues
|
|
8
|
+
Author-email: QMI Lab <hello@qmilab.com>
|
|
9
|
+
License: Apache-2.0
|
|
10
|
+
Keywords: agents,ai-agents,autogen,governance,lodestar,trust
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Requires-Python: >=3.10
|
|
15
|
+
Requires-Dist: lodestar-runtime-client==0.3.0
|
|
16
|
+
Provides-Extra: autogen
|
|
17
|
+
Requires-Dist: autogen-agentchat>=0.4; extra == 'autogen'
|
|
18
|
+
Requires-Dist: autogen-core>=0.4; extra == 'autogen'
|
|
19
|
+
Provides-Extra: dev
|
|
20
|
+
Requires-Dist: autogen-agentchat>=0.4; extra == 'dev'
|
|
21
|
+
Requires-Dist: autogen-core>=0.4; extra == 'dev'
|
|
22
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
|
|
25
|
+
# lodestar-autogen
|
|
26
|
+
|
|
27
|
+
Govern an **AutoGen** agent's native tool calls with
|
|
28
|
+
[Lodestar](https://qmilab.com/lodestar) — the open epistemic-governance framework
|
|
29
|
+
for AI agents.
|
|
30
|
+
|
|
31
|
+
AutoGen (the `autogen-agentchat` / `autogen-core` actor framework) runs
|
|
32
|
+
multi-agent conversations whose tools are native in-process Python objects and
|
|
33
|
+
does not speak MCP, so the MCP proxy cannot wrap it. This package is the **thin
|
|
34
|
+
native hook** (ADR-0027) — the third framework on the same shared gate, after
|
|
35
|
+
LangGraph and CrewAI: it spawns the TypeScript **governance-gate sidecar**
|
|
36
|
+
(`lodestar runtime gate`) and remotes each native tool call through the Lodestar
|
|
37
|
+
Action Kernel over newline-delimited JSON-RPC. The same machinery the MCP proxy
|
|
38
|
+
and the LangGraph / CrewAI adapters run — two-phase `propose → arbitrate →
|
|
39
|
+
execute`, the signed policy gate, cognitive-core ingestion (external-document
|
|
40
|
+
content can't auto-promote), sentinel arbitration, and the signed-approval L4 hold
|
|
41
|
+
path — now applies to AutoGen, with no change to the engine. The gate sidecar is
|
|
42
|
+
shared, unchanged; only this hook is new.
|
|
43
|
+
|
|
44
|
+
The tool body runs **only** inside the gate's execute phase, reached only after
|
|
45
|
+
the gate (and any approval hold) clears: "tools that do work before approval are
|
|
46
|
+
bugs" — across the Python↔TS boundary.
|
|
47
|
+
|
|
48
|
+
## Install
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pip install "lodestar-autogen[autogen]"
|
|
52
|
+
# and the Lodestar CLI (Bun/npm), which provides `lodestar runtime gate`:
|
|
53
|
+
npm install -g @qmilab/lodestar-cli # or: bun add -g @qmilab/lodestar-cli
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Use
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from autogen_agentchat.agents import AssistantAgent
|
|
60
|
+
from lodestar_autogen import GateClient, govern_tools, governed_call
|
|
61
|
+
|
|
62
|
+
with GateClient("runtime-gate.config.json") as gate:
|
|
63
|
+
governed = govern_tools(gate, my_tools) # register + wrap the toolset
|
|
64
|
+
agent = AssistantAgent("assistant", model_client=model_client, tools=governed)
|
|
65
|
+
# ... run your agent / team as usual; every tool call is governed.
|
|
66
|
+
|
|
67
|
+
# A custom step invokes a governed tool through the helper, never raw.
|
|
68
|
+
# It blocks, so off the event loop call it via a worker thread:
|
|
69
|
+
import asyncio
|
|
70
|
+
result = await asyncio.to_thread(governed_call, gate, "search_web", {"q": "lodestar"})
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
`govern_tools` accepts the same toolset shapes `AssistantAgent` does — a
|
|
74
|
+
`BaseTool` as-is, or a **bare callable** (normalised to a `FunctionTool` using its
|
|
75
|
+
docstring), so you can pass the literal `tools=[my_func, ...]` list you'd otherwise
|
|
76
|
+
hand the agent.
|
|
77
|
+
|
|
78
|
+
The gate's config (`runtime-gate.config.json`) is a `RuntimeGateConfig` — the
|
|
79
|
+
signed policy document, approver keys, sentinel ids, persistence, and durable log
|
|
80
|
+
root all live there. The hook never holds credentials or policy.
|
|
81
|
+
|
|
82
|
+
## Scope (honest, ADR-0004 lineage)
|
|
83
|
+
|
|
84
|
+
This is **governance over declared actions, not OS containment of the process.**
|
|
85
|
+
Raw I/O performed *outside* the tool abstraction — a custom step that calls
|
|
86
|
+
`requests.get()` directly instead of a registered tool — is outside the governed
|
|
87
|
+
surface, exactly as `guard.wrap()` and the MCP proxy only govern the tools they
|
|
88
|
+
are given. A call for an unregistered tool is **denied** (fail closed). Pair the
|
|
89
|
+
adapter with network/filesystem controls for defense in depth.
|
|
90
|
+
|
|
91
|
+
## Holds & denials
|
|
92
|
+
|
|
93
|
+
An L4 action the trust-ladder floor parks for approval is resolved by
|
|
94
|
+
block-polling the gate up to the deadline for a *signed* approval (`hold_wait_ms`)
|
|
95
|
+
— the headless default. A denied / held-then-timed-out call raises
|
|
96
|
+
`LodestarDenied` by default; AutoGen's `StaticWorkbench.call_tool` catches it and
|
|
97
|
+
surfaces the reason to the agent as a re-plannable error `ToolResult`. Pass
|
|
98
|
+
`on_denied` to `govern_tools` to map a denial to a return value instead.
|
|
99
|
+
|
|
100
|
+
## Async, loops & cancellation
|
|
101
|
+
|
|
102
|
+
AutoGen's tool surface is fully async (`BaseTool.run_json` is a coroutine). The
|
|
103
|
+
governed wrapper offloads the blocking gate RPC onto a worker thread so it never
|
|
104
|
+
stalls the agent's event loop, and the gate's remoted body runs the original
|
|
105
|
+
tool's coroutine on a single **persistent** event loop — so loop-scoped state a
|
|
106
|
+
tool caches (an `aiohttp.ClientSession`, an `asyncio.Event`) stays valid across
|
|
107
|
+
calls rather than being bound to a loop that is torn down after each call.
|
|
108
|
+
|
|
109
|
+
The wrapper honours AutoGen's `CancellationToken`: an already-cancelled token
|
|
110
|
+
short-circuits (no governed work starts), and an in-flight call is unblocked when
|
|
111
|
+
the run is cancelled. Note that once the gate is executing the tool body it runs
|
|
112
|
+
server-side and can't be force-cancelled mid-flight — a property of the
|
|
113
|
+
remoted-execute model it shares with the LangGraph / CrewAI hooks.
|
|
114
|
+
|
|
115
|
+
Apache-2.0. Part of the Lodestar monorepo (`runtimes/autogen/`). The pure-stdlib
|
|
116
|
+
`client.py` is duplicated verbatim from `lodestar-langgraph` / `lodestar-crewai`;
|
|
117
|
+
it graduates to a shared `lodestar-runtime-client` package alongside PyPI
|
|
118
|
+
publishing (issue #128).
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# lodestar-autogen
|
|
2
|
+
|
|
3
|
+
Govern an **AutoGen** agent's native tool calls with
|
|
4
|
+
[Lodestar](https://qmilab.com/lodestar) — the open epistemic-governance framework
|
|
5
|
+
for AI agents.
|
|
6
|
+
|
|
7
|
+
AutoGen (the `autogen-agentchat` / `autogen-core` actor framework) runs
|
|
8
|
+
multi-agent conversations whose tools are native in-process Python objects and
|
|
9
|
+
does not speak MCP, so the MCP proxy cannot wrap it. This package is the **thin
|
|
10
|
+
native hook** (ADR-0027) — the third framework on the same shared gate, after
|
|
11
|
+
LangGraph and CrewAI: it spawns the TypeScript **governance-gate sidecar**
|
|
12
|
+
(`lodestar runtime gate`) and remotes each native tool call through the Lodestar
|
|
13
|
+
Action Kernel over newline-delimited JSON-RPC. The same machinery the MCP proxy
|
|
14
|
+
and the LangGraph / CrewAI adapters run — two-phase `propose → arbitrate →
|
|
15
|
+
execute`, the signed policy gate, cognitive-core ingestion (external-document
|
|
16
|
+
content can't auto-promote), sentinel arbitration, and the signed-approval L4 hold
|
|
17
|
+
path — now applies to AutoGen, with no change to the engine. The gate sidecar is
|
|
18
|
+
shared, unchanged; only this hook is new.
|
|
19
|
+
|
|
20
|
+
The tool body runs **only** inside the gate's execute phase, reached only after
|
|
21
|
+
the gate (and any approval hold) clears: "tools that do work before approval are
|
|
22
|
+
bugs" — across the Python↔TS boundary.
|
|
23
|
+
|
|
24
|
+
## Install
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install "lodestar-autogen[autogen]"
|
|
28
|
+
# and the Lodestar CLI (Bun/npm), which provides `lodestar runtime gate`:
|
|
29
|
+
npm install -g @qmilab/lodestar-cli # or: bun add -g @qmilab/lodestar-cli
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Use
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
from autogen_agentchat.agents import AssistantAgent
|
|
36
|
+
from lodestar_autogen import GateClient, govern_tools, governed_call
|
|
37
|
+
|
|
38
|
+
with GateClient("runtime-gate.config.json") as gate:
|
|
39
|
+
governed = govern_tools(gate, my_tools) # register + wrap the toolset
|
|
40
|
+
agent = AssistantAgent("assistant", model_client=model_client, tools=governed)
|
|
41
|
+
# ... run your agent / team as usual; every tool call is governed.
|
|
42
|
+
|
|
43
|
+
# A custom step invokes a governed tool through the helper, never raw.
|
|
44
|
+
# It blocks, so off the event loop call it via a worker thread:
|
|
45
|
+
import asyncio
|
|
46
|
+
result = await asyncio.to_thread(governed_call, gate, "search_web", {"q": "lodestar"})
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
`govern_tools` accepts the same toolset shapes `AssistantAgent` does — a
|
|
50
|
+
`BaseTool` as-is, or a **bare callable** (normalised to a `FunctionTool` using its
|
|
51
|
+
docstring), so you can pass the literal `tools=[my_func, ...]` list you'd otherwise
|
|
52
|
+
hand the agent.
|
|
53
|
+
|
|
54
|
+
The gate's config (`runtime-gate.config.json`) is a `RuntimeGateConfig` — the
|
|
55
|
+
signed policy document, approver keys, sentinel ids, persistence, and durable log
|
|
56
|
+
root all live there. The hook never holds credentials or policy.
|
|
57
|
+
|
|
58
|
+
## Scope (honest, ADR-0004 lineage)
|
|
59
|
+
|
|
60
|
+
This is **governance over declared actions, not OS containment of the process.**
|
|
61
|
+
Raw I/O performed *outside* the tool abstraction — a custom step that calls
|
|
62
|
+
`requests.get()` directly instead of a registered tool — is outside the governed
|
|
63
|
+
surface, exactly as `guard.wrap()` and the MCP proxy only govern the tools they
|
|
64
|
+
are given. A call for an unregistered tool is **denied** (fail closed). Pair the
|
|
65
|
+
adapter with network/filesystem controls for defense in depth.
|
|
66
|
+
|
|
67
|
+
## Holds & denials
|
|
68
|
+
|
|
69
|
+
An L4 action the trust-ladder floor parks for approval is resolved by
|
|
70
|
+
block-polling the gate up to the deadline for a *signed* approval (`hold_wait_ms`)
|
|
71
|
+
— the headless default. A denied / held-then-timed-out call raises
|
|
72
|
+
`LodestarDenied` by default; AutoGen's `StaticWorkbench.call_tool` catches it and
|
|
73
|
+
surfaces the reason to the agent as a re-plannable error `ToolResult`. Pass
|
|
74
|
+
`on_denied` to `govern_tools` to map a denial to a return value instead.
|
|
75
|
+
|
|
76
|
+
## Async, loops & cancellation
|
|
77
|
+
|
|
78
|
+
AutoGen's tool surface is fully async (`BaseTool.run_json` is a coroutine). The
|
|
79
|
+
governed wrapper offloads the blocking gate RPC onto a worker thread so it never
|
|
80
|
+
stalls the agent's event loop, and the gate's remoted body runs the original
|
|
81
|
+
tool's coroutine on a single **persistent** event loop — so loop-scoped state a
|
|
82
|
+
tool caches (an `aiohttp.ClientSession`, an `asyncio.Event`) stays valid across
|
|
83
|
+
calls rather than being bound to a loop that is torn down after each call.
|
|
84
|
+
|
|
85
|
+
The wrapper honours AutoGen's `CancellationToken`: an already-cancelled token
|
|
86
|
+
short-circuits (no governed work starts), and an in-flight call is unblocked when
|
|
87
|
+
the run is cancelled. Note that once the gate is executing the tool body it runs
|
|
88
|
+
server-side and can't be force-cancelled mid-flight — a property of the
|
|
89
|
+
remoted-execute model it shares with the LangGraph / CrewAI hooks.
|
|
90
|
+
|
|
91
|
+
Apache-2.0. Part of the Lodestar monorepo (`runtimes/autogen/`). The pure-stdlib
|
|
92
|
+
`client.py` is duplicated verbatim from `lodestar-langgraph` / `lodestar-crewai`;
|
|
93
|
+
it graduates to a shared `lodestar-runtime-client` package alongside PyPI
|
|
94
|
+
publishing (issue #128).
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""lodestar-autogen — govern an AutoGen agent's native tool calls with Lodestar.
|
|
2
|
+
|
|
3
|
+
The thin native hook of the runtime-adapter epic (ADR-0024 / ADR-0027), and the
|
|
4
|
+
third framework on the same shared gate (after LangGraph and CrewAI). It spawns
|
|
5
|
+
the TypeScript governance-gate sidecar (``lodestar runtime gate``) and remotes
|
|
6
|
+
each native AutoGen tool call through the Action Kernel over NDJSON-RPC — so the
|
|
7
|
+
same two-phase execution, policy gate, cognitive-core ingestion, sentinel
|
|
8
|
+
arbitration, and signed-approval hold path the MCP proxy and the LangGraph /
|
|
9
|
+
CrewAI adapters run now apply to AutoGen, a framework that does not speak MCP. The
|
|
10
|
+
gate sidecar is shared, unchanged; only this hook is new.
|
|
11
|
+
|
|
12
|
+
Quick start::
|
|
13
|
+
|
|
14
|
+
from autogen_agentchat.agents import AssistantAgent
|
|
15
|
+
from lodestar_autogen import GateClient, govern_tools, governed_call
|
|
16
|
+
|
|
17
|
+
with GateClient("runtime-gate.config.json") as gate:
|
|
18
|
+
governed = govern_tools(gate, my_tools) # register + wrap the toolset
|
|
19
|
+
agent = AssistantAgent("assistant", model_client=..., tools=governed)
|
|
20
|
+
# ... run your agent/team as usual; every tool call is now governed.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from .adapter import (
|
|
24
|
+
DEFAULT_HOLD_WAIT_MS,
|
|
25
|
+
LodestarDenied,
|
|
26
|
+
govern_tools,
|
|
27
|
+
governed_call,
|
|
28
|
+
)
|
|
29
|
+
from lodestar_runtime_client import GateClient, GateError
|
|
30
|
+
|
|
31
|
+
__all__ = [
|
|
32
|
+
"GateClient",
|
|
33
|
+
"GateError",
|
|
34
|
+
"govern_tools",
|
|
35
|
+
"governed_call",
|
|
36
|
+
"LodestarDenied",
|
|
37
|
+
"DEFAULT_HOLD_WAIT_MS",
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
__version__ = "0.3.0"
|
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
"""AutoGen integration for the Lodestar governance gate (ADR-0024 / ADR-0027).
|
|
2
|
+
|
|
3
|
+
AutoGen (the ``autogen-agentchat`` / ``autogen-core`` actor framework) runs
|
|
4
|
+
multi-agent conversations whose tools are native in-process
|
|
5
|
+
``autogen_core.tools.BaseTool`` objects; it does not speak MCP, so the MCP proxy
|
|
6
|
+
cannot wrap it. This adapter is the thin native hook (ADR-0027): it reuses the
|
|
7
|
+
**same** language-agnostic governance-gate sidecar the LangGraph and CrewAI
|
|
8
|
+
adapters do and governs AutoGen's **tool-invocation surface** and nothing
|
|
9
|
+
implicitly (ADR-0024 §3, one closed fail-closed surface):
|
|
10
|
+
|
|
11
|
+
* :func:`govern_tools` registers every tool with the gate and returns *wrapped*
|
|
12
|
+
``BaseTool``s to hand to the ``AssistantAgent`` (and any ``Workbench``) — a
|
|
13
|
+
governed wrapper is the only object an agent ever holds for a governed
|
|
14
|
+
capability. The wrapper routes each call through the gate (``propose →
|
|
15
|
+
arbitrate``); only on an ``allow`` does the gate remote the body back to run.
|
|
16
|
+
The wrapper overrides ``run_json`` — the exact point AutoGen executes a tool
|
|
17
|
+
through (``AssistantAgent`` dispatches via ``StaticWorkbench.call_tool``, which
|
|
18
|
+
calls ``tool.run_json``; a direct caller hits the same method).
|
|
19
|
+
* :func:`governed_call` is the helper a **custom step** (a callback / handler that
|
|
20
|
+
invokes a tool directly) uses to call a governed tool — never a raw tool body.
|
|
21
|
+
* A call for a tool that was never registered is **denied by the gate** (fail
|
|
22
|
+
closed). Raw I/O performed outside the tool abstraction is outside the governed
|
|
23
|
+
surface, exactly as ``guard.wrap()`` and the MCP proxy only govern the tools
|
|
24
|
+
they are given — pair with network/filesystem controls for defense in depth.
|
|
25
|
+
|
|
26
|
+
When a governed call is denied/held-then-timed-out, the default re-raises
|
|
27
|
+
:class:`LodestarDenied`; AutoGen's ``StaticWorkbench.call_tool`` catches it and
|
|
28
|
+
surfaces the reason to the agent as a re-plannable error ``ToolResult``. Pass
|
|
29
|
+
``on_denied`` to map a denial to a return value instead.
|
|
30
|
+
|
|
31
|
+
Holds (an L4 action the trust-ladder floor parks for approval) are resolved by
|
|
32
|
+
**block-polling** the gate up to the deadline for a *signed* approval
|
|
33
|
+
(``hold_wait_ms``) — the headless default the ADR sanctions.
|
|
34
|
+
|
|
35
|
+
AutoGen's tool surface is **fully async** (``run`` / ``run_json`` are coroutines).
|
|
36
|
+
The governed wrapper therefore offloads the blocking gate RPC onto a worker thread
|
|
37
|
+
(:func:`asyncio.to_thread`) so it never stalls the agent's event loop, and the
|
|
38
|
+
gate's remoted body runs the original tool's coroutine via ``asyncio.run`` on the
|
|
39
|
+
client's (loop-less) worker thread.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
from __future__ import annotations
|
|
43
|
+
|
|
44
|
+
import asyncio
|
|
45
|
+
import json
|
|
46
|
+
import threading
|
|
47
|
+
from typing import Any, Callable, Iterable, Optional
|
|
48
|
+
|
|
49
|
+
from lodestar_runtime_client import GateClient
|
|
50
|
+
|
|
51
|
+
# Default block-poll budget for a held action, in ms. Keep comfortably under the
|
|
52
|
+
# gate/client timeout; 0 means "don't wait" (surface the hold immediately).
|
|
53
|
+
DEFAULT_HOLD_WAIT_MS = 60_000
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class LodestarDenied(Exception):
|
|
57
|
+
"""A governed tool call was denied, held-then-timed-out, or failed.
|
|
58
|
+
|
|
59
|
+
``kind`` is the machine tag from the gate (``policy_denied``,
|
|
60
|
+
``approval_denied``, ``approval_timeout``, ``unregistered_tool``,
|
|
61
|
+
``precondition_failed``, ``execution_failed``).
|
|
62
|
+
"""
|
|
63
|
+
|
|
64
|
+
def __init__(self, reason: str, kind: str, action_id: Optional[str] = None) -> None:
|
|
65
|
+
super().__init__(reason)
|
|
66
|
+
self.reason = reason
|
|
67
|
+
self.kind = kind
|
|
68
|
+
self.action_id = action_id
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def governed_call(
|
|
72
|
+
client: GateClient,
|
|
73
|
+
tool: str,
|
|
74
|
+
args: dict,
|
|
75
|
+
*,
|
|
76
|
+
hold_wait_ms: int = DEFAULT_HOLD_WAIT_MS,
|
|
77
|
+
) -> Any:
|
|
78
|
+
"""Invoke a governed tool through the gate and return its output.
|
|
79
|
+
|
|
80
|
+
Drives the full two-phase flow: ``govern``; on a hold, block-poll ``resume``
|
|
81
|
+
up to ``hold_wait_ms`` for a signed approval; on completion, return the tool
|
|
82
|
+
output. Raises :class:`LodestarDenied` on any non-completion (including an
|
|
83
|
+
unregistered tool — fail closed). This is the helper a custom AutoGen step
|
|
84
|
+
calls; never invoke a raw tool body from a custom step.
|
|
85
|
+
|
|
86
|
+
This call **blocks**; from inside an AutoGen coroutine, call it via
|
|
87
|
+
``await asyncio.to_thread(governed_call, ...)`` so it does not stall the loop
|
|
88
|
+
(the governed-tool wrapper already does this for the agent's own tool calls).
|
|
89
|
+
"""
|
|
90
|
+
result = client.govern(tool, args)
|
|
91
|
+
if result.get("phase") == "pending_approval":
|
|
92
|
+
result = client.resume(
|
|
93
|
+
str(result.get("action_id")),
|
|
94
|
+
str(result.get("request_id")),
|
|
95
|
+
wait_ms=hold_wait_ms,
|
|
96
|
+
)
|
|
97
|
+
phase = result.get("phase")
|
|
98
|
+
if phase == "completed":
|
|
99
|
+
return result.get("output")
|
|
100
|
+
raise LodestarDenied(
|
|
101
|
+
str(result.get("reason") or "governed tool call was not allowed"),
|
|
102
|
+
str(result.get("kind") or phase or "denied"),
|
|
103
|
+
result.get("action_id"),
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def govern_tools(
|
|
108
|
+
client: GateClient,
|
|
109
|
+
tools: Iterable[Any],
|
|
110
|
+
*,
|
|
111
|
+
hold_wait_ms: int = DEFAULT_HOLD_WAIT_MS,
|
|
112
|
+
on_denied: Optional[Callable[[LodestarDenied], Any]] = None,
|
|
113
|
+
) -> list[Any]:
|
|
114
|
+
"""Register and wrap an AutoGen toolset for governance.
|
|
115
|
+
|
|
116
|
+
Returns governed ``BaseTool``s to assign to your ``AssistantAgent`` (and any
|
|
117
|
+
``Workbench``), so an agent never holds an ungoverned handle. Each wrapper runs
|
|
118
|
+
the call through the gate; the gate remotes the *original* tool body back to
|
|
119
|
+
run only inside its execute phase.
|
|
120
|
+
|
|
121
|
+
``on_denied`` maps a :class:`LodestarDenied` to a tool return value; the
|
|
122
|
+
default re-raises, which AutoGen's ``StaticWorkbench.call_tool`` turns into a
|
|
123
|
+
re-plannable error ``ToolResult`` for the agent.
|
|
124
|
+
"""
|
|
125
|
+
# Imported lazily so `from lodestar_autogen import GateClient` works without
|
|
126
|
+
# autogen installed (the client is pure stdlib). pydantic + CancellationToken
|
|
127
|
+
# come in with autogen-core.
|
|
128
|
+
from autogen_core import CancellationToken
|
|
129
|
+
from autogen_core.tools import BaseTool, FunctionTool
|
|
130
|
+
from pydantic import BaseModel
|
|
131
|
+
|
|
132
|
+
cls = _governed_tool_cls()
|
|
133
|
+
governed: list[Any] = []
|
|
134
|
+
for tool in tools:
|
|
135
|
+
# Accept the same toolset shapes AssistantAgent does — a BaseTool as-is, a
|
|
136
|
+
# bare callable normalised to a FunctionTool — so `govern_tools(gate, tools)`
|
|
137
|
+
# works on the exact `tools=[...]` list the agent would take, not only
|
|
138
|
+
# pre-wrapped BaseTools.
|
|
139
|
+
tool = _normalize_tool(tool, BaseTool, FunctionTool)
|
|
140
|
+
# Bind the ORIGINAL tool body for the gate's remoted execute. Using the
|
|
141
|
+
# original (not the wrapper) is what prevents recursion.
|
|
142
|
+
client.register_tool(tool.name, _body_for(tool, CancellationToken, BaseModel))
|
|
143
|
+
governed.append(_wrap_tool(cls, client, tool, hold_wait_ms, on_denied))
|
|
144
|
+
return governed
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _normalize_tool(tool: Any, base_tool_cls: Any, function_tool_cls: Any) -> Any:
|
|
148
|
+
"""Mirror ``AssistantAgent``'s tool ingestion: a ``BaseTool`` is used as-is; a
|
|
149
|
+
bare callable is wrapped in a ``FunctionTool`` (its ``__doc__`` as the
|
|
150
|
+
description, exactly as the agent does). So ``govern_tools`` accepts the same
|
|
151
|
+
``tools=[...]`` shapes the framework does — passing a plain function no longer
|
|
152
|
+
fails on a missing ``.name``."""
|
|
153
|
+
if isinstance(tool, base_tool_cls):
|
|
154
|
+
return tool
|
|
155
|
+
if callable(tool):
|
|
156
|
+
description = tool.__doc__ if getattr(tool, "__doc__", None) else ""
|
|
157
|
+
return function_tool_cls(tool, description=description)
|
|
158
|
+
raise TypeError(f"govern_tools: unsupported tool type {type(tool)!r}")
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# A single persistent event loop on a dedicated daemon thread runs every remoted
|
|
162
|
+
# tool coroutine. AutoGen's tool surface is *fully* async, so a fresh `asyncio.run`
|
|
163
|
+
# per call would bind any loop-scoped state a tool caches (an `aiohttp.ClientSession`,
|
|
164
|
+
# an `asyncio.Event`/`Queue`, a pooled DB connection) to a loop that is then torn
|
|
165
|
+
# down — the next call on a new loop would fail cross-loop. One stable loop keeps
|
|
166
|
+
# that state valid across calls. Lazily created; never touched on the import path.
|
|
167
|
+
_tool_loop_lock = threading.Lock()
|
|
168
|
+
_tool_loop_holder: dict[str, Any] = {"loop": None}
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _tool_loop() -> Any:
|
|
172
|
+
with _tool_loop_lock:
|
|
173
|
+
loop = _tool_loop_holder["loop"]
|
|
174
|
+
if loop is None:
|
|
175
|
+
loop = asyncio.new_event_loop()
|
|
176
|
+
threading.Thread(
|
|
177
|
+
target=loop.run_forever, name="lodestar-autogen-tool-loop", daemon=True
|
|
178
|
+
).start()
|
|
179
|
+
_tool_loop_holder["loop"] = loop
|
|
180
|
+
return loop
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _body_for(tool: Any, cancellation_token_cls: Any, base_model_cls: Any) -> Callable[[dict], dict]:
|
|
184
|
+
"""A run_tool body that executes the real AutoGen tool and wraps its result
|
|
185
|
+
into the gate's tool-result shape.
|
|
186
|
+
|
|
187
|
+
AutoGen's tool surface is fully async — ``run_json`` validates the args against
|
|
188
|
+
the tool's ``args_type`` and awaits ``run``. The gate remotes this body on a
|
|
189
|
+
worker thread with no running event loop; we drive the coroutine on the shared
|
|
190
|
+
persistent tool loop (so loop-scoped state a tool caches survives across calls,
|
|
191
|
+
unlike a fresh ``asyncio.run`` per call) and block this worker thread on the
|
|
192
|
+
result. A Pydantic-model result is dumped to a JSON-safe value for the wire; a
|
|
193
|
+
non-finite float is rejected by the client before it can corrupt the JSON (→
|
|
194
|
+
``tool_error`` → failed action), never silently emitted as ``NaN``.
|
|
195
|
+
"""
|
|
196
|
+
|
|
197
|
+
def body(args: dict) -> dict:
|
|
198
|
+
loop = _tool_loop()
|
|
199
|
+
future = asyncio.run_coroutine_threadsafe(tool.run_json(args, cancellation_token_cls()), loop)
|
|
200
|
+
output = future.result()
|
|
201
|
+
# Normalise a Pydantic-model return to a JSON-safe value for the wire.
|
|
202
|
+
if isinstance(output, base_model_cls):
|
|
203
|
+
output = output.model_dump(mode="json")
|
|
204
|
+
documents: list[dict] = []
|
|
205
|
+
# A tool may surface untrusted document content for external_document
|
|
206
|
+
# evidence by returning {"output": ..., "_lodestar_documents": [...]}.
|
|
207
|
+
if isinstance(output, dict) and "_lodestar_documents" in output:
|
|
208
|
+
documents = list(output.get("_lodestar_documents") or [])
|
|
209
|
+
output = output.get("output")
|
|
210
|
+
return {"output": output, "documents": documents}
|
|
211
|
+
|
|
212
|
+
return body
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
# The governed BaseTool subclass is built once, lazily — so importing
|
|
216
|
+
# `lodestar_autogen` (e.g. `from lodestar_autogen import GateClient`) does not require
|
|
217
|
+
# autogen installed (the client is pure stdlib). Cached module-wide.
|
|
218
|
+
_GOVERNED_TOOL_CLS: Optional[type] = None
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _governed_tool_cls() -> type:
|
|
222
|
+
global _GOVERNED_TOOL_CLS
|
|
223
|
+
if _GOVERNED_TOOL_CLS is not None:
|
|
224
|
+
return _GOVERNED_TOOL_CLS
|
|
225
|
+
# Imported lazily; see above.
|
|
226
|
+
from autogen_core import CancellationToken
|
|
227
|
+
from autogen_core.tools import BaseTool
|
|
228
|
+
|
|
229
|
+
class _GovernedAutoGenTool(BaseTool): # type: ignore[type-arg]
|
|
230
|
+
"""A governed ``BaseTool`` that presents the original's schema but routes
|
|
231
|
+
every call through the gate. Not a Pydantic model (AutoGen's ``BaseTool``
|
|
232
|
+
is a plain class), so the gate reference + per-tool config live in plain
|
|
233
|
+
instance attributes — they are never part of the tool's serialised schema
|
|
234
|
+
(the description the LLM sees) and are not validated as tool inputs.
|
|
235
|
+
"""
|
|
236
|
+
|
|
237
|
+
def __init__(self, original: Any, gov: dict) -> None:
|
|
238
|
+
super().__init__(
|
|
239
|
+
args_type=original.args_type(),
|
|
240
|
+
return_type=original.return_type(),
|
|
241
|
+
name=original.name,
|
|
242
|
+
description=original.description,
|
|
243
|
+
strict=bool(getattr(original, "_strict", False)),
|
|
244
|
+
)
|
|
245
|
+
self._original = original
|
|
246
|
+
self._gov = gov
|
|
247
|
+
|
|
248
|
+
# Delegate the schema surface to the original so a custom override (or a
|
|
249
|
+
# FunctionTool's generated schema) is preserved verbatim — the model sees
|
|
250
|
+
# exactly the original's parameters, not one regenerated from the wrapper.
|
|
251
|
+
@property
|
|
252
|
+
def schema(self) -> Any:
|
|
253
|
+
return self._original.schema
|
|
254
|
+
|
|
255
|
+
def args_type(self) -> Any:
|
|
256
|
+
return self._original.args_type()
|
|
257
|
+
|
|
258
|
+
def return_type(self) -> Any:
|
|
259
|
+
return self._original.return_type()
|
|
260
|
+
|
|
261
|
+
def state_type(self) -> Any:
|
|
262
|
+
return self._original.state_type()
|
|
263
|
+
|
|
264
|
+
def return_value_as_string(self, value: Any) -> str:
|
|
265
|
+
# The value has crossed the JSON wire, so it is a str / dict / list /
|
|
266
|
+
# primitive (not the original Pydantic model). Stringify deterministically.
|
|
267
|
+
if isinstance(value, str):
|
|
268
|
+
return value
|
|
269
|
+
try:
|
|
270
|
+
return json.dumps(value)
|
|
271
|
+
except (TypeError, ValueError):
|
|
272
|
+
return str(value)
|
|
273
|
+
|
|
274
|
+
async def run_json(self, args: Any, cancellation_token: Any = None, call_id: Optional[str] = None) -> Any:
|
|
275
|
+
# The choke point the workbench / agent dispatches through.
|
|
276
|
+
return await self._governed(dict(args or {}), cancellation_token)
|
|
277
|
+
|
|
278
|
+
async def run(self, args: Any, cancellation_token: Any = None) -> Any:
|
|
279
|
+
# The abstract method; also governs a direct programmatic caller that
|
|
280
|
+
# already validated its args into the model. run_json does NOT delegate
|
|
281
|
+
# here, so there is no double-governing.
|
|
282
|
+
payload = args.model_dump() if hasattr(args, "model_dump") else dict(args or {})
|
|
283
|
+
return await self._governed(payload, cancellation_token)
|
|
284
|
+
|
|
285
|
+
async def _governed(self, payload: dict, cancellation_token: Any = None) -> Any:
|
|
286
|
+
gov = self._gov
|
|
287
|
+
# Honour an already-cancelled run: don't even propose the action, so a
|
|
288
|
+
# cancelled agent run starts no new governed work (no body, no event).
|
|
289
|
+
if cancellation_token is not None and cancellation_token.is_cancelled():
|
|
290
|
+
raise asyncio.CancelledError()
|
|
291
|
+
# Offload the blocking gate RPC onto a worker thread so the agent's event
|
|
292
|
+
# loop is never stalled by govern/resume; link the agent's cancellation
|
|
293
|
+
# token so cancelling the run promptly unblocks this await. NOTE: once the
|
|
294
|
+
# gate reaches its execute phase the remoted body runs server-side and
|
|
295
|
+
# cannot be force-cancelled across the RPC boundary — a documented boundary
|
|
296
|
+
# of the remoted-execute model (ADR-0027 §2). The early-cancel check above
|
|
297
|
+
# is what prevents a *new* call from starting on an already-cancelled run.
|
|
298
|
+
task = asyncio.ensure_future(
|
|
299
|
+
asyncio.to_thread(governed_call, gov["client"], gov["name"], payload, hold_wait_ms=gov["hold_wait_ms"])
|
|
300
|
+
)
|
|
301
|
+
if cancellation_token is not None:
|
|
302
|
+
cancellation_token.link_future(task)
|
|
303
|
+
try:
|
|
304
|
+
return await task
|
|
305
|
+
except LodestarDenied as denied:
|
|
306
|
+
on_denied = gov["on_denied"]
|
|
307
|
+
if on_denied is not None:
|
|
308
|
+
return on_denied(denied)
|
|
309
|
+
# Re-raise: AutoGen's StaticWorkbench.call_tool catches it and
|
|
310
|
+
# surfaces the reason as a re-plannable error ToolResult.
|
|
311
|
+
raise
|
|
312
|
+
|
|
313
|
+
_GOVERNED_TOOL_CLS = _GovernedAutoGenTool
|
|
314
|
+
return _GOVERNED_TOOL_CLS
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def _wrap_tool(
|
|
318
|
+
cls: type,
|
|
319
|
+
client: GateClient,
|
|
320
|
+
tool: Any,
|
|
321
|
+
hold_wait_ms: int,
|
|
322
|
+
on_denied: Optional[Callable[[LodestarDenied], Any]],
|
|
323
|
+
) -> Any:
|
|
324
|
+
return cls(
|
|
325
|
+
tool,
|
|
326
|
+
{
|
|
327
|
+
"client": client,
|
|
328
|
+
"name": tool.name,
|
|
329
|
+
"hold_wait_ms": hold_wait_ms,
|
|
330
|
+
"on_denied": on_denied,
|
|
331
|
+
},
|
|
332
|
+
)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "lodestar-autogen"
|
|
7
|
+
version = "0.3.0"
|
|
8
|
+
description = "Govern an AutoGen agent's native tool calls with Lodestar — the thin native hook that remotes each tool call through the Lodestar Action Kernel over NDJSON-RPC (ADR-0027)."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "Apache-2.0" }
|
|
12
|
+
authors = [{ name = "QMI Lab", email = "hello@qmilab.com" }]
|
|
13
|
+
keywords = ["ai-agents", "autogen", "governance", "lodestar", "trust", "agents"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"License :: OSI Approved :: Apache Software License",
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"Intended Audience :: Developers",
|
|
18
|
+
]
|
|
19
|
+
# The Lodestar RPC client (spawns the TS gate, speaks NDJSON over stdio) is the
|
|
20
|
+
# pure-stdlib `lodestar-runtime-client`, shared across the runtime hooks (#128,
|
|
21
|
+
# ADR-0028) and pinned in lockstep with this package. The AutoGen integration in
|
|
22
|
+
# `adapter` imports autogen lazily; install the `autogen` extra.
|
|
23
|
+
dependencies = ["lodestar-runtime-client==0.3.0"]
|
|
24
|
+
|
|
25
|
+
[project.optional-dependencies]
|
|
26
|
+
# autogen-agentchat pulls autogen-core (where BaseTool / CancellationToken live).
|
|
27
|
+
# Verified against autogen-core / autogen-agentchat 0.7.5; the seam (BaseTool.run_json
|
|
28
|
+
# via StaticWorkbench.call_tool) has been stable across the 0.4+ actor line.
|
|
29
|
+
autogen = ["autogen-agentchat>=0.4", "autogen-core>=0.4"]
|
|
30
|
+
dev = ["autogen-agentchat>=0.4", "autogen-core>=0.4", "pytest>=8.0"]
|
|
31
|
+
|
|
32
|
+
[project.urls]
|
|
33
|
+
Homepage = "https://qmilab.com/lodestar"
|
|
34
|
+
Repository = "https://github.com/qmilab/lodestar"
|
|
35
|
+
Issues = "https://github.com/qmilab/lodestar/issues"
|
|
36
|
+
|
|
37
|
+
[tool.hatch.build.targets.wheel]
|
|
38
|
+
packages = ["lodestar_autogen"]
|
|
@@ -0,0 +1,420 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""End-to-end driver for the AutoGen runtime adapter (ADR-0027 / ADR-0024 §8).
|
|
3
|
+
|
|
4
|
+
Drives REAL AutoGen tools through the `lodestar-autogen` hook and the TypeScript
|
|
5
|
+
governance-gate sidecar, exercising the real-runtime cases the in-TS
|
|
6
|
+
`runtime-gate-enforces-two-phase` probe cannot:
|
|
7
|
+
|
|
8
|
+
0. a BARE CALLABLE in the toolset is normalised to a governed FunctionTool (the
|
|
9
|
+
same `tools=[my_func]` shape `AssistantAgent` accepts);
|
|
10
|
+
1. govern_tools returns governed wrappers only; a governed L1 call executes
|
|
11
|
+
through AutoGen's own execution path (`StaticWorkbench.call_tool`, the exact
|
|
12
|
+
dispatch an `AssistantAgent` uses) — the body runs exactly once, remoted back;
|
|
13
|
+
2. a custom step invokes a governed tool via ``governed_call`` (incl. the
|
|
14
|
+
normalised bare-callable tool);
|
|
15
|
+
3. an async-implemented AutoGen tool (an async ``FunctionTool``) runs through the
|
|
16
|
+
gate's remoted execute, and a custom ``BaseTool`` subclass works too; the
|
|
17
|
+
remoted body runs on ONE stable persistent loop, so a tool's loop-scoped state
|
|
18
|
+
survives across calls (3c);
|
|
19
|
+
4. concurrent in-flight calls are correlated to the right result;
|
|
20
|
+
5. an L4 tool is HELD (two-phase across the boundary): with no approver it
|
|
21
|
+
times out and the body NEVER runs — both through ``governed_call`` (raises)
|
|
22
|
+
and through the framework path (``call_tool`` surfaces an error ``ToolResult``);
|
|
23
|
+
6. a tool that was never registered is DENIED — fail closed;
|
|
24
|
+
7. the governed wrappers are valid AutoGen ``BaseTool``s the framework accepts —
|
|
25
|
+
they attach to a real ``AssistantAgent`` (a stub model client, no LLM/key);
|
|
26
|
+
8. a non-finite float in an argument or a result fails the call rather than
|
|
27
|
+
hanging it (Python's json would otherwise emit invalid ``NaN``);
|
|
28
|
+
9. an already-cancelled ``CancellationToken`` short-circuits the wrapper — no
|
|
29
|
+
action is proposed and the body never runs.
|
|
30
|
+
|
|
31
|
+
Spawns the gate via ``bun run <repo>/packages/cli/src/index.ts runtime gate``.
|
|
32
|
+
Invoked by the runtime-gated ``autogen-tool-calls-are-governed`` probe, which skips
|
|
33
|
+
loudly when Python / AutoGen is absent. Exit 0 = pass, 1 = fail.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
from __future__ import annotations
|
|
37
|
+
|
|
38
|
+
import asyncio
|
|
39
|
+
import json
|
|
40
|
+
import math
|
|
41
|
+
import sys
|
|
42
|
+
import tempfile
|
|
43
|
+
import threading
|
|
44
|
+
import time
|
|
45
|
+
from pathlib import Path
|
|
46
|
+
from typing import Any
|
|
47
|
+
|
|
48
|
+
REPO_ROOT = Path(__file__).resolve().parents[3]
|
|
49
|
+
CLI_INDEX = REPO_ROOT / "packages" / "cli" / "src" / "index.ts"
|
|
50
|
+
|
|
51
|
+
# Prefer the INSTALLED hook so CI (which pip-installs runtimes/autogen) actually
|
|
52
|
+
# exercises the packaged artifact and its pyproject exports. Only fall back to the
|
|
53
|
+
# source tree for a local run where the hook isn't installed.
|
|
54
|
+
try:
|
|
55
|
+
from lodestar_autogen import ( # noqa: E402
|
|
56
|
+
GateClient,
|
|
57
|
+
GateError,
|
|
58
|
+
LodestarDenied,
|
|
59
|
+
govern_tools,
|
|
60
|
+
governed_call,
|
|
61
|
+
)
|
|
62
|
+
except ImportError:
|
|
63
|
+
# The hook's source __init__ imports lodestar_runtime_client (#128); put the
|
|
64
|
+
# shared client's source on the path too so the no-install fallback resolves it.
|
|
65
|
+
sys.path.insert(0, str(REPO_ROOT / "runtimes" / "runtime-client"))
|
|
66
|
+
sys.path.insert(0, str(REPO_ROOT / "runtimes" / "autogen"))
|
|
67
|
+
from lodestar_autogen import ( # noqa: E402
|
|
68
|
+
GateClient,
|
|
69
|
+
GateError,
|
|
70
|
+
LodestarDenied,
|
|
71
|
+
govern_tools,
|
|
72
|
+
governed_call,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
from autogen_core import CancellationToken
|
|
77
|
+
from autogen_core.models import ChatCompletionClient, RequestUsage
|
|
78
|
+
from autogen_core.tools import BaseTool, FunctionTool, StaticWorkbench
|
|
79
|
+
from autogen_agentchat.agents import AssistantAgent
|
|
80
|
+
from pydantic import BaseModel, Field
|
|
81
|
+
except Exception as exc: # pragma: no cover - the probe gates on import availability
|
|
82
|
+
print(f"SKIP: AutoGen not importable: {exc}")
|
|
83
|
+
sys.exit(0)
|
|
84
|
+
|
|
85
|
+
# ── tool bodies (the REAL functions the gate remotes back to run) ─────────────
|
|
86
|
+
runs: dict[str, int] = {"echo": 0, "read_doc": 0, "deploy": 0, "fetch": 0, "search": 0, "loop_check": 0}
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def echo(msg: str) -> dict:
|
|
90
|
+
"""echo a message back"""
|
|
91
|
+
runs["echo"] += 1
|
|
92
|
+
return {"echo": msg}
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def search(q: str) -> dict:
|
|
96
|
+
"""search the web (a BARE CALLABLE — not a BaseTool; govern_tools must
|
|
97
|
+
normalise it to a FunctionTool the way AssistantAgent does)"""
|
|
98
|
+
runs["search"] += 1
|
|
99
|
+
return {"hits": [q]}
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
# Records the running loop across calls to prove the remoted body runs on ONE
|
|
103
|
+
# stable persistent loop (a fresh asyncio.run per call would give a new, torn-down
|
|
104
|
+
# loop each time → cross-loop breakage for loop-scoped state).
|
|
105
|
+
_loop_state: dict[str, object] = {"loop": None}
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
async def loop_check(x: int) -> dict:
|
|
109
|
+
"""async tool that reports whether it is running on the same loop as last call"""
|
|
110
|
+
runs["loop_check"] += 1
|
|
111
|
+
current = asyncio.get_running_loop()
|
|
112
|
+
same = _loop_state["loop"] is current
|
|
113
|
+
_loop_state["loop"] = current
|
|
114
|
+
return {"same_as_prev": same}
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def deploy(target: str) -> dict:
|
|
118
|
+
"""deploy to a target (irreversible, L4)"""
|
|
119
|
+
runs["deploy"] += 1 # must stay 0 for a held L4 with no approver
|
|
120
|
+
return {"deployed": target}
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
async def fetch(url: str) -> dict:
|
|
124
|
+
"""fetch a url (async-implemented tool)"""
|
|
125
|
+
runs["fetch"] += 1
|
|
126
|
+
return {"fetched": url}
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def nan_out(x: int) -> dict:
|
|
130
|
+
"""returns a non-finite float (invalid JSON for the gate)"""
|
|
131
|
+
# The hook must reject this (→ tool_error → failed action), never emit `NaN`.
|
|
132
|
+
return {"output": {"value": float("nan")}}
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class ReadDocArgs(BaseModel):
|
|
136
|
+
path: str = Field(..., description="path to read")
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class ReadDoc(BaseTool): # type: ignore[type-arg]
|
|
140
|
+
"""A custom BaseTool subclass (async run) that surfaces untrusted content."""
|
|
141
|
+
|
|
142
|
+
def __init__(self) -> None:
|
|
143
|
+
super().__init__(args_type=ReadDocArgs, return_type=dict, name="read_doc", description="read an (untrusted) document")
|
|
144
|
+
|
|
145
|
+
async def run(self, args: ReadDocArgs, cancellation_token: CancellationToken) -> dict:
|
|
146
|
+
runs["read_doc"] += 1
|
|
147
|
+
# Surface untrusted document content for external_document evidence.
|
|
148
|
+
return {"output": {"read": args.path}, "_lodestar_documents": [{"text": "untrusted file body", "source": args.path}]}
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
# A no-network stub model client so a real AssistantAgent can be constructed
|
|
152
|
+
# (construction validates the toolset); we never run inference.
|
|
153
|
+
class _StubModelClient(ChatCompletionClient): # type: ignore[misc]
|
|
154
|
+
@property
|
|
155
|
+
def model_info(self) -> dict:
|
|
156
|
+
return {
|
|
157
|
+
"vision": False,
|
|
158
|
+
"function_calling": True,
|
|
159
|
+
"json_output": False,
|
|
160
|
+
"family": "unknown",
|
|
161
|
+
"structured_output": False,
|
|
162
|
+
"multiple_system_messages": True,
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
@property
|
|
166
|
+
def capabilities(self) -> dict:
|
|
167
|
+
return self.model_info
|
|
168
|
+
|
|
169
|
+
async def create(self, *a: Any, **k: Any) -> Any:
|
|
170
|
+
raise NotImplementedError("stub model client: no inference in the e2e")
|
|
171
|
+
|
|
172
|
+
def create_stream(self, *a: Any, **k: Any) -> Any:
|
|
173
|
+
raise NotImplementedError("stub model client: no inference in the e2e")
|
|
174
|
+
|
|
175
|
+
async def close(self) -> None:
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
def actual_usage(self) -> RequestUsage:
|
|
179
|
+
return RequestUsage(prompt_tokens=0, completion_tokens=0)
|
|
180
|
+
|
|
181
|
+
def total_usage(self) -> RequestUsage:
|
|
182
|
+
return RequestUsage(prompt_tokens=0, completion_tokens=0)
|
|
183
|
+
|
|
184
|
+
def count_tokens(self, messages: Any, *, tools: Any = []) -> int:
|
|
185
|
+
return 0
|
|
186
|
+
|
|
187
|
+
def remaining_tokens(self, messages: Any, *, tools: Any = []) -> int:
|
|
188
|
+
return 0
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
failures: list[str] = []
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def check(label: str, cond: bool, extra: str = "") -> None:
|
|
195
|
+
status = "PASS" if cond else "FAIL"
|
|
196
|
+
print(f" [{status}] {label}" + (f" — {extra}" if extra else ""))
|
|
197
|
+
if not cond:
|
|
198
|
+
failures.append(label)
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def call_tool_sync(workbench: Any, name: str, args: dict) -> Any:
|
|
202
|
+
"""Drive a tool through AutoGen's real dispatch path (the agent uses this)."""
|
|
203
|
+
return asyncio.run(workbench.call_tool(name, args, CancellationToken()))
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def main() -> int:
|
|
207
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
208
|
+
log_root = str(Path(tmp) / "events")
|
|
209
|
+
config = {
|
|
210
|
+
"project_id": "autogen-e2e",
|
|
211
|
+
"actor_id": "autogen-agent",
|
|
212
|
+
"session_id": "auto",
|
|
213
|
+
"log_root": log_root,
|
|
214
|
+
"default_scope": {"level": "session", "identifier": "autogen-e2e"},
|
|
215
|
+
"default_sensitivity": "internal",
|
|
216
|
+
"auto_approve_ceiling": 3,
|
|
217
|
+
# An L4 hold parks; with no approver it must time out fast here.
|
|
218
|
+
"approval_timeout_ms": 300,
|
|
219
|
+
"approvals": {"allow_unsigned": True},
|
|
220
|
+
"tool_defaults": {
|
|
221
|
+
"echo": {"required_trust_level": 1, "reversibility": "reversible", "sandbox": "read", "permissions": [], "blast_radius": "session"},
|
|
222
|
+
"read_doc": {"required_trust_level": 1, "reversibility": "reversible", "sandbox": "read", "permissions": [], "blast_radius": "session"},
|
|
223
|
+
"deploy": {"required_trust_level": 4, "reversibility": "irreversible", "sandbox": "controlled-shell", "permissions": [], "blast_radius": "external"},
|
|
224
|
+
"fetch": {"required_trust_level": 1, "reversibility": "reversible", "sandbox": "read", "permissions": [], "blast_radius": "session"},
|
|
225
|
+
"nan_out": {"required_trust_level": 1, "reversibility": "reversible", "sandbox": "read", "permissions": [], "blast_radius": "session"},
|
|
226
|
+
"search": {"required_trust_level": 1, "reversibility": "reversible", "sandbox": "read", "permissions": [], "blast_radius": "session"},
|
|
227
|
+
"loop_check": {"required_trust_level": 1, "reversibility": "reversible", "sandbox": "read", "permissions": [], "blast_radius": "session"},
|
|
228
|
+
},
|
|
229
|
+
}
|
|
230
|
+
config_path = Path(tmp) / "runtime-gate.config.json"
|
|
231
|
+
config_path.write_text(json.dumps(config))
|
|
232
|
+
|
|
233
|
+
# P3: a sidecar that exits before `ready` (here: an invalid config the CLI
|
|
234
|
+
# rejects) must fail construction FAST with a useful message, not block the
|
|
235
|
+
# full ready timeout.
|
|
236
|
+
bad_config = Path(tmp) / "bad.config.json"
|
|
237
|
+
bad_config.write_text("{}") # missing required fields → the CLI exits 1
|
|
238
|
+
t0 = time.monotonic()
|
|
239
|
+
startup_err = None
|
|
240
|
+
try:
|
|
241
|
+
GateClient(str(bad_config), launcher=["bun", "run", str(CLI_INDEX)], ready_timeout_s=20)
|
|
242
|
+
except GateError as exc:
|
|
243
|
+
startup_err = str(exc)
|
|
244
|
+
elapsed = time.monotonic() - t0
|
|
245
|
+
check("P3: bad-config startup fails fast (not after the ready timeout)", startup_err is not None and elapsed < 10, f"{elapsed:.1f}s")
|
|
246
|
+
check("P3: the failure reports the gate exited before ready", startup_err is not None and "before signalling ready" in startup_err, str(startup_err))
|
|
247
|
+
|
|
248
|
+
with GateClient(str(config_path), launcher=["bun", "run", str(CLI_INDEX)]) as gate:
|
|
249
|
+
print("─" * 72)
|
|
250
|
+
print("autogen-tool-calls-are-governed (real AutoGen tools + hook + gate)")
|
|
251
|
+
print("─" * 72)
|
|
252
|
+
|
|
253
|
+
echo_t = FunctionTool(echo, description="echo a message back", name="echo")
|
|
254
|
+
deploy_t = FunctionTool(deploy, description="deploy to a target (irreversible, L4)", name="deploy")
|
|
255
|
+
fetch_t = FunctionTool(fetch, description="fetch a url (async-implemented tool)", name="fetch")
|
|
256
|
+
nan_t = FunctionTool(nan_out, description="returns a non-finite float", name="nan_out")
|
|
257
|
+
read_doc_t = ReadDoc()
|
|
258
|
+
loop_check_t = FunctionTool(loop_check, description="reports its running loop", name="loop_check")
|
|
259
|
+
# `search` is a BARE CALLABLE (not pre-wrapped) — govern_tools must
|
|
260
|
+
# normalise it, exactly as AssistantAgent would.
|
|
261
|
+
tools = [echo_t, read_doc_t, deploy_t, fetch_t, nan_t, loop_check_t, search]
|
|
262
|
+
|
|
263
|
+
governed = govern_tools(gate, tools, hold_wait_ms=2_000)
|
|
264
|
+
governed_by_name = {t.name: t for t in governed}
|
|
265
|
+
expected_names = {"echo", "read_doc", "deploy", "fetch", "nan_out", "loop_check", "search"}
|
|
266
|
+
check("0: only governed wrappers are exposed", set(governed_by_name) == expected_names, str(set(governed_by_name)))
|
|
267
|
+
# The wrappers are real AutoGen BaseTools (not the originals).
|
|
268
|
+
check("0: wrappers are BaseTool instances, distinct from the originals", all(isinstance(t, BaseTool) for t in governed) and governed_by_name["echo"] is not echo_t, "")
|
|
269
|
+
# The original schema is preserved (the model sees the right parameters).
|
|
270
|
+
check("0: governed echo preserves the original schema", list(governed_by_name["echo"].schema["parameters"]["properties"]) == ["msg"], str(governed_by_name["echo"].schema["parameters"]["properties"]))
|
|
271
|
+
# A bare callable was accepted and normalised to a governed FunctionTool.
|
|
272
|
+
check("0: bare callable was normalised + governed (search)", isinstance(governed_by_name.get("search"), BaseTool) and list(governed_by_name["search"].schema["parameters"]["properties"]) == ["q"], str(governed_by_name.get("search") and governed_by_name["search"].schema["parameters"]["properties"]))
|
|
273
|
+
|
|
274
|
+
# The governed wrappers dispatch through AutoGen's REAL execution path:
|
|
275
|
+
# StaticWorkbench.call_tool → tool.run_json → the gate.
|
|
276
|
+
wb = StaticWorkbench(governed)
|
|
277
|
+
|
|
278
|
+
# 1. a governed L1 tool runs through the workbench; the body runs once,
|
|
279
|
+
# remoted back. call_tool returns a (stringified) ToolResult.
|
|
280
|
+
r = call_tool_sync(wb, "echo", {"msg": "hi"})
|
|
281
|
+
check("1: StaticWorkbench.call_tool ran the governed echo", (not r.is_error) and json.loads(r.result[0].content) == {"echo": "hi"}, str(r.result[0].content))
|
|
282
|
+
check("1: echo body ran exactly once", runs["echo"] == 1, str(runs["echo"]))
|
|
283
|
+
|
|
284
|
+
# 2. custom step via governed_call (returns the structured output).
|
|
285
|
+
res = governed_call(gate, "echo", {"msg": "from-step"})
|
|
286
|
+
check("2: governed_call returned the tool output", res == {"echo": "from-step"}, str(res))
|
|
287
|
+
check("2: echo body ran again exactly once", runs["echo"] == 2, str(runs["echo"]))
|
|
288
|
+
|
|
289
|
+
# 2b. a custom BaseTool subclass with an async run, surfacing untrusted
|
|
290
|
+
# document content, runs through the gate's remoted execute.
|
|
291
|
+
doc = governed_call(gate, "read_doc", {"path": "/notes.md"})
|
|
292
|
+
check("2b: custom BaseTool subclass ran via the gate", doc == {"read": "/notes.md"}, str(doc))
|
|
293
|
+
check("2b: read_doc body ran once", runs["read_doc"] == 1, str(runs["read_doc"]))
|
|
294
|
+
|
|
295
|
+
# 2c. the normalised bare-callable tool runs through the gate.
|
|
296
|
+
hits = governed_call(gate, "search", {"q": "lodestar"})
|
|
297
|
+
check("2c: normalised bare-callable tool ran via the gate", hits == {"hits": ["lodestar"]}, str(hits))
|
|
298
|
+
check("2c: search body ran once", runs["search"] == 1, str(runs["search"]))
|
|
299
|
+
|
|
300
|
+
# 3. an async-implemented tool runs via the gate's remoted execute (the
|
|
301
|
+
# hook drives its coroutine with asyncio.run on the worker thread),
|
|
302
|
+
# through BOTH the framework path and a direct governed_call.
|
|
303
|
+
afetch = call_tool_sync(wb, "fetch", {"url": "https://x"})
|
|
304
|
+
check("3: async tool ran via StaticWorkbench.call_tool", (not afetch.is_error) and json.loads(afetch.result[0].content) == {"fetched": "https://x"}, str(afetch.result[0].content))
|
|
305
|
+
res_async = governed_call(gate, "fetch", {"url": "https://y"})
|
|
306
|
+
check("3: async tool ran via governed_call", res_async == {"fetched": "https://y"}, str(res_async))
|
|
307
|
+
check("3: async tool body ran exactly twice", runs["fetch"] == 2, str(runs["fetch"]))
|
|
308
|
+
|
|
309
|
+
# 3c. the remoted body runs on ONE stable persistent loop, so a tool's
|
|
310
|
+
# loop-scoped state survives across calls (a fresh asyncio.run per
|
|
311
|
+
# call would give a new, torn-down loop each time → cross-loop break).
|
|
312
|
+
first = governed_call(gate, "loop_check", {"x": 1})
|
|
313
|
+
second = governed_call(gate, "loop_check", {"x": 2})
|
|
314
|
+
check("3c: first loop_check sees no previous loop", first == {"same_as_prev": False}, str(first))
|
|
315
|
+
check("3c: second loop_check runs on the SAME persistent loop", second == {"same_as_prev": True}, str(second))
|
|
316
|
+
|
|
317
|
+
# 4. concurrent in-flight calls are each correlated to their own result.
|
|
318
|
+
before = runs["echo"]
|
|
319
|
+
results: dict[str, object] = {}
|
|
320
|
+
errors: list[str] = []
|
|
321
|
+
|
|
322
|
+
def call(tag: str) -> None:
|
|
323
|
+
try:
|
|
324
|
+
results[tag] = governed_call(gate, "echo", {"msg": tag})
|
|
325
|
+
except Exception as exc: # noqa: BLE001
|
|
326
|
+
errors.append(f"{tag}: {exc}")
|
|
327
|
+
|
|
328
|
+
threads = [threading.Thread(target=call, args=(f"C{i}",)) for i in range(4)]
|
|
329
|
+
for t in threads:
|
|
330
|
+
t.start()
|
|
331
|
+
for t in threads:
|
|
332
|
+
t.join()
|
|
333
|
+
check("4: concurrent calls all returned", len(results) == 4 and not errors, f"{results} errs={errors}")
|
|
334
|
+
check("4: each concurrent call correlated to its own arg", all(results.get(f"C{i}") == {"echo": f"C{i}"} for i in range(4)), str(results))
|
|
335
|
+
check("4: all concurrent bodies ran", runs["echo"] - before == 4, str(runs["echo"] - before))
|
|
336
|
+
|
|
337
|
+
# 5. L4 tool is HELD (two-phase across the boundary): with no approver
|
|
338
|
+
# it times out and the body NEVER runs.
|
|
339
|
+
deploy_before = runs["deploy"]
|
|
340
|
+
denied_kind = None
|
|
341
|
+
try:
|
|
342
|
+
governed_call(gate, "deploy", {"target": "prod"}, hold_wait_ms=2_000)
|
|
343
|
+
except LodestarDenied as denied:
|
|
344
|
+
denied_kind = denied.kind
|
|
345
|
+
check("5: L4 deploy was held then denied", denied_kind == "approval_timeout", str(denied_kind))
|
|
346
|
+
check("5: deploy body NEVER ran (no work before approval)", runs["deploy"] - deploy_before == 0, str(runs["deploy"] - deploy_before))
|
|
347
|
+
# Through the framework path, the workbench catches the denial and
|
|
348
|
+
# surfaces it as an error ToolResult (re-plannable for the agent).
|
|
349
|
+
framework = call_tool_sync(wb, "deploy", {"target": "prod2"})
|
|
350
|
+
check("5: held L4 surfaces as an error ToolResult via call_tool", framework.is_error and "approval" in framework.result[0].content.lower(), str(framework.result[0].content))
|
|
351
|
+
check("5: deploy body STILL never ran", runs["deploy"] - deploy_before == 0, str(runs["deploy"] - deploy_before))
|
|
352
|
+
|
|
353
|
+
# 6. a tool that was never registered is denied — fail closed.
|
|
354
|
+
ghost_kind = None
|
|
355
|
+
try:
|
|
356
|
+
governed_call(gate, "never_registered", {})
|
|
357
|
+
except LodestarDenied as denied:
|
|
358
|
+
ghost_kind = denied.kind
|
|
359
|
+
check("6: unregistered tool denied (fail closed)", ghost_kind == "unregistered_tool", str(ghost_kind))
|
|
360
|
+
|
|
361
|
+
# 9. an already-cancelled CancellationToken short-circuits the wrapper:
|
|
362
|
+
# no action is proposed and the body never runs, so a cancelled agent
|
|
363
|
+
# run starts no new governed work. (The remoted body, once in the
|
|
364
|
+
# gate's execute phase, runs server-side and isn't force-cancellable
|
|
365
|
+
# across the RPC boundary — a documented boundary, ADR-0027 §2.)
|
|
366
|
+
echo_before = runs["echo"]
|
|
367
|
+
cancelled = False
|
|
368
|
+
ct = CancellationToken()
|
|
369
|
+
ct.cancel()
|
|
370
|
+
try:
|
|
371
|
+
asyncio.run(governed_by_name["echo"].run_json({"msg": "nope"}, ct))
|
|
372
|
+
except asyncio.CancelledError:
|
|
373
|
+
cancelled = True
|
|
374
|
+
check("9: a pre-cancelled token short-circuits (CancelledError)", cancelled, str(cancelled))
|
|
375
|
+
check("9: echo body did NOT run on a cancelled token", runs["echo"] == echo_before, str(runs["echo"] - echo_before))
|
|
376
|
+
|
|
377
|
+
# 7. the governed wrappers are valid AutoGen tools the framework accepts:
|
|
378
|
+
# they attach to a real AssistantAgent (construction validates the
|
|
379
|
+
# toolset; a stub model client means no LLM call / API key needed).
|
|
380
|
+
agent_ok = None
|
|
381
|
+
try:
|
|
382
|
+
agent = AssistantAgent("ops", model_client=_StubModelClient(), tools=governed)
|
|
383
|
+
agent_ok = {t.name for t in agent._tools} == expected_names
|
|
384
|
+
except Exception as exc: # noqa: BLE001
|
|
385
|
+
agent_ok = False
|
|
386
|
+
print(f" (AssistantAgent construction raised: {exc})")
|
|
387
|
+
check("7: governed wrappers attach to a real AssistantAgent", agent_ok is True, str(agent_ok))
|
|
388
|
+
|
|
389
|
+
# 8. Non-finite floats are rejected before they corrupt the JSON wire,
|
|
390
|
+
# so a NaN in args or a tool result fails the call rather than
|
|
391
|
+
# hanging it.
|
|
392
|
+
arg_nan_err = None
|
|
393
|
+
try:
|
|
394
|
+
governed_call(gate, "echo", {"msg": math.nan})
|
|
395
|
+
except GateError:
|
|
396
|
+
arg_nan_err = "gate_error"
|
|
397
|
+
except LodestarDenied as denied:
|
|
398
|
+
arg_nan_err = denied.kind
|
|
399
|
+
check("8: a NaN argument is rejected, not silently hung", arg_nan_err is not None, str(arg_nan_err))
|
|
400
|
+
|
|
401
|
+
out_nan_err = None
|
|
402
|
+
try:
|
|
403
|
+
governed_call(gate, "nan_out", {"x": 1})
|
|
404
|
+
except LodestarDenied as denied:
|
|
405
|
+
out_nan_err = denied.kind
|
|
406
|
+
except GateError:
|
|
407
|
+
out_nan_err = "gate_error"
|
|
408
|
+
check("8: a NaN tool result fails the action, not silently hung", out_nan_err is not None, str(out_nan_err))
|
|
409
|
+
|
|
410
|
+
print("─" * 72)
|
|
411
|
+
if failures:
|
|
412
|
+
print(f"RESULT: FAIL ({len(failures)} check(s) failed)")
|
|
413
|
+
else:
|
|
414
|
+
print("RESULT: PASS — AutoGen native tool calls are governed end-to-end")
|
|
415
|
+
print("─" * 72)
|
|
416
|
+
return 1 if failures else 0
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
if __name__ == "__main__":
|
|
420
|
+
sys.exit(main())
|