axor-langchain 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- axor_langchain-0.2.0/.github/workflows/ci.yml +88 -0
- axor_langchain-0.2.0/.gitignore +6 -0
- axor_langchain-0.2.0/LICENSE +21 -0
- axor_langchain-0.2.0/PKG-INFO +357 -0
- axor_langchain-0.2.0/README.md +333 -0
- axor_langchain-0.2.0/axor_langchain/__init__.py +6 -0
- axor_langchain-0.2.0/axor_langchain/middleware.py +967 -0
- axor_langchain-0.2.0/benchmark/graph.py +297 -0
- axor_langchain-0.2.0/benchmark/live_graph.py +695 -0
- axor_langchain-0.2.0/benchmark/run.py +593 -0
- axor_langchain-0.2.0/pyproject.toml +32 -0
- axor_langchain-0.2.0/tests/test_smoke.py +95 -0
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
name: CI/CD
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
tags: ["v*.*.*"]
|
|
7
|
+
pull_request:
|
|
8
|
+
branches: [main]
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
test:
|
|
12
|
+
name: Test (Python ${{ matrix.python-version }})
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
strategy:
|
|
15
|
+
matrix:
|
|
16
|
+
python-version: ["3.11", "3.12"]
|
|
17
|
+
|
|
18
|
+
steps:
|
|
19
|
+
- uses: actions/checkout@v4
|
|
20
|
+
|
|
21
|
+
- name: Checkout axor-core
|
|
22
|
+
uses: actions/checkout@v4
|
|
23
|
+
with:
|
|
24
|
+
repository: ${{ github.repository_owner }}/axor-core
|
|
25
|
+
path: axor-core
|
|
26
|
+
|
|
27
|
+
- uses: actions/setup-python@v5
|
|
28
|
+
with:
|
|
29
|
+
python-version: ${{ matrix.python-version }}
|
|
30
|
+
cache: pip
|
|
31
|
+
|
|
32
|
+
- name: Install
|
|
33
|
+
run: |
|
|
34
|
+
pip install -e axor-core/
|
|
35
|
+
pip install -e ".[dev]"
|
|
36
|
+
|
|
37
|
+
- name: Run tests
|
|
38
|
+
run: pytest tests/ -v --tb=short
|
|
39
|
+
|
|
40
|
+
publish:
|
|
41
|
+
name: Publish to PyPI
|
|
42
|
+
needs: test
|
|
43
|
+
runs-on: ubuntu-latest
|
|
44
|
+
if: startsWith(github.ref, 'refs/tags/v')
|
|
45
|
+
environment: pypi
|
|
46
|
+
|
|
47
|
+
permissions:
|
|
48
|
+
id-token: write
|
|
49
|
+
|
|
50
|
+
steps:
|
|
51
|
+
- uses: actions/checkout@v4
|
|
52
|
+
|
|
53
|
+
- uses: actions/setup-python@v5
|
|
54
|
+
with:
|
|
55
|
+
python-version: "3.12"
|
|
56
|
+
|
|
57
|
+
- name: Verify tag matches package version
|
|
58
|
+
run: |
|
|
59
|
+
python - << 'EOF'
|
|
60
|
+
import pathlib
|
|
61
|
+
import re
|
|
62
|
+
import sys
|
|
63
|
+
import tomllib
|
|
64
|
+
|
|
65
|
+
ref = "${{ github.ref_name }}"
|
|
66
|
+
m = re.fullmatch(r"v(\d+\.\d+\.\d+)", ref)
|
|
67
|
+
if not m:
|
|
68
|
+
print(f"Tag {ref!r} must match vX.Y.Z")
|
|
69
|
+
sys.exit(1)
|
|
70
|
+
|
|
71
|
+
tag_version = m.group(1)
|
|
72
|
+
data = tomllib.loads(pathlib.Path("pyproject.toml").read_text(encoding="utf-8"))
|
|
73
|
+
pkg_version = data["project"]["version"]
|
|
74
|
+
|
|
75
|
+
if tag_version != pkg_version:
|
|
76
|
+
print(f"Version mismatch: tag={tag_version}, pyproject={pkg_version}")
|
|
77
|
+
sys.exit(1)
|
|
78
|
+
|
|
79
|
+
print(f"Version check passed: {pkg_version}")
|
|
80
|
+
EOF
|
|
81
|
+
|
|
82
|
+
- name: Build
|
|
83
|
+
run: |
|
|
84
|
+
pip install hatchling build
|
|
85
|
+
python -m build
|
|
86
|
+
|
|
87
|
+
- name: Publish to PyPI
|
|
88
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Axor Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,357 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: axor-langchain
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Governance middleware for LangChain 1.0 agents — powered by axor-core compression engines
|
|
5
|
+
License: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Keywords: agents,axor,governance,langchain,langgraph,token-optimization
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
13
|
+
Requires-Python: >=3.11
|
|
14
|
+
Requires-Dist: langchain>=1.0.0
|
|
15
|
+
Requires-Dist: langgraph>=1.0.0
|
|
16
|
+
Provides-Extra: core
|
|
17
|
+
Requires-Dist: axor-core>=0.2.0; extra == 'core'
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest-asyncio; extra == 'dev'
|
|
20
|
+
Requires-Dist: pytest>=7.0; extra == 'dev'
|
|
21
|
+
Provides-Extra: memory
|
|
22
|
+
Requires-Dist: axor-memory-sqlite>=0.1.0; extra == 'memory'
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
|
|
25
|
+
# axor-langchain
|
|
26
|
+
|
|
27
|
+
[](https://github.com/Bucha11/axor-langchain/actions/workflows/ci.yml)
|
|
28
|
+
[](https://pypi.org/project/axor-langchain/)
|
|
29
|
+
[](https://pypi.org/project/axor-langchain/)
|
|
30
|
+
[](LICENSE)
|
|
31
|
+
|
|
32
|
+
**Cut token costs 40–80% in LangChain multi-agent pipelines.**
|
|
33
|
+
|
|
34
|
+
One middleware. No graph changes. Works with any `create_agent()` agent.
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## The problem
|
|
39
|
+
|
|
40
|
+
LangChain agents accumulate messages. By turn 10 you're paying for:
|
|
41
|
+
- Tool outputs from 8 turns ago that nobody needs
|
|
42
|
+
- Repeated context that hasn't changed
|
|
43
|
+
- Intermediate reasoning that's already been acted on
|
|
44
|
+
|
|
45
|
+
A 10-node research pipeline can balloon from 5k to 80k tokens by the last node — and you're billed for all of it on every API call.
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Installation
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install axor-langchain
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## Quick start
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
from langchain.agents import create_agent
|
|
61
|
+
from axor_langchain import AxorMiddleware
|
|
62
|
+
|
|
63
|
+
# before: bare agent
|
|
64
|
+
agent = create_agent("anthropic:claude-sonnet-4-5", tools=tools)
|
|
65
|
+
|
|
66
|
+
# after: governed agent — one line change
|
|
67
|
+
axor = AxorMiddleware(soft_token_limit=100_000, verbose=True)
|
|
68
|
+
agent = create_agent(
|
|
69
|
+
"anthropic:claude-sonnet-4-5",
|
|
70
|
+
tools=tools,
|
|
71
|
+
middleware=[axor],
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
result = await agent.ainvoke({"messages": [("user", "research transformers")]})
|
|
75
|
+
print(f"Tokens spent: {axor.total_tokens_spent}")
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
---
|
|
79
|
+
|
|
80
|
+
## What it does
|
|
81
|
+
|
|
82
|
+
### Context compression
|
|
83
|
+
|
|
84
|
+
Before each model call, `AxorMiddleware` compresses the message history based on session length:
|
|
85
|
+
|
|
86
|
+
| Session length | Mode | Window | Tool output cap |
|
|
87
|
+
|---------------|------|--------|----------------|
|
|
88
|
+
| ≤ 6 messages | broad | all | 8,000 chars |
|
|
89
|
+
| 7–20 messages | moderate | last 16 | 2,000 chars |
|
|
90
|
+
| 21+ messages | minimal | last 6 | 800 chars |
|
|
91
|
+
|
|
92
|
+
The longer the session, the more aggressively old context is compressed. Recent messages are always kept. System messages are never dropped.
|
|
93
|
+
|
|
94
|
+
**Typical savings:**
|
|
95
|
+
|
|
96
|
+
```
|
|
97
|
+
Turn 1: 1,200 tokens (no compression yet)
|
|
98
|
+
Turn 5: 1,800 tokens (moderate: old tools truncated)
|
|
99
|
+
Turn 10: 2,100 tokens (minimal: only recent 6 messages)
|
|
100
|
+
Turn 20: 2,300 tokens (stable — doesn't keep growing)
|
|
101
|
+
|
|
102
|
+
Without axor:
|
|
103
|
+
Turn 20: 45,000 tokens (full history accumulated)
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Tool governance
|
|
107
|
+
|
|
108
|
+
Filter which tools each agent can call — without changing the graph:
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
# research agent: read + search only, no write/bash
|
|
112
|
+
axor = AxorMiddleware(
|
|
113
|
+
allowed_tools=["search", "read", "web_search"],
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# audit agent: read only
|
|
117
|
+
axor = AxorMiddleware(
|
|
118
|
+
denied_tools=["write", "bash", "delete"],
|
|
119
|
+
)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Budget tracking
|
|
123
|
+
|
|
124
|
+
Hard stop when token limit is reached — no surprise bills:
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
axor = AxorMiddleware(
|
|
128
|
+
soft_token_limit=80_000, # log warning
|
|
129
|
+
hard_token_limit=100_000, # stop agent, return partial result
|
|
130
|
+
)
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Pinned personality
|
|
134
|
+
|
|
135
|
+
Personality is always the first system message — survives compression:
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
axor = AxorMiddleware(
|
|
139
|
+
personality="You are a security-focused code reviewer. "
|
|
140
|
+
"Always check for injection risks and hardcoded secrets.",
|
|
141
|
+
)
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Cross-session memory (optional)
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
pip install axor-langchain[memory]
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
from axor_memory_sqlite import SQLiteMemoryProvider
|
|
152
|
+
|
|
153
|
+
provider = SQLiteMemoryProvider("~/.axor/memory.db")
|
|
154
|
+
axor = AxorMiddleware(
|
|
155
|
+
memory_provider=provider,
|
|
156
|
+
memory_namespace="research-agent",
|
|
157
|
+
)
|
|
158
|
+
# after each session: last assistant message saved to SQLite
|
|
159
|
+
# next session: load with provider.load(MemoryQuery(...))
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### Small context bypass
|
|
163
|
+
|
|
164
|
+
By default, contexts under 4,000 tokens skip the compression pipeline entirely. This avoids overhead on small/early turns where compression can't save more than it costs:
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
# default: auto-bypass for small contexts (recommended)
|
|
168
|
+
axor = AxorMiddleware(soft_token_limit=100_000)
|
|
169
|
+
|
|
170
|
+
# disable bypass — always compress (aggressive savings, may add overhead on small turns)
|
|
171
|
+
axor = AxorMiddleware(bypass_token_threshold=0)
|
|
172
|
+
|
|
173
|
+
# custom threshold
|
|
174
|
+
axor = AxorMiddleware(bypass_token_threshold=8000)
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
Budget tracking and tool governance still apply even when compression is bypassed.
|
|
178
|
+
|
|
179
|
+
**Impact on savings (real data from claude-sonnet-4-6 benchmark):**
|
|
180
|
+
|
|
181
|
+
| | Without bypass | With bypass (4000) |
|
|
182
|
+
|--|---------------|-------------------|
|
|
183
|
+
| Total savings (4t + 8t combined) | +26.4% | **+24.8%** |
|
|
184
|
+
| Risk of negative savings on small contexts | Yes (-9% at 6t) | **No** (0% — passed through) |
|
|
185
|
+
| Large context savings (8t+) | +26-48% | **+26-48%** (unchanged) |
|
|
186
|
+
|
|
187
|
+
Bypass trades ~1.6% total savings for stable, predictable behavior — you never pay more than without axor.
|
|
188
|
+
|
|
189
|
+
---
|
|
190
|
+
|
|
191
|
+
## LangGraph integration
|
|
192
|
+
|
|
193
|
+
Works with any LangGraph `StateGraph` that uses LangChain agents as nodes:
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
from langgraph.graph import StateGraph, END
|
|
197
|
+
from langchain.agents import create_agent
|
|
198
|
+
from axor_langchain import AxorMiddleware
|
|
199
|
+
|
|
200
|
+
# each node gets its own governance config
|
|
201
|
+
research_axor = AxorMiddleware(
|
|
202
|
+
allowed_tools=["search", "web_search"],
|
|
203
|
+
soft_token_limit=50_000,
|
|
204
|
+
verbose=True,
|
|
205
|
+
)
|
|
206
|
+
writer_axor = AxorMiddleware(
|
|
207
|
+
allowed_tools=["read", "write"],
|
|
208
|
+
soft_token_limit=30_000,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
research_agent = create_agent(
|
|
212
|
+
"anthropic:claude-sonnet-4-5",
|
|
213
|
+
tools=[search_tool, web_search_tool],
|
|
214
|
+
middleware=[research_axor],
|
|
215
|
+
)
|
|
216
|
+
writer_agent = create_agent(
|
|
217
|
+
"anthropic:claude-sonnet-4-5",
|
|
218
|
+
tools=[read_tool, write_tool],
|
|
219
|
+
middleware=[writer_axor],
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
workflow = StateGraph(State)
|
|
223
|
+
workflow.add_node("research", research_agent)
|
|
224
|
+
workflow.add_node("write", writer_agent)
|
|
225
|
+
workflow.add_edge("research", "write")
|
|
226
|
+
workflow.add_edge("write", END)
|
|
227
|
+
|
|
228
|
+
app = workflow.compile()
|
|
229
|
+
result = await app.ainvoke({"messages": [...]})
|
|
230
|
+
|
|
231
|
+
print(f"Research tokens: {research_axor.total_tokens_spent}")
|
|
232
|
+
print(f"Writer tokens: {writer_axor.total_tokens_spent}")
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
Per-node governance: each agent compresses its own context independently.
|
|
236
|
+
|
|
237
|
+
---
|
|
238
|
+
|
|
239
|
+
## Configuration reference
|
|
240
|
+
|
|
241
|
+
```python
|
|
242
|
+
AxorMiddleware(
|
|
243
|
+
soft_token_limit=None, # int | None — warning threshold
|
|
244
|
+
hard_token_limit=None, # int | None — stop threshold (default: soft * 1.5)
|
|
245
|
+
compression_mode="auto", # "auto" | "minimal" | "moderate" | "broad"
|
|
246
|
+
bypass_token_threshold=4000, # int — skip compression below this token count
|
|
247
|
+
allowed_tools=None, # list[str] | None — whitelist
|
|
248
|
+
denied_tools=None, # list[str] | None — blacklist
|
|
249
|
+
personality=None, # str | None — pinned system message
|
|
250
|
+
memory_provider=None, # MemoryProvider | None
|
|
251
|
+
memory_namespace="axor", # str
|
|
252
|
+
tool_error_handler=None, # Callable[[str, Exception], str] | None
|
|
253
|
+
tool_max_retries=0, # int — extra retry attempts
|
|
254
|
+
tool_retry_delay=0.0, # float — seconds between retries
|
|
255
|
+
track_tool_stats=False, # bool — per-tool call/latency/error tracking
|
|
256
|
+
verbose=False, # bool — log governance decisions
|
|
257
|
+
)
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
---
|
|
261
|
+
|
|
262
|
+
## Difference from axor-claude
|
|
263
|
+
|
|
264
|
+
| | axor-claude | axor-langchain |
|
|
265
|
+
|--|-------------|----------------|
|
|
266
|
+
| Provider | Anthropic only | any (OpenAI, Anthropic, Google…) |
|
|
267
|
+
| Framework | axor-core GovernedSession | LangChain create_agent() |
|
|
268
|
+
| Governance depth | full (context shaping, IntentLoop) | middleware (message compression, tool filter) |
|
|
269
|
+
| Best for | standalone coding agents | multi-agent LangGraph pipelines |
|
|
270
|
+
|
|
271
|
+
---
|
|
272
|
+
|
|
273
|
+
## Requirements
|
|
274
|
+
|
|
275
|
+
- Python 3.11+
|
|
276
|
+
- `langchain >= 1.0.0`
|
|
277
|
+
- `langgraph >= 1.0.0`
|
|
278
|
+
|
|
279
|
+
---
|
|
280
|
+
|
|
281
|
+
## License
|
|
282
|
+
|
|
283
|
+
MIT
|
|
284
|
+
|
|
285
|
+
---
|
|
286
|
+
|
|
287
|
+
## Benchmarks
|
|
288
|
+
|
|
289
|
+
### Live results (claude-sonnet-4-6, 3-node research pipeline)
|
|
290
|
+
|
|
291
|
+
Real API calls, real `usage_metadata` token counts. Pipeline: planner → researcher → writer.
|
|
292
|
+
Default `bypass_token_threshold=4000` — small contexts pass through without compression.
|
|
293
|
+
|
|
294
|
+
**Per-node breakdown (8 turns, auto mode):**
|
|
295
|
+
|
|
296
|
+
| Node | Without axor | With axor | Saved |
|
|
297
|
+
|------|-------------|-----------|-------|
|
|
298
|
+
| planner | 13,678 tok | 7,112 tok | **48.0%** |
|
|
299
|
+
| researcher | 27,677 tok | 19,750 tok | **28.6%** |
|
|
300
|
+
| writer | 44,963 tok | 36,811 tok | **18.1%** |
|
|
301
|
+
| **TOTAL** | **86,318 tok** | **63,673 tok** | **26.2%** |
|
|
302
|
+
|
|
303
|
+
Writer sees all accumulated context from planner + researcher — this is where token explosion happens in production.
|
|
304
|
+
|
|
305
|
+
**Across configurations:**
|
|
306
|
+
|
|
307
|
+
| Prior turns | Mode | Without axor | With axor | Savings | $/10K runs saved |
|
|
308
|
+
|------------|------|-------------|-----------|---------|-----------------|
|
|
309
|
+
| 4 turns | auto | 28,366 tok | 20,717 tok | **27.0%** | **$274** |
|
|
310
|
+
| 8 turns | auto | 86,318 tok | 63,673 tok | **26.2%** | **$733** |
|
|
311
|
+
| 8 turns | minimal | 65,243 tok | 52,451 tok | **19.6%** | **$438** |
|
|
312
|
+
|
|
313
|
+
> Pricing: claude-sonnet-4-6 @ $3/M input, $15/M output tokens.
|
|
314
|
+
> Results vary between runs due to LLM non-determinism. Use `--runs 3` for averaged results.
|
|
315
|
+
|
|
316
|
+
**Bypass impact (calculated from real data across 4t + 8t runs):**
|
|
317
|
+
|
|
318
|
+
| | Without bypass | With bypass (default) |
|
|
319
|
+
|--|---------------|----------------------|
|
|
320
|
+
| Total savings | +26.4% | **+24.8%** |
|
|
321
|
+
| Negative savings risk | Yes | **No** |
|
|
322
|
+
| Large context savings | +26-48% | +26-48% (same) |
|
|
323
|
+
|
|
324
|
+
~1.6% less total savings, but guaranteed no overhead on small contexts.
|
|
325
|
+
|
|
326
|
+
### Simulated benchmark (no API key needed)
|
|
327
|
+
|
|
328
|
+
Tests all middleware features: compression, tool governance, budget, tool retry, bypass detection.
|
|
329
|
+
|
|
330
|
+
```bash
|
|
331
|
+
python benchmark/run.py # all 17 scenarios
|
|
332
|
+
python benchmark/run.py --scenario bypass # test bypass only
|
|
333
|
+
python benchmark/run.py --json # CI-friendly output
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
### Live benchmark
|
|
337
|
+
|
|
338
|
+
```bash
|
|
339
|
+
export ANTHROPIC_API_KEY=sk-ant-...
|
|
340
|
+
python benchmark/live_graph.py --provider anthropic --turns 8
|
|
341
|
+
python benchmark/live_graph.py --provider anthropic --runs 3 # averaged
|
|
342
|
+
|
|
343
|
+
# OpenAI
|
|
344
|
+
export OPENAI_API_KEY=sk-...
|
|
345
|
+
python benchmark/live_graph.py --provider openai
|
|
346
|
+
```
|
|
347
|
+
|
|
348
|
+
| Flag | Default | Description |
|
|
349
|
+
|------|---------|-------------|
|
|
350
|
+
| `--provider` | `anthropic` | `anthropic` or `openai` |
|
|
351
|
+
| `--model` | `claude-sonnet-4-6` / `gpt-4.1-mini` | Override model |
|
|
352
|
+
| `--task` | research topic | Task for the agent |
|
|
353
|
+
| `--mode` | `auto` | Compression mode |
|
|
354
|
+
| `--turns` | `6` | Prior history turns |
|
|
355
|
+
| `--runs` | `1` | Number of runs for averaging |
|
|
356
|
+
| `--no-axor` | — | Baseline only |
|
|
357
|
+
| `--axor-only` | — | axor run only |
|