banna 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- banna-0.1.0/PKG-INFO +225 -0
- banna-0.1.0/README.md +181 -0
- banna-0.1.0/pyproject.toml +80 -0
- banna-0.1.0/setup.cfg +4 -0
- banna-0.1.0/src/banna.egg-info/PKG-INFO +225 -0
- banna-0.1.0/src/banna.egg-info/SOURCES.txt +100 -0
- banna-0.1.0/src/banna.egg-info/dependency_links.txt +1 -0
- banna-0.1.0/src/banna.egg-info/entry_points.txt +3 -0
- banna-0.1.0/src/banna.egg-info/requires.txt +22 -0
- banna-0.1.0/src/banna.egg-info/top_level.txt +1 -0
- banna-0.1.0/src/banna_agent/__init__.py +3 -0
- banna-0.1.0/src/banna_agent/adapters/__init__.py +1 -0
- banna-0.1.0/src/banna_agent/benchmarks/__init__.py +1 -0
- banna-0.1.0/src/banna_agent/benchmarks/gaia/__init__.py +1 -0
- banna-0.1.0/src/banna_agent/benchmarks/gaia/loader.py +172 -0
- banna-0.1.0/src/banna_agent/benchmarks/gaia/report.py +184 -0
- banna-0.1.0/src/banna_agent/benchmarks/gaia/runner.py +463 -0
- banna-0.1.0/src/banna_agent/benchmarks/gaia/scorer.py +245 -0
- banna-0.1.0/src/banna_agent/cli/__init__.py +9 -0
- banna-0.1.0/src/banna_agent/cli/__main__.py +6 -0
- banna-0.1.0/src/banna_agent/cli/app.py +579 -0
- banna-0.1.0/src/banna_agent/cli/commands.py +1211 -0
- banna-0.1.0/src/banna_agent/cli/display.py +356 -0
- banna-0.1.0/src/banna_agent/cli/session.py +213 -0
- banna-0.1.0/src/banna_agent/cli/theme.py +486 -0
- banna-0.1.0/src/banna_agent/core/__init__.py +1 -0
- banna-0.1.0/src/banna_agent/core/agent.py +320 -0
- banna-0.1.0/src/banna_agent/core/budget.py +63 -0
- banna-0.1.0/src/banna_agent/core/events.py +114 -0
- banna-0.1.0/src/banna_agent/core/state.py +137 -0
- banna-0.1.0/src/banna_agent/core/types.py +229 -0
- banna-0.1.0/src/banna_agent/llm/__init__.py +1 -0
- banna-0.1.0/src/banna_agent/llm/anthropic.py +335 -0
- banna-0.1.0/src/banna_agent/llm/base.py +248 -0
- banna-0.1.0/src/banna_agent/llm/config.py +107 -0
- banna-0.1.0/src/banna_agent/llm/gemini.py +296 -0
- banna-0.1.0/src/banna_agent/llm/ollama.py +231 -0
- banna-0.1.0/src/banna_agent/llm/openai.py +329 -0
- banna-0.1.0/src/banna_agent/llm/pricing.py +197 -0
- banna-0.1.0/src/banna_agent/llm/registry.py +131 -0
- banna-0.1.0/src/banna_agent/memory/__init__.py +5 -0
- banna-0.1.0/src/banna_agent/memory/base.py +149 -0
- banna-0.1.0/src/banna_agent/memory/chroma_store.py +171 -0
- banna-0.1.0/src/banna_agent/memory/compactor.py +172 -0
- banna-0.1.0/src/banna_agent/memory/embeddings.py +181 -0
- banna-0.1.0/src/banna_agent/memory/in_memory_store.py +106 -0
- banna-0.1.0/src/banna_agent/memory/jsonl_store.py +119 -0
- banna-0.1.0/src/banna_agent/memory/skill_harvester.py +144 -0
- banna-0.1.0/src/banna_agent/memory/skill_library.py +120 -0
- banna-0.1.0/src/banna_agent/policies/__init__.py +1 -0
- banna-0.1.0/src/banna_agent/policies/_plan_exec.py +507 -0
- banna-0.1.0/src/banna_agent/policies/_planning.py +188 -0
- banna-0.1.0/src/banna_agent/policies/base.py +36 -0
- banna-0.1.0/src/banna_agent/policies/best_first_over_plans.py +192 -0
- banna-0.1.0/src/banna_agent/policies/best_of_n.py +266 -0
- banna-0.1.0/src/banna_agent/policies/bfs_over_plans.py +223 -0
- banna-0.1.0/src/banna_agent/policies/dfs_over_plans.py +189 -0
- banna-0.1.0/src/banna_agent/policies/planner_react.py +284 -0
- banna-0.1.0/src/banna_agent/policies/react.py +807 -0
- banna-0.1.0/src/banna_agent/policies/verifier_retry.py +227 -0
- banna-0.1.0/src/banna_agent/tools/__init__.py +1 -0
- banna-0.1.0/src/banna_agent/tools/_command_runner.py +222 -0
- banna-0.1.0/src/banna_agent/tools/_http_cache.py +291 -0
- banna-0.1.0/src/banna_agent/tools/_parsers/__init__.py +23 -0
- banna-0.1.0/src/banna_agent/tools/_parsers/mypy.py +60 -0
- banna-0.1.0/src/banna_agent/tools/_parsers/pytest.py +70 -0
- banna-0.1.0/src/banna_agent/tools/_parsers/ruff.py +65 -0
- banna-0.1.0/src/banna_agent/tools/audio_transcribe.py +109 -0
- banna-0.1.0/src/banna_agent/tools/base.py +116 -0
- banna-0.1.0/src/banna_agent/tools/browser.py +470 -0
- banna-0.1.0/src/banna_agent/tools/calculator.py +100 -0
- banna-0.1.0/src/banna_agent/tools/file_reader.py +359 -0
- banna-0.1.0/src/banna_agent/tools/final_answer.py +99 -0
- banna-0.1.0/src/banna_agent/tools/grep.py +180 -0
- banna-0.1.0/src/banna_agent/tools/image_extract.py +172 -0
- banna-0.1.0/src/banna_agent/tools/list_files.py +113 -0
- banna-0.1.0/src/banna_agent/tools/memory.py +160 -0
- banna-0.1.0/src/banna_agent/tools/pdf_reader.py +312 -0
- banna-0.1.0/src/banna_agent/tools/plan.py +176 -0
- banna-0.1.0/src/banna_agent/tools/python_sandbox.py +173 -0
- banna-0.1.0/src/banna_agent/tools/run_shell.py +247 -0
- banna-0.1.0/src/banna_agent/tools/run_tests.py +147 -0
- banna-0.1.0/src/banna_agent/tools/search/__init__.py +42 -0
- banna-0.1.0/src/banna_agent/tools/search/backends/__init__.py +8 -0
- banna-0.1.0/src/banna_agent/tools/search/backends/arxiv.py +130 -0
- banna-0.1.0/src/banna_agent/tools/search/backends/biorxiv.py +108 -0
- banna-0.1.0/src/banna_agent/tools/search/backends/duckduckgo.py +122 -0
- banna-0.1.0/src/banna_agent/tools/search/backends/github.py +147 -0
- banna-0.1.0/src/banna_agent/tools/search/backends/google.py +108 -0
- banna-0.1.0/src/banna_agent/tools/search/backends/tavily.py +72 -0
- banna-0.1.0/src/banna_agent/tools/search/backends/yacy.py +244 -0
- banna-0.1.0/src/banna_agent/tools/search/base.py +145 -0
- banna-0.1.0/src/banna_agent/tools/search/tool.py +343 -0
- banna-0.1.0/src/banna_agent/tools/url_reader.py +117 -0
- banna-0.1.0/src/banna_agent/tools/xlsx_reader.py +251 -0
- banna-0.1.0/src/banna_agent/verifiers/__init__.py +41 -0
- banna-0.1.0/src/banna_agent/verifiers/arithmetic.py +244 -0
- banna-0.1.0/src/banna_agent/verifiers/base.py +170 -0
- banna-0.1.0/src/banna_agent/verifiers/citation.py +351 -0
- banna-0.1.0/src/banna_agent/verifiers/command.py +132 -0
- banna-0.1.0/src/banna_agent/verifiers/coverage.py +124 -0
- banna-0.1.0/src/banna_agent/verifiers/format.py +264 -0
banna-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: banna
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A from-scratch, provider-agnostic reasoning agent with a typed state substrate and verifier-guided search. Primary benchmark: GAIA.
|
|
5
|
+
Author-email: Siavash Monfared <monfared@alum.mit.edu>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/siavashmonfared/banna
|
|
8
|
+
Project-URL: Repository, https://github.com/siavashmonfared/banna
|
|
9
|
+
Project-URL: Issues, https://github.com/siavashmonfared/banna/issues
|
|
10
|
+
Keywords: agent,llm,react,verifier,gaia,reasoning,anthropic,openai,gemini
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
16
|
+
Classifier: Operating System :: MacOS
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
Requires-Dist: anthropic>=0.40
|
|
25
|
+
Requires-Dist: openai>=1.50
|
|
26
|
+
Requires-Dist: google-generativeai>=0.8
|
|
27
|
+
Requires-Dist: requests>=2.31
|
|
28
|
+
Requires-Dist: pydantic>=2.6
|
|
29
|
+
Requires-Dist: datasets>=2.19
|
|
30
|
+
Requires-Dist: pyyaml>=6.0
|
|
31
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
32
|
+
Requires-Dist: pypdf>=4.0
|
|
33
|
+
Requires-Dist: openpyxl>=3.1
|
|
34
|
+
Requires-Dist: pandas>=2.2
|
|
35
|
+
Requires-Dist: pillow>=10.0
|
|
36
|
+
Requires-Dist: rich>=13.7
|
|
37
|
+
Provides-Extra: dev
|
|
38
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
39
|
+
Requires-Dist: pytest-cov>=5.0; extra == "dev"
|
|
40
|
+
Requires-Dist: ruff>=0.5; extra == "dev"
|
|
41
|
+
Requires-Dist: mypy>=1.10; extra == "dev"
|
|
42
|
+
Provides-Extra: pdf
|
|
43
|
+
Requires-Dist: pdfplumber>=0.11; extra == "pdf"
|
|
44
|
+
|
|
45
|
+
# banna
|
|
46
|
+
|
|
47
|
+
A from-scratch, provider-agnostic reasoning agent with a **typed state substrate** and a **verifier-guided** loop. Built to study where ReAct-style agents fail on the **GAIA** benchmark and to fix those failures structurally — not with prompt patches.
|
|
48
|
+
|
|
49
|
+
No LangChain, no LlamaIndex, no smolagents in the core. The reasoning loop is a typed transition function over `(state, action, observation) → state'`; ReAct, verifier-retry, planner-ReAct, BFS/DFS/best-first-over-plans, and best-of-N are each ~200 LOC `Policy` implementations over that same substrate.
|
|
50
|
+
|
|
51
|
+
## What's interesting about this repo
|
|
52
|
+
|
|
53
|
+
1. **Forensic GAIA debugging.** A full-validation run on `gpt-5-nano` was instrumented end-to-end, traces were dumped per task, and seven distinct structural failure modes were diagnosed and fixed — not by prompt-tweaking, but by changing the loop. See [Failure modes & fixes](#failure-modes--fixes-the-c1c6-pass) below.
|
|
54
|
+
2. **Multi-axis budget tracker** that separates *productive* steps from *repair* steps. A model stuck in an `[empty_reply]` loop no longer burns its productive-step budget; instead it trips a separate `max_repair_steps` axis with a forced tool-choice escape.
|
|
55
|
+
3. **Per-verifier actionable nudges.** Each verifier (Arithmetic, Citation, Coverage, Format) attaches a `meta["nudge"]` to its fail verdicts that names the missing thing (the recomputed value, the missing evidence_id, the unsupported number, the empty field). The retry policy groups these by verifier and emits one line per kind — short enough that the model actually reads them.
|
|
56
|
+
4. **Budget-exhaustion synthesis.** When the agent runs out of steps mid-task, instead of returning `null`, a final forced-`final_answer` call gives it one last shot with a cheap fallback chain (last claim → last short text → none).
|
|
57
|
+
5. **Provider-agnostic tool forcing.** A single helper translates "force any tool" into OpenAI's `tool_choice: "required"`, Anthropic's `{type: "any"}`, and Gemini's `ANY` mode — used to break out of empty-reply loops.
|
|
58
|
+
|
|
59
|
+
## Architecture
|
|
60
|
+
|
|
61
|
+
```mermaid
|
|
62
|
+
flowchart TB
|
|
63
|
+
Q[User question<br/>+ optional attachment] --> AG[Agent.run_policy]
|
|
64
|
+
|
|
65
|
+
subgraph SUB[banna_agent runtime]
|
|
66
|
+
AG --> ST[AgentState<br/>typed: Trace · Evidence · Claim · Budget]
|
|
67
|
+
ST -->|propose| POL{Policy}
|
|
68
|
+
|
|
69
|
+
POL --> REACT[ReActPolicy]
|
|
70
|
+
POL --> VR[VerifierRetryPolicy<br/>wraps inner]
|
|
71
|
+
POL --> PL[PlannerReActPolicy]
|
|
72
|
+
POL --> BFS[BFS / DFS / Best-First<br/>over plans]
|
|
73
|
+
POL --> BON[BestOfNPolicy<br/>K trajectories + selector]
|
|
74
|
+
|
|
75
|
+
REACT -->|Action| EX[Execute]
|
|
76
|
+
VR -->|FINAL_ANSWER| VERS{Verifiers}
|
|
77
|
+
VERS -- pass --> COMMIT[commit]
|
|
78
|
+
VERS -- fail --> NUDGE[per-verifier nudge<br/>→ THINK feedback]
|
|
79
|
+
NUDGE --> POL
|
|
80
|
+
|
|
81
|
+
EX --> TOOLS[ToolRegistry]
|
|
82
|
+
TOOLS --> T1[search]
|
|
83
|
+
TOOLS --> T2[read_url]
|
|
84
|
+
TOOLS --> T3[read_file<br/>+ pdf / xlsx tools]
|
|
85
|
+
TOOLS --> T4[python_sandbox]
|
|
86
|
+
TOOLS --> T5[calculator]
|
|
87
|
+
TOOLS --> T6[run_shell · grep · list_files]
|
|
88
|
+
|
|
89
|
+
EX -->|observation| ST
|
|
90
|
+
ST --> BUD[BudgetTracker<br/>steps · repair_steps · wall · tokens · cost]
|
|
91
|
+
BUD -. trip .-> SYN[synthesize_on_exhaustion<br/>forced final_answer]
|
|
92
|
+
SYN --> COMMIT
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
subgraph VERIFIERS[Verifiers]
|
|
96
|
+
VF[FormatVerifier<br/>shape / empty answer]
|
|
97
|
+
VA[ArithmeticVerifier<br/>safe-AST recompute]
|
|
98
|
+
VC[CitationVerifier<br/>numeric + Jaccard support]
|
|
99
|
+
VG[CoverageVerifier<br/>claim ↔ evidence]
|
|
100
|
+
end
|
|
101
|
+
VERS --- VF
|
|
102
|
+
VERS --- VA
|
|
103
|
+
VERS --- VC
|
|
104
|
+
VERS --- VG
|
|
105
|
+
|
|
106
|
+
COMMIT --> ANS[answer]
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Install
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
# 1. From PyPI (once published)
|
|
113
|
+
pip install banna
|
|
114
|
+
|
|
115
|
+
# 2. From GitHub directly (no clone, no PyPI required)
|
|
116
|
+
pip install git+https://github.com/siavashmonfared/banna.git
|
|
117
|
+
|
|
118
|
+
# 3. Isolated install with pipx (recommended for CLI use)
|
|
119
|
+
pipx install git+https://github.com/siavashmonfared/banna.git
|
|
120
|
+
|
|
121
|
+
# 4. From a local clone (for development)
|
|
122
|
+
git clone https://github.com/siavashmonfared/banna.git
|
|
123
|
+
cd banna
|
|
124
|
+
pip install -e ".[dev]"
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
Any install path drops a `banna` (and `banna-agent`) executable on your `$PATH`.
|
|
128
|
+
|
|
129
|
+
## Quickstart
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
# set at least one provider key
|
|
133
|
+
export OPENAI_API_KEY=sk-... # or ANTHROPIC_API_KEY=... / GEMINI_API_KEY=...
|
|
134
|
+
|
|
135
|
+
# open the interactive REPL
|
|
136
|
+
banna --policy verifier_retry --provider openai --model gpt-5-nano
|
|
137
|
+
|
|
138
|
+
# or run a single GAIA Level-1 question (no REPL)
|
|
139
|
+
python -m banna_agent.benchmarks.gaia.runner \
|
|
140
|
+
--policy verifier_retry --provider openai --model gpt-5-nano \
|
|
141
|
+
--level 1 --n 1
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Example REPL session
|
|
145
|
+
|
|
146
|
+
```
|
|
147
|
+
$ banna --policy verifier_retry --provider openai --model gpt-5-nano
|
|
148
|
+
|
|
149
|
+
● banna · v0.1.0 provider=openai model=gpt-5-nano policy=verifier_retry
|
|
150
|
+
|
|
151
|
+
> How many studio albums did Mercedes Sosa release between 2000 and 2009?
|
|
152
|
+
|
|
153
|
+
thinking…
|
|
154
|
+
▸ search(query="Mercedes Sosa discography studio albums 2000-2009")
|
|
155
|
+
↳ 8 results · evidence_id ev_a3f
|
|
156
|
+
▸ read_url(url="https://en.wikipedia.org/wiki/Mercedes_Sosa")
|
|
157
|
+
↳ 12.4 kB · evidence_id ev_91c
|
|
158
|
+
thinking…
|
|
159
|
+
▸ final_answer(answer="3", evidence_ids=["ev_a3f", "ev_91c"])
|
|
160
|
+
verifiers: format ✓ citation ✓ coverage ✓ arithmetic skip
|
|
161
|
+
|
|
162
|
+
● banna
|
|
163
|
+
3
|
|
164
|
+
|
|
165
|
+
3 steps · 4.7s · 1840→210 tok · $0.0021
|
|
166
|
+
|
|
167
|
+
> /show trace
|
|
168
|
+
…step-by-step dump of action + observation + meta…
|
|
169
|
+
|
|
170
|
+
> /exit
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
The full GAIA validation runner (165 questions across L1/L2/L3) is in `experiments/02_gaia_full/run.py`.
|
|
174
|
+
|
|
175
|
+
## Failure modes & fixes (the C1–C6 pass)
|
|
176
|
+
|
|
177
|
+
Diagnosed from a full GAIA validation run on `gpt-5-nano`. Each fix lands as a structural change to the loop, with unit tests pinning the new behavior.
|
|
178
|
+
|
|
179
|
+
| ID | Failure mode | Root cause | Fix |
|
|
180
|
+
|----|--------------|------------|-----|
|
|
181
|
+
| C1 | `[empty_reply]` loops eat the step budget | Repair-style THINKs counted as productive steps | New `Budget.repair_steps_used` axis + `max_repair_steps=6`; `meta["repair"]=True` routes off the main counter |
|
|
182
|
+
| C2 | Model returns empty content + no tool call | No detection / no escape | After 2 consecutive empties with no evidence, force `tool_choice` to any tool (provider-agnostic) |
|
|
183
|
+
| C3 | `pred_answer=null` on budget exhaustion | Loop exits with no commit | `policy.synthesize_on_exhaustion(state)`: one threaded LLM call with forced `final_answer` + cheap fallback chain |
|
|
184
|
+
| C4 | L1 step cap too tight (8 steps) | Default budget profile | L1/L2/L3 caps bumped to 12/18/24 |
|
|
185
|
+
| C5 | Rich file tools never used on attachments | Hint steered model toward `read_file` even for PDF/XLSX | Extension-routed `_file_hint()` + cheap `_file_summary()` (pypdf page count, openpyxl sheet names, CSV header) |
|
|
186
|
+
| C6 | Verifier retries low repair rate | Feedback was generic | Each verifier populates `meta["nudge"]` with a verifier-specific actionable instruction; retry feedback groups by verifier |
|
|
187
|
+
|
|
188
|
+
## GAIA validation results
|
|
189
|
+
|
|
190
|
+
> The fixes are landed and tested (562 tests passing in this public repo, 568 in the private superset). The post-fix re-run on GAIA validation is the next thing on the queue.
|
|
191
|
+
|
|
192
|
+
| Run | Provider · model | Policy | Set | Accuracy | Notes |
|
|
193
|
+
|-----|------------------|--------|-----|----------|-------|
|
|
194
|
+
| Pre-C1–C6 | OpenAI · `gpt-5-nano` | verifier_retry | GAIA val (165 Q) | **33.9 %** (56 / 165) | $0.94, 7 structural bugs surfaced |
|
|
195
|
+
| Post-C1–C6 | OpenAI · `gpt-5-nano` | verifier_retry | GAIA val (165 Q) | _re-run pending_ | target ≥ 40 % |
|
|
196
|
+
| Post-C1–C6 | OpenAI · `gpt-5-nano` | best_of_n (K=3) | GAIA val (165 Q) | _re-run pending_ | stretch ≥ 45 % |
|
|
197
|
+
|
|
198
|
+
(Numbers populate once the re-run completes; CI is wired to gate on a held-out smoke subset.)
|
|
199
|
+
|
|
200
|
+
## Repo layout
|
|
201
|
+
|
|
202
|
+
```
|
|
203
|
+
src/banna_agent/
|
|
204
|
+
├── core/ AgentState, Trace, Action, Budget, EventLog, run_policy
|
|
205
|
+
├── llm/ provider-agnostic LLMClient + adapters (anthropic, openai, gemini, ollama, bedrock)
|
|
206
|
+
├── tools/ search, read_url, read_file, pdf/xlsx tools, python_sandbox,
|
|
207
|
+
│ calculator, run_shell, grep, list_files, plan, memory, final_answer
|
|
208
|
+
├── policies/ react, planner_react, verifier_retry, bfs/dfs/best_first_over_plans, best_of_n
|
|
209
|
+
├── verifiers/ arithmetic, citation, coverage, format, command (+ base protocol)
|
|
210
|
+
├── benchmarks/ gaia/ (loader, runner, scorer, report)
|
|
211
|
+
├── memory/ in_memory_store, jsonl_store, skill_library, embeddings
|
|
212
|
+
└── cli/ Rich-based REPL: /policy /budget /show /skills /compact /save /load …
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
Tests live in `tests/` and are organized to mirror `src/`. Run them with:
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
pytest -q
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
Current status: **562 passed, 0 failed** on this public branch (no external substrate dependencies).
|
|
222
|
+
|
|
223
|
+
## License
|
|
224
|
+
|
|
225
|
+
MIT
|
banna-0.1.0/README.md
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# banna
|
|
2
|
+
|
|
3
|
+
A from-scratch, provider-agnostic reasoning agent with a **typed state substrate** and a **verifier-guided** loop. Built to study where ReAct-style agents fail on the **GAIA** benchmark and to fix those failures structurally — not with prompt patches.
|
|
4
|
+
|
|
5
|
+
No LangChain, no LlamaIndex, no smolagents in the core. The reasoning loop is a typed transition function over `(state, action, observation) → state'`; ReAct, verifier-retry, planner-ReAct, BFS/DFS/best-first-over-plans, and best-of-N are each ~200 LOC `Policy` implementations over that same substrate.
|
|
6
|
+
|
|
7
|
+
## What's interesting about this repo
|
|
8
|
+
|
|
9
|
+
1. **Forensic GAIA debugging.** A full-validation run on `gpt-5-nano` was instrumented end-to-end, traces were dumped per task, and seven distinct structural failure modes were diagnosed and fixed — not by prompt-tweaking, but by changing the loop. See [Failure modes & fixes](#failure-modes--fixes-the-c1c6-pass) below.
|
|
10
|
+
2. **Multi-axis budget tracker** that separates *productive* steps from *repair* steps. A model stuck in an `[empty_reply]` loop no longer burns its productive-step budget; instead it trips a separate `max_repair_steps` axis with a forced tool-choice escape.
|
|
11
|
+
3. **Per-verifier actionable nudges.** Each verifier (Arithmetic, Citation, Coverage, Format) attaches a `meta["nudge"]` to its fail verdicts that names the missing thing (the recomputed value, the missing evidence_id, the unsupported number, the empty field). The retry policy groups these by verifier and emits one line per kind — short enough that the model actually reads them.
|
|
12
|
+
4. **Budget-exhaustion synthesis.** When the agent runs out of steps mid-task, instead of returning `null`, a final forced-`final_answer` call gives it one last shot with a cheap fallback chain (last claim → last short text → none).
|
|
13
|
+
5. **Provider-agnostic tool forcing.** A single helper translates "force any tool" into OpenAI's `tool_choice: "required"`, Anthropic's `{type: "any"}`, and Gemini's `ANY` mode — used to break out of empty-reply loops.
|
|
14
|
+
|
|
15
|
+
## Architecture
|
|
16
|
+
|
|
17
|
+
```mermaid
|
|
18
|
+
flowchart TB
|
|
19
|
+
Q[User question<br/>+ optional attachment] --> AG[Agent.run_policy]
|
|
20
|
+
|
|
21
|
+
subgraph SUB[banna_agent runtime]
|
|
22
|
+
AG --> ST[AgentState<br/>typed: Trace · Evidence · Claim · Budget]
|
|
23
|
+
ST -->|propose| POL{Policy}
|
|
24
|
+
|
|
25
|
+
POL --> REACT[ReActPolicy]
|
|
26
|
+
POL --> VR[VerifierRetryPolicy<br/>wraps inner]
|
|
27
|
+
POL --> PL[PlannerReActPolicy]
|
|
28
|
+
POL --> BFS[BFS / DFS / Best-First<br/>over plans]
|
|
29
|
+
POL --> BON[BestOfNPolicy<br/>K trajectories + selector]
|
|
30
|
+
|
|
31
|
+
REACT -->|Action| EX[Execute]
|
|
32
|
+
VR -->|FINAL_ANSWER| VERS{Verifiers}
|
|
33
|
+
VERS -- pass --> COMMIT[commit]
|
|
34
|
+
VERS -- fail --> NUDGE[per-verifier nudge<br/>→ THINK feedback]
|
|
35
|
+
NUDGE --> POL
|
|
36
|
+
|
|
37
|
+
EX --> TOOLS[ToolRegistry]
|
|
38
|
+
TOOLS --> T1[search]
|
|
39
|
+
TOOLS --> T2[read_url]
|
|
40
|
+
TOOLS --> T3[read_file<br/>+ pdf / xlsx tools]
|
|
41
|
+
TOOLS --> T4[python_sandbox]
|
|
42
|
+
TOOLS --> T5[calculator]
|
|
43
|
+
TOOLS --> T6[run_shell · grep · list_files]
|
|
44
|
+
|
|
45
|
+
EX -->|observation| ST
|
|
46
|
+
ST --> BUD[BudgetTracker<br/>steps · repair_steps · wall · tokens · cost]
|
|
47
|
+
BUD -. trip .-> SYN[synthesize_on_exhaustion<br/>forced final_answer]
|
|
48
|
+
SYN --> COMMIT
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
subgraph VERIFIERS[Verifiers]
|
|
52
|
+
VF[FormatVerifier<br/>shape / empty answer]
|
|
53
|
+
VA[ArithmeticVerifier<br/>safe-AST recompute]
|
|
54
|
+
VC[CitationVerifier<br/>numeric + Jaccard support]
|
|
55
|
+
VG[CoverageVerifier<br/>claim ↔ evidence]
|
|
56
|
+
end
|
|
57
|
+
VERS --- VF
|
|
58
|
+
VERS --- VA
|
|
59
|
+
VERS --- VC
|
|
60
|
+
VERS --- VG
|
|
61
|
+
|
|
62
|
+
COMMIT --> ANS[answer]
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## Install
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
# 1. From PyPI (once published)
|
|
69
|
+
pip install banna
|
|
70
|
+
|
|
71
|
+
# 2. From GitHub directly (no clone, no PyPI required)
|
|
72
|
+
pip install git+https://github.com/siavashmonfared/banna.git
|
|
73
|
+
|
|
74
|
+
# 3. Isolated install with pipx (recommended for CLI use)
|
|
75
|
+
pipx install git+https://github.com/siavashmonfared/banna.git
|
|
76
|
+
|
|
77
|
+
# 4. From a local clone (for development)
|
|
78
|
+
git clone https://github.com/siavashmonfared/banna.git
|
|
79
|
+
cd banna
|
|
80
|
+
pip install -e ".[dev]"
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Any install path drops a `banna` (and `banna-agent`) executable on your `$PATH`.
|
|
84
|
+
|
|
85
|
+
## Quickstart
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
# set at least one provider key
|
|
89
|
+
export OPENAI_API_KEY=sk-... # or ANTHROPIC_API_KEY=... / GEMINI_API_KEY=...
|
|
90
|
+
|
|
91
|
+
# open the interactive REPL
|
|
92
|
+
banna --policy verifier_retry --provider openai --model gpt-5-nano
|
|
93
|
+
|
|
94
|
+
# or run a single GAIA Level-1 question (no REPL)
|
|
95
|
+
python -m banna_agent.benchmarks.gaia.runner \
|
|
96
|
+
--policy verifier_retry --provider openai --model gpt-5-nano \
|
|
97
|
+
--level 1 --n 1
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Example REPL session
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
$ banna --policy verifier_retry --provider openai --model gpt-5-nano
|
|
104
|
+
|
|
105
|
+
● banna · v0.1.0 provider=openai model=gpt-5-nano policy=verifier_retry
|
|
106
|
+
|
|
107
|
+
> How many studio albums did Mercedes Sosa release between 2000 and 2009?
|
|
108
|
+
|
|
109
|
+
thinking…
|
|
110
|
+
▸ search(query="Mercedes Sosa discography studio albums 2000-2009")
|
|
111
|
+
↳ 8 results · evidence_id ev_a3f
|
|
112
|
+
▸ read_url(url="https://en.wikipedia.org/wiki/Mercedes_Sosa")
|
|
113
|
+
↳ 12.4 kB · evidence_id ev_91c
|
|
114
|
+
thinking…
|
|
115
|
+
▸ final_answer(answer="3", evidence_ids=["ev_a3f", "ev_91c"])
|
|
116
|
+
verifiers: format ✓ citation ✓ coverage ✓ arithmetic skip
|
|
117
|
+
|
|
118
|
+
● banna
|
|
119
|
+
3
|
|
120
|
+
|
|
121
|
+
3 steps · 4.7s · 1840→210 tok · $0.0021
|
|
122
|
+
|
|
123
|
+
> /show trace
|
|
124
|
+
…step-by-step dump of action + observation + meta…
|
|
125
|
+
|
|
126
|
+
> /exit
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
The full GAIA validation runner (165 questions across L1/L2/L3) is in `experiments/02_gaia_full/run.py`.
|
|
130
|
+
|
|
131
|
+
## Failure modes & fixes (the C1–C6 pass)
|
|
132
|
+
|
|
133
|
+
Diagnosed from a full GAIA validation run on `gpt-5-nano`. Each fix lands as a structural change to the loop, with unit tests pinning the new behavior.
|
|
134
|
+
|
|
135
|
+
| ID | Failure mode | Root cause | Fix |
|
|
136
|
+
|----|--------------|------------|-----|
|
|
137
|
+
| C1 | `[empty_reply]` loops eat the step budget | Repair-style THINKs counted as productive steps | New `Budget.repair_steps_used` axis + `max_repair_steps=6`; `meta["repair"]=True` routes off the main counter |
|
|
138
|
+
| C2 | Model returns empty content + no tool call | No detection / no escape | After 2 consecutive empties with no evidence, force `tool_choice` to any tool (provider-agnostic) |
|
|
139
|
+
| C3 | `pred_answer=null` on budget exhaustion | Loop exits with no commit | `policy.synthesize_on_exhaustion(state)`: one threaded LLM call with forced `final_answer` + cheap fallback chain |
|
|
140
|
+
| C4 | L1 step cap too tight (8 steps) | Default budget profile | L1/L2/L3 caps bumped to 12/18/24 |
|
|
141
|
+
| C5 | Rich file tools never used on attachments | Hint steered model toward `read_file` even for PDF/XLSX | Extension-routed `_file_hint()` + cheap `_file_summary()` (pypdf page count, openpyxl sheet names, CSV header) |
|
|
142
|
+
| C6 | Verifier retries low repair rate | Feedback was generic | Each verifier populates `meta["nudge"]` with a verifier-specific actionable instruction; retry feedback groups by verifier |
|
|
143
|
+
|
|
144
|
+
## GAIA validation results
|
|
145
|
+
|
|
146
|
+
> The fixes are landed and tested (562 tests passing in this public repo, 568 in the private superset). The post-fix re-run on GAIA validation is the next thing on the queue.
|
|
147
|
+
|
|
148
|
+
| Run | Provider · model | Policy | Set | Accuracy | Notes |
|
|
149
|
+
|-----|------------------|--------|-----|----------|-------|
|
|
150
|
+
| Pre-C1–C6 | OpenAI · `gpt-5-nano` | verifier_retry | GAIA val (165 Q) | **33.9 %** (56 / 165) | $0.94, 7 structural bugs surfaced |
|
|
151
|
+
| Post-C1–C6 | OpenAI · `gpt-5-nano` | verifier_retry | GAIA val (165 Q) | _re-run pending_ | target ≥ 40 % |
|
|
152
|
+
| Post-C1–C6 | OpenAI · `gpt-5-nano` | best_of_n (K=3) | GAIA val (165 Q) | _re-run pending_ | stretch ≥ 45 % |
|
|
153
|
+
|
|
154
|
+
(Numbers populate once the re-run completes; CI is wired to gate on a held-out smoke subset.)
|
|
155
|
+
|
|
156
|
+
## Repo layout
|
|
157
|
+
|
|
158
|
+
```
|
|
159
|
+
src/banna_agent/
|
|
160
|
+
├── core/ AgentState, Trace, Action, Budget, EventLog, run_policy
|
|
161
|
+
├── llm/ provider-agnostic LLMClient + adapters (anthropic, openai, gemini, ollama, bedrock)
|
|
162
|
+
├── tools/ search, read_url, read_file, pdf/xlsx tools, python_sandbox,
|
|
163
|
+
│ calculator, run_shell, grep, list_files, plan, memory, final_answer
|
|
164
|
+
├── policies/ react, planner_react, verifier_retry, bfs/dfs/best_first_over_plans, best_of_n
|
|
165
|
+
├── verifiers/ arithmetic, citation, coverage, format, command (+ base protocol)
|
|
166
|
+
├── benchmarks/ gaia/ (loader, runner, scorer, report)
|
|
167
|
+
├── memory/ in_memory_store, jsonl_store, skill_library, embeddings
|
|
168
|
+
└── cli/ Rich-based REPL: /policy /budget /show /skills /compact /save /load …
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
Tests live in `tests/` and are organized to mirror `src/`. Run them with:
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
pytest -q
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
Current status: **562 passed, 0 failed** on this public branch (no external substrate dependencies).
|
|
178
|
+
|
|
179
|
+
## License
|
|
180
|
+
|
|
181
|
+
MIT
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "banna"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "A from-scratch, provider-agnostic reasoning agent with a typed state substrate and verifier-guided search. Primary benchmark: GAIA."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
authors = [{ name = "Siavash Monfared", email = "monfared@alum.mit.edu" }]
|
|
12
|
+
license = { text = "MIT" }
|
|
13
|
+
keywords = ["agent", "llm", "react", "verifier", "gaia", "reasoning", "anthropic", "openai", "gemini"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"Intended Audience :: Science/Research",
|
|
18
|
+
"License :: OSI Approved :: MIT License",
|
|
19
|
+
"Operating System :: POSIX :: Linux",
|
|
20
|
+
"Operating System :: MacOS",
|
|
21
|
+
"Programming Language :: Python :: 3",
|
|
22
|
+
"Programming Language :: Python :: 3.10",
|
|
23
|
+
"Programming Language :: Python :: 3.11",
|
|
24
|
+
"Programming Language :: Python :: 3.12",
|
|
25
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
dependencies = [
|
|
29
|
+
"anthropic>=0.40",
|
|
30
|
+
"openai>=1.50",
|
|
31
|
+
"google-generativeai>=0.8",
|
|
32
|
+
"requests>=2.31",
|
|
33
|
+
"pydantic>=2.6",
|
|
34
|
+
"datasets>=2.19",
|
|
35
|
+
"pyyaml>=6.0",
|
|
36
|
+
"beautifulsoup4>=4.12",
|
|
37
|
+
"pypdf>=4.0",
|
|
38
|
+
"openpyxl>=3.1",
|
|
39
|
+
"pandas>=2.2",
|
|
40
|
+
"pillow>=10.0",
|
|
41
|
+
"rich>=13.7",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
[project.urls]
|
|
45
|
+
Homepage = "https://github.com/siavashmonfared/banna"
|
|
46
|
+
Repository = "https://github.com/siavashmonfared/banna"
|
|
47
|
+
Issues = "https://github.com/siavashmonfared/banna/issues"
|
|
48
|
+
|
|
49
|
+
[project.optional-dependencies]
|
|
50
|
+
dev = [
|
|
51
|
+
"pytest>=8.0",
|
|
52
|
+
"pytest-cov>=5.0",
|
|
53
|
+
"ruff>=0.5",
|
|
54
|
+
"mypy>=1.10",
|
|
55
|
+
]
|
|
56
|
+
# Optional PDF-tables support. pypdf (core dep) handles text extraction;
|
|
57
|
+
# pdfplumber adds table-structure parsing for GAIA tasks where the
|
|
58
|
+
# answer lives in a PDF table. Install with `pip install ".[pdf]"`.
|
|
59
|
+
pdf = [
|
|
60
|
+
"pdfplumber>=0.11",
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
[project.scripts]
|
|
64
|
+
banna = "banna_agent.cli:main"
|
|
65
|
+
banna-agent = "banna_agent.cli:main"
|
|
66
|
+
|
|
67
|
+
[tool.setuptools.packages.find]
|
|
68
|
+
where = ["src"]
|
|
69
|
+
|
|
70
|
+
[tool.ruff]
|
|
71
|
+
line-length = 100
|
|
72
|
+
target-version = "py310"
|
|
73
|
+
|
|
74
|
+
[tool.ruff.lint]
|
|
75
|
+
select = ["E", "F", "W", "I", "UP", "B", "SIM", "TID", "RUF"]
|
|
76
|
+
ignore = ["E501"]
|
|
77
|
+
|
|
78
|
+
[tool.pytest.ini_options]
|
|
79
|
+
testpaths = ["tests"]
|
|
80
|
+
addopts = "-ra --strict-markers"
|
banna-0.1.0/setup.cfg
ADDED