banna 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. banna-0.1.0/PKG-INFO +225 -0
  2. banna-0.1.0/README.md +181 -0
  3. banna-0.1.0/pyproject.toml +80 -0
  4. banna-0.1.0/setup.cfg +4 -0
  5. banna-0.1.0/src/banna.egg-info/PKG-INFO +225 -0
  6. banna-0.1.0/src/banna.egg-info/SOURCES.txt +100 -0
  7. banna-0.1.0/src/banna.egg-info/dependency_links.txt +1 -0
  8. banna-0.1.0/src/banna.egg-info/entry_points.txt +3 -0
  9. banna-0.1.0/src/banna.egg-info/requires.txt +22 -0
  10. banna-0.1.0/src/banna.egg-info/top_level.txt +1 -0
  11. banna-0.1.0/src/banna_agent/__init__.py +3 -0
  12. banna-0.1.0/src/banna_agent/adapters/__init__.py +1 -0
  13. banna-0.1.0/src/banna_agent/benchmarks/__init__.py +1 -0
  14. banna-0.1.0/src/banna_agent/benchmarks/gaia/__init__.py +1 -0
  15. banna-0.1.0/src/banna_agent/benchmarks/gaia/loader.py +172 -0
  16. banna-0.1.0/src/banna_agent/benchmarks/gaia/report.py +184 -0
  17. banna-0.1.0/src/banna_agent/benchmarks/gaia/runner.py +463 -0
  18. banna-0.1.0/src/banna_agent/benchmarks/gaia/scorer.py +245 -0
  19. banna-0.1.0/src/banna_agent/cli/__init__.py +9 -0
  20. banna-0.1.0/src/banna_agent/cli/__main__.py +6 -0
  21. banna-0.1.0/src/banna_agent/cli/app.py +579 -0
  22. banna-0.1.0/src/banna_agent/cli/commands.py +1211 -0
  23. banna-0.1.0/src/banna_agent/cli/display.py +356 -0
  24. banna-0.1.0/src/banna_agent/cli/session.py +213 -0
  25. banna-0.1.0/src/banna_agent/cli/theme.py +486 -0
  26. banna-0.1.0/src/banna_agent/core/__init__.py +1 -0
  27. banna-0.1.0/src/banna_agent/core/agent.py +320 -0
  28. banna-0.1.0/src/banna_agent/core/budget.py +63 -0
  29. banna-0.1.0/src/banna_agent/core/events.py +114 -0
  30. banna-0.1.0/src/banna_agent/core/state.py +137 -0
  31. banna-0.1.0/src/banna_agent/core/types.py +229 -0
  32. banna-0.1.0/src/banna_agent/llm/__init__.py +1 -0
  33. banna-0.1.0/src/banna_agent/llm/anthropic.py +335 -0
  34. banna-0.1.0/src/banna_agent/llm/base.py +248 -0
  35. banna-0.1.0/src/banna_agent/llm/config.py +107 -0
  36. banna-0.1.0/src/banna_agent/llm/gemini.py +296 -0
  37. banna-0.1.0/src/banna_agent/llm/ollama.py +231 -0
  38. banna-0.1.0/src/banna_agent/llm/openai.py +329 -0
  39. banna-0.1.0/src/banna_agent/llm/pricing.py +197 -0
  40. banna-0.1.0/src/banna_agent/llm/registry.py +131 -0
  41. banna-0.1.0/src/banna_agent/memory/__init__.py +5 -0
  42. banna-0.1.0/src/banna_agent/memory/base.py +149 -0
  43. banna-0.1.0/src/banna_agent/memory/chroma_store.py +171 -0
  44. banna-0.1.0/src/banna_agent/memory/compactor.py +172 -0
  45. banna-0.1.0/src/banna_agent/memory/embeddings.py +181 -0
  46. banna-0.1.0/src/banna_agent/memory/in_memory_store.py +106 -0
  47. banna-0.1.0/src/banna_agent/memory/jsonl_store.py +119 -0
  48. banna-0.1.0/src/banna_agent/memory/skill_harvester.py +144 -0
  49. banna-0.1.0/src/banna_agent/memory/skill_library.py +120 -0
  50. banna-0.1.0/src/banna_agent/policies/__init__.py +1 -0
  51. banna-0.1.0/src/banna_agent/policies/_plan_exec.py +507 -0
  52. banna-0.1.0/src/banna_agent/policies/_planning.py +188 -0
  53. banna-0.1.0/src/banna_agent/policies/base.py +36 -0
  54. banna-0.1.0/src/banna_agent/policies/best_first_over_plans.py +192 -0
  55. banna-0.1.0/src/banna_agent/policies/best_of_n.py +266 -0
  56. banna-0.1.0/src/banna_agent/policies/bfs_over_plans.py +223 -0
  57. banna-0.1.0/src/banna_agent/policies/dfs_over_plans.py +189 -0
  58. banna-0.1.0/src/banna_agent/policies/planner_react.py +284 -0
  59. banna-0.1.0/src/banna_agent/policies/react.py +807 -0
  60. banna-0.1.0/src/banna_agent/policies/verifier_retry.py +227 -0
  61. banna-0.1.0/src/banna_agent/tools/__init__.py +1 -0
  62. banna-0.1.0/src/banna_agent/tools/_command_runner.py +222 -0
  63. banna-0.1.0/src/banna_agent/tools/_http_cache.py +291 -0
  64. banna-0.1.0/src/banna_agent/tools/_parsers/__init__.py +23 -0
  65. banna-0.1.0/src/banna_agent/tools/_parsers/mypy.py +60 -0
  66. banna-0.1.0/src/banna_agent/tools/_parsers/pytest.py +70 -0
  67. banna-0.1.0/src/banna_agent/tools/_parsers/ruff.py +65 -0
  68. banna-0.1.0/src/banna_agent/tools/audio_transcribe.py +109 -0
  69. banna-0.1.0/src/banna_agent/tools/base.py +116 -0
  70. banna-0.1.0/src/banna_agent/tools/browser.py +470 -0
  71. banna-0.1.0/src/banna_agent/tools/calculator.py +100 -0
  72. banna-0.1.0/src/banna_agent/tools/file_reader.py +359 -0
  73. banna-0.1.0/src/banna_agent/tools/final_answer.py +99 -0
  74. banna-0.1.0/src/banna_agent/tools/grep.py +180 -0
  75. banna-0.1.0/src/banna_agent/tools/image_extract.py +172 -0
  76. banna-0.1.0/src/banna_agent/tools/list_files.py +113 -0
  77. banna-0.1.0/src/banna_agent/tools/memory.py +160 -0
  78. banna-0.1.0/src/banna_agent/tools/pdf_reader.py +312 -0
  79. banna-0.1.0/src/banna_agent/tools/plan.py +176 -0
  80. banna-0.1.0/src/banna_agent/tools/python_sandbox.py +173 -0
  81. banna-0.1.0/src/banna_agent/tools/run_shell.py +247 -0
  82. banna-0.1.0/src/banna_agent/tools/run_tests.py +147 -0
  83. banna-0.1.0/src/banna_agent/tools/search/__init__.py +42 -0
  84. banna-0.1.0/src/banna_agent/tools/search/backends/__init__.py +8 -0
  85. banna-0.1.0/src/banna_agent/tools/search/backends/arxiv.py +130 -0
  86. banna-0.1.0/src/banna_agent/tools/search/backends/biorxiv.py +108 -0
  87. banna-0.1.0/src/banna_agent/tools/search/backends/duckduckgo.py +122 -0
  88. banna-0.1.0/src/banna_agent/tools/search/backends/github.py +147 -0
  89. banna-0.1.0/src/banna_agent/tools/search/backends/google.py +108 -0
  90. banna-0.1.0/src/banna_agent/tools/search/backends/tavily.py +72 -0
  91. banna-0.1.0/src/banna_agent/tools/search/backends/yacy.py +244 -0
  92. banna-0.1.0/src/banna_agent/tools/search/base.py +145 -0
  93. banna-0.1.0/src/banna_agent/tools/search/tool.py +343 -0
  94. banna-0.1.0/src/banna_agent/tools/url_reader.py +117 -0
  95. banna-0.1.0/src/banna_agent/tools/xlsx_reader.py +251 -0
  96. banna-0.1.0/src/banna_agent/verifiers/__init__.py +41 -0
  97. banna-0.1.0/src/banna_agent/verifiers/arithmetic.py +244 -0
  98. banna-0.1.0/src/banna_agent/verifiers/base.py +170 -0
  99. banna-0.1.0/src/banna_agent/verifiers/citation.py +351 -0
  100. banna-0.1.0/src/banna_agent/verifiers/command.py +132 -0
  101. banna-0.1.0/src/banna_agent/verifiers/coverage.py +124 -0
  102. banna-0.1.0/src/banna_agent/verifiers/format.py +264 -0
banna-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,225 @@
1
+ Metadata-Version: 2.4
2
+ Name: banna
3
+ Version: 0.1.0
4
+ Summary: A from-scratch, provider-agnostic reasoning agent with a typed state substrate and verifier-guided search. Primary benchmark: GAIA.
5
+ Author-email: Siavash Monfared <monfared@alum.mit.edu>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/siavashmonfared/banna
8
+ Project-URL: Repository, https://github.com/siavashmonfared/banna
9
+ Project-URL: Issues, https://github.com/siavashmonfared/banna/issues
10
+ Keywords: agent,llm,react,verifier,gaia,reasoning,anthropic,openai,gemini
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: POSIX :: Linux
16
+ Classifier: Operating System :: MacOS
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
22
+ Requires-Python: >=3.10
23
+ Description-Content-Type: text/markdown
24
+ Requires-Dist: anthropic>=0.40
25
+ Requires-Dist: openai>=1.50
26
+ Requires-Dist: google-generativeai>=0.8
27
+ Requires-Dist: requests>=2.31
28
+ Requires-Dist: pydantic>=2.6
29
+ Requires-Dist: datasets>=2.19
30
+ Requires-Dist: pyyaml>=6.0
31
+ Requires-Dist: beautifulsoup4>=4.12
32
+ Requires-Dist: pypdf>=4.0
33
+ Requires-Dist: openpyxl>=3.1
34
+ Requires-Dist: pandas>=2.2
35
+ Requires-Dist: pillow>=10.0
36
+ Requires-Dist: rich>=13.7
37
+ Provides-Extra: dev
38
+ Requires-Dist: pytest>=8.0; extra == "dev"
39
+ Requires-Dist: pytest-cov>=5.0; extra == "dev"
40
+ Requires-Dist: ruff>=0.5; extra == "dev"
41
+ Requires-Dist: mypy>=1.10; extra == "dev"
42
+ Provides-Extra: pdf
43
+ Requires-Dist: pdfplumber>=0.11; extra == "pdf"
44
+
45
+ # banna
46
+
47
+ A from-scratch, provider-agnostic reasoning agent with a **typed state substrate** and a **verifier-guided** loop. Built to study where ReAct-style agents fail on the **GAIA** benchmark and to fix those failures structurally — not with prompt patches.
48
+
49
+ No LangChain, no LlamaIndex, no smolagents in the core. The reasoning loop is a typed transition function over `(state, action, observation) → state'`; ReAct, verifier-retry, planner-ReAct, BFS/DFS/best-first-over-plans, and best-of-N are each ~200 LOC `Policy` implementations over that same substrate.
50
+
51
+ ## What's interesting about this repo
52
+
53
+ 1. **Forensic GAIA debugging.** A full-validation run on `gpt-5-nano` was instrumented end-to-end, traces were dumped per task, and seven distinct structural failure modes were diagnosed and fixed — not by prompt-tweaking, but by changing the loop. See [Failure modes & fixes](#failure-modes--fixes-the-c1c6-pass) below.
54
+ 2. **Multi-axis budget tracker** that separates *productive* steps from *repair* steps. A model stuck in an `[empty_reply]` loop no longer burns its productive-step budget; instead it trips a separate `max_repair_steps` axis with a forced tool-choice escape.
55
+ 3. **Per-verifier actionable nudges.** Each verifier (Arithmetic, Citation, Coverage, Format) attaches a `meta["nudge"]` to its fail verdicts that names the missing thing (the recomputed value, the missing evidence_id, the unsupported number, the empty field). The retry policy groups these by verifier and emits one line per kind — short enough that the model actually reads them.
56
+ 4. **Budget-exhaustion synthesis.** When the agent runs out of steps mid-task, instead of returning `null`, a final forced-`final_answer` call gives it one last shot with a cheap fallback chain (last claim → last short text → none).
57
+ 5. **Provider-agnostic tool forcing.** A single helper translates "force any tool" into OpenAI's `tool_choice: "required"`, Anthropic's `{type: "any"}`, and Gemini's `ANY` mode — used to break out of empty-reply loops.
58
+
59
+ ## Architecture
60
+
61
+ ```mermaid
62
+ flowchart TB
63
+ Q[User question<br/>+ optional attachment] --> AG[Agent.run_policy]
64
+
65
+ subgraph SUB[banna_agent runtime]
66
+ AG --> ST[AgentState<br/>typed: Trace · Evidence · Claim · Budget]
67
+ ST -->|propose| POL{Policy}
68
+
69
+ POL --> REACT[ReActPolicy]
70
+ POL --> VR[VerifierRetryPolicy<br/>wraps inner]
71
+ POL --> PL[PlannerReActPolicy]
72
+ POL --> BFS[BFS / DFS / Best-First<br/>over plans]
73
+ POL --> BON[BestOfNPolicy<br/>K trajectories + selector]
74
+
75
+ REACT -->|Action| EX[Execute]
76
+ VR -->|FINAL_ANSWER| VERS{Verifiers}
77
+ VERS -- pass --> COMMIT[commit]
78
+ VERS -- fail --> NUDGE[per-verifier nudge<br/>→ THINK feedback]
79
+ NUDGE --> POL
80
+
81
+ EX --> TOOLS[ToolRegistry]
82
+ TOOLS --> T1[search]
83
+ TOOLS --> T2[read_url]
84
+ TOOLS --> T3[read_file<br/>+ pdf / xlsx tools]
85
+ TOOLS --> T4[python_sandbox]
86
+ TOOLS --> T5[calculator]
87
+ TOOLS --> T6[run_shell · grep · list_files]
88
+
89
+ EX -->|observation| ST
90
+ ST --> BUD[BudgetTracker<br/>steps · repair_steps · wall · tokens · cost]
91
+ BUD -. trip .-> SYN[synthesize_on_exhaustion<br/>forced final_answer]
92
+ SYN --> COMMIT
93
+ end
94
+
95
+ subgraph VERIFIERS[Verifiers]
96
+ VF[FormatVerifier<br/>shape / empty answer]
97
+ VA[ArithmeticVerifier<br/>safe-AST recompute]
98
+ VC[CitationVerifier<br/>numeric + Jaccard support]
99
+ VG[CoverageVerifier<br/>claim ↔ evidence]
100
+ end
101
+ VERS --- VF
102
+ VERS --- VA
103
+ VERS --- VC
104
+ VERS --- VG
105
+
106
+ COMMIT --> ANS[answer]
107
+ ```
108
+
109
+ ## Install
110
+
111
+ ```bash
112
+ # 1. From PyPI (once published)
113
+ pip install banna
114
+
115
+ # 2. From GitHub directly (no clone, no PyPI required)
116
+ pip install git+https://github.com/siavashmonfared/banna.git
117
+
118
+ # 3. Isolated install with pipx (recommended for CLI use)
119
+ pipx install git+https://github.com/siavashmonfared/banna.git
120
+
121
+ # 4. From a local clone (for development)
122
+ git clone https://github.com/siavashmonfared/banna.git
123
+ cd banna
124
+ pip install -e ".[dev]"
125
+ ```
126
+
127
+ Any install path drops a `banna` (and `banna-agent`) executable on your `$PATH`.
128
+
129
+ ## Quickstart
130
+
131
+ ```bash
132
+ # set at least one provider key
133
+ export OPENAI_API_KEY=sk-... # or ANTHROPIC_API_KEY=... / GEMINI_API_KEY=...
134
+
135
+ # open the interactive REPL
136
+ banna --policy verifier_retry --provider openai --model gpt-5-nano
137
+
138
+ # or run a single GAIA Level-1 question (no REPL)
139
+ python -m banna_agent.benchmarks.gaia.runner \
140
+ --policy verifier_retry --provider openai --model gpt-5-nano \
141
+ --level 1 --n 1
142
+ ```
143
+
144
+ ### Example REPL session
145
+
146
+ ```
147
+ $ banna --policy verifier_retry --provider openai --model gpt-5-nano
148
+
149
+ ● banna · v0.1.0 provider=openai model=gpt-5-nano policy=verifier_retry
150
+
151
+ > How many studio albums did Mercedes Sosa release between 2000 and 2009?
152
+
153
+ thinking…
154
+ ▸ search(query="Mercedes Sosa discography studio albums 2000-2009")
155
+ ↳ 8 results · evidence_id ev_a3f
156
+ ▸ read_url(url="https://en.wikipedia.org/wiki/Mercedes_Sosa")
157
+ ↳ 12.4 kB · evidence_id ev_91c
158
+ thinking…
159
+ ▸ final_answer(answer="3", evidence_ids=["ev_a3f", "ev_91c"])
160
+ verifiers: format ✓ citation ✓ coverage ✓ arithmetic skip
161
+
162
+ ● banna
163
+ 3
164
+
165
+ 3 steps · 4.7s · 1840→210 tok · $0.0021
166
+
167
+ > /show trace
168
+ …step-by-step dump of action + observation + meta…
169
+
170
+ > /exit
171
+ ```
172
+
173
+ The full GAIA validation runner (165 questions across L1/L2/L3) is in `experiments/02_gaia_full/run.py`.
174
+
175
+ ## Failure modes & fixes (the C1–C6 pass)
176
+
177
+ Diagnosed from a full GAIA validation run on `gpt-5-nano`. Each fix lands as a structural change to the loop, with unit tests pinning the new behavior.
178
+
179
+ | ID | Failure mode | Root cause | Fix |
180
+ |----|--------------|------------|-----|
181
+ | C1 | `[empty_reply]` loops eat the step budget | Repair-style THINKs counted as productive steps | New `Budget.repair_steps_used` axis + `max_repair_steps=6`; `meta["repair"]=True` routes off the main counter |
182
+ | C2 | Model returns empty content + no tool call | No detection / no escape | After 2 consecutive empties with no evidence, force `tool_choice` to any tool (provider-agnostic) |
183
+ | C3 | `pred_answer=null` on budget exhaustion | Loop exits with no commit | `policy.synthesize_on_exhaustion(state)`: one threaded LLM call with forced `final_answer` + cheap fallback chain |
184
+ | C4 | L1 step cap too tight (8 steps) | Default budget profile | L1/L2/L3 caps bumped to 12/18/24 |
185
+ | C5 | Rich file tools never used on attachments | Hint steered model toward `read_file` even for PDF/XLSX | Extension-routed `_file_hint()` + cheap `_file_summary()` (pypdf page count, openpyxl sheet names, CSV header) |
186
+ | C6 | Verifier retries low repair rate | Feedback was generic | Each verifier populates `meta["nudge"]` with a verifier-specific actionable instruction; retry feedback groups by verifier |
187
+
188
+ ## GAIA validation results
189
+
190
+ > The fixes are landed and tested (562 tests passing in this public repo, 568 in the private superset). The post-fix re-run on GAIA validation is the next thing on the queue.
191
+
192
+ | Run | Provider · model | Policy | Set | Accuracy | Notes |
193
+ |-----|------------------|--------|-----|----------|-------|
194
+ | Pre-C1–C6 | OpenAI · `gpt-5-nano` | verifier_retry | GAIA val (165 Q) | **33.9 %** (56 / 165) | $0.94, 7 structural bugs surfaced |
195
+ | Post-C1–C6 | OpenAI · `gpt-5-nano` | verifier_retry | GAIA val (165 Q) | _re-run pending_ | target ≥ 40 % |
196
+ | Post-C1–C6 | OpenAI · `gpt-5-nano` | best_of_n (K=3) | GAIA val (165 Q) | _re-run pending_ | stretch ≥ 45 % |
197
+
198
+ (Numbers populate once the re-run completes; CI is wired to gate on a held-out smoke subset.)
199
+
200
+ ## Repo layout
201
+
202
+ ```
203
+ src/banna_agent/
204
+ ├── core/ AgentState, Trace, Action, Budget, EventLog, run_policy
205
+ ├── llm/ provider-agnostic LLMClient + adapters (anthropic, openai, gemini, ollama, bedrock)
206
+ ├── tools/ search, read_url, read_file, pdf/xlsx tools, python_sandbox,
207
+ │ calculator, run_shell, grep, list_files, plan, memory, final_answer
208
+ ├── policies/ react, planner_react, verifier_retry, bfs/dfs/best_first_over_plans, best_of_n
209
+ ├── verifiers/ arithmetic, citation, coverage, format, command (+ base protocol)
210
+ ├── benchmarks/ gaia/ (loader, runner, scorer, report)
211
+ ├── memory/ in_memory_store, jsonl_store, skill_library, embeddings
212
+ └── cli/ Rich-based REPL: /policy /budget /show /skills /compact /save /load …
213
+ ```
214
+
215
+ Tests live in `tests/` and are organized to mirror `src/`. Run them with:
216
+
217
+ ```bash
218
+ pytest -q
219
+ ```
220
+
221
+ Current status: **562 passed, 0 failed** on this public branch (no external substrate dependencies).
222
+
223
+ ## License
224
+
225
+ MIT
banna-0.1.0/README.md ADDED
@@ -0,0 +1,181 @@
1
+ # banna
2
+
3
+ A from-scratch, provider-agnostic reasoning agent with a **typed state substrate** and a **verifier-guided** loop. Built to study where ReAct-style agents fail on the **GAIA** benchmark and to fix those failures structurally — not with prompt patches.
4
+
5
+ No LangChain, no LlamaIndex, no smolagents in the core. The reasoning loop is a typed transition function over `(state, action, observation) → state'`; ReAct, verifier-retry, planner-ReAct, BFS/DFS/best-first-over-plans, and best-of-N are each ~200 LOC `Policy` implementations over that same substrate.
6
+
7
+ ## What's interesting about this repo
8
+
9
+ 1. **Forensic GAIA debugging.** A full-validation run on `gpt-5-nano` was instrumented end-to-end, traces were dumped per task, and seven distinct structural failure modes were diagnosed and fixed — not by prompt-tweaking, but by changing the loop. See [Failure modes & fixes](#failure-modes--fixes-the-c1c6-pass) below.
10
+ 2. **Multi-axis budget tracker** that separates *productive* steps from *repair* steps. A model stuck in an `[empty_reply]` loop no longer burns its productive-step budget; instead it trips a separate `max_repair_steps` axis with a forced tool-choice escape.
11
+ 3. **Per-verifier actionable nudges.** Each verifier (Arithmetic, Citation, Coverage, Format) attaches a `meta["nudge"]` to its fail verdicts that names the missing thing (the recomputed value, the missing evidence_id, the unsupported number, the empty field). The retry policy groups these by verifier and emits one line per kind — short enough that the model actually reads them.
12
+ 4. **Budget-exhaustion synthesis.** When the agent runs out of steps mid-task, instead of returning `null`, a final forced-`final_answer` call gives it one last shot with a cheap fallback chain (last claim → last short text → none).
13
+ 5. **Provider-agnostic tool forcing.** A single helper translates "force any tool" into OpenAI's `tool_choice: "required"`, Anthropic's `{type: "any"}`, and Gemini's `ANY` mode — used to break out of empty-reply loops.
14
+
15
+ ## Architecture
16
+
17
+ ```mermaid
18
+ flowchart TB
19
+ Q[User question<br/>+ optional attachment] --> AG[Agent.run_policy]
20
+
21
+ subgraph SUB[banna_agent runtime]
22
+ AG --> ST[AgentState<br/>typed: Trace · Evidence · Claim · Budget]
23
+ ST -->|propose| POL{Policy}
24
+
25
+ POL --> REACT[ReActPolicy]
26
+ POL --> VR[VerifierRetryPolicy<br/>wraps inner]
27
+ POL --> PL[PlannerReActPolicy]
28
+ POL --> BFS[BFS / DFS / Best-First<br/>over plans]
29
+ POL --> BON[BestOfNPolicy<br/>K trajectories + selector]
30
+
31
+ REACT -->|Action| EX[Execute]
32
+ VR -->|FINAL_ANSWER| VERS{Verifiers}
33
+ VERS -- pass --> COMMIT[commit]
34
+ VERS -- fail --> NUDGE[per-verifier nudge<br/>→ THINK feedback]
35
+ NUDGE --> POL
36
+
37
+ EX --> TOOLS[ToolRegistry]
38
+ TOOLS --> T1[search]
39
+ TOOLS --> T2[read_url]
40
+ TOOLS --> T3[read_file<br/>+ pdf / xlsx tools]
41
+ TOOLS --> T4[python_sandbox]
42
+ TOOLS --> T5[calculator]
43
+ TOOLS --> T6[run_shell · grep · list_files]
44
+
45
+ EX -->|observation| ST
46
+ ST --> BUD[BudgetTracker<br/>steps · repair_steps · wall · tokens · cost]
47
+ BUD -. trip .-> SYN[synthesize_on_exhaustion<br/>forced final_answer]
48
+ SYN --> COMMIT
49
+ end
50
+
51
+ subgraph VERIFIERS[Verifiers]
52
+ VF[FormatVerifier<br/>shape / empty answer]
53
+ VA[ArithmeticVerifier<br/>safe-AST recompute]
54
+ VC[CitationVerifier<br/>numeric + Jaccard support]
55
+ VG[CoverageVerifier<br/>claim ↔ evidence]
56
+ end
57
+ VERS --- VF
58
+ VERS --- VA
59
+ VERS --- VC
60
+ VERS --- VG
61
+
62
+ COMMIT --> ANS[answer]
63
+ ```
64
+
65
+ ## Install
66
+
67
+ ```bash
68
+ # 1. From PyPI (once published)
69
+ pip install banna
70
+
71
+ # 2. From GitHub directly (no clone, no PyPI required)
72
+ pip install git+https://github.com/siavashmonfared/banna.git
73
+
74
+ # 3. Isolated install with pipx (recommended for CLI use)
75
+ pipx install git+https://github.com/siavashmonfared/banna.git
76
+
77
+ # 4. From a local clone (for development)
78
+ git clone https://github.com/siavashmonfared/banna.git
79
+ cd banna
80
+ pip install -e ".[dev]"
81
+ ```
82
+
83
+ Any install path drops a `banna` (and `banna-agent`) executable on your `$PATH`.
84
+
85
+ ## Quickstart
86
+
87
+ ```bash
88
+ # set at least one provider key
89
+ export OPENAI_API_KEY=sk-... # or ANTHROPIC_API_KEY=... / GEMINI_API_KEY=...
90
+
91
+ # open the interactive REPL
92
+ banna --policy verifier_retry --provider openai --model gpt-5-nano
93
+
94
+ # or run a single GAIA Level-1 question (no REPL)
95
+ python -m banna_agent.benchmarks.gaia.runner \
96
+ --policy verifier_retry --provider openai --model gpt-5-nano \
97
+ --level 1 --n 1
98
+ ```
99
+
100
+ ### Example REPL session
101
+
102
+ ```
103
+ $ banna --policy verifier_retry --provider openai --model gpt-5-nano
104
+
105
+ ● banna · v0.1.0 provider=openai model=gpt-5-nano policy=verifier_retry
106
+
107
+ > How many studio albums did Mercedes Sosa release between 2000 and 2009?
108
+
109
+ thinking…
110
+ ▸ search(query="Mercedes Sosa discography studio albums 2000-2009")
111
+ ↳ 8 results · evidence_id ev_a3f
112
+ ▸ read_url(url="https://en.wikipedia.org/wiki/Mercedes_Sosa")
113
+ ↳ 12.4 kB · evidence_id ev_91c
114
+ thinking…
115
+ ▸ final_answer(answer="3", evidence_ids=["ev_a3f", "ev_91c"])
116
+ verifiers: format ✓ citation ✓ coverage ✓ arithmetic skip
117
+
118
+ ● banna
119
+ 3
120
+
121
+ 3 steps · 4.7s · 1840→210 tok · $0.0021
122
+
123
+ > /show trace
124
+ …step-by-step dump of action + observation + meta…
125
+
126
+ > /exit
127
+ ```
128
+
129
+ The full GAIA validation runner (165 questions across L1/L2/L3) is in `experiments/02_gaia_full/run.py`.
130
+
131
+ ## Failure modes & fixes (the C1–C6 pass)
132
+
133
+ Diagnosed from a full GAIA validation run on `gpt-5-nano`. Each fix lands as a structural change to the loop, with unit tests pinning the new behavior.
134
+
135
+ | ID | Failure mode | Root cause | Fix |
136
+ |----|--------------|------------|-----|
137
+ | C1 | `[empty_reply]` loops eat the step budget | Repair-style THINKs counted as productive steps | New `Budget.repair_steps_used` axis + `max_repair_steps=6`; `meta["repair"]=True` routes off the main counter |
138
+ | C2 | Model returns empty content + no tool call | No detection / no escape | After 2 consecutive empties with no evidence, force `tool_choice` to any tool (provider-agnostic) |
139
+ | C3 | `pred_answer=null` on budget exhaustion | Loop exits with no commit | `policy.synthesize_on_exhaustion(state)`: one threaded LLM call with forced `final_answer` + cheap fallback chain |
140
+ | C4 | L1 step cap too tight (8 steps) | Default budget profile | L1/L2/L3 caps bumped to 12/18/24 |
141
+ | C5 | Rich file tools never used on attachments | Hint steered model toward `read_file` even for PDF/XLSX | Extension-routed `_file_hint()` + cheap `_file_summary()` (pypdf page count, openpyxl sheet names, CSV header) |
142
+ | C6 | Verifier retries low repair rate | Feedback was generic | Each verifier populates `meta["nudge"]` with a verifier-specific actionable instruction; retry feedback groups by verifier |
143
+
144
+ ## GAIA validation results
145
+
146
+ > The fixes are landed and tested (562 tests passing in this public repo, 568 in the private superset). The post-fix re-run on GAIA validation is the next thing on the queue.
147
+
148
+ | Run | Provider · model | Policy | Set | Accuracy | Notes |
149
+ |-----|------------------|--------|-----|----------|-------|
150
+ | Pre-C1–C6 | OpenAI · `gpt-5-nano` | verifier_retry | GAIA val (165 Q) | **33.9 %** (56 / 165) | $0.94, 7 structural bugs surfaced |
151
+ | Post-C1–C6 | OpenAI · `gpt-5-nano` | verifier_retry | GAIA val (165 Q) | _re-run pending_ | target ≥ 40 % |
152
+ | Post-C1–C6 | OpenAI · `gpt-5-nano` | best_of_n (K=3) | GAIA val (165 Q) | _re-run pending_ | stretch ≥ 45 % |
153
+
154
+ (Numbers populate once the re-run completes; CI is wired to gate on a held-out smoke subset.)
155
+
156
+ ## Repo layout
157
+
158
+ ```
159
+ src/banna_agent/
160
+ ├── core/ AgentState, Trace, Action, Budget, EventLog, run_policy
161
+ ├── llm/ provider-agnostic LLMClient + adapters (anthropic, openai, gemini, ollama, bedrock)
162
+ ├── tools/ search, read_url, read_file, pdf/xlsx tools, python_sandbox,
163
+ │ calculator, run_shell, grep, list_files, plan, memory, final_answer
164
+ ├── policies/ react, planner_react, verifier_retry, bfs/dfs/best_first_over_plans, best_of_n
165
+ ├── verifiers/ arithmetic, citation, coverage, format, command (+ base protocol)
166
+ ├── benchmarks/ gaia/ (loader, runner, scorer, report)
167
+ ├── memory/ in_memory_store, jsonl_store, skill_library, embeddings
168
+ └── cli/ Rich-based REPL: /policy /budget /show /skills /compact /save /load …
169
+ ```
170
+
171
+ Tests live in `tests/` and are organized to mirror `src/`. Run them with:
172
+
173
+ ```bash
174
+ pytest -q
175
+ ```
176
+
177
+ Current status: **562 passed, 0 failed** on this public branch (no external substrate dependencies).
178
+
179
+ ## License
180
+
181
+ MIT
@@ -0,0 +1,80 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "banna"
7
+ version = "0.1.0"
8
+ description = "A from-scratch, provider-agnostic reasoning agent with a typed state substrate and verifier-guided search. Primary benchmark: GAIA."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ authors = [{ name = "Siavash Monfared", email = "monfared@alum.mit.edu" }]
12
+ license = { text = "MIT" }
13
+ keywords = ["agent", "llm", "react", "verifier", "gaia", "reasoning", "anthropic", "openai", "gemini"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Intended Audience :: Developers",
17
+ "Intended Audience :: Science/Research",
18
+ "License :: OSI Approved :: MIT License",
19
+ "Operating System :: POSIX :: Linux",
20
+ "Operating System :: MacOS",
21
+ "Programming Language :: Python :: 3",
22
+ "Programming Language :: Python :: 3.10",
23
+ "Programming Language :: Python :: 3.11",
24
+ "Programming Language :: Python :: 3.12",
25
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
26
+ ]
27
+
28
+ dependencies = [
29
+ "anthropic>=0.40",
30
+ "openai>=1.50",
31
+ "google-generativeai>=0.8",
32
+ "requests>=2.31",
33
+ "pydantic>=2.6",
34
+ "datasets>=2.19",
35
+ "pyyaml>=6.0",
36
+ "beautifulsoup4>=4.12",
37
+ "pypdf>=4.0",
38
+ "openpyxl>=3.1",
39
+ "pandas>=2.2",
40
+ "pillow>=10.0",
41
+ "rich>=13.7",
42
+ ]
43
+
44
+ [project.urls]
45
+ Homepage = "https://github.com/siavashmonfared/banna"
46
+ Repository = "https://github.com/siavashmonfared/banna"
47
+ Issues = "https://github.com/siavashmonfared/banna/issues"
48
+
49
+ [project.optional-dependencies]
50
+ dev = [
51
+ "pytest>=8.0",
52
+ "pytest-cov>=5.0",
53
+ "ruff>=0.5",
54
+ "mypy>=1.10",
55
+ ]
56
+ # Optional PDF-tables support. pypdf (core dep) handles text extraction;
57
+ # pdfplumber adds table-structure parsing for GAIA tasks where the
58
+ # answer lives in a PDF table. Install with `pip install ".[pdf]"`.
59
+ pdf = [
60
+ "pdfplumber>=0.11",
61
+ ]
62
+
63
+ [project.scripts]
64
+ banna = "banna_agent.cli:main"
65
+ banna-agent = "banna_agent.cli:main"
66
+
67
+ [tool.setuptools.packages.find]
68
+ where = ["src"]
69
+
70
+ [tool.ruff]
71
+ line-length = 100
72
+ target-version = "py310"
73
+
74
+ [tool.ruff.lint]
75
+ select = ["E", "F", "W", "I", "UP", "B", "SIM", "TID", "RUF"]
76
+ ignore = ["E501"]
77
+
78
+ [tool.pytest.ini_options]
79
+ testpaths = ["tests"]
80
+ addopts = "-ra --strict-markers"
banna-0.1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+