baar-core 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- baar_core-0.1.0/LICENSE +21 -0
- baar_core-0.1.0/PKG-INFO +157 -0
- baar_core-0.1.0/README.md +129 -0
- baar_core-0.1.0/baar/__init__.py +22 -0
- baar_core-0.1.0/baar/__main__.py +16 -0
- baar_core-0.1.0/baar/core/__init__.py +3 -0
- baar_core-0.1.0/baar/core/budget.py +153 -0
- baar_core-0.1.0/baar/core/models.py +163 -0
- baar_core-0.1.0/baar/core/router.py +199 -0
- baar_core-0.1.0/baar/router.py +167 -0
- baar_core-0.1.0/baar_core.egg-info/PKG-INFO +157 -0
- baar_core-0.1.0/baar_core.egg-info/SOURCES.txt +21 -0
- baar_core-0.1.0/baar_core.egg-info/dependency_links.txt +1 -0
- baar_core-0.1.0/baar_core.egg-info/entry_points.txt +3 -0
- baar_core-0.1.0/baar_core.egg-info/requires.txt +8 -0
- baar_core-0.1.0/baar_core.egg-info/top_level.txt +1 -0
- baar_core-0.1.0/pyproject.toml +56 -0
- baar_core-0.1.0/setup.cfg +4 -0
- baar_core-0.1.0/tests/test_budget.py +165 -0
- baar_core-0.1.0/tests/test_models.py +207 -0
- baar_core-0.1.0/tests/test_resilience.py +59 -0
- baar_core-0.1.0/tests/test_router.py +224 -0
- baar_core-0.1.0/tests/test_router_integration.py +260 -0
baar_core-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 BAAR-Algo Authors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
baar_core-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: baar-core
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Budget-Aware Agentic Routing — route LLM calls intelligently between cheap and powerful models with a hard budget cap.
|
|
5
|
+
License: MIT
|
|
6
|
+
Project-URL: Homepage, https://github.com/orvi2014/Baar-Core
|
|
7
|
+
Project-URL: Issues, https://github.com/orvi2014/Baar-Core/issues
|
|
8
|
+
Keywords: llm,agents,routing,budget,cost,openai,langchain
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: litellm>=1.30.0
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
23
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
24
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == "dev"
|
|
25
|
+
Requires-Dist: datasets>=2.15.0; extra == "dev"
|
|
26
|
+
Requires-Dist: huggingface_hub>=0.19.0; extra == "dev"
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# baar-core (BAAR-Algo)
|
|
30
|
+
|
|
31
|
+
[](https://badge.fury.io/py/baar-core)
|
|
32
|
+
[](https://opensource.org/licenses/MIT)
|
|
33
|
+
[](https://www.python.org/downloads/)
|
|
34
|
+
[](https://github.com/orvi2014/Baar-Core/blob/main/RESEARCH.md)
|
|
35
|
+
|
|
36
|
+
**Route LLM calls intelligently between cheap and powerful models — with a hard financial kill-switch that never breaks.**
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## 🚀 Why BAAR?
|
|
41
|
+
|
|
42
|
+
Every agent developer using GPT-4o has seen this:
|
|
43
|
+
- **Simple task** → sent to GPT-4o anyway → **15× more expensive** than necessary.
|
|
44
|
+
- **Budget set to $0.10** → agent burns $0.40 → **surprise invoice**.
|
|
45
|
+
- **No visibility** into which agent step cost what, or why.
|
|
46
|
+
|
|
47
|
+
**BAAR (Budget-Aware Agentic Routing)** solves this at the protocol level.
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## 🧠 How it Works
|
|
52
|
+
|
|
53
|
+
BAAR acts as a semantic gateway between your application and the LLM providers.
|
|
54
|
+
|
|
55
|
+
```mermaid
|
|
56
|
+
graph TD
|
|
57
|
+
A[Your Task] --> B{Semantic Router}
|
|
58
|
+
B -- Complexity < 0.65 --> C[gpt-4o-mini]
|
|
59
|
+
B -- Complexity >= 0.65 --> D[Budget Kill-Switch]
|
|
60
|
+
|
|
61
|
+
D -- "Affordable?" --> E[gpt-4o]
|
|
62
|
+
D -- "Too Expensive" --> F[Force Downgrade to Mini]
|
|
63
|
+
|
|
64
|
+
E --> G[Audit & Spend Tracking]
|
|
65
|
+
F --> G
|
|
66
|
+
C --> G
|
|
67
|
+
|
|
68
|
+
G --> H[Final Response]
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
1. **Semantic Scoring**: Uses a cheap model to score task complexity (0.0–1.0).
|
|
72
|
+
2. **BCD (Budget-Constrained Decoding)**: If the powerful model is too expensive for your remaining budget, BAAR **automatically downgrades** to a cheaper one to ensure the task completes without an overage.
|
|
73
|
+
3. **Local Rejection**: If even the cheapest model exceeds the budget, the request is rejected **locally** with zero network cost.
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## 🔬 Benchmarking Results
|
|
78
|
+
|
|
79
|
+
To ensure frontier-grade quality, BAAR-Algo is validated on industry-standard datasets.
|
|
80
|
+
|
|
81
|
+
| Dataset | Strategy | Accuracy % | Cost (USD) | Savings vs BIG |
|
|
82
|
+
| :--- | :--- | :---: | :---: | :---: |
|
|
83
|
+
| **MMLU** | ALWAYS-BIG | 100.0% | $0.0905 | - |
|
|
84
|
+
| (Knowledge) | **BAAR-Algo** | **70.0%** | **$0.0050** | **93.3%** |
|
|
85
|
+
| **GSM8K** | ALWAYS-BIG | 100.0% | $0.0905 | - |
|
|
86
|
+
| (Math) | **BAAR-Algo** | **80.0%** | **$0.0050** | **93.3%** |
|
|
87
|
+
| **HumanEval** | ALWAYS-BIG | 100.0% | $0.0105 | - |
|
|
88
|
+
| (Coding) | **BAAR-Algo** | **100.0%** | **$0.0105** | **0.0%*** |
|
|
89
|
+
|
|
90
|
+
*\*On HumanEval, BAAR correctly detects 100% complexity and uses the Big model, ensuring zero quality loss for critical code.*
|
|
91
|
+
|
|
92
|
+
### Run the Benchmark Yourself (Free)
|
|
93
|
+
```bash
|
|
94
|
+
baar-bench --dataset all --mock
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## 📦 Installation
|
|
100
|
+
|
|
101
|
+
```bash
|
|
102
|
+
pip install baar-core
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## ⚡ Quick Start
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from baar import BAARRouter
|
|
109
|
+
|
|
110
|
+
# Set a hard $0.10 budget cap
|
|
111
|
+
router = BAARRouter(budget=0.10)
|
|
112
|
+
|
|
113
|
+
# This will be routed to gpt-4o-mini (Complexity ~0.1)
|
|
114
|
+
response = router.chat("What is the capital of France?")
|
|
115
|
+
|
|
116
|
+
# This will be routed to gpt-4o (Complexity ~0.9)
|
|
117
|
+
code = router.chat("Write a complex matrix multiplication in CUDA.")
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
## 🛡️ Resilience & Security
|
|
123
|
+
|
|
124
|
+
BAAR is designed for **Financial Safety** (Anti-Denial of Wallet).
|
|
125
|
+
|
|
126
|
+
| Attack Vector | BAAR Response | Proof |
|
|
127
|
+
| :--- | :--- | :--- |
|
|
128
|
+
| **Unbounded Consumption** | Zero-Call Rejection | Blocks request locally with **Zero** network calls. |
|
|
129
|
+
| **Complexity Inflation** | Semantic Scoring | Ignores gibberish/padding intended to drain budget. |
|
|
130
|
+
| **Sensitivity Toggling** | Tunable Threshold | Adjust `complexity_threshold` to match your quality needs. |
|
|
131
|
+
|
|
132
|
+
Verify resilience locally:
|
|
133
|
+
```bash
|
|
134
|
+
baar-stress
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
## 🛠️ Configuration
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
router = BAARRouter(
|
|
143
|
+
budget=0.10, # Hard cap in USD
|
|
144
|
+
small_model="gpt-4o-mini", # Cheap model
|
|
145
|
+
big_model="gpt-4o", # Powerful model
|
|
146
|
+
complexity_threshold=0.65, # 0–1: above this → use big model
|
|
147
|
+
)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## 📄 License & Research
|
|
153
|
+
|
|
154
|
+
Distributed under the **MIT License**. See [LICENSE](https://github.com/orvi2014/Baar-Core/blob/main/LICENSE) for more information.
|
|
155
|
+
|
|
156
|
+
For architectural details and mapping to the **OWASP LLM10** security framework, see [RESEARCH.md](https://github.com/orvi2014/Baar-Core/blob/main/RESEARCH.md).
|
|
157
|
+
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# baar-core (BAAR-Algo)
|
|
2
|
+
|
|
3
|
+
[](https://badge.fury.io/py/baar-core)
|
|
4
|
+
[](https://opensource.org/licenses/MIT)
|
|
5
|
+
[](https://www.python.org/downloads/)
|
|
6
|
+
[](https://github.com/orvi2014/Baar-Core/blob/main/RESEARCH.md)
|
|
7
|
+
|
|
8
|
+
**Route LLM calls intelligently between cheap and powerful models — with a hard financial kill-switch that never breaks.**
|
|
9
|
+
|
|
10
|
+
---
|
|
11
|
+
|
|
12
|
+
## 🚀 Why BAAR?
|
|
13
|
+
|
|
14
|
+
Every agent developer using GPT-4o has seen this:
|
|
15
|
+
- **Simple task** → sent to GPT-4o anyway → **15× more expensive** than necessary.
|
|
16
|
+
- **Budget set to $0.10** → agent burns $0.40 → **surprise invoice**.
|
|
17
|
+
- **No visibility** into which agent step cost what, or why.
|
|
18
|
+
|
|
19
|
+
**BAAR (Budget-Aware Agentic Routing)** solves this at the protocol level.
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## 🧠 How it Works
|
|
24
|
+
|
|
25
|
+
BAAR acts as a semantic gateway between your application and the LLM providers.
|
|
26
|
+
|
|
27
|
+
```mermaid
|
|
28
|
+
graph TD
|
|
29
|
+
A[Your Task] --> B{Semantic Router}
|
|
30
|
+
B -- Complexity < 0.65 --> C[gpt-4o-mini]
|
|
31
|
+
B -- Complexity >= 0.65 --> D[Budget Kill-Switch]
|
|
32
|
+
|
|
33
|
+
D -- "Affordable?" --> E[gpt-4o]
|
|
34
|
+
D -- "Too Expensive" --> F[Force Downgrade to Mini]
|
|
35
|
+
|
|
36
|
+
E --> G[Audit & Spend Tracking]
|
|
37
|
+
F --> G
|
|
38
|
+
C --> G
|
|
39
|
+
|
|
40
|
+
G --> H[Final Response]
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
1. **Semantic Scoring**: Uses a cheap model to score task complexity (0.0–1.0).
|
|
44
|
+
2. **BCD (Budget-Constrained Decoding)**: If the powerful model is too expensive for your remaining budget, BAAR **automatically downgrades** to a cheaper one to ensure the task completes without an overage.
|
|
45
|
+
3. **Local Rejection**: If even the cheapest model exceeds the budget, the request is rejected **locally** with zero network cost.
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## 🔬 Benchmarking Results
|
|
50
|
+
|
|
51
|
+
To ensure frontier-grade quality, BAAR-Algo is validated on industry-standard datasets.
|
|
52
|
+
|
|
53
|
+
| Dataset | Strategy | Accuracy % | Cost (USD) | Savings vs BIG |
|
|
54
|
+
| :--- | :--- | :---: | :---: | :---: |
|
|
55
|
+
| **MMLU** | ALWAYS-BIG | 100.0% | $0.0905 | - |
|
|
56
|
+
| (Knowledge) | **BAAR-Algo** | **70.0%** | **$0.0050** | **93.3%** |
|
|
57
|
+
| **GSM8K** | ALWAYS-BIG | 100.0% | $0.0905 | - |
|
|
58
|
+
| (Math) | **BAAR-Algo** | **80.0%** | **$0.0050** | **93.3%** |
|
|
59
|
+
| **HumanEval** | ALWAYS-BIG | 100.0% | $0.0105 | - |
|
|
60
|
+
| (Coding) | **BAAR-Algo** | **100.0%** | **$0.0105** | **0.0%*** |
|
|
61
|
+
|
|
62
|
+
*\*On HumanEval, BAAR correctly detects 100% complexity and uses the Big model, ensuring zero quality loss for critical code.*
|
|
63
|
+
|
|
64
|
+
### Run the Benchmark Yourself (Free)
|
|
65
|
+
```bash
|
|
66
|
+
baar-bench --dataset all --mock
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## 📦 Installation
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
pip install baar-core
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## ⚡ Quick Start
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from baar import BAARRouter
|
|
81
|
+
|
|
82
|
+
# Set a hard $0.10 budget cap
|
|
83
|
+
router = BAARRouter(budget=0.10)
|
|
84
|
+
|
|
85
|
+
# This will be routed to gpt-4o-mini (Complexity ~0.1)
|
|
86
|
+
response = router.chat("What is the capital of France?")
|
|
87
|
+
|
|
88
|
+
# This will be routed to gpt-4o (Complexity ~0.9)
|
|
89
|
+
code = router.chat("Write a complex matrix multiplication in CUDA.")
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
## 🛡️ Resilience & Security
|
|
95
|
+
|
|
96
|
+
BAAR is designed for **Financial Safety** (Anti-Denial of Wallet).
|
|
97
|
+
|
|
98
|
+
| Attack Vector | BAAR Response | Proof |
|
|
99
|
+
| :--- | :--- | :--- |
|
|
100
|
+
| **Unbounded Consumption** | Zero-Call Rejection | Blocks request locally with **Zero** network calls. |
|
|
101
|
+
| **Complexity Inflation** | Semantic Scoring | Ignores gibberish/padding intended to drain budget. |
|
|
102
|
+
| **Sensitivity Toggling** | Tunable Threshold | Adjust `complexity_threshold` to match your quality needs. |
|
|
103
|
+
|
|
104
|
+
Verify resilience locally:
|
|
105
|
+
```bash
|
|
106
|
+
baar-stress
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## 🛠️ Configuration
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
router = BAARRouter(
|
|
115
|
+
budget=0.10, # Hard cap in USD
|
|
116
|
+
small_model="gpt-4o-mini", # Cheap model
|
|
117
|
+
big_model="gpt-4o", # Powerful model
|
|
118
|
+
complexity_threshold=0.65, # 0–1: above this → use big model
|
|
119
|
+
)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## 📄 License & Research
|
|
125
|
+
|
|
126
|
+
Distributed under the **MIT License**. See [LICENSE](https://github.com/orvi2014/Baar-Core/blob/main/LICENSE) for more information.
|
|
127
|
+
|
|
128
|
+
For architectural details and mapping to the **OWASP LLM10** security framework, see [RESEARCH.md](https://github.com/orvi2014/Baar-Core/blob/main/RESEARCH.md).
|
|
129
|
+
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""
|
|
2
|
+
baar — Budget-Aware Agentic Routing.
|
|
3
|
+
Public API for the BAAR-Algo project.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from baar.router import BAARRouter, token_counter
|
|
7
|
+
from baar.core.budget import BudgetExceeded, BudgetTracker
|
|
8
|
+
from baar.core.router import Router, ModelTier, RoutingDecision
|
|
9
|
+
from baar.core.models import StepResult, RoutingLog
|
|
10
|
+
|
|
11
|
+
__version__ = "0.1.0"
|
|
12
|
+
__all__ = [
|
|
13
|
+
"BAARRouter",
|
|
14
|
+
"token_counter",
|
|
15
|
+
"BudgetExceeded",
|
|
16
|
+
"BudgetTracker",
|
|
17
|
+
"Router",
|
|
18
|
+
"ModelTier",
|
|
19
|
+
"RoutingDecision",
|
|
20
|
+
"StepResult",
|
|
21
|
+
"RoutingLog",
|
|
22
|
+
]
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""
|
|
2
|
+
baar/__main__.py — CLI entry point for 'python -m baar'.
|
|
3
|
+
"""
|
|
4
|
+
import sys
|
|
5
|
+
from baar import __version__
|
|
6
|
+
|
|
7
|
+
def main():
|
|
8
|
+
print(f"BAAR-Algo (baar-core) v{__version__}")
|
|
9
|
+
print("\nBudget-Aware Agentic Routing — Intelligent LLM Model Selection.")
|
|
10
|
+
print("\nUsage:")
|
|
11
|
+
print(" baar-bench : Run the scientific validation suite")
|
|
12
|
+
print(" baar-stress : Run the adversarial resilience suite")
|
|
13
|
+
print("\nSee README.md for more information.")
|
|
14
|
+
|
|
15
|
+
if __name__ == "__main__":
|
|
16
|
+
main()
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Budget tracker — real token-based cost using LiteLLM pricing.
|
|
3
|
+
This is the financial source of truth for the entire system.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import Optional
|
|
8
|
+
from litellm import completion_cost, cost_per_token
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BudgetExceeded(Exception):
|
|
12
|
+
"""Raised when a model call would exceed the remaining budget."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, requested: float, remaining: float, model: str):
|
|
15
|
+
self.requested = requested
|
|
16
|
+
self.remaining = remaining
|
|
17
|
+
self.model = model
|
|
18
|
+
super().__init__(
|
|
19
|
+
f"Budget exceeded: model '{model}' would cost ~${requested:.6f} "
|
|
20
|
+
f"but only ${remaining:.6f} remains."
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class SpendRecord:
|
|
26
|
+
step: int
|
|
27
|
+
model: str
|
|
28
|
+
prompt_tokens: int
|
|
29
|
+
completion_tokens: int
|
|
30
|
+
cost: float
|
|
31
|
+
cumulative_cost: float
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class BudgetTracker:
|
|
36
|
+
"""
|
|
37
|
+
Tracks real spend using LiteLLM's live pricing data.
|
|
38
|
+
Never uses hardcoded costs — always derives from actual token counts.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
total_budget: float
|
|
42
|
+
_spent: float = field(default=0.0, init=False)
|
|
43
|
+
_records: list = field(default_factory=list, init=False)
|
|
44
|
+
_step: int = field(default=0, init=False)
|
|
45
|
+
|
|
46
|
+
@property
|
|
47
|
+
def spent(self) -> float:
|
|
48
|
+
return self._spent
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def remaining(self) -> float:
|
|
52
|
+
return max(0.0, self.total_budget - self._spent)
|
|
53
|
+
|
|
54
|
+
@property
|
|
55
|
+
def utilization(self) -> float:
|
|
56
|
+
"""0.0 → 1.0 showing how much budget has been consumed."""
|
|
57
|
+
if self.total_budget <= 0:
|
|
58
|
+
return 1.0
|
|
59
|
+
return min(1.0, self._spent / self.total_budget)
|
|
60
|
+
|
|
61
|
+
def cost_from_response(self, response) -> float:
|
|
62
|
+
"""
|
|
63
|
+
Extract real cost from a LiteLLM completion response.
|
|
64
|
+
Uses completion_cost() which reads live pricing from LiteLLM's
|
|
65
|
+
model_prices_and_context_window.json — no hardcoding.
|
|
66
|
+
"""
|
|
67
|
+
try:
|
|
68
|
+
cost = completion_cost(completion_response=response)
|
|
69
|
+
return float(cost)
|
|
70
|
+
except Exception:
|
|
71
|
+
# Fallback: manual calc from usage if completion_cost fails
|
|
72
|
+
usage = getattr(response, "usage", None)
|
|
73
|
+
if usage:
|
|
74
|
+
model = response.model or "gpt-4o-mini"
|
|
75
|
+
try:
|
|
76
|
+
in_cost, out_cost = cost_per_token(
|
|
77
|
+
model=model,
|
|
78
|
+
prompt_tokens=usage.prompt_tokens,
|
|
79
|
+
completion_tokens=usage.completion_tokens,
|
|
80
|
+
)
|
|
81
|
+
return float(in_cost + out_cost)
|
|
82
|
+
except Exception:
|
|
83
|
+
pass
|
|
84
|
+
return 0.0
|
|
85
|
+
|
|
86
|
+
def estimate_cost(self, model: str, prompt_tokens: int, completion_tokens: int = 200) -> float:
|
|
87
|
+
"""
|
|
88
|
+
Pre-flight cost estimate before making a call.
|
|
89
|
+
Uses cost_per_token with estimated output token count.
|
|
90
|
+
"""
|
|
91
|
+
try:
|
|
92
|
+
in_cost, out_cost = cost_per_token(
|
|
93
|
+
model=model,
|
|
94
|
+
prompt_tokens=prompt_tokens,
|
|
95
|
+
completion_tokens=completion_tokens,
|
|
96
|
+
)
|
|
97
|
+
return float(in_cost + out_cost)
|
|
98
|
+
except Exception:
|
|
99
|
+
return 0.0
|
|
100
|
+
|
|
101
|
+
def check_affordability(self, model: str, prompt_tokens: int) -> None:
|
|
102
|
+
"""
|
|
103
|
+
Hard budget constraint (BCD — Budget-Constrained Decoding).
|
|
104
|
+
Raises BudgetExceeded before the call is ever made.
|
|
105
|
+
"""
|
|
106
|
+
estimated = self.estimate_cost(model, prompt_tokens)
|
|
107
|
+
if estimated > self.remaining:
|
|
108
|
+
raise BudgetExceeded(
|
|
109
|
+
requested=estimated,
|
|
110
|
+
remaining=self.remaining,
|
|
111
|
+
model=model,
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
def record(self, response, model: str) -> SpendRecord:
|
|
115
|
+
"""Record actual spend after a successful call."""
|
|
116
|
+
self._step += 1
|
|
117
|
+
cost = self.cost_from_response(response)
|
|
118
|
+
self._spent += cost
|
|
119
|
+
|
|
120
|
+
usage = getattr(response, "usage", None)
|
|
121
|
+
record = SpendRecord(
|
|
122
|
+
step=self._step,
|
|
123
|
+
model=model,
|
|
124
|
+
prompt_tokens=getattr(usage, "prompt_tokens", 0),
|
|
125
|
+
completion_tokens=getattr(usage, "completion_tokens", 0),
|
|
126
|
+
cost=cost,
|
|
127
|
+
cumulative_cost=self._spent,
|
|
128
|
+
)
|
|
129
|
+
self._records.append(record)
|
|
130
|
+
return record
|
|
131
|
+
|
|
132
|
+
@property
|
|
133
|
+
def records(self) -> list:
|
|
134
|
+
return list(self._records)
|
|
135
|
+
|
|
136
|
+
def summary(self) -> dict:
|
|
137
|
+
return {
|
|
138
|
+
"total_budget": self.total_budget,
|
|
139
|
+
"spent": round(self._spent, 8),
|
|
140
|
+
"remaining": round(self.remaining, 8),
|
|
141
|
+
"utilization_pct": round(self.utilization * 100, 2),
|
|
142
|
+
"steps": self._step,
|
|
143
|
+
"records": [
|
|
144
|
+
{
|
|
145
|
+
"step": r.step,
|
|
146
|
+
"model": r.model,
|
|
147
|
+
"tokens": r.prompt_tokens + r.completion_tokens,
|
|
148
|
+
"cost": round(r.cost, 8),
|
|
149
|
+
"cumulative": round(r.cumulative_cost, 8),
|
|
150
|
+
}
|
|
151
|
+
for r in self._records
|
|
152
|
+
],
|
|
153
|
+
}
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data models for step results and routing logs.
|
|
3
|
+
Every decision is recorded — this is what devs show in benchmarks.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import Optional, List
|
|
8
|
+
from baar.core.router import RoutingDecision, ModelTier
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class StepResult:
|
|
13
|
+
"""Result of a single routed LLM call."""
|
|
14
|
+
step_num: int
|
|
15
|
+
task: str
|
|
16
|
+
decision: RoutingDecision
|
|
17
|
+
response_text: str
|
|
18
|
+
cost: float
|
|
19
|
+
cumulative_cost: float
|
|
20
|
+
prompt_tokens: int
|
|
21
|
+
completion_tokens: int
|
|
22
|
+
latency_ms: float
|
|
23
|
+
|
|
24
|
+
@property
|
|
25
|
+
def model_used(self) -> str:
|
|
26
|
+
return self.decision.model
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def used_big(self) -> bool:
|
|
30
|
+
return self.decision.tier == ModelTier.BIG
|
|
31
|
+
|
|
32
|
+
def to_dict(self) -> dict:
|
|
33
|
+
return {
|
|
34
|
+
"step": self.step_num,
|
|
35
|
+
"task_preview": self.task[:80] + ("..." if len(self.task) > 80 else ""),
|
|
36
|
+
"model": self.model_used,
|
|
37
|
+
"tier": self.decision.tier.value,
|
|
38
|
+
"complexity_score": self.decision.complexity_score,
|
|
39
|
+
"confidence": self.decision.confidence,
|
|
40
|
+
"routing_reason": self.decision.reason,
|
|
41
|
+
"forced_by_budget": self.decision.forced_by_budget,
|
|
42
|
+
"prompt_tokens": self.prompt_tokens,
|
|
43
|
+
"completion_tokens": self.completion_tokens,
|
|
44
|
+
"cost_usd": round(self.cost, 8),
|
|
45
|
+
"cumulative_cost_usd": round(self.cumulative_cost, 8),
|
|
46
|
+
"latency_ms": round(self.latency_ms, 1),
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class RoutingLog:
|
|
52
|
+
"""
|
|
53
|
+
Full audit trail of a BAAR session.
|
|
54
|
+
This is what you show developers in the benchmark report.
|
|
55
|
+
"""
|
|
56
|
+
budget: float
|
|
57
|
+
small_model: str
|
|
58
|
+
big_model: str
|
|
59
|
+
steps: List[StepResult] = field(default_factory=list)
|
|
60
|
+
|
|
61
|
+
def add(self, step: StepResult) -> None:
|
|
62
|
+
self.steps.append(step)
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def total_cost(self) -> float:
|
|
66
|
+
return sum(s.cost for s in self.steps)
|
|
67
|
+
|
|
68
|
+
@property
|
|
69
|
+
def total_steps(self) -> int:
|
|
70
|
+
return len(self.steps)
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def big_calls(self) -> int:
|
|
74
|
+
return sum(1 for s in self.steps if s.used_big)
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def small_calls(self) -> int:
|
|
78
|
+
return sum(1 for s in self.steps if not s.used_big)
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def budget_forced_downgrades(self) -> int:
|
|
82
|
+
return sum(1 for s in self.steps if s.decision.forced_by_budget)
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def always_big_cost(self) -> float:
|
|
86
|
+
"""What this would have cost if we used big model for every step."""
|
|
87
|
+
return sum(
|
|
88
|
+
s.cost * (s.decision.complexity_score / max(0.01, s.decision.complexity_score))
|
|
89
|
+
if not s.used_big
|
|
90
|
+
else s.cost
|
|
91
|
+
for s in self.steps
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
def savings_vs_always_big(self) -> dict:
|
|
95
|
+
"""
|
|
96
|
+
Calculate savings vs naive always-big strategy.
|
|
97
|
+
This is the benchmark number that matters.
|
|
98
|
+
"""
|
|
99
|
+
# Estimate always-big cost: use ratio of big/small pricing
|
|
100
|
+
# gpt-4o is ~15x more expensive than gpt-4o-mini per token
|
|
101
|
+
estimated_always_big = sum(
|
|
102
|
+
s.cost * 15 if not s.used_big else s.cost
|
|
103
|
+
for s in self.steps
|
|
104
|
+
)
|
|
105
|
+
saved = estimated_always_big - self.total_cost
|
|
106
|
+
pct = (saved / estimated_always_big * 100) if estimated_always_big > 0 else 0
|
|
107
|
+
|
|
108
|
+
return {
|
|
109
|
+
"baar_cost": round(self.total_cost, 6),
|
|
110
|
+
"estimated_always_big_cost": round(estimated_always_big, 6),
|
|
111
|
+
"saved_usd": round(saved, 6),
|
|
112
|
+
"savings_pct": round(pct, 1),
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
def summary(self) -> dict:
|
|
116
|
+
savings = self.savings_vs_always_big()
|
|
117
|
+
return {
|
|
118
|
+
"budget_usd": self.budget,
|
|
119
|
+
"spent_usd": round(self.total_cost, 8),
|
|
120
|
+
"remaining_usd": round(self.budget - self.total_cost, 8),
|
|
121
|
+
"utilization_pct": round(self.total_cost / self.budget * 100, 2) if self.budget > 0 else 0,
|
|
122
|
+
"total_steps": self.total_steps,
|
|
123
|
+
"small_model_calls": self.small_calls,
|
|
124
|
+
"big_model_calls": self.big_calls,
|
|
125
|
+
"budget_forced_downgrades": self.budget_forced_downgrades,
|
|
126
|
+
"pct_routed_to_small": round(self.small_calls / max(1, self.total_steps) * 100, 1),
|
|
127
|
+
"savings_vs_always_big": savings,
|
|
128
|
+
"steps": [s.to_dict() for s in self.steps],
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
def print_report(self) -> None:
|
|
132
|
+
"""Human-readable report — what devs share as screenshots."""
|
|
133
|
+
s = self.summary()
|
|
134
|
+
sav = s["savings_vs_always_big"]
|
|
135
|
+
|
|
136
|
+
print("\n" + "═" * 60)
|
|
137
|
+
print(" BAAR ROUTING REPORT")
|
|
138
|
+
print("═" * 60)
|
|
139
|
+
print(f" Budget: ${s['budget_usd']:.4f}")
|
|
140
|
+
print(f" Spent: ${s['spent_usd']:.6f} ({s['utilization_pct']}% used)")
|
|
141
|
+
print(f" Remaining: ${s['remaining_usd']:.6f}")
|
|
142
|
+
print("─" * 60)
|
|
143
|
+
print(f" Total steps: {s['total_steps']}")
|
|
144
|
+
print(f" → small model: {s['small_model_calls']} calls ({s['pct_routed_to_small']}%)")
|
|
145
|
+
print(f" → big model: {s['big_model_calls']} calls")
|
|
146
|
+
print(f" → budget forced downgrades: {s['budget_forced_downgrades']}")
|
|
147
|
+
print("─" * 60)
|
|
148
|
+
print(f" BAAR cost: ${sav['baar_cost']:.6f}")
|
|
149
|
+
print(f" Always-big estimate: ${sav['estimated_always_big_cost']:.6f}")
|
|
150
|
+
print(f" Saved: ${sav['saved_usd']:.6f} ({sav['savings_pct']}% cheaper)")
|
|
151
|
+
print("─" * 60)
|
|
152
|
+
print(f" {'Step':<5} {'Model':<15} {'Complexity':<11} {'Cost':>10} Reason")
|
|
153
|
+
print(f" {'─'*4} {'─'*14} {'─'*10} {'─'*10} {'─'*20}")
|
|
154
|
+
for step in s["steps"]:
|
|
155
|
+
forced = " [BUDGET]" if step["forced_by_budget"] else ""
|
|
156
|
+
print(
|
|
157
|
+
f" {step['step']:<5} "
|
|
158
|
+
f"{step['model']:<15} "
|
|
159
|
+
f"{step['complexity_score']:<11.3f} "
|
|
160
|
+
f"${step['cost_usd']:>9.6f} "
|
|
161
|
+
f"{step['routing_reason'][:30]}{forced}"
|
|
162
|
+
)
|
|
163
|
+
print("═" * 60 + "\n")
|