agentic-sre-gateway 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentic_sre_gateway-1.0.0/LICENSE +21 -0
- agentic_sre_gateway-1.0.0/PKG-INFO +105 -0
- agentic_sre_gateway-1.0.0/README.md +74 -0
- agentic_sre_gateway-1.0.0/agentic_sre_gateway.egg-info/PKG-INFO +105 -0
- agentic_sre_gateway-1.0.0/agentic_sre_gateway.egg-info/SOURCES.txt +12 -0
- agentic_sre_gateway-1.0.0/agentic_sre_gateway.egg-info/dependency_links.txt +1 -0
- agentic_sre_gateway-1.0.0/agentic_sre_gateway.egg-info/entry_points.txt +2 -0
- agentic_sre_gateway-1.0.0/agentic_sre_gateway.egg-info/requires.txt +5 -0
- agentic_sre_gateway-1.0.0/agentic_sre_gateway.egg-info/top_level.txt +1 -0
- agentic_sre_gateway-1.0.0/app/__init__.py +1 -0
- agentic_sre_gateway-1.0.0/app/cli.py +13 -0
- agentic_sre_gateway-1.0.0/app/main.py +199 -0
- agentic_sre_gateway-1.0.0/setup.cfg +4 -0
- agentic_sre_gateway-1.0.0/setup.py +36 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Manik Bodamwad
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agentic-sre-gateway
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: An SRE-Optimized API Gateway for dynamic LLM routing, Redis caching, and Prometheus telemetry.
|
|
5
|
+
Home-page: https://github.com/ManikBodamwad/LLM-Latency-Cost-Router
|
|
6
|
+
Author: Manik Bodamwad
|
|
7
|
+
Author-email: bodamwadm@gmail.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
|
+
Requires-Python: >=3.9
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: fastapi>=0.100.0
|
|
17
|
+
Requires-Dist: uvicorn>=0.23.0
|
|
18
|
+
Requires-Dist: litellm>=1.0.0
|
|
19
|
+
Requires-Dist: redis>=5.0.0
|
|
20
|
+
Requires-Dist: prometheus_client>=0.17.0
|
|
21
|
+
Dynamic: author
|
|
22
|
+
Dynamic: author-email
|
|
23
|
+
Dynamic: classifier
|
|
24
|
+
Dynamic: description
|
|
25
|
+
Dynamic: description-content-type
|
|
26
|
+
Dynamic: home-page
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
Dynamic: requires-dist
|
|
29
|
+
Dynamic: requires-python
|
|
30
|
+
Dynamic: summary
|
|
31
|
+
|
|
32
|
+
# Agentic API Gateway | SRE Edge Router
|
|
33
|
+
|
|
34
|
+
A production-grade, highly resilient API Gateway that dynamically routes Large Language Model (LLM) prompts based on complexity and minimizes cost through semantic caching.
|
|
35
|
+
|
|
36
|
+
Built strictly with **Site Reliability Engineering (SRE)** principles, it implements automated failovers, Redis-backed rate limiters, token-cost telemetry routing to Prometheus, and features a glowing, immersive UI dashboard built beautifully in Vanilla JS/Vite.
|
|
37
|
+
|
|
38
|
+
## 🚀 Core SRE Features
|
|
39
|
+
|
|
40
|
+
- **Dynamic Tier Routing**: Uses a heuristics engine to parse prompt intent. Simple queries route to blazing-fast models (`groq/llama-3.1-8b`), while complex queries (e.g. system design) automatically route to heavy models (`groq/llama-3.1-70b`).
|
|
41
|
+
- **Zero-Latency Semantic Caching**: SHA-256 hashes intercepts inbound requests. Identical requests skip the LLM network entirely, serving an exact match directly from Redis memory in `< 0ms` for `$0USD` cost.
|
|
42
|
+
- **Intelligent Failover Resiliency**: Wraps primary model calls in strict `asyncio` timeouts. If a provider throws a quota limit, `503`, or hangs, the router gracefully degrades to alternative models before ever throwing an error to the user.
|
|
43
|
+
- **Vite SRE Telemetry Dashboard**: Complete visual interface built without bulky frameworks—utilizing raw CSS glassmorphism, flexbox scaling, and micro-animated charts showcasing true request latency and cost updates on every stream.
|
|
44
|
+
- **DDoS/Billing Defense**: Implements a Redis token-bucket API rate limiter (50 req/min) requiring an `x-api-key` header to prevent billing exhaustion.
|
|
45
|
+
- **Prometheus & Grafana Observability**: Instrumentated with custom Python metrics exposing End-to-End Latency Histograms, LLM Token Cost accumulations, and Routing Cache Hit/Miss rates to `/metrics`.
|
|
46
|
+
|
|
47
|
+
## 🛠️ Tech Stack & Architecture
|
|
48
|
+
|
|
49
|
+
- **Backend Route Logic**: `Python 3.11`, `FastAPI`, `LiteLLM` (for multi-provider standardization)
|
|
50
|
+
- **Frontend Dashboard**: Raw `HTML5`, Vanilla Base `CSS`, `Vite` Node-Server, `marked.js`
|
|
51
|
+
- **Cache & Memory**: `Redis` alpine container
|
|
52
|
+
- **Orchestration**: `Docker Compose`
|
|
53
|
+
- **Observability**: `Prometheus` (Scraping), `Grafana` (Visualization)
|
|
54
|
+
- **Inference Hardware**: `Groq LPU` (Llama 3.1 models config standard)
|
|
55
|
+
|
|
56
|
+
## ⚡ Quickstart
|
|
57
|
+
|
|
58
|
+
1. **Clone the repository:**
|
|
59
|
+
```bash
|
|
60
|
+
git clone https://github.com/ManikBodamwad/LLM-Latency-Cost-Router.git
|
|
61
|
+
cd "LLM Latency & Cost router"
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
2. **Supply your API Keys:**
|
|
65
|
+
Create a `.env` file in the root directory:
|
|
66
|
+
```env
|
|
67
|
+
GROQ_API_KEY="gsk_your_groq_key_here"
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
3. **Deploy via Docker Compose:**
|
|
71
|
+
```bash
|
|
72
|
+
docker compose up -d --build
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
4. **Experience the Application:**
|
|
76
|
+
Open [http://localhost:5173](http://localhost:5173) in your web browser. Type a complex prompt like *"Can you explain the Medallion Architecture?"* and observe the SRE dashboard dynamically tracking the latency, the exact Token Cost, and the routing strategy in real-time.
|
|
77
|
+
|
|
78
|
+
## 📦 Usage as a Python Package
|
|
79
|
+
|
|
80
|
+
This repository is built as a portable Python package so engineering teams can inject edge-routing into their own systems natively without bulky Docker containers!
|
|
81
|
+
|
|
82
|
+
If you install this via pip:
|
|
83
|
+
```bash
|
|
84
|
+
pip install agentic-sre-gateway
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
You can instantly spin up the SRE-optimized Routing API on your local terminal using the globally injected command:
|
|
88
|
+
```bash
|
|
89
|
+
export GROQ_API_KEY="your_key"
|
|
90
|
+
export REDIS_URL="redis://localhost:6379/0"
|
|
91
|
+
agentic-gateway
|
|
92
|
+
```
|
|
93
|
+
This serves teams that want a drop-in API proxy to massively reduce LLM bills and monitor token consumption locally without rewriting complex LiteLLM and Prometheus wrappers themselves.
|
|
94
|
+
|
|
95
|
+
## 📊 View Local Development Telemetry
|
|
96
|
+
|
|
97
|
+
For local visualization during development, the `docker-compose` orchestration automatically spins up standard metrics scrape targets.
|
|
98
|
+
|
|
99
|
+
- **Prometheus Scraper UI**: `http://localhost:9090`
|
|
100
|
+
- **Grafana Workspace**: `http://localhost:3000`
|
|
101
|
+
*(Note: This uses the default local-dev credentials Login: `admin` / Password: `admin`)*
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
*Developed by Manik Bodamwad to solve enterprise-level LLM deployment friction points: Cost Runaway, High Latency, and Provider Downtime.*
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# Agentic API Gateway | SRE Edge Router
|
|
2
|
+
|
|
3
|
+
A production-grade, highly resilient API Gateway that dynamically routes Large Language Model (LLM) prompts based on complexity and minimizes cost through semantic caching.
|
|
4
|
+
|
|
5
|
+
Built strictly with **Site Reliability Engineering (SRE)** principles, it implements automated failovers, Redis-backed rate limiters, token-cost telemetry routing to Prometheus, and features a glowing, immersive UI dashboard built beautifully in Vanilla JS/Vite.
|
|
6
|
+
|
|
7
|
+
## 🚀 Core SRE Features
|
|
8
|
+
|
|
9
|
+
- **Dynamic Tier Routing**: Uses a heuristics engine to parse prompt intent. Simple queries route to blazing-fast models (`groq/llama-3.1-8b`), while complex queries (e.g. system design) automatically route to heavy models (`groq/llama-3.1-70b`).
|
|
10
|
+
- **Zero-Latency Semantic Caching**: SHA-256 hashes intercepts inbound requests. Identical requests skip the LLM network entirely, serving an exact match directly from Redis memory in `< 0ms` for `$0USD` cost.
|
|
11
|
+
- **Intelligent Failover Resiliency**: Wraps primary model calls in strict `asyncio` timeouts. If a provider throws a quota limit, `503`, or hangs, the router gracefully degrades to alternative models before ever throwing an error to the user.
|
|
12
|
+
- **Vite SRE Telemetry Dashboard**: Complete visual interface built without bulky frameworks—utilizing raw CSS glassmorphism, flexbox scaling, and micro-animated charts showcasing true request latency and cost updates on every stream.
|
|
13
|
+
- **DDoS/Billing Defense**: Implements a Redis token-bucket API rate limiter (50 req/min) requiring an `x-api-key` header to prevent billing exhaustion.
|
|
14
|
+
- **Prometheus & Grafana Observability**: Instrumentated with custom Python metrics exposing End-to-End Latency Histograms, LLM Token Cost accumulations, and Routing Cache Hit/Miss rates to `/metrics`.
|
|
15
|
+
|
|
16
|
+
## 🛠️ Tech Stack & Architecture
|
|
17
|
+
|
|
18
|
+
- **Backend Route Logic**: `Python 3.11`, `FastAPI`, `LiteLLM` (for multi-provider standardization)
|
|
19
|
+
- **Frontend Dashboard**: Raw `HTML5`, Vanilla Base `CSS`, `Vite` Node-Server, `marked.js`
|
|
20
|
+
- **Cache & Memory**: `Redis` alpine container
|
|
21
|
+
- **Orchestration**: `Docker Compose`
|
|
22
|
+
- **Observability**: `Prometheus` (Scraping), `Grafana` (Visualization)
|
|
23
|
+
- **Inference Hardware**: `Groq LPU` (Llama 3.1 models config standard)
|
|
24
|
+
|
|
25
|
+
## ⚡ Quickstart
|
|
26
|
+
|
|
27
|
+
1. **Clone the repository:**
|
|
28
|
+
```bash
|
|
29
|
+
git clone https://github.com/ManikBodamwad/LLM-Latency-Cost-Router.git
|
|
30
|
+
cd "LLM Latency & Cost router"
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
2. **Supply your API Keys:**
|
|
34
|
+
Create a `.env` file in the root directory:
|
|
35
|
+
```env
|
|
36
|
+
GROQ_API_KEY="gsk_your_groq_key_here"
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
3. **Deploy via Docker Compose:**
|
|
40
|
+
```bash
|
|
41
|
+
docker compose up -d --build
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
4. **Experience the Application:**
|
|
45
|
+
Open [http://localhost:5173](http://localhost:5173) in your web browser. Type a complex prompt like *"Can you explain the Medallion Architecture?"* and observe the SRE dashboard dynamically tracking the latency, the exact Token Cost, and the routing strategy in real-time.
|
|
46
|
+
|
|
47
|
+
## 📦 Usage as a Python Package
|
|
48
|
+
|
|
49
|
+
This repository is built as a portable Python package so engineering teams can inject edge-routing into their own systems natively without bulky Docker containers!
|
|
50
|
+
|
|
51
|
+
If you install this via pip:
|
|
52
|
+
```bash
|
|
53
|
+
pip install agentic-sre-gateway
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
You can instantly spin up the SRE-optimized Routing API on your local terminal using the globally injected command:
|
|
57
|
+
```bash
|
|
58
|
+
export GROQ_API_KEY="your_key"
|
|
59
|
+
export REDIS_URL="redis://localhost:6379/0"
|
|
60
|
+
agentic-gateway
|
|
61
|
+
```
|
|
62
|
+
This serves teams that want a drop-in API proxy to massively reduce LLM bills and monitor token consumption locally without rewriting complex LiteLLM and Prometheus wrappers themselves.
|
|
63
|
+
|
|
64
|
+
## 📊 View Local Development Telemetry
|
|
65
|
+
|
|
66
|
+
For local visualization during development, the `docker-compose` orchestration automatically spins up standard metrics scrape targets.
|
|
67
|
+
|
|
68
|
+
- **Prometheus Scraper UI**: `http://localhost:9090`
|
|
69
|
+
- **Grafana Workspace**: `http://localhost:3000`
|
|
70
|
+
*(Note: This uses the default local-dev credentials Login: `admin` / Password: `admin`)*
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
*Developed by Manik Bodamwad to solve enterprise-level LLM deployment friction points: Cost Runaway, High Latency, and Provider Downtime.*
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: agentic-sre-gateway
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: An SRE-Optimized API Gateway for dynamic LLM routing, Redis caching, and Prometheus telemetry.
|
|
5
|
+
Home-page: https://github.com/ManikBodamwad/LLM-Latency-Cost-Router
|
|
6
|
+
Author: Manik Bodamwad
|
|
7
|
+
Author-email: bodamwadm@gmail.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
12
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
13
|
+
Requires-Python: >=3.9
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
License-File: LICENSE
|
|
16
|
+
Requires-Dist: fastapi>=0.100.0
|
|
17
|
+
Requires-Dist: uvicorn>=0.23.0
|
|
18
|
+
Requires-Dist: litellm>=1.0.0
|
|
19
|
+
Requires-Dist: redis>=5.0.0
|
|
20
|
+
Requires-Dist: prometheus_client>=0.17.0
|
|
21
|
+
Dynamic: author
|
|
22
|
+
Dynamic: author-email
|
|
23
|
+
Dynamic: classifier
|
|
24
|
+
Dynamic: description
|
|
25
|
+
Dynamic: description-content-type
|
|
26
|
+
Dynamic: home-page
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
Dynamic: requires-dist
|
|
29
|
+
Dynamic: requires-python
|
|
30
|
+
Dynamic: summary
|
|
31
|
+
|
|
32
|
+
# Agentic API Gateway | SRE Edge Router
|
|
33
|
+
|
|
34
|
+
A production-grade, highly resilient API Gateway that dynamically routes Large Language Model (LLM) prompts based on complexity and minimizes cost through semantic caching.
|
|
35
|
+
|
|
36
|
+
Built strictly with **Site Reliability Engineering (SRE)** principles, it implements automated failovers, Redis-backed rate limiters, token-cost telemetry routing to Prometheus, and features a glowing, immersive UI dashboard built beautifully in Vanilla JS/Vite.
|
|
37
|
+
|
|
38
|
+
## 🚀 Core SRE Features
|
|
39
|
+
|
|
40
|
+
- **Dynamic Tier Routing**: Uses a heuristics engine to parse prompt intent. Simple queries route to blazing-fast models (`groq/llama-3.1-8b`), while complex queries (e.g. system design) automatically route to heavy models (`groq/llama-3.1-70b`).
|
|
41
|
+
- **Zero-Latency Semantic Caching**: SHA-256 hashes intercepts inbound requests. Identical requests skip the LLM network entirely, serving an exact match directly from Redis memory in `< 0ms` for `$0USD` cost.
|
|
42
|
+
- **Intelligent Failover Resiliency**: Wraps primary model calls in strict `asyncio` timeouts. If a provider throws a quota limit, `503`, or hangs, the router gracefully degrades to alternative models before ever throwing an error to the user.
|
|
43
|
+
- **Vite SRE Telemetry Dashboard**: Complete visual interface built without bulky frameworks—utilizing raw CSS glassmorphism, flexbox scaling, and micro-animated charts showcasing true request latency and cost updates on every stream.
|
|
44
|
+
- **DDoS/Billing Defense**: Implements a Redis token-bucket API rate limiter (50 req/min) requiring an `x-api-key` header to prevent billing exhaustion.
|
|
45
|
+
- **Prometheus & Grafana Observability**: Instrumentated with custom Python metrics exposing End-to-End Latency Histograms, LLM Token Cost accumulations, and Routing Cache Hit/Miss rates to `/metrics`.
|
|
46
|
+
|
|
47
|
+
## 🛠️ Tech Stack & Architecture
|
|
48
|
+
|
|
49
|
+
- **Backend Route Logic**: `Python 3.11`, `FastAPI`, `LiteLLM` (for multi-provider standardization)
|
|
50
|
+
- **Frontend Dashboard**: Raw `HTML5`, Vanilla Base `CSS`, `Vite` Node-Server, `marked.js`
|
|
51
|
+
- **Cache & Memory**: `Redis` alpine container
|
|
52
|
+
- **Orchestration**: `Docker Compose`
|
|
53
|
+
- **Observability**: `Prometheus` (Scraping), `Grafana` (Visualization)
|
|
54
|
+
- **Inference Hardware**: `Groq LPU` (Llama 3.1 models config standard)
|
|
55
|
+
|
|
56
|
+
## ⚡ Quickstart
|
|
57
|
+
|
|
58
|
+
1. **Clone the repository:**
|
|
59
|
+
```bash
|
|
60
|
+
git clone https://github.com/ManikBodamwad/LLM-Latency-Cost-Router.git
|
|
61
|
+
cd "LLM Latency & Cost router"
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
2. **Supply your API Keys:**
|
|
65
|
+
Create a `.env` file in the root directory:
|
|
66
|
+
```env
|
|
67
|
+
GROQ_API_KEY="gsk_your_groq_key_here"
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
3. **Deploy via Docker Compose:**
|
|
71
|
+
```bash
|
|
72
|
+
docker compose up -d --build
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
4. **Experience the Application:**
|
|
76
|
+
Open [http://localhost:5173](http://localhost:5173) in your web browser. Type a complex prompt like *"Can you explain the Medallion Architecture?"* and observe the SRE dashboard dynamically tracking the latency, the exact Token Cost, and the routing strategy in real-time.
|
|
77
|
+
|
|
78
|
+
## 📦 Usage as a Python Package
|
|
79
|
+
|
|
80
|
+
This repository is built as a portable Python package so engineering teams can inject edge-routing into their own systems natively without bulky Docker containers!
|
|
81
|
+
|
|
82
|
+
If you install this via pip:
|
|
83
|
+
```bash
|
|
84
|
+
pip install agentic-sre-gateway
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
You can instantly spin up the SRE-optimized Routing API on your local terminal using the globally injected command:
|
|
88
|
+
```bash
|
|
89
|
+
export GROQ_API_KEY="your_key"
|
|
90
|
+
export REDIS_URL="redis://localhost:6379/0"
|
|
91
|
+
agentic-gateway
|
|
92
|
+
```
|
|
93
|
+
This serves teams that want a drop-in API proxy to massively reduce LLM bills and monitor token consumption locally without rewriting complex LiteLLM and Prometheus wrappers themselves.
|
|
94
|
+
|
|
95
|
+
## 📊 View Local Development Telemetry
|
|
96
|
+
|
|
97
|
+
For local visualization during development, the `docker-compose` orchestration automatically spins up standard metrics scrape targets.
|
|
98
|
+
|
|
99
|
+
- **Prometheus Scraper UI**: `http://localhost:9090`
|
|
100
|
+
- **Grafana Workspace**: `http://localhost:3000`
|
|
101
|
+
*(Note: This uses the default local-dev credentials Login: `admin` / Password: `admin`)*
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
*Developed by Manik Bodamwad to solve enterprise-level LLM deployment friction points: Cost Runaway, High Latency, and Provider Downtime.*
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
setup.py
|
|
4
|
+
agentic_sre_gateway.egg-info/PKG-INFO
|
|
5
|
+
agentic_sre_gateway.egg-info/SOURCES.txt
|
|
6
|
+
agentic_sre_gateway.egg-info/dependency_links.txt
|
|
7
|
+
agentic_sre_gateway.egg-info/entry_points.txt
|
|
8
|
+
agentic_sre_gateway.egg-info/requires.txt
|
|
9
|
+
agentic_sre_gateway.egg-info/top_level.txt
|
|
10
|
+
app/__init__.py
|
|
11
|
+
app/cli.py
|
|
12
|
+
app/main.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
app
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Making the app directory a discoverable python package.
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import uvicorn
|
|
2
|
+
import os
|
|
3
|
+
|
|
4
|
+
def start_gateway():
|
|
5
|
+
"""
|
|
6
|
+
Entrypoint for the CLI. Starts the Agentic SRE Gateway via Uvicorn.
|
|
7
|
+
"""
|
|
8
|
+
print("🚀 Starting Agentic SRE Gateway on http://0.0.0.0:8000")
|
|
9
|
+
print("Make sure you have Redis running and GROQ_API_KEY in your environment.")
|
|
10
|
+
uvicorn.run("app.main:app", host="0.0.0.0", port=8000, reload=False)
|
|
11
|
+
|
|
12
|
+
if __name__ == "__main__":
|
|
13
|
+
start_gateway()
|
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
from fastapi import FastAPI, HTTPException, Request, Header, Depends
|
|
2
|
+
from fastapi.responses import StreamingResponse
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
import time
|
|
5
|
+
import asyncio
|
|
6
|
+
import os
|
|
7
|
+
import hashlib
|
|
8
|
+
import redis.asyncio as redis
|
|
9
|
+
from prometheus_client import make_asgi_app, Counter, Histogram
|
|
10
|
+
from fastapi.middleware.cors import CORSMiddleware
|
|
11
|
+
|
|
12
|
+
app = FastAPI(title="LLM Router API")
|
|
13
|
+
|
|
14
|
+
app.add_middleware(
|
|
15
|
+
CORSMiddleware,
|
|
16
|
+
allow_origins=["*"],
|
|
17
|
+
allow_credentials=False,
|
|
18
|
+
allow_methods=["*"],
|
|
19
|
+
allow_headers=["*"],
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# Initialize Redis client
|
|
23
|
+
redis_client = redis.from_url(os.getenv("REDIS_URL", "redis://localhost:6379/0"), decode_responses=True)
|
|
24
|
+
|
|
25
|
+
# Metrics definitions
|
|
26
|
+
REQUEST_LATENCY = Histogram(
|
|
27
|
+
'llm_request_latency_seconds',
|
|
28
|
+
'Latency of LLM requests',
|
|
29
|
+
['model', 'tier']
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
TOKEN_COST = Counter(
|
|
33
|
+
'llm_token_cost_total',
|
|
34
|
+
'Estimated cost in USD',
|
|
35
|
+
['model', 'tier']
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
FAILOVER_COUNT = Counter(
|
|
39
|
+
'llm_failover_count_total',
|
|
40
|
+
'Number of failovers',
|
|
41
|
+
['original_model', 'fallback_model', 'reason']
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
CACHE_HITS = Counter('llm_cache_hits_total', 'Number of cache hits')
|
|
45
|
+
CACHE_MISSES = Counter('llm_cache_misses_total', 'Number of cache misses')
|
|
46
|
+
|
|
47
|
+
import litellm
|
|
48
|
+
|
|
49
|
+
# Constants for pricing mock will be handled by litellm's completion_cost
|
|
50
|
+
class ChatRequest(BaseModel):
|
|
51
|
+
prompt: str
|
|
52
|
+
stream: bool = False
|
|
53
|
+
|
|
54
|
+
async def verify_api_key(x_api_key: str = Header(...)):
|
|
55
|
+
if not x_api_key:
|
|
56
|
+
raise HTTPException(status_code=401, detail="API Key required")
|
|
57
|
+
# Fixed window rate limiter (50 req / minute)
|
|
58
|
+
key = f"rate_limit:{x_api_key}"
|
|
59
|
+
requests = await redis_client.incr(key)
|
|
60
|
+
if requests == 1:
|
|
61
|
+
await redis_client.expire(key, 60)
|
|
62
|
+
if requests > 50:
|
|
63
|
+
raise HTTPException(status_code=429, detail="Rate limit exceeded: 50 requests per minute")
|
|
64
|
+
return x_api_key
|
|
65
|
+
|
|
66
|
+
def determine_complexity(prompt: str) -> str:
|
|
67
|
+
complex_keywords = ["analyze", "code", "architecture", "summarize", "complex"]
|
|
68
|
+
if len(prompt) > 1000:
|
|
69
|
+
return "complex"
|
|
70
|
+
if any(kw in prompt.lower() for kw in complex_keywords):
|
|
71
|
+
return "complex"
|
|
72
|
+
return "simple"
|
|
73
|
+
|
|
74
|
+
@app.post("/api/v1/chat/completions")
|
|
75
|
+
async def chat_completions(req: ChatRequest, api_key: str = Depends(verify_api_key)):
|
|
76
|
+
tier = determine_complexity(req.prompt)
|
|
77
|
+
|
|
78
|
+
if tier == "simple":
|
|
79
|
+
target_model = "groq/llama-3.1-8b-instant"
|
|
80
|
+
fallback_model = "groq/llama-3.1-8b-instant"
|
|
81
|
+
timeout_seconds = 5.0
|
|
82
|
+
else:
|
|
83
|
+
target_model = "groq/llama-3.1-70b-versatile"
|
|
84
|
+
fallback_model = "groq/llama-3.1-8b-instant"
|
|
85
|
+
timeout_seconds = 10.0
|
|
86
|
+
|
|
87
|
+
start_time = time.time()
|
|
88
|
+
response = None
|
|
89
|
+
used_model = target_model
|
|
90
|
+
|
|
91
|
+
messages = [{"role": "user", "content": req.prompt}]
|
|
92
|
+
|
|
93
|
+
# Check cache first
|
|
94
|
+
prompt_hash = hashlib.sha256(req.prompt.encode()).hexdigest()
|
|
95
|
+
cached_response = await redis_client.get(prompt_hash)
|
|
96
|
+
|
|
97
|
+
if cached_response:
|
|
98
|
+
CACHE_HITS.inc()
|
|
99
|
+
latency = time.time() - start_time
|
|
100
|
+
REQUEST_LATENCY.labels(model="cache", tier=tier).observe(latency)
|
|
101
|
+
|
|
102
|
+
if req.stream:
|
|
103
|
+
async def cache_streamer():
|
|
104
|
+
yield cached_response
|
|
105
|
+
return StreamingResponse(cache_streamer(), media_type="text/event-stream")
|
|
106
|
+
|
|
107
|
+
return {
|
|
108
|
+
"tier": tier,
|
|
109
|
+
"latency": round(latency, 3),
|
|
110
|
+
"used_model": "cache",
|
|
111
|
+
"response": cached_response,
|
|
112
|
+
"cost_usd": 0.0
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
CACHE_MISSES.inc()
|
|
116
|
+
|
|
117
|
+
if req.stream:
|
|
118
|
+
try:
|
|
119
|
+
response_stream = await litellm.acompletion(model=target_model, messages=messages, timeout=timeout_seconds, stream=True)
|
|
120
|
+
except Exception as e:
|
|
121
|
+
error_type = "timeout" if isinstance(e, asyncio.TimeoutError) else "error_503"
|
|
122
|
+
FAILOVER_COUNT.labels(original_model=target_model, fallback_model=fallback_model, reason=error_type).inc()
|
|
123
|
+
used_model = fallback_model
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
response_stream = await litellm.acompletion(model=fallback_model, messages=messages, timeout=timeout_seconds, stream=True)
|
|
127
|
+
except Exception as e2:
|
|
128
|
+
raise HTTPException(status_code=503, detail=f"Core and Fallback LLM failed: {str(e2)}")
|
|
129
|
+
|
|
130
|
+
async def stream_generator():
|
|
131
|
+
full_text = ""
|
|
132
|
+
async for chunk in response_stream:
|
|
133
|
+
content = chunk.choices[0].delta.content or ""
|
|
134
|
+
full_text += content
|
|
135
|
+
yield content
|
|
136
|
+
|
|
137
|
+
latency = time.time() - start_time
|
|
138
|
+
REQUEST_LATENCY.labels(model=used_model, tier=tier).observe(latency)
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
await redis_client.setex(prompt_hash, 3600, full_text)
|
|
142
|
+
except Exception:
|
|
143
|
+
pass
|
|
144
|
+
|
|
145
|
+
return StreamingResponse(stream_generator(), media_type="text/event-stream")
|
|
146
|
+
|
|
147
|
+
try:
|
|
148
|
+
# Attempt primary model
|
|
149
|
+
response = await litellm.acompletion(
|
|
150
|
+
model=target_model,
|
|
151
|
+
messages=messages,
|
|
152
|
+
timeout=timeout_seconds
|
|
153
|
+
)
|
|
154
|
+
except Exception as e:
|
|
155
|
+
# Handle timeout or 503 or any exception by falling back
|
|
156
|
+
error_type = "timeout" if isinstance(e, asyncio.TimeoutError) else "error_503"
|
|
157
|
+
FAILOVER_COUNT.labels(original_model=target_model, fallback_model=fallback_model, reason=error_type).inc()
|
|
158
|
+
used_model = fallback_model
|
|
159
|
+
|
|
160
|
+
try:
|
|
161
|
+
# Call fallback
|
|
162
|
+
response = await litellm.acompletion(
|
|
163
|
+
model=fallback_model,
|
|
164
|
+
messages=messages,
|
|
165
|
+
timeout=timeout_seconds
|
|
166
|
+
)
|
|
167
|
+
except Exception as e2:
|
|
168
|
+
raise HTTPException(status_code=503, detail=f"LLM Quota Exceeded or fully down: {str(e2)}")
|
|
169
|
+
|
|
170
|
+
latency = time.time() - start_time
|
|
171
|
+
REQUEST_LATENCY.labels(model=used_model, tier=tier).observe(latency)
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
cost = litellm.completion_cost(completion_response=response, model=used_model) or 0.0
|
|
175
|
+
except Exception:
|
|
176
|
+
cost = 0.0
|
|
177
|
+
|
|
178
|
+
TOKEN_COST.labels(model=used_model, tier=tier).inc(cost)
|
|
179
|
+
|
|
180
|
+
# Extract string from response format
|
|
181
|
+
content_str = response.choices[0].message.content
|
|
182
|
+
|
|
183
|
+
# Store in cache for 1 hour (3600 seconds)
|
|
184
|
+
try:
|
|
185
|
+
await redis_client.setex(prompt_hash, 3600, content_str)
|
|
186
|
+
except Exception as e:
|
|
187
|
+
pass # Ignore cache write errors
|
|
188
|
+
|
|
189
|
+
return {
|
|
190
|
+
"tier": tier,
|
|
191
|
+
"latency": round(latency, 3),
|
|
192
|
+
"used_model": used_model,
|
|
193
|
+
"response": content_str,
|
|
194
|
+
"cost_usd": cost
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
# Mount prometheus metrics endpoint
|
|
198
|
+
metrics_app = make_asgi_app()
|
|
199
|
+
app.mount("/metrics", metrics_app)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
with open("README.md", "r", encoding="utf-8") as fh:
|
|
4
|
+
long_description = fh.read()
|
|
5
|
+
|
|
6
|
+
setup(
|
|
7
|
+
name="agentic-sre-gateway",
|
|
8
|
+
version="1.0.0",
|
|
9
|
+
author="Manik Bodamwad",
|
|
10
|
+
author_email="bodamwadm@gmail.com",
|
|
11
|
+
description="An SRE-Optimized API Gateway for dynamic LLM routing, Redis caching, and Prometheus telemetry.",
|
|
12
|
+
long_description=long_description,
|
|
13
|
+
long_description_content_type="text/markdown",
|
|
14
|
+
url="https://github.com/ManikBodamwad/LLM-Latency-Cost-Router",
|
|
15
|
+
packages=find_packages(),
|
|
16
|
+
install_requires=[
|
|
17
|
+
"fastapi>=0.100.0",
|
|
18
|
+
"uvicorn>=0.23.0",
|
|
19
|
+
"litellm>=1.0.0",
|
|
20
|
+
"redis>=5.0.0",
|
|
21
|
+
"prometheus_client>=0.17.0"
|
|
22
|
+
],
|
|
23
|
+
classifiers=[
|
|
24
|
+
"Programming Language :: Python :: 3",
|
|
25
|
+
"License :: OSI Approved :: MIT License",
|
|
26
|
+
"Operating System :: OS Independent",
|
|
27
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
28
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
29
|
+
],
|
|
30
|
+
python_requires=">=3.9",
|
|
31
|
+
entry_points={
|
|
32
|
+
"console_scripts": [
|
|
33
|
+
"agentic-gateway=app.cli:start_gateway",
|
|
34
|
+
],
|
|
35
|
+
},
|
|
36
|
+
)
|