slimzero 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- slimzero-0.1.0/.gitignore +57 -0
- slimzero-0.1.0/LICENSE +21 -0
- slimzero-0.1.0/PKG-INFO +581 -0
- slimzero-0.1.0/README.md +535 -0
- slimzero-0.1.0/pyproject.toml +131 -0
- slimzero-0.1.0/slimzero/__init__.py +53 -0
- slimzero-0.1.0/slimzero/__main__.py +124 -0
- slimzero-0.1.0/slimzero/agent/__init__.py +0 -0
- slimzero-0.1.0/slimzero/agent/gsd.py +328 -0
- slimzero-0.1.0/slimzero/agent/ralph.py +357 -0
- slimzero-0.1.0/slimzero/core.py +421 -0
- slimzero-0.1.0/slimzero/dashboard/__init__.py +237 -0
- slimzero-0.1.0/slimzero/exceptions.py +202 -0
- slimzero-0.1.0/slimzero/plugins/__init__.py +154 -0
- slimzero-0.1.0/slimzero/post/__init__.py +0 -0
- slimzero-0.1.0/slimzero/post/flagger.py +221 -0
- slimzero-0.1.0/slimzero/post/logger.py +228 -0
- slimzero-0.1.0/slimzero/post/validator.py +176 -0
- slimzero-0.1.0/slimzero/schemas.py +232 -0
- slimzero-0.1.0/slimzero/stages/__init__.py +0 -0
- slimzero-0.1.0/slimzero/stages/budget.py +290 -0
- slimzero-0.1.0/slimzero/stages/few_shot.py +238 -0
- slimzero-0.1.0/slimzero/stages/hallucination.py +226 -0
- slimzero-0.1.0/slimzero/stages/history.py +210 -0
- slimzero-0.1.0/slimzero/stages/injector.py +155 -0
- slimzero-0.1.0/slimzero/stages/intent.py +245 -0
- slimzero-0.1.0/slimzero/stages/rewriter.py +315 -0
- slimzero-0.1.0/slimzero/stages/semantic_guard.py +204 -0
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
|
|
7
|
+
# Virtual environments
|
|
8
|
+
venv/
|
|
9
|
+
.venv/
|
|
10
|
+
env/
|
|
11
|
+
.env/
|
|
12
|
+
|
|
13
|
+
# IDE
|
|
14
|
+
.vscode/
|
|
15
|
+
.idea/
|
|
16
|
+
*.swp
|
|
17
|
+
*.swo
|
|
18
|
+
|
|
19
|
+
# Testing
|
|
20
|
+
.pytest_cache/
|
|
21
|
+
.coverage
|
|
22
|
+
htmlcov/
|
|
23
|
+
.tox/
|
|
24
|
+
tests/
|
|
25
|
+
|
|
26
|
+
# Development
|
|
27
|
+
.gsd/
|
|
28
|
+
scripts/
|
|
29
|
+
archive/
|
|
30
|
+
.claude/
|
|
31
|
+
.gsd/
|
|
32
|
+
|
|
33
|
+
# Build artifacts
|
|
34
|
+
build/
|
|
35
|
+
dist/
|
|
36
|
+
*.egg-info/
|
|
37
|
+
*.egg
|
|
38
|
+
|
|
39
|
+
# OS
|
|
40
|
+
.DS_Store
|
|
41
|
+
Thumbs.db
|
|
42
|
+
|
|
43
|
+
# Logs
|
|
44
|
+
*.log
|
|
45
|
+
*.jsonl
|
|
46
|
+
|
|
47
|
+
# Local config
|
|
48
|
+
.env
|
|
49
|
+
.env.local
|
|
50
|
+
config.local.py
|
|
51
|
+
|
|
52
|
+
# Documentation build
|
|
53
|
+
site/
|
|
54
|
+
docs/_build/
|
|
55
|
+
|
|
56
|
+
# Checkpoints
|
|
57
|
+
*.checkpoint.json
|
slimzero-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 SlimZero Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
slimzero-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,581 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: slimzero
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Zero-overhead prompt compression, response minimisation, hallucination guarding, and autonomous agent orchestration
|
|
5
|
+
Project-URL: Homepage, https://github.com/xt67/SlimZero
|
|
6
|
+
Project-URL: Documentation, https://slimzero.readthedocs.io
|
|
7
|
+
Project-URL: Repository, https://github.com/xt67/SlimZero.git
|
|
8
|
+
Project-URL: Issues, https://github.com/xt67/SlimZero/issues
|
|
9
|
+
Author: SlimZero Contributors
|
|
10
|
+
Author-email: xt67 <xrahman088@gmail.com>
|
|
11
|
+
License: MIT
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Keywords: agent,compression,cost,hallucination,llm,optimization,prompt,ralph,savings,token
|
|
14
|
+
Classifier: Development Status :: 3 - Alpha
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Provides-Extra: agent
|
|
24
|
+
Requires-Dist: networkx>=3.3; extra == 'agent'
|
|
25
|
+
Provides-Extra: all
|
|
26
|
+
Requires-Dist: networkx>=3.3; extra == 'all'
|
|
27
|
+
Requires-Dist: rich>=13.7; extra == 'all'
|
|
28
|
+
Requires-Dist: sentence-transformers>=2.7; extra == 'all'
|
|
29
|
+
Requires-Dist: spacy>=3.7; extra == 'all'
|
|
30
|
+
Requires-Dist: tiktoken>=0.7; extra == 'all'
|
|
31
|
+
Requires-Dist: transformers>=4.40; extra == 'all'
|
|
32
|
+
Provides-Extra: base
|
|
33
|
+
Requires-Dist: sentence-transformers>=2.7; extra == 'base'
|
|
34
|
+
Requires-Dist: spacy>=3.7; extra == 'base'
|
|
35
|
+
Requires-Dist: tiktoken>=0.7; extra == 'base'
|
|
36
|
+
Requires-Dist: transformers>=4.40; extra == 'base'
|
|
37
|
+
Provides-Extra: dashboard
|
|
38
|
+
Requires-Dist: rich>=13.7; extra == 'dashboard'
|
|
39
|
+
Provides-Extra: dev
|
|
40
|
+
Requires-Dist: hypothesis>=6.100; extra == 'dev'
|
|
41
|
+
Requires-Dist: mypy>=1.9; extra == 'dev'
|
|
42
|
+
Requires-Dist: pytest-cov>=4.1; extra == 'dev'
|
|
43
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
44
|
+
Requires-Dist: ruff>=0.3; extra == 'dev'
|
|
45
|
+
Description-Content-Type: text/markdown
|
|
46
|
+
|
|
47
|
+
# SlimZero
|
|
48
|
+
|
|
49
|
+
**Zero-overhead token compression for LLM APIs.**
|
|
50
|
+
|
|
51
|
+
SlimZero sits between your app and any LLM — Claude, GPT-4, Gemini, Ollama — and quietly makes every call cheaper, faster, and less likely to hallucinate. It rewrites your prompts locally, tells the model to respond concisely, and validates the response, all without spending a single token doing it.
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from slimzero import SlimZero
|
|
55
|
+
|
|
56
|
+
sz = SlimZero(model="claude-sonnet-4-6")
|
|
57
|
+
response = sz.call("Can you please maybe explain what gradient descent is in simple terms?")
|
|
58
|
+
# → internally rewrites to: "Explain gradient descent simply."
|
|
59
|
+
# → appends: "One paragraph max. No summary."
|
|
60
|
+
# → hits API once
|
|
61
|
+
# → saved 31 input tokens, ~55 output tokens estimated
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
|
|
66
|
+
## Why SlimZero?
|
|
67
|
+
|
|
68
|
+
Every other token optimisation tool — LangChain optimisers, DSPy, LLMLingua — burns API tokens to optimise your prompts. SlimZero does all of that locally. The API is called exactly once, for your actual query, never for the optimisation work itself.
|
|
69
|
+
|
|
70
|
+
| Tool | Meta-token cost | Hallucination guard | Agent fault control | Drop-in? |
|
|
71
|
+
|---|---|---|---|---|
|
|
72
|
+
| LangChain optimiser | HIGH | None | None | No |
|
|
73
|
+
| DSPy | HIGH | None | None | No |
|
|
74
|
+
| LLMLingua | Low | None | None | Partial |
|
|
75
|
+
| **SlimZero** | **Zero** | **Built-in** | **Full** | **Yes** |
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## What it does
|
|
80
|
+
|
|
81
|
+
### 1. Compresses your prompt locally
|
|
82
|
+
Strips filler words, merges redundant clauses, converts hedged phrasing to direct instructions. Uses a local T5-small model — never the target API.
|
|
83
|
+
|
|
84
|
+
### 2. Pre-conditions minimal responses
|
|
85
|
+
Appends a tiny system instruction fragment (under 12 tokens) that tells the LLM to skip preamble, not restate the question, and answer only what was asked. This happens before the API call, not after — so you pay for fewer output tokens too.
|
|
86
|
+
|
|
87
|
+
### 3. Guards against hallucinations
|
|
88
|
+
Classifies your query by risk level locally — dates, numbers, citations are HIGH risk; open-ended questions are LOW. High-risk queries get an uncertainty instruction appended. After the response arrives, a local validator checks it actually addressed your question.
|
|
89
|
+
|
|
90
|
+
### 4. Runs a full autonomous agent loop
|
|
91
|
+
Built-in Ralph loop + GSD task graph for long-horizon goals. Decomposes goals into checkpointed sub-tasks, applies compression on every agent step, and includes circuit breakers, semantic drift detection, and tool-call validation. No runaway loops, no silent failures.
|
|
92
|
+
|
|
93
|
+
### 5. Tracks everything
|
|
94
|
+
A live savings dashboard shows tokens saved per call, cumulative cost reduction, which stages fired, and hallucination flags — exportable as JSON.
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
## Architecture Diagrams
|
|
99
|
+
|
|
100
|
+
### System Architecture
|
|
101
|
+
|
|
102
|
+

|
|
103
|
+
|
|
104
|
+
### Agent Loop (Ralph + GSD)
|
|
105
|
+
|
|
106
|
+

|
|
107
|
+
|
|
108
|
+
### Request Flowchart
|
|
109
|
+
|
|
110
|
+

|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## Installation
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
pip install slimzero
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
Optional extras:
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
pip install slimzero[agent] # Ralph loop + GSD task graph
|
|
124
|
+
pip install slimzero[dashboard] # Rich live terminal dashboard
|
|
125
|
+
pip install slimzero[all] # Everything
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
---
|
|
129
|
+
|
|
130
|
+
## Quick start
|
|
131
|
+
|
|
132
|
+
### Basic call
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
from slimzero import SlimZero
|
|
136
|
+
|
|
137
|
+
sz = SlimZero(model="claude-sonnet-4-6")
|
|
138
|
+
result = sz.call("Explain what a transformer model is in detail please.")
|
|
139
|
+
|
|
140
|
+
print(result.response) # LLM response
|
|
141
|
+
print(result.input_token_savings_percent) # e.g. 38%
|
|
142
|
+
print(result.flags_raised) # hallucination flags, if any
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### With an existing client
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
import anthropic
|
|
149
|
+
from slimzero import SlimZero
|
|
150
|
+
|
|
151
|
+
client = anthropic.Anthropic()
|
|
152
|
+
sz = SlimZero(api_client=client, model="claude-opus-4-6")
|
|
153
|
+
|
|
154
|
+
result = sz.call("Write a Python function to reverse a linked list.")
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
### Multi-turn conversation
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
sz = SlimZero(model="gpt-4o", history_window=4)
|
|
161
|
+
|
|
162
|
+
result1 = sz.call("What is gradient descent?")
|
|
163
|
+
result2 = sz.call("Now give me a Python example.")
|
|
164
|
+
# Old turns are auto-compressed. Recent turns stay verbatim.
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
### Autonomous agent mode
|
|
168
|
+
|
|
169
|
+
```python
|
|
170
|
+
from slimzero import SlimZero
|
|
171
|
+
|
|
172
|
+
sz = SlimZero(
|
|
173
|
+
model="claude-opus-4-6",
|
|
174
|
+
agent_mode=True,
|
|
175
|
+
max_agent_steps=30,
|
|
176
|
+
dashboard=True
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
result = sz.run_goal(
|
|
180
|
+
goal="Research the top 5 open-source vector databases and write a comparison report.",
|
|
181
|
+
tools=[search_tool, write_tool, read_tool]
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
print(result.output) # Final result
|
|
185
|
+
print(result.audit_trail) # Every tool call logged
|
|
186
|
+
print(result.total_tokens_saved)
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
---
|
|
190
|
+
|
|
191
|
+
## Comprehensive Examples
|
|
192
|
+
|
|
193
|
+
### Web Framework Integration
|
|
194
|
+
|
|
195
|
+
#### Flask API
|
|
196
|
+
|
|
197
|
+
```python
|
|
198
|
+
from flask import Flask, request, jsonify
|
|
199
|
+
from slimzero import SlimZero
|
|
200
|
+
|
|
201
|
+
app = Flask(__name__)
|
|
202
|
+
sz = SlimZero(model="gpt-4o")
|
|
203
|
+
|
|
204
|
+
@app.route("/chat", methods=["POST"])
|
|
205
|
+
def chat():
|
|
206
|
+
data = request.json
|
|
207
|
+
result = sz.call(
|
|
208
|
+
prompt=data["message"],
|
|
209
|
+
system_prompt=data.get("system", "You are a helpful assistant.")
|
|
210
|
+
)
|
|
211
|
+
return jsonify({
|
|
212
|
+
"response": result.response,
|
|
213
|
+
"savings_percent": result.input_token_savings_percent,
|
|
214
|
+
"flags": result.flags_raised
|
|
215
|
+
})
|
|
216
|
+
|
|
217
|
+
if __name__ == "__main__":
|
|
218
|
+
app.run(debug=True)
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
#### FastAPI API
|
|
222
|
+
|
|
223
|
+
```python
|
|
224
|
+
from fastapi import FastAPI, HTTPException
|
|
225
|
+
from pydantic import BaseModel
|
|
226
|
+
from slimzero import SlimZero
|
|
227
|
+
|
|
228
|
+
app = FastAPI()
|
|
229
|
+
sz = SlimZero(model="gpt-4o")
|
|
230
|
+
|
|
231
|
+
class ChatRequest(BaseModel):
|
|
232
|
+
message: str
|
|
233
|
+
system: str | None = "You are a helpful assistant."
|
|
234
|
+
|
|
235
|
+
class ChatResponse(BaseModel):
|
|
236
|
+
response: str
|
|
237
|
+
savings_percent: float
|
|
238
|
+
flags: list[str]
|
|
239
|
+
|
|
240
|
+
@app.post("/chat", response_model=ChatResponse)
|
|
241
|
+
async def chat(req: ChatRequest):
|
|
242
|
+
result = sz.call(prompt=req.message, system_prompt=req.system)
|
|
243
|
+
return ChatResponse(
|
|
244
|
+
response=result.response,
|
|
245
|
+
savings_percent=result.input_token_savings_percent,
|
|
246
|
+
flags=result.flags_raised
|
|
247
|
+
)
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
#### LangChain Agent
|
|
251
|
+
|
|
252
|
+
```python
|
|
253
|
+
from langchain_openai import ChatOpenAI
|
|
254
|
+
from slimzero import SlimZero
|
|
255
|
+
|
|
256
|
+
llm = ChatOpenAI(model="gpt-4o")
|
|
257
|
+
sz = SlimZero(api_client=llm, model="gpt-4o")
|
|
258
|
+
|
|
259
|
+
def slimzero_llm(prompt: str) -> str:
|
|
260
|
+
result = sz.call(prompt=prompt)
|
|
261
|
+
return result.response
|
|
262
|
+
|
|
263
|
+
# Use slimzero_llm as your LLM in LangChain chains
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
### CLI Tool
|
|
267
|
+
|
|
268
|
+
```python
|
|
269
|
+
#!/usr/bin/env python3
|
|
270
|
+
"""SlimZero CLI - Compress prompts from the command line."""
|
|
271
|
+
|
|
272
|
+
import argparse
|
|
273
|
+
import sys
|
|
274
|
+
from slimzero import SlimZero
|
|
275
|
+
|
|
276
|
+
def main():
|
|
277
|
+
parser = argparse.ArgumentParser(description="SlimZero - Zero-overhead token compression")
|
|
278
|
+
parser.add_argument("prompt", nargs="*", help="Prompt to compress")
|
|
279
|
+
parser.add_argument("--model", "-m", default="gpt-4o", help="Model to use")
|
|
280
|
+
parser.add_argument("--compare", "-c", action="store_true", help="Show before/after comparison")
|
|
281
|
+
|
|
282
|
+
args = parser.parse_args()
|
|
283
|
+
|
|
284
|
+
if not args.prompt:
|
|
285
|
+
print("Enter a prompt:", end=" ")
|
|
286
|
+
prompt = sys.stdin.read().strip()
|
|
287
|
+
else:
|
|
288
|
+
prompt = " ".join(args.prompt)
|
|
289
|
+
|
|
290
|
+
sz = SlimZero(model=args.model)
|
|
291
|
+
result = sz.call(prompt)
|
|
292
|
+
|
|
293
|
+
if args.compare:
|
|
294
|
+
print(f"\n📝 Original ({result.original_tokens} tokens):")
|
|
295
|
+
print(f" {result.original_prompt}")
|
|
296
|
+
print(f"\n✨ Compressed ({result.sent_tokens} tokens):")
|
|
297
|
+
print(f" {result.sent_prompt}")
|
|
298
|
+
print(f"\n💰 Savings: {result.input_token_savings_percent:.1f}%")
|
|
299
|
+
else:
|
|
300
|
+
print(result.response)
|
|
301
|
+
|
|
302
|
+
if result.flags_raised:
|
|
303
|
+
print(f"\n⚠️ Flags: {result.flags_raised}")
|
|
304
|
+
|
|
305
|
+
if __name__ == "__main__":
|
|
306
|
+
main()
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
### Batch Processing
|
|
310
|
+
|
|
311
|
+
```python
|
|
312
|
+
from slimzero import SlimZero
|
|
313
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
314
|
+
|
|
315
|
+
sz = SlimZero(model="gpt-4o")
|
|
316
|
+
|
|
317
|
+
prompts = [
|
|
318
|
+
"Please could you explain what machine learning is?",
|
|
319
|
+
"Can you maybe give me an example of recursion?",
|
|
320
|
+
"I was wondering if you could help me understand Python lists?",
|
|
321
|
+
"What is the difference between a stack and a queue?",
|
|
322
|
+
"Could you possibly explain what an API is?",
|
|
323
|
+
]
|
|
324
|
+
|
|
325
|
+
def process_prompt(prompt: str) -> dict:
|
|
326
|
+
result = sz.call(prompt)
|
|
327
|
+
return {
|
|
328
|
+
"prompt": prompt,
|
|
329
|
+
"response": result.response,
|
|
330
|
+
"savings_percent": result.input_token_savings_percent
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
with ThreadPoolExecutor(max_workers=5) as executor:
|
|
334
|
+
results = list(executor.map(process_prompt, prompts))
|
|
335
|
+
|
|
336
|
+
# Summary
|
|
337
|
+
total_savings = sum(r["savings_percent"] for r in results)
|
|
338
|
+
print(f"Average savings: {total_savings / len(results):.1f}%")
|
|
339
|
+
```
|
|
340
|
+
|
|
341
|
+
### Streaming Responses
|
|
342
|
+
|
|
343
|
+
```python
|
|
344
|
+
from slimzero import SlimZero
|
|
345
|
+
|
|
346
|
+
sz = SlimZero(model="gpt-4o")
|
|
347
|
+
|
|
348
|
+
result = sz.call(
|
|
349
|
+
prompt="Write a detailed explanation of how async/await works in Python.",
|
|
350
|
+
stream=True
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
for chunk in result.stream:
|
|
354
|
+
print(chunk, end="", flush=True)
|
|
355
|
+
```
|
|
356
|
+
|
|
357
|
+
### Custom Plugins
|
|
358
|
+
|
|
359
|
+
```python
|
|
360
|
+
from slimzero.plugins import BaseStage, StageInput, StageOutput
|
|
361
|
+
|
|
362
|
+
class MarkdownFormatter(BaseStage):
|
|
363
|
+
"""Ensure responses are formatted as markdown."""
|
|
364
|
+
name = "markdown_formatter"
|
|
365
|
+
|
|
366
|
+
def process(self, inp: StageInput) -> StageOutput:
|
|
367
|
+
new_prompt = inp.prompt + "\n\nFormat your response in markdown."
|
|
368
|
+
return StageOutput(prompt=new_prompt, modified=True, notes="added markdown instruction")
|
|
369
|
+
|
|
370
|
+
sz = SlimZero(
|
|
371
|
+
model="gpt-4o",
|
|
372
|
+
extra_stages=[MarkdownFormatter()]
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
result = sz.call("Explain Python decorators")
|
|
376
|
+
```
|
|
377
|
+
|
|
378
|
+
### Conversation with History
|
|
379
|
+
|
|
380
|
+
```python
|
|
381
|
+
from slimzero import SlimZero
|
|
382
|
+
|
|
383
|
+
sz = SlimZero(
|
|
384
|
+
model="gpt-4o",
|
|
385
|
+
history_window=5, # Keep last 5 turns verbatim
|
|
386
|
+
hallucination_guard=True
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
# First turn
|
|
390
|
+
result1 = sz.call("What is gradient descent?")
|
|
391
|
+
print(result1.response)
|
|
392
|
+
|
|
393
|
+
# Second turn - history is managed automatically
|
|
394
|
+
result2 = sz.call("Show me a Python example.")
|
|
395
|
+
print(result2.response)
|
|
396
|
+
|
|
397
|
+
# Check compression stats
|
|
398
|
+
print(f"History turns: {len(result2.conversation_history)}")
|
|
399
|
+
print(f"Compressed history: {result2.history_was_compressed}")
|
|
400
|
+
```
|
|
401
|
+
|
|
402
|
+
### JSON Structured Logging
|
|
403
|
+
|
|
404
|
+
```python
|
|
405
|
+
from slimzero import SlimZero
|
|
406
|
+
|
|
407
|
+
sz = SlimZero(
|
|
408
|
+
model="gpt-4o",
|
|
409
|
+
log_file="slimzero_sessions.jsonl"
|
|
410
|
+
)
|
|
411
|
+
|
|
412
|
+
# Each call appends structured JSON to the log file
|
|
413
|
+
result = sz.call("Explain quantum entanglement")
|
|
414
|
+
|
|
415
|
+
# Or read the log
|
|
416
|
+
import json
|
|
417
|
+
with open("slimzero_sessions.jsonl") as f:
|
|
418
|
+
for line in f:
|
|
419
|
+
entry = json.loads(line)
|
|
420
|
+
print(f"Call {entry['call_id']}: {entry['savings_percent']}% saved")
|
|
421
|
+
```
|
|
422
|
+
|
|
423
|
+
### Dashboard Mode
|
|
424
|
+
|
|
425
|
+
```python
|
|
426
|
+
from slimzero import SlimZero
|
|
427
|
+
|
|
428
|
+
sz = SlimZero(
|
|
429
|
+
model="gpt-4o",
|
|
430
|
+
dashboard=True # Rich live terminal dashboard
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
# Run several calls and watch the dashboard update
|
|
434
|
+
for topic in ["Python", "Rust", "Go", "JavaScript"]:
|
|
435
|
+
result = sz.call(f"Explain {topic} in one paragraph.")
|
|
436
|
+
print(f"\n{topic}: {result.response[:100]}...")
|
|
437
|
+
```
|
|
438
|
+
|
|
439
|
+
---
|
|
440
|
+
|
|
441
|
+
## How the pipeline works
|
|
442
|
+
|
|
443
|
+
Every call passes through 8 local stages before hitting the API, then 3 local stages after.
|
|
444
|
+
|
|
445
|
+
```
|
|
446
|
+
User prompt
|
|
447
|
+
│
|
|
448
|
+
▼
|
|
449
|
+
┌─────────────────────────────── LOCAL (0 tokens) ──────────────────────────────┐
|
|
450
|
+
│ Stage 1 Intent Extractor spaCy · parse task, entities, format │
|
|
451
|
+
│ Stage 2 Prompt Rewriter T5-small · compress to imperative form │
|
|
452
|
+
│ Stage 3 Semantic Guard ⚠ MiniLM · reject if similarity < 0.92 │
|
|
453
|
+
│ Stage 4 Few-Shot Ranker cosine sim · keep top-k examples │
|
|
454
|
+
│ Stage 5 History Compressor T5-small · summarise old turns │
|
|
455
|
+
│ Stage 6 Format Injector rule engine · append response fragment │
|
|
456
|
+
│ Stage 7 Hallucination Scorer heuristics · classify risk · inject if HIGH │
|
|
457
|
+
│ Stage 8 Budget Enforcer tiktoken · hard cap · trim by priority │
|
|
458
|
+
└───────────────────────────────────────────────────────────────────────────────┘
|
|
459
|
+
│
|
|
460
|
+
▼
|
|
461
|
+
┌─────────────────────────── API CALL (tokens spent) ───────────────────────────┐
|
|
462
|
+
│ Target LLM — optimised payload only │
|
|
463
|
+
└───────────────────────────────────────────────────────────────────────────────┘
|
|
464
|
+
│
|
|
465
|
+
▼
|
|
466
|
+
┌─────────────────────────────── LOCAL (0 tokens) ──────────────────────────────┐
|
|
467
|
+
│ Post 1 Response Validator MiniLM · check reply addresses intent │
|
|
468
|
+
│ Post 2 Hallucination Flagger 80 heuristic patterns · annotate metadata │
|
|
469
|
+
│ Post 3 Savings Logger token delta · cost · cumulative dashboard │
|
|
470
|
+
└───────────────────────────────────────────────────────────────────────────────┘
|
|
471
|
+
│
|
|
472
|
+
▼
|
|
473
|
+
SlimZeroResult (response + metadata + savings)
|
|
474
|
+
```
|
|
475
|
+
|
|
476
|
+
**Stage 3 is the only non-bypassable stage.** If the rewrite changes meaning (similarity drops below 0.92), the original prompt is sent instead. Savings never come at the cost of correctness.
|
|
477
|
+
|
|
478
|
+
---
|
|
479
|
+
|
|
480
|
+
## Configuration
|
|
481
|
+
|
|
482
|
+
```python
|
|
483
|
+
sz = SlimZero(
|
|
484
|
+
model="claude-sonnet-4-6", # any OpenAI-spec model string
|
|
485
|
+
api_client=None, # pass existing client, or SlimZero creates one
|
|
486
|
+
token_budget=4096, # hard token ceiling for outbound prompt
|
|
487
|
+
sim_threshold=0.92, # semantic guard threshold (min 0.80)
|
|
488
|
+
few_shot_k=3, # max few-shot examples to keep
|
|
489
|
+
history_window=4, # recent turns kept verbatim
|
|
490
|
+
hallucination_guard=True, # enable risk scoring + uncertainty injection
|
|
491
|
+
response_validation=True, # enable post-response intent check
|
|
492
|
+
agent_mode=False, # enable Ralph loop + GSD
|
|
493
|
+
max_agent_steps=20, # circuit breaker: max steps
|
|
494
|
+
max_retries=3, # circuit breaker: retries per sub-task
|
|
495
|
+
drift_threshold=0.75, # semantic drift detection threshold
|
|
496
|
+
dashboard=False, # Rich live terminal dashboard
|
|
497
|
+
log_file=None, # path for JSON structured log
|
|
498
|
+
)
|
|
499
|
+
```
|
|
500
|
+
|
|
501
|
+
---
|
|
502
|
+
|
|
503
|
+
## Fault prevention
|
|
504
|
+
|
|
505
|
+
SlimZero is built around eight layered defences:
|
|
506
|
+
|
|
507
|
+
| Layer | Mechanism | Response |
|
|
508
|
+
|---|---|---|
|
|
509
|
+
| L1 Input validation | Schema check on every call | Raise `SlimZeroInputError` |
|
|
510
|
+
| L2 Semantic gate | Cosine similarity post-rewrite | Reject rewrite, use original |
|
|
511
|
+
| L3 Budget enforcer | Token count check | Trim in priority order |
|
|
512
|
+
| L4 Response gate | Intent similarity post-call | Flag `OFF_TASK` with warning |
|
|
513
|
+
| L5 Hallucination flag | Heuristic pattern scan | Annotate response metadata |
|
|
514
|
+
| L6 Circuit breaker | Step/retry/token budget | Halt loop, checkpoint state |
|
|
515
|
+
| L7 Drift detector | Plan embedding divergence | Re-ground agent, halt if re-drifts |
|
|
516
|
+
| L8 Tool validator | Argument schema check | Reject call, return structured error |
|
|
517
|
+
|
|
518
|
+
Three rules that never change:
|
|
519
|
+
|
|
520
|
+
- SlimZero **never suppresses** an LLM response. Flags are metadata — the response always reaches your code.
|
|
521
|
+
- SlimZero **never blocks** a request. If every stage fails, the original prompt is sent unchanged.
|
|
522
|
+
- The semantic guard **cannot be disabled** — only its threshold can be lowered (minimum 0.80).
|
|
523
|
+
|
|
524
|
+
---
|
|
525
|
+
|
|
526
|
+
## Agent mode — Ralph + GSD
|
|
527
|
+
|
|
528
|
+
`agent_mode=True` activates the full autonomous agent orchestration layer.
|
|
529
|
+
|
|
530
|
+
**GSD (Get-Shit-Done)** decomposes your goal into a directed graph of sub-tasks using one LLM call. Each sub-task is checkpointed — if the session dies, it resumes from where it left off.
|
|
531
|
+
|
|
532
|
+
**Ralph** runs each sub-task in an observe → plan → act → reflect loop. SlimZero applies the full compression pipeline to every LLM call inside the loop, so agent steps are as cheap as regular calls.
|
|
533
|
+
|
|
534
|
+
**Circuit breakers** halt the loop if any budget is exhausted (max steps, max retries, max tokens). State is always checkpointed before halting — no work is lost.
|
|
535
|
+
|
|
536
|
+
**Semantic drift detection** watches whether the agent's plan stays aligned with the original goal. If it diverges for three consecutive steps, it is re-grounded. If it diverges again, the loop halts.
|
|
537
|
+
|
|
538
|
+
---
|
|
539
|
+
|
|
540
|
+
## Plugin API
|
|
541
|
+
|
|
542
|
+
Every pipeline stage is a plugin. Add your own compressor without touching SlimZero's core:
|
|
543
|
+
|
|
544
|
+
```python
|
|
545
|
+
from slimzero.plugins import BaseStage, StageInput, StageOutput
|
|
546
|
+
|
|
547
|
+
class MyCompressor(BaseStage):
|
|
548
|
+
name = "my_compressor"
|
|
549
|
+
|
|
550
|
+
def process(self, inp: StageInput) -> StageOutput:
|
|
551
|
+
new_prompt = your_compression_logic(inp.prompt)
|
|
552
|
+
return StageOutput(prompt=new_prompt, modified=True, notes="custom compression")
|
|
553
|
+
|
|
554
|
+
sz = SlimZero(model="...", extra_stages=[MyCompressor()])
|
|
555
|
+
```
|
|
556
|
+
|
|
557
|
+
Plugins auto-discovered via Python entry points (`slimzero.stages` group). Community plugins installable as separate packages.
|
|
558
|
+
|
|
559
|
+
---
|
|
560
|
+
|
|
561
|
+
## Tech stack
|
|
562
|
+
|
|
563
|
+
| Library | Purpose |
|
|
564
|
+
|---|---|
|
|
565
|
+
| spaCy `en_core_web_sm` | Intent extraction |
|
|
566
|
+
| sentence-transformers `all-MiniLM-L6-v2` | Semantic guard + response validation |
|
|
567
|
+
| tiktoken | Token counting (all major models) |
|
|
568
|
+
| transformers + T5-small | Local prompt rewriter |
|
|
569
|
+
| Ollama Python client | Fallback local rewriter |
|
|
570
|
+
| networkx | GSD task graph |
|
|
571
|
+
| Rich | Live dashboard + logging |
|
|
572
|
+
|
|
573
|
+
---
|
|
574
|
+
|
|
575
|
+
## License
|
|
576
|
+
|
|
577
|
+
MIT — use it in anything.
|
|
578
|
+
|
|
579
|
+
---
|
|
580
|
+
|
|
581
|
+
*Built to solve a real problem: every token should do real work.*
|