ak-primus 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ak_primus-0.2.0/LICENSE +21 -0
- ak_primus-0.2.0/PKG-INFO +287 -0
- ak_primus-0.2.0/README.md +231 -0
- ak_primus-0.2.0/ak_primus/__init__.py +8 -0
- ak_primus-0.2.0/ak_primus/audit.py +201 -0
- ak_primus-0.2.0/ak_primus/classifier.py +1428 -0
- ak_primus-0.2.0/ak_primus/layers/__init__.py +102 -0
- ak_primus-0.2.0/ak_primus/layers/cache.py +268 -0
- ak_primus-0.2.0/ak_primus/layers/compression.py +758 -0
- ak_primus-0.2.0/ak_primus/layers/memory.py +537 -0
- ak_primus-0.2.0/ak_primus/layers/metrics.py +316 -0
- ak_primus-0.2.0/ak_primus/layers/native_compress.py +783 -0
- ak_primus-0.2.0/ak_primus/layers/prompt_opt.py +1010 -0
- ak_primus-0.2.0/ak_primus/layers/quality.py +411 -0
- ak_primus-0.2.0/ak_primus/layers/search.py +532 -0
- ak_primus-0.2.0/ak_primus/ml/__init__.py +4 -0
- ak_primus-0.2.0/ak_primus/ml/classifier_ml.py +667 -0
- ak_primus-0.2.0/ak_primus/proxy.py +895 -0
- ak_primus-0.2.0/ak_primus/router.py +447 -0
- ak_primus-0.2.0/ak_primus/server.py +908 -0
- ak_primus-0.2.0/ak_primus/storage/__init__.py +19 -0
- ak_primus-0.2.0/ak_primus/storage/session_store.py +526 -0
- ak_primus-0.2.0/ak_primus/storage/vector_store.py +375 -0
- ak_primus-0.2.0/ak_primus.egg-info/PKG-INFO +287 -0
- ak_primus-0.2.0/ak_primus.egg-info/SOURCES.txt +29 -0
- ak_primus-0.2.0/ak_primus.egg-info/dependency_links.txt +1 -0
- ak_primus-0.2.0/ak_primus.egg-info/entry_points.txt +3 -0
- ak_primus-0.2.0/ak_primus.egg-info/requires.txt +38 -0
- ak_primus-0.2.0/ak_primus.egg-info/top_level.txt +1 -0
- ak_primus-0.2.0/pyproject.toml +99 -0
- ak_primus-0.2.0/setup.cfg +4 -0
ak_primus-0.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 AK-Primus Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
ak_primus-0.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ak-primus
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: Intelligent token compression + routing MCP server
|
|
5
|
+
Author: AK-Primus Contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/ak-primus/ak-primus
|
|
8
|
+
Project-URL: Repository, https://github.com/ak-primus/ak-primus
|
|
9
|
+
Project-URL: Issues, https://github.com/ak-primus/ak-primus/issues
|
|
10
|
+
Keywords: mcp,llm,compression,token,optimization,context-window
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Requires-Python: >=3.11
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
License-File: LICENSE
|
|
23
|
+
Requires-Dist: mcp>=1.0.0
|
|
24
|
+
Requires-Dist: tiktoken>=0.7.0
|
|
25
|
+
Requires-Dist: httpx>=0.27.0
|
|
26
|
+
Requires-Dist: pydantic>=2.7.0
|
|
27
|
+
Requires-Dist: anyio>=4.4.0
|
|
28
|
+
Requires-Dist: structlog>=24.0.0
|
|
29
|
+
Provides-Extra: compress
|
|
30
|
+
Requires-Dist: llmlingua>=0.2.0; extra == "compress"
|
|
31
|
+
Requires-Dist: transformers>=4.41.0; extra == "compress"
|
|
32
|
+
Requires-Dist: torch>=2.2.0; extra == "compress"
|
|
33
|
+
Provides-Extra: retrieval
|
|
34
|
+
Requires-Dist: sentence-transformers>=3.0.0; extra == "retrieval"
|
|
35
|
+
Requires-Dist: chromadb>=0.5.0; extra == "retrieval"
|
|
36
|
+
Requires-Dist: numpy>=1.26.0; extra == "retrieval"
|
|
37
|
+
Requires-Dist: scikit-learn>=1.5.0; extra == "retrieval"
|
|
38
|
+
Requires-Dist: openai>=1.30.0; extra == "retrieval"
|
|
39
|
+
Requires-Dist: anthropic>=0.29.0; extra == "retrieval"
|
|
40
|
+
Provides-Extra: optimize
|
|
41
|
+
Requires-Dist: dspy-ai>=2.4.0; extra == "optimize"
|
|
42
|
+
Requires-Dist: openai>=1.30.0; extra == "optimize"
|
|
43
|
+
Requires-Dist: evaluate>=0.4.0; extra == "optimize"
|
|
44
|
+
Provides-Extra: http
|
|
45
|
+
Requires-Dist: uvicorn[standard]>=0.29.0; extra == "http"
|
|
46
|
+
Requires-Dist: starlette>=0.37.0; extra == "http"
|
|
47
|
+
Provides-Extra: dev
|
|
48
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
49
|
+
Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
|
|
50
|
+
Requires-Dist: pytest-cov>=5.0.0; extra == "dev"
|
|
51
|
+
Requires-Dist: ruff>=0.4.0; extra == "dev"
|
|
52
|
+
Requires-Dist: mypy>=1.10.0; extra == "dev"
|
|
53
|
+
Provides-Extra: all
|
|
54
|
+
Requires-Dist: ak-primus[compress,http,optimize,retrieval]; extra == "all"
|
|
55
|
+
Dynamic: license-file
|
|
56
|
+
|
|
57
|
+
# AK-Primus v0.2.0
|
|
58
|
+
|
|
59
|
+
> Intelligent token compression + routing MCP server with adaptive self-healing.
|
|
60
|
+
|
|
61
|
+
Every LLM request passes through a 7-layer pipeline selected by an 8-class hybrid classifier. The right compression and retrieval stack runs automatically for each request type. Nothing is applied blindly. The system gets cheaper the more it is used.
|
|
62
|
+
|
|
63
|
+
```
|
|
64
|
+
Request → Classifier → Router → [Cache | Memory | Compress | Search | Prompt-Opt] → LLM
|
|
65
|
+
↓
|
|
66
|
+
Quality Score ← Response ←────┘
|
|
67
|
+
↓
|
|
68
|
+
Adaptive Profile Update
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## What's new in 0.2.0
|
|
74
|
+
|
|
75
|
+
| Area | Change |
|
|
76
|
+
|---|---|
|
|
77
|
+
| Compression | Expansion guard — never inflates token count; LLMLingua-1 uses GPT-2 (was 7B Llama) |
|
|
78
|
+
| Classifier | ML hybrid (400-example logistic regression + rule fast-path); 88%+ accuracy on 275-scenario suite |
|
|
79
|
+
| Cache | 3-level lookup: SHA-256 exact → ChromaDB HNSW ANN → SQLite cosine fallback |
|
|
80
|
+
| Memory | L1 working memory + L2 episodic (session consolidation) + L3 semantic (cross-session ChromaDB) |
|
|
81
|
+
| DSPy | 4 typed Signatures wired end-to-end; BootstrapFewShot + MIPRO2 with disk cache |
|
|
82
|
+
| Quality | ROUGE-L + BERTScore + LLM-as-judge blended score; `score_async()` variant |
|
|
83
|
+
| Transport | `AK_PRIMUS_TRANSPORT=http` — Starlette ASGI with `/health`, `/ready`, `/metrics`, SSE |
|
|
84
|
+
| Testing | 84 pytest tests + 275-scenario benchmark suite with per-class accuracy CI gate |
|
|
85
|
+
|
|
86
|
+
---
|
|
87
|
+
|
|
88
|
+
## Architecture
|
|
89
|
+
|
|
90
|
+
```
|
|
91
|
+
ak-primus/
|
|
92
|
+
├── core/
|
|
93
|
+
│ └── ak_primus/
|
|
94
|
+
│ ├── classifier.py # 8-class hybrid (rule + ML logistic regression)
|
|
95
|
+
│ ├── router.py # Stack selection per request type
|
|
96
|
+
│ ├── server.py # MCP server — 7 tools, stdio + HTTP/SSE
|
|
97
|
+
│ ├── layers/
|
|
98
|
+
│ │ ├── cache.py # 3-level semantic cache (exact / HNSW / cosine)
|
|
99
|
+
│ │ ├── compression.py # LLMLingua-2, LLMLingua-1, LongLLMLingua, SelectiveContext
|
|
100
|
+
│ │ ├── memory.py # L1 working | L2 episodic | L3 semantic (ChromaDB)
|
|
101
|
+
│ │ ├── metrics.py # tiktoken real token counting + cost accounting
|
|
102
|
+
│ │ ├── prompt_opt.py # DSPy BootstrapFewShot + MIPRO2, OPRO, Medprompt
|
|
103
|
+
│ │ ├── quality.py # ROUGE-L + BERTScore + LLM-as-judge + adaptive profile
|
|
104
|
+
│ │ └── search.py # HyDE, RAPTOR, FLARE, ColBERT retrieval
|
|
105
|
+
│ ├── ml/
|
|
106
|
+
│ │ └── classifier_ml.py # Embedding-based logistic regression (400 training examples)
|
|
107
|
+
│ └── storage/
|
|
108
|
+
│ ├── session_store.py # SQLite WAL — sessions, cache, profiles, memory_facts
|
|
109
|
+
│ └── vector_store.py # ChromaDB HNSW — semantic_cache + memory_facts
|
|
110
|
+
├── extension/
|
|
111
|
+
│ └── src/
|
|
112
|
+
│ ├── extension.ts # VS Code extension entry point
|
|
113
|
+
│ ├── dashboard.ts # Real-time metrics webview
|
|
114
|
+
│ └── mcp-client.ts # MCP stdio bridge
|
|
115
|
+
├── tests/
|
|
116
|
+
│ ├── unit/ # 70 unit tests (all layers)
|
|
117
|
+
│ └── integration/ # 14 integration + benchmark threshold tests
|
|
118
|
+
└── benchmarks/
|
|
119
|
+
└── run_1200_scenarios.py # 275-scenario accuracy + latency suite
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## 7 MCP Tools
|
|
125
|
+
|
|
126
|
+
| Tool | Purpose |
|
|
127
|
+
|---|---|
|
|
128
|
+
| `classify_request` | Detect request type + return recommended stack with expected token reduction |
|
|
129
|
+
| `compress_history` | Apply compression stack to message history; returns compressed messages + savings |
|
|
130
|
+
| `build_context` | HyDE / RAPTOR / FLARE retrieval-augmented context building |
|
|
131
|
+
| `optimize_prompt` | DSPy BootstrapFewShot / OPRO / Medprompt prompt optimisation |
|
|
132
|
+
| `get_token_report` | Real tiktoken metrics, cost savings, session stats |
|
|
133
|
+
| `process_request` | **Master pipeline**: classify → cache → memory → compress → quality → adapt |
|
|
134
|
+
| `report_quality` | Feed quality signal (0–1) back into adaptive compression profile |
|
|
135
|
+
|
|
136
|
+
### `process_request` — master pipeline
|
|
137
|
+
|
|
138
|
+
```json
|
|
139
|
+
{
|
|
140
|
+
"optimized_messages": [...],
|
|
141
|
+
"tokens_before": 1240,
|
|
142
|
+
"tokens_after": 487,
|
|
143
|
+
"tokens_saved": 753,
|
|
144
|
+
"savings_pct": 60.7,
|
|
145
|
+
"request_type": "code",
|
|
146
|
+
"confidence": 0.91,
|
|
147
|
+
"quality_score": 0.876,
|
|
148
|
+
"adapted_ratio": 0.382,
|
|
149
|
+
"cache_hit": false,
|
|
150
|
+
"session_id": "sess-abc123",
|
|
151
|
+
"lifetime_savings": 41250
|
|
152
|
+
}
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
---
|
|
156
|
+
|
|
157
|
+
## 8 Request Types
|
|
158
|
+
|
|
159
|
+
| Type | Classifier Trigger | Default Stack |
|
|
160
|
+
|---|---|---|
|
|
161
|
+
| `code` | Code keywords + task verbs + C++/C# detection | SelectiveContext → PrefixCache |
|
|
162
|
+
| `rag_doc` | Documents present + QA-style question | HyDE retrieval + LLMLingua-2 |
|
|
163
|
+
| `agent_session` | Multi-turn history + follow-up phrases | WorkingMemory + L3 semantic |
|
|
164
|
+
| `domain_expert` | Legal / medical / finance domain terms | Medprompt + SelectiveContext |
|
|
165
|
+
| `multi_hop` | "relationship", "compare", "trace" patterns | RAPTOR + ChainOfThought |
|
|
166
|
+
| `math` | Equations, proof, calculate keywords | LLMLingua-1 (formula-aware) |
|
|
167
|
+
| `fixed_template` | Long system prompt (>600 tokens) | PrefixCache (no compression) |
|
|
168
|
+
| `simple_qa` | Short conversational question | Light SelectiveContext |
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## Installation
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
# Minimal (core MCP server, no ML)
|
|
176
|
+
pip install ak-primus
|
|
177
|
+
|
|
178
|
+
# Full (all optional groups)
|
|
179
|
+
pip install "ak-primus[all]"
|
|
180
|
+
|
|
181
|
+
# From source
|
|
182
|
+
git clone https://github.com/ak-primus/ak-primus
|
|
183
|
+
pip install -e "core[all]"
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
| Group | Installs | When to use |
|
|
187
|
+
|---|---|---|
|
|
188
|
+
| `compress` | LLMLingua, transformers, torch | Token compression |
|
|
189
|
+
| `retrieval` | sentence-transformers, chromadb, scikit-learn | Semantic cache + search |
|
|
190
|
+
| `optimize` | dspy-ai, evaluate | DSPy + ROUGE/BERTScore |
|
|
191
|
+
| `http` | uvicorn, starlette | HTTP/SSE transport |
|
|
192
|
+
| `dev` | pytest, ruff, mypy | Development |
|
|
193
|
+
|
|
194
|
+
---
|
|
195
|
+
|
|
196
|
+
## Quick Start
|
|
197
|
+
|
|
198
|
+
### Claude Desktop
|
|
199
|
+
|
|
200
|
+
```json
|
|
201
|
+
{
|
|
202
|
+
"mcpServers": {
|
|
203
|
+
"ak-primus": {
|
|
204
|
+
"command": "ak-primus",
|
|
205
|
+
"args": ["serve"],
|
|
206
|
+
"env": {
|
|
207
|
+
"ANTHROPIC_API_KEY": "sk-ant-..."
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
}
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
### Docker
|
|
215
|
+
|
|
216
|
+
```bash
|
|
217
|
+
# stdio (MCP)
|
|
218
|
+
docker build -f Dockerfile.akprimus --target runtime -t ak-primus:0.2.0 .
|
|
219
|
+
docker run -i --rm -e ANTHROPIC_API_KEY="$ANTHROPIC_API_KEY" -v ak_data:/data ak-primus:0.2.0
|
|
220
|
+
|
|
221
|
+
# HTTP/SSE with health probes
|
|
222
|
+
docker compose --profile http up
|
|
223
|
+
curl http://localhost:8080/health
|
|
224
|
+
```
|
|
225
|
+
|
|
226
|
+
---
|
|
227
|
+
|
|
228
|
+
## Self-Healing Compression
|
|
229
|
+
|
|
230
|
+
The adaptive profile tunes compression ratios over time per (workspace, request_type):
|
|
231
|
+
|
|
232
|
+
```
|
|
233
|
+
quality ≥ 0.85 → compress more aggressively next time
|
|
234
|
+
quality < 0.70 → back off compression
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
After ~50 samples per type, the ratio converges to the Pareto-optimal point.
|
|
238
|
+
|
|
239
|
+
```bash
|
|
240
|
+
# Feed quality signal back after reviewing LLM response
|
|
241
|
+
curl -s http://localhost:8080/mcp -d '{"tool":"report_quality","quality_score":0.92,"request_type":"code"}'
|
|
242
|
+
```
|
|
243
|
+
|
|
244
|
+
---
|
|
245
|
+
|
|
246
|
+
## Running Tests
|
|
247
|
+
|
|
248
|
+
```bash
|
|
249
|
+
pytest tests/ -q # 84 tests
|
|
250
|
+
python benchmarks/run_1200_scenarios.py --quick # accuracy + latency
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
### Benchmark (v0.2.0, 275 scenarios)
|
|
254
|
+
|
|
255
|
+
| Class | Accuracy |
|
|
256
|
+
|---|---|
|
|
257
|
+
| `math` | 96% |
|
|
258
|
+
| `fixed_template` | 100% |
|
|
259
|
+
| `agent_session` | 92% |
|
|
260
|
+
| `domain_expert` | 90% |
|
|
261
|
+
| `multi_hop` | 90% |
|
|
262
|
+
| `simple_qa` | 88% |
|
|
263
|
+
| `code` | 82% |
|
|
264
|
+
| `rag_doc` | 80% |
|
|
265
|
+
| **Overall** | **~88%** |
|
|
266
|
+
|
|
267
|
+
Classifier latency: p50 < 1ms, p95 < 5ms (rule-based path, no model load).
|
|
268
|
+
|
|
269
|
+
---
|
|
270
|
+
|
|
271
|
+
## Environment Variables
|
|
272
|
+
|
|
273
|
+
| Variable | Default | Description |
|
|
274
|
+
|---|---|---|
|
|
275
|
+
| `AK_PRIMUS_MODEL` | `claude-sonnet-4-6` | LLM model for DSPy / OPRO / judge |
|
|
276
|
+
| `AK_PRIMUS_TRANSPORT` | `stdio` | `stdio` or `http` |
|
|
277
|
+
| `AK_PRIMUS_HOST` | `127.0.0.1` | HTTP bind host |
|
|
278
|
+
| `AK_PRIMUS_PORT` | `8080` | HTTP bind port |
|
|
279
|
+
| `AK_PRIMUS_DB` | `~/.ak_primus/store.db` | SQLite path |
|
|
280
|
+
| `AK_PRIMUS_DSPY_CACHE` | `~/.ak_primus/dspy_cache` | DSPy program cache |
|
|
281
|
+
|
|
282
|
+
---
|
|
283
|
+
|
|
284
|
+
## License
|
|
285
|
+
|
|
286
|
+
MIT
|
|
287
|
+
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
# AK-Primus v0.2.0
|
|
2
|
+
|
|
3
|
+
> Intelligent token compression + routing MCP server with adaptive self-healing.
|
|
4
|
+
|
|
5
|
+
Every LLM request passes through a 7-layer pipeline selected by an 8-class hybrid classifier. The right compression and retrieval stack runs automatically for each request type. Nothing is applied blindly. The system gets cheaper the more it is used.
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
Request → Classifier → Router → [Cache | Memory | Compress | Search | Prompt-Opt] → LLM
|
|
9
|
+
↓
|
|
10
|
+
Quality Score ← Response ←────┘
|
|
11
|
+
↓
|
|
12
|
+
Adaptive Profile Update
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## What's new in 0.2.0
|
|
18
|
+
|
|
19
|
+
| Area | Change |
|
|
20
|
+
|---|---|
|
|
21
|
+
| Compression | Expansion guard — never inflates token count; LLMLingua-1 uses GPT-2 (was 7B Llama) |
|
|
22
|
+
| Classifier | ML hybrid (400-example logistic regression + rule fast-path); 88%+ accuracy on 275-scenario suite |
|
|
23
|
+
| Cache | 3-level lookup: SHA-256 exact → ChromaDB HNSW ANN → SQLite cosine fallback |
|
|
24
|
+
| Memory | L1 working memory + L2 episodic (session consolidation) + L3 semantic (cross-session ChromaDB) |
|
|
25
|
+
| DSPy | 4 typed Signatures wired end-to-end; BootstrapFewShot + MIPRO2 with disk cache |
|
|
26
|
+
| Quality | ROUGE-L + BERTScore + LLM-as-judge blended score; `score_async()` variant |
|
|
27
|
+
| Transport | `AK_PRIMUS_TRANSPORT=http` — Starlette ASGI with `/health`, `/ready`, `/metrics`, SSE |
|
|
28
|
+
| Testing | 84 pytest tests + 275-scenario benchmark suite with per-class accuracy CI gate |
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## Architecture
|
|
33
|
+
|
|
34
|
+
```
|
|
35
|
+
ak-primus/
|
|
36
|
+
├── core/
|
|
37
|
+
│ └── ak_primus/
|
|
38
|
+
│ ├── classifier.py # 8-class hybrid (rule + ML logistic regression)
|
|
39
|
+
│ ├── router.py # Stack selection per request type
|
|
40
|
+
│ ├── server.py # MCP server — 7 tools, stdio + HTTP/SSE
|
|
41
|
+
│ ├── layers/
|
|
42
|
+
│ │ ├── cache.py # 3-level semantic cache (exact / HNSW / cosine)
|
|
43
|
+
│ │ ├── compression.py # LLMLingua-2, LLMLingua-1, LongLLMLingua, SelectiveContext
|
|
44
|
+
│ │ ├── memory.py # L1 working | L2 episodic | L3 semantic (ChromaDB)
|
|
45
|
+
│ │ ├── metrics.py # tiktoken real token counting + cost accounting
|
|
46
|
+
│ │ ├── prompt_opt.py # DSPy BootstrapFewShot + MIPRO2, OPRO, Medprompt
|
|
47
|
+
│ │ ├── quality.py # ROUGE-L + BERTScore + LLM-as-judge + adaptive profile
|
|
48
|
+
│ │ └── search.py # HyDE, RAPTOR, FLARE, ColBERT retrieval
|
|
49
|
+
│ ├── ml/
|
|
50
|
+
│ │ └── classifier_ml.py # Embedding-based logistic regression (400 training examples)
|
|
51
|
+
│ └── storage/
|
|
52
|
+
│ ├── session_store.py # SQLite WAL — sessions, cache, profiles, memory_facts
|
|
53
|
+
│ └── vector_store.py # ChromaDB HNSW — semantic_cache + memory_facts
|
|
54
|
+
├── extension/
|
|
55
|
+
│ └── src/
|
|
56
|
+
│ ├── extension.ts # VS Code extension entry point
|
|
57
|
+
│ ├── dashboard.ts # Real-time metrics webview
|
|
58
|
+
│ └── mcp-client.ts # MCP stdio bridge
|
|
59
|
+
├── tests/
|
|
60
|
+
│ ├── unit/ # 70 unit tests (all layers)
|
|
61
|
+
│ └── integration/ # 14 integration + benchmark threshold tests
|
|
62
|
+
└── benchmarks/
|
|
63
|
+
└── run_1200_scenarios.py # 275-scenario accuracy + latency suite
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
## 7 MCP Tools
|
|
69
|
+
|
|
70
|
+
| Tool | Purpose |
|
|
71
|
+
|---|---|
|
|
72
|
+
| `classify_request` | Detect request type + return recommended stack with expected token reduction |
|
|
73
|
+
| `compress_history` | Apply compression stack to message history; returns compressed messages + savings |
|
|
74
|
+
| `build_context` | HyDE / RAPTOR / FLARE retrieval-augmented context building |
|
|
75
|
+
| `optimize_prompt` | DSPy BootstrapFewShot / OPRO / Medprompt prompt optimisation |
|
|
76
|
+
| `get_token_report` | Real tiktoken metrics, cost savings, session stats |
|
|
77
|
+
| `process_request` | **Master pipeline**: classify → cache → memory → compress → quality → adapt |
|
|
78
|
+
| `report_quality` | Feed quality signal (0–1) back into adaptive compression profile |
|
|
79
|
+
|
|
80
|
+
### `process_request` — master pipeline
|
|
81
|
+
|
|
82
|
+
```json
|
|
83
|
+
{
|
|
84
|
+
"optimized_messages": [...],
|
|
85
|
+
"tokens_before": 1240,
|
|
86
|
+
"tokens_after": 487,
|
|
87
|
+
"tokens_saved": 753,
|
|
88
|
+
"savings_pct": 60.7,
|
|
89
|
+
"request_type": "code",
|
|
90
|
+
"confidence": 0.91,
|
|
91
|
+
"quality_score": 0.876,
|
|
92
|
+
"adapted_ratio": 0.382,
|
|
93
|
+
"cache_hit": false,
|
|
94
|
+
"session_id": "sess-abc123",
|
|
95
|
+
"lifetime_savings": 41250
|
|
96
|
+
}
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
---
|
|
100
|
+
|
|
101
|
+
## 8 Request Types
|
|
102
|
+
|
|
103
|
+
| Type | Classifier Trigger | Default Stack |
|
|
104
|
+
|---|---|---|
|
|
105
|
+
| `code` | Code keywords + task verbs + C++/C# detection | SelectiveContext → PrefixCache |
|
|
106
|
+
| `rag_doc` | Documents present + QA-style question | HyDE retrieval + LLMLingua-2 |
|
|
107
|
+
| `agent_session` | Multi-turn history + follow-up phrases | WorkingMemory + L3 semantic |
|
|
108
|
+
| `domain_expert` | Legal / medical / finance domain terms | Medprompt + SelectiveContext |
|
|
109
|
+
| `multi_hop` | "relationship", "compare", "trace" patterns | RAPTOR + ChainOfThought |
|
|
110
|
+
| `math` | Equations, proof, calculate keywords | LLMLingua-1 (formula-aware) |
|
|
111
|
+
| `fixed_template` | Long system prompt (>600 tokens) | PrefixCache (no compression) |
|
|
112
|
+
| `simple_qa` | Short conversational question | Light SelectiveContext |
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## Installation
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
# Minimal (core MCP server, no ML)
|
|
120
|
+
pip install ak-primus
|
|
121
|
+
|
|
122
|
+
# Full (all optional groups)
|
|
123
|
+
pip install "ak-primus[all]"
|
|
124
|
+
|
|
125
|
+
# From source
|
|
126
|
+
git clone https://github.com/ak-primus/ak-primus
|
|
127
|
+
pip install -e "core[all]"
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
| Group | Installs | When to use |
|
|
131
|
+
|---|---|---|
|
|
132
|
+
| `compress` | LLMLingua, transformers, torch | Token compression |
|
|
133
|
+
| `retrieval` | sentence-transformers, chromadb, scikit-learn | Semantic cache + search |
|
|
134
|
+
| `optimize` | dspy-ai, evaluate | DSPy + ROUGE/BERTScore |
|
|
135
|
+
| `http` | uvicorn, starlette | HTTP/SSE transport |
|
|
136
|
+
| `dev` | pytest, ruff, mypy | Development |
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
## Quick Start
|
|
141
|
+
|
|
142
|
+
### Claude Desktop
|
|
143
|
+
|
|
144
|
+
```json
|
|
145
|
+
{
|
|
146
|
+
"mcpServers": {
|
|
147
|
+
"ak-primus": {
|
|
148
|
+
"command": "ak-primus",
|
|
149
|
+
"args": ["serve"],
|
|
150
|
+
"env": {
|
|
151
|
+
"ANTHROPIC_API_KEY": "sk-ant-..."
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### Docker
|
|
159
|
+
|
|
160
|
+
```bash
|
|
161
|
+
# stdio (MCP)
|
|
162
|
+
docker build -f Dockerfile.akprimus --target runtime -t ak-primus:0.2.0 .
|
|
163
|
+
docker run -i --rm -e ANTHROPIC_API_KEY="$ANTHROPIC_API_KEY" -v ak_data:/data ak-primus:0.2.0
|
|
164
|
+
|
|
165
|
+
# HTTP/SSE with health probes
|
|
166
|
+
docker compose --profile http up
|
|
167
|
+
curl http://localhost:8080/health
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## Self-Healing Compression
|
|
173
|
+
|
|
174
|
+
The adaptive profile tunes compression ratios over time per (workspace, request_type):
|
|
175
|
+
|
|
176
|
+
```
|
|
177
|
+
quality ≥ 0.85 → compress more aggressively next time
|
|
178
|
+
quality < 0.70 → back off compression
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
After ~50 samples per type, the ratio converges to the Pareto-optimal point.
|
|
182
|
+
|
|
183
|
+
```bash
|
|
184
|
+
# Feed quality signal back after reviewing LLM response
|
|
185
|
+
curl -s http://localhost:8080/mcp -d '{"tool":"report_quality","quality_score":0.92,"request_type":"code"}'
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
---
|
|
189
|
+
|
|
190
|
+
## Running Tests
|
|
191
|
+
|
|
192
|
+
```bash
|
|
193
|
+
pytest tests/ -q # 84 tests
|
|
194
|
+
python benchmarks/run_1200_scenarios.py --quick # accuracy + latency
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
### Benchmark (v0.2.0, 275 scenarios)
|
|
198
|
+
|
|
199
|
+
| Class | Accuracy |
|
|
200
|
+
|---|---|
|
|
201
|
+
| `math` | 96% |
|
|
202
|
+
| `fixed_template` | 100% |
|
|
203
|
+
| `agent_session` | 92% |
|
|
204
|
+
| `domain_expert` | 90% |
|
|
205
|
+
| `multi_hop` | 90% |
|
|
206
|
+
| `simple_qa` | 88% |
|
|
207
|
+
| `code` | 82% |
|
|
208
|
+
| `rag_doc` | 80% |
|
|
209
|
+
| **Overall** | **~88%** |
|
|
210
|
+
|
|
211
|
+
Classifier latency: p50 < 1ms, p95 < 5ms (rule-based path, no model load).
|
|
212
|
+
|
|
213
|
+
---
|
|
214
|
+
|
|
215
|
+
## Environment Variables
|
|
216
|
+
|
|
217
|
+
| Variable | Default | Description |
|
|
218
|
+
|---|---|---|
|
|
219
|
+
| `AK_PRIMUS_MODEL` | `claude-sonnet-4-6` | LLM model for DSPy / OPRO / judge |
|
|
220
|
+
| `AK_PRIMUS_TRANSPORT` | `stdio` | `stdio` or `http` |
|
|
221
|
+
| `AK_PRIMUS_HOST` | `127.0.0.1` | HTTP bind host |
|
|
222
|
+
| `AK_PRIMUS_PORT` | `8080` | HTTP bind port |
|
|
223
|
+
| `AK_PRIMUS_DB` | `~/.ak_primus/store.db` | SQLite path |
|
|
224
|
+
| `AK_PRIMUS_DSPY_CACHE` | `~/.ak_primus/dspy_cache` | DSPy program cache |
|
|
225
|
+
|
|
226
|
+
---
|
|
227
|
+
|
|
228
|
+
## License
|
|
229
|
+
|
|
230
|
+
MIT
|
|
231
|
+
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""
|
|
2
|
+
AK-Primus — Intelligent token compression & routing MCP server.
|
|
3
|
+
"""
|
|
4
|
+
from .classifier import RequestClassifier, RequestType
|
|
5
|
+
from .router import StackRouter, CompressionStack
|
|
6
|
+
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
__all__ = ["RequestClassifier", "RequestType", "StackRouter", "CompressionStack"]
|