ak-primus 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. ak_primus-0.2.0/LICENSE +21 -0
  2. ak_primus-0.2.0/PKG-INFO +287 -0
  3. ak_primus-0.2.0/README.md +231 -0
  4. ak_primus-0.2.0/ak_primus/__init__.py +8 -0
  5. ak_primus-0.2.0/ak_primus/audit.py +201 -0
  6. ak_primus-0.2.0/ak_primus/classifier.py +1428 -0
  7. ak_primus-0.2.0/ak_primus/layers/__init__.py +102 -0
  8. ak_primus-0.2.0/ak_primus/layers/cache.py +268 -0
  9. ak_primus-0.2.0/ak_primus/layers/compression.py +758 -0
  10. ak_primus-0.2.0/ak_primus/layers/memory.py +537 -0
  11. ak_primus-0.2.0/ak_primus/layers/metrics.py +316 -0
  12. ak_primus-0.2.0/ak_primus/layers/native_compress.py +783 -0
  13. ak_primus-0.2.0/ak_primus/layers/prompt_opt.py +1010 -0
  14. ak_primus-0.2.0/ak_primus/layers/quality.py +411 -0
  15. ak_primus-0.2.0/ak_primus/layers/search.py +532 -0
  16. ak_primus-0.2.0/ak_primus/ml/__init__.py +4 -0
  17. ak_primus-0.2.0/ak_primus/ml/classifier_ml.py +667 -0
  18. ak_primus-0.2.0/ak_primus/proxy.py +895 -0
  19. ak_primus-0.2.0/ak_primus/router.py +447 -0
  20. ak_primus-0.2.0/ak_primus/server.py +908 -0
  21. ak_primus-0.2.0/ak_primus/storage/__init__.py +19 -0
  22. ak_primus-0.2.0/ak_primus/storage/session_store.py +526 -0
  23. ak_primus-0.2.0/ak_primus/storage/vector_store.py +375 -0
  24. ak_primus-0.2.0/ak_primus.egg-info/PKG-INFO +287 -0
  25. ak_primus-0.2.0/ak_primus.egg-info/SOURCES.txt +29 -0
  26. ak_primus-0.2.0/ak_primus.egg-info/dependency_links.txt +1 -0
  27. ak_primus-0.2.0/ak_primus.egg-info/entry_points.txt +3 -0
  28. ak_primus-0.2.0/ak_primus.egg-info/requires.txt +38 -0
  29. ak_primus-0.2.0/ak_primus.egg-info/top_level.txt +1 -0
  30. ak_primus-0.2.0/pyproject.toml +99 -0
  31. ak_primus-0.2.0/setup.cfg +4 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 AK-Primus Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,287 @@
1
+ Metadata-Version: 2.4
2
+ Name: ak-primus
3
+ Version: 0.2.0
4
+ Summary: Intelligent token compression + routing MCP server
5
+ Author: AK-Primus Contributors
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/ak-primus/ak-primus
8
+ Project-URL: Repository, https://github.com/ak-primus/ak-primus
9
+ Project-URL: Issues, https://github.com/ak-primus/ak-primus/issues
10
+ Keywords: mcp,llm,compression,token,optimization,context-window
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Requires-Python: >=3.11
21
+ Description-Content-Type: text/markdown
22
+ License-File: LICENSE
23
+ Requires-Dist: mcp>=1.0.0
24
+ Requires-Dist: tiktoken>=0.7.0
25
+ Requires-Dist: httpx>=0.27.0
26
+ Requires-Dist: pydantic>=2.7.0
27
+ Requires-Dist: anyio>=4.4.0
28
+ Requires-Dist: structlog>=24.0.0
29
+ Provides-Extra: compress
30
+ Requires-Dist: llmlingua>=0.2.0; extra == "compress"
31
+ Requires-Dist: transformers>=4.41.0; extra == "compress"
32
+ Requires-Dist: torch>=2.2.0; extra == "compress"
33
+ Provides-Extra: retrieval
34
+ Requires-Dist: sentence-transformers>=3.0.0; extra == "retrieval"
35
+ Requires-Dist: chromadb>=0.5.0; extra == "retrieval"
36
+ Requires-Dist: numpy>=1.26.0; extra == "retrieval"
37
+ Requires-Dist: scikit-learn>=1.5.0; extra == "retrieval"
38
+ Requires-Dist: openai>=1.30.0; extra == "retrieval"
39
+ Requires-Dist: anthropic>=0.29.0; extra == "retrieval"
40
+ Provides-Extra: optimize
41
+ Requires-Dist: dspy-ai>=2.4.0; extra == "optimize"
42
+ Requires-Dist: openai>=1.30.0; extra == "optimize"
43
+ Requires-Dist: evaluate>=0.4.0; extra == "optimize"
44
+ Provides-Extra: http
45
+ Requires-Dist: uvicorn[standard]>=0.29.0; extra == "http"
46
+ Requires-Dist: starlette>=0.37.0; extra == "http"
47
+ Provides-Extra: dev
48
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
49
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
50
+ Requires-Dist: pytest-cov>=5.0.0; extra == "dev"
51
+ Requires-Dist: ruff>=0.4.0; extra == "dev"
52
+ Requires-Dist: mypy>=1.10.0; extra == "dev"
53
+ Provides-Extra: all
54
+ Requires-Dist: ak-primus[compress,http,optimize,retrieval]; extra == "all"
55
+ Dynamic: license-file
56
+
57
+ # AK-Primus v0.2.0
58
+
59
+ > Intelligent token compression + routing MCP server with adaptive self-healing.
60
+
61
+ Every LLM request passes through a 7-layer pipeline selected by an 8-class hybrid classifier. The right compression and retrieval stack runs automatically for each request type. Nothing is applied blindly. The system gets cheaper the more it is used.
62
+
63
+ ```
64
+ Request → Classifier → Router → [Cache | Memory | Compress | Search | Prompt-Opt] → LLM
65
+
66
+ Quality Score ← Response ←────┘
67
+
68
+ Adaptive Profile Update
69
+ ```
70
+
71
+ ---
72
+
73
+ ## What's new in 0.2.0
74
+
75
+ | Area | Change |
76
+ |---|---|
77
+ | Compression | Expansion guard — never inflates token count; LLMLingua-1 uses GPT-2 (was 7B Llama) |
78
+ | Classifier | ML hybrid (400-example logistic regression + rule fast-path); 88%+ accuracy on 275-scenario suite |
79
+ | Cache | 3-level lookup: SHA-256 exact → ChromaDB HNSW ANN → SQLite cosine fallback |
80
+ | Memory | L1 working memory + L2 episodic (session consolidation) + L3 semantic (cross-session ChromaDB) |
81
+ | DSPy | 4 typed Signatures wired end-to-end; BootstrapFewShot + MIPRO2 with disk cache |
82
+ | Quality | ROUGE-L + BERTScore + LLM-as-judge blended score; `score_async()` variant |
83
+ | Transport | `AK_PRIMUS_TRANSPORT=http` — Starlette ASGI with `/health`, `/ready`, `/metrics`, SSE |
84
+ | Testing | 84 pytest tests + 275-scenario benchmark suite with per-class accuracy CI gate |
85
+
86
+ ---
87
+
88
+ ## Architecture
89
+
90
+ ```
91
+ ak-primus/
92
+ ├── core/
93
+ │ └── ak_primus/
94
+ │ ├── classifier.py # 8-class hybrid (rule + ML logistic regression)
95
+ │ ├── router.py # Stack selection per request type
96
+ │ ├── server.py # MCP server — 7 tools, stdio + HTTP/SSE
97
+ │ ├── layers/
98
+ │ │ ├── cache.py # 3-level semantic cache (exact / HNSW / cosine)
99
+ │ │ ├── compression.py # LLMLingua-2, LLMLingua-1, LongLLMLingua, SelectiveContext
100
+ │ │ ├── memory.py # L1 working | L2 episodic | L3 semantic (ChromaDB)
101
+ │ │ ├── metrics.py # tiktoken real token counting + cost accounting
102
+ │ │ ├── prompt_opt.py # DSPy BootstrapFewShot + MIPRO2, OPRO, Medprompt
103
+ │ │ ├── quality.py # ROUGE-L + BERTScore + LLM-as-judge + adaptive profile
104
+ │ │ └── search.py # HyDE, RAPTOR, FLARE, ColBERT retrieval
105
+ │ ├── ml/
106
+ │ │ └── classifier_ml.py # Embedding-based logistic regression (400 training examples)
107
+ │ └── storage/
108
+ │ ├── session_store.py # SQLite WAL — sessions, cache, profiles, memory_facts
109
+ │ └── vector_store.py # ChromaDB HNSW — semantic_cache + memory_facts
110
+ ├── extension/
111
+ │ └── src/
112
+ │ ├── extension.ts # VS Code extension entry point
113
+ │ ├── dashboard.ts # Real-time metrics webview
114
+ │ └── mcp-client.ts # MCP stdio bridge
115
+ ├── tests/
116
+ │ ├── unit/ # 70 unit tests (all layers)
117
+ │ └── integration/ # 14 integration + benchmark threshold tests
118
+ └── benchmarks/
119
+ └── run_1200_scenarios.py # 275-scenario accuracy + latency suite
120
+ ```
121
+
122
+ ---
123
+
124
+ ## 7 MCP Tools
125
+
126
+ | Tool | Purpose |
127
+ |---|---|
128
+ | `classify_request` | Detect request type + return recommended stack with expected token reduction |
129
+ | `compress_history` | Apply compression stack to message history; returns compressed messages + savings |
130
+ | `build_context` | HyDE / RAPTOR / FLARE retrieval-augmented context building |
131
+ | `optimize_prompt` | DSPy BootstrapFewShot / OPRO / Medprompt prompt optimisation |
132
+ | `get_token_report` | Real tiktoken metrics, cost savings, session stats |
133
+ | `process_request` | **Master pipeline**: classify → cache → memory → compress → quality → adapt |
134
+ | `report_quality` | Feed quality signal (0–1) back into adaptive compression profile |
135
+
136
+ ### `process_request` — master pipeline
137
+
138
+ ```json
139
+ {
140
+ "optimized_messages": [...],
141
+ "tokens_before": 1240,
142
+ "tokens_after": 487,
143
+ "tokens_saved": 753,
144
+ "savings_pct": 60.7,
145
+ "request_type": "code",
146
+ "confidence": 0.91,
147
+ "quality_score": 0.876,
148
+ "adapted_ratio": 0.382,
149
+ "cache_hit": false,
150
+ "session_id": "sess-abc123",
151
+ "lifetime_savings": 41250
152
+ }
153
+ ```
154
+
155
+ ---
156
+
157
+ ## 8 Request Types
158
+
159
+ | Type | Classifier Trigger | Default Stack |
160
+ |---|---|---|
161
+ | `code` | Code keywords + task verbs + C++/C# detection | SelectiveContext → PrefixCache |
162
+ | `rag_doc` | Documents present + QA-style question | HyDE retrieval + LLMLingua-2 |
163
+ | `agent_session` | Multi-turn history + follow-up phrases | WorkingMemory + L3 semantic |
164
+ | `domain_expert` | Legal / medical / finance domain terms | Medprompt + SelectiveContext |
165
+ | `multi_hop` | "relationship", "compare", "trace" patterns | RAPTOR + ChainOfThought |
166
+ | `math` | Equations, proof, calculate keywords | LLMLingua-1 (formula-aware) |
167
+ | `fixed_template` | Long system prompt (>600 tokens) | PrefixCache (no compression) |
168
+ | `simple_qa` | Short conversational question | Light SelectiveContext |
169
+
170
+ ---
171
+
172
+ ## Installation
173
+
174
+ ```bash
175
+ # Minimal (core MCP server, no ML)
176
+ pip install ak-primus
177
+
178
+ # Full (all optional groups)
179
+ pip install "ak-primus[all]"
180
+
181
+ # From source
182
+ git clone https://github.com/ak-primus/ak-primus
183
+ pip install -e "core[all]"
184
+ ```
185
+
186
+ | Group | Installs | When to use |
187
+ |---|---|---|
188
+ | `compress` | LLMLingua, transformers, torch | Token compression |
189
+ | `retrieval` | sentence-transformers, chromadb, scikit-learn | Semantic cache + search |
190
+ | `optimize` | dspy-ai, evaluate | DSPy + ROUGE/BERTScore |
191
+ | `http` | uvicorn, starlette | HTTP/SSE transport |
192
+ | `dev` | pytest, ruff, mypy | Development |
193
+
194
+ ---
195
+
196
+ ## Quick Start
197
+
198
+ ### Claude Desktop
199
+
200
+ ```json
201
+ {
202
+ "mcpServers": {
203
+ "ak-primus": {
204
+ "command": "ak-primus",
205
+ "args": ["serve"],
206
+ "env": {
207
+ "ANTHROPIC_API_KEY": "sk-ant-..."
208
+ }
209
+ }
210
+ }
211
+ }
212
+ ```
213
+
214
+ ### Docker
215
+
216
+ ```bash
217
+ # stdio (MCP)
218
+ docker build -f Dockerfile.akprimus --target runtime -t ak-primus:0.2.0 .
219
+ docker run -i --rm -e ANTHROPIC_API_KEY="$ANTHROPIC_API_KEY" -v ak_data:/data ak-primus:0.2.0
220
+
221
+ # HTTP/SSE with health probes
222
+ docker compose --profile http up
223
+ curl http://localhost:8080/health
224
+ ```
225
+
226
+ ---
227
+
228
+ ## Self-Healing Compression
229
+
230
+ The adaptive profile tunes compression ratios over time per (workspace, request_type):
231
+
232
+ ```
233
+ quality ≥ 0.85 → compress more aggressively next time
234
+ quality < 0.70 → back off compression
235
+ ```
236
+
237
+ After ~50 samples per type, the ratio converges to the Pareto-optimal point.
238
+
239
+ ```bash
240
+ # Feed quality signal back after reviewing LLM response
241
+ curl -s http://localhost:8080/mcp -d '{"tool":"report_quality","quality_score":0.92,"request_type":"code"}'
242
+ ```
243
+
244
+ ---
245
+
246
+ ## Running Tests
247
+
248
+ ```bash
249
+ pytest tests/ -q # 84 tests
250
+ python benchmarks/run_1200_scenarios.py --quick # accuracy + latency
251
+ ```
252
+
253
+ ### Benchmark (v0.2.0, 275 scenarios)
254
+
255
+ | Class | Accuracy |
256
+ |---|---|
257
+ | `math` | 96% |
258
+ | `fixed_template` | 100% |
259
+ | `agent_session` | 92% |
260
+ | `domain_expert` | 90% |
261
+ | `multi_hop` | 90% |
262
+ | `simple_qa` | 88% |
263
+ | `code` | 82% |
264
+ | `rag_doc` | 80% |
265
+ | **Overall** | **~88%** |
266
+
267
+ Classifier latency: p50 < 1ms, p95 < 5ms (rule-based path, no model load).
268
+
269
+ ---
270
+
271
+ ## Environment Variables
272
+
273
+ | Variable | Default | Description |
274
+ |---|---|---|
275
+ | `AK_PRIMUS_MODEL` | `claude-sonnet-4-6` | LLM model for DSPy / OPRO / judge |
276
+ | `AK_PRIMUS_TRANSPORT` | `stdio` | `stdio` or `http` |
277
+ | `AK_PRIMUS_HOST` | `127.0.0.1` | HTTP bind host |
278
+ | `AK_PRIMUS_PORT` | `8080` | HTTP bind port |
279
+ | `AK_PRIMUS_DB` | `~/.ak_primus/store.db` | SQLite path |
280
+ | `AK_PRIMUS_DSPY_CACHE` | `~/.ak_primus/dspy_cache` | DSPy program cache |
281
+
282
+ ---
283
+
284
+ ## License
285
+
286
+ MIT
287
+
@@ -0,0 +1,231 @@
1
+ # AK-Primus v0.2.0
2
+
3
+ > Intelligent token compression + routing MCP server with adaptive self-healing.
4
+
5
+ Every LLM request passes through a 7-layer pipeline selected by an 8-class hybrid classifier. The right compression and retrieval stack runs automatically for each request type. Nothing is applied blindly. The system gets cheaper the more it is used.
6
+
7
+ ```
8
+ Request → Classifier → Router → [Cache | Memory | Compress | Search | Prompt-Opt] → LLM
9
+
10
+ Quality Score ← Response ←────┘
11
+
12
+ Adaptive Profile Update
13
+ ```
14
+
15
+ ---
16
+
17
+ ## What's new in 0.2.0
18
+
19
+ | Area | Change |
20
+ |---|---|
21
+ | Compression | Expansion guard — never inflates token count; LLMLingua-1 uses GPT-2 (was 7B Llama) |
22
+ | Classifier | ML hybrid (400-example logistic regression + rule fast-path); 88%+ accuracy on 275-scenario suite |
23
+ | Cache | 3-level lookup: SHA-256 exact → ChromaDB HNSW ANN → SQLite cosine fallback |
24
+ | Memory | L1 working memory + L2 episodic (session consolidation) + L3 semantic (cross-session ChromaDB) |
25
+ | DSPy | 4 typed Signatures wired end-to-end; BootstrapFewShot + MIPRO2 with disk cache |
26
+ | Quality | ROUGE-L + BERTScore + LLM-as-judge blended score; `score_async()` variant |
27
+ | Transport | `AK_PRIMUS_TRANSPORT=http` — Starlette ASGI with `/health`, `/ready`, `/metrics`, SSE |
28
+ | Testing | 84 pytest tests + 275-scenario benchmark suite with per-class accuracy CI gate |
29
+
30
+ ---
31
+
32
+ ## Architecture
33
+
34
+ ```
35
+ ak-primus/
36
+ ├── core/
37
+ │ └── ak_primus/
38
+ │ ├── classifier.py # 8-class hybrid (rule + ML logistic regression)
39
+ │ ├── router.py # Stack selection per request type
40
+ │ ├── server.py # MCP server — 7 tools, stdio + HTTP/SSE
41
+ │ ├── layers/
42
+ │ │ ├── cache.py # 3-level semantic cache (exact / HNSW / cosine)
43
+ │ │ ├── compression.py # LLMLingua-2, LLMLingua-1, LongLLMLingua, SelectiveContext
44
+ │ │ ├── memory.py # L1 working | L2 episodic | L3 semantic (ChromaDB)
45
+ │ │ ├── metrics.py # tiktoken real token counting + cost accounting
46
+ │ │ ├── prompt_opt.py # DSPy BootstrapFewShot + MIPRO2, OPRO, Medprompt
47
+ │ │ ├── quality.py # ROUGE-L + BERTScore + LLM-as-judge + adaptive profile
48
+ │ │ └── search.py # HyDE, RAPTOR, FLARE, ColBERT retrieval
49
+ │ ├── ml/
50
+ │ │ └── classifier_ml.py # Embedding-based logistic regression (400 training examples)
51
+ │ └── storage/
52
+ │ ├── session_store.py # SQLite WAL — sessions, cache, profiles, memory_facts
53
+ │ └── vector_store.py # ChromaDB HNSW — semantic_cache + memory_facts
54
+ ├── extension/
55
+ │ └── src/
56
+ │ ├── extension.ts # VS Code extension entry point
57
+ │ ├── dashboard.ts # Real-time metrics webview
58
+ │ └── mcp-client.ts # MCP stdio bridge
59
+ ├── tests/
60
+ │ ├── unit/ # 70 unit tests (all layers)
61
+ │ └── integration/ # 14 integration + benchmark threshold tests
62
+ └── benchmarks/
63
+ └── run_1200_scenarios.py # 275-scenario accuracy + latency suite
64
+ ```
65
+
66
+ ---
67
+
68
+ ## 7 MCP Tools
69
+
70
+ | Tool | Purpose |
71
+ |---|---|
72
+ | `classify_request` | Detect request type + return recommended stack with expected token reduction |
73
+ | `compress_history` | Apply compression stack to message history; returns compressed messages + savings |
74
+ | `build_context` | HyDE / RAPTOR / FLARE retrieval-augmented context building |
75
+ | `optimize_prompt` | DSPy BootstrapFewShot / OPRO / Medprompt prompt optimisation |
76
+ | `get_token_report` | Real tiktoken metrics, cost savings, session stats |
77
+ | `process_request` | **Master pipeline**: classify → cache → memory → compress → quality → adapt |
78
+ | `report_quality` | Feed quality signal (0–1) back into adaptive compression profile |
79
+
80
+ ### `process_request` — master pipeline
81
+
82
+ ```json
83
+ {
84
+ "optimized_messages": [...],
85
+ "tokens_before": 1240,
86
+ "tokens_after": 487,
87
+ "tokens_saved": 753,
88
+ "savings_pct": 60.7,
89
+ "request_type": "code",
90
+ "confidence": 0.91,
91
+ "quality_score": 0.876,
92
+ "adapted_ratio": 0.382,
93
+ "cache_hit": false,
94
+ "session_id": "sess-abc123",
95
+ "lifetime_savings": 41250
96
+ }
97
+ ```
98
+
99
+ ---
100
+
101
+ ## 8 Request Types
102
+
103
+ | Type | Classifier Trigger | Default Stack |
104
+ |---|---|---|
105
+ | `code` | Code keywords + task verbs + C++/C# detection | SelectiveContext → PrefixCache |
106
+ | `rag_doc` | Documents present + QA-style question | HyDE retrieval + LLMLingua-2 |
107
+ | `agent_session` | Multi-turn history + follow-up phrases | WorkingMemory + L3 semantic |
108
+ | `domain_expert` | Legal / medical / finance domain terms | Medprompt + SelectiveContext |
109
+ | `multi_hop` | "relationship", "compare", "trace" patterns | RAPTOR + ChainOfThought |
110
+ | `math` | Equations, proof, calculate keywords | LLMLingua-1 (formula-aware) |
111
+ | `fixed_template` | Long system prompt (>600 tokens) | PrefixCache (no compression) |
112
+ | `simple_qa` | Short conversational question | Light SelectiveContext |
113
+
114
+ ---
115
+
116
+ ## Installation
117
+
118
+ ```bash
119
+ # Minimal (core MCP server, no ML)
120
+ pip install ak-primus
121
+
122
+ # Full (all optional groups)
123
+ pip install "ak-primus[all]"
124
+
125
+ # From source
126
+ git clone https://github.com/ak-primus/ak-primus
127
+ pip install -e "core[all]"
128
+ ```
129
+
130
+ | Group | Installs | When to use |
131
+ |---|---|---|
132
+ | `compress` | LLMLingua, transformers, torch | Token compression |
133
+ | `retrieval` | sentence-transformers, chromadb, scikit-learn | Semantic cache + search |
134
+ | `optimize` | dspy-ai, evaluate | DSPy + ROUGE/BERTScore |
135
+ | `http` | uvicorn, starlette | HTTP/SSE transport |
136
+ | `dev` | pytest, ruff, mypy | Development |
137
+
138
+ ---
139
+
140
+ ## Quick Start
141
+
142
+ ### Claude Desktop
143
+
144
+ ```json
145
+ {
146
+ "mcpServers": {
147
+ "ak-primus": {
148
+ "command": "ak-primus",
149
+ "args": ["serve"],
150
+ "env": {
151
+ "ANTHROPIC_API_KEY": "sk-ant-..."
152
+ }
153
+ }
154
+ }
155
+ }
156
+ ```
157
+
158
+ ### Docker
159
+
160
+ ```bash
161
+ # stdio (MCP)
162
+ docker build -f Dockerfile.akprimus --target runtime -t ak-primus:0.2.0 .
163
+ docker run -i --rm -e ANTHROPIC_API_KEY="$ANTHROPIC_API_KEY" -v ak_data:/data ak-primus:0.2.0
164
+
165
+ # HTTP/SSE with health probes
166
+ docker compose --profile http up
167
+ curl http://localhost:8080/health
168
+ ```
169
+
170
+ ---
171
+
172
+ ## Self-Healing Compression
173
+
174
+ The adaptive profile tunes compression ratios over time per (workspace, request_type):
175
+
176
+ ```
177
+ quality ≥ 0.85 → compress more aggressively next time
178
+ quality < 0.70 → back off compression
179
+ ```
180
+
181
+ After ~50 samples per type, the ratio converges to the Pareto-optimal point.
182
+
183
+ ```bash
184
+ # Feed quality signal back after reviewing LLM response
185
+ curl -s http://localhost:8080/mcp -d '{"tool":"report_quality","quality_score":0.92,"request_type":"code"}'
186
+ ```
187
+
188
+ ---
189
+
190
+ ## Running Tests
191
+
192
+ ```bash
193
+ pytest tests/ -q # 84 tests
194
+ python benchmarks/run_1200_scenarios.py --quick # accuracy + latency
195
+ ```
196
+
197
+ ### Benchmark (v0.2.0, 275 scenarios)
198
+
199
+ | Class | Accuracy |
200
+ |---|---|
201
+ | `math` | 96% |
202
+ | `fixed_template` | 100% |
203
+ | `agent_session` | 92% |
204
+ | `domain_expert` | 90% |
205
+ | `multi_hop` | 90% |
206
+ | `simple_qa` | 88% |
207
+ | `code` | 82% |
208
+ | `rag_doc` | 80% |
209
+ | **Overall** | **~88%** |
210
+
211
+ Classifier latency: p50 < 1ms, p95 < 5ms (rule-based path, no model load).
212
+
213
+ ---
214
+
215
+ ## Environment Variables
216
+
217
+ | Variable | Default | Description |
218
+ |---|---|---|
219
+ | `AK_PRIMUS_MODEL` | `claude-sonnet-4-6` | LLM model for DSPy / OPRO / judge |
220
+ | `AK_PRIMUS_TRANSPORT` | `stdio` | `stdio` or `http` |
221
+ | `AK_PRIMUS_HOST` | `127.0.0.1` | HTTP bind host |
222
+ | `AK_PRIMUS_PORT` | `8080` | HTTP bind port |
223
+ | `AK_PRIMUS_DB` | `~/.ak_primus/store.db` | SQLite path |
224
+ | `AK_PRIMUS_DSPY_CACHE` | `~/.ak_primus/dspy_cache` | DSPy program cache |
225
+
226
+ ---
227
+
228
+ ## License
229
+
230
+ MIT
231
+
@@ -0,0 +1,8 @@
1
+ """
2
+ AK-Primus — Intelligent token compression & routing MCP server.
3
+ """
4
+ from .classifier import RequestClassifier, RequestType
5
+ from .router import StackRouter, CompressionStack
6
+
7
+ __version__ = "0.1.0"
8
+ __all__ = ["RequestClassifier", "RequestType", "StackRouter", "CompressionStack"]