ocp-router 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ocp_router-0.2.0/.gitignore +19 -0
- ocp_router-0.2.0/PKG-INFO +296 -0
- ocp_router-0.2.0/README.md +276 -0
- ocp_router-0.2.0/ocp_router/__init__.py +37 -0
- ocp_router-0.2.0/ocp_router/backends/__init__.py +0 -0
- ocp_router-0.2.0/ocp_router/backends/anthropic.py +83 -0
- ocp_router-0.2.0/ocp_router/backends/base.py +81 -0
- ocp_router-0.2.0/ocp_router/backends/ollama.py +124 -0
- ocp_router-0.2.0/ocp_router/backends/openai.py +88 -0
- ocp_router-0.2.0/ocp_router/classifier.py +203 -0
- ocp_router-0.2.0/ocp_router/factory.py +71 -0
- ocp_router-0.2.0/ocp_router/router.py +76 -0
- ocp_router-0.2.0/pyproject.toml +32 -0
- ocp_router-0.2.0/tests/test_classifier.py +190 -0
- ocp_router-0.2.0/tests/test_ollama_backend.py +201 -0
- ocp_router-0.2.0/tests/test_router.py +234 -0
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ocp-router
|
|
3
|
+
Version: 0.2.0
|
|
4
|
+
Summary: OCP Router — hybrid local/cloud model routing layer for Open Context Protocol
|
|
5
|
+
Project-URL: Homepage, https://github.com/Rajesh1213/OCP
|
|
6
|
+
Project-URL: Repository, https://github.com/Rajesh1213/OCP
|
|
7
|
+
License: Apache-2.0
|
|
8
|
+
Requires-Python: >=3.11
|
|
9
|
+
Requires-Dist: aiohttp>=3.9
|
|
10
|
+
Requires-Dist: pydantic>=2.7
|
|
11
|
+
Requires-Dist: tiktoken>=0.7
|
|
12
|
+
Provides-Extra: anthropic
|
|
13
|
+
Requires-Dist: anthropic>=0.40; extra == 'anthropic'
|
|
14
|
+
Provides-Extra: dev
|
|
15
|
+
Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
|
|
16
|
+
Requires-Dist: pytest>=8; extra == 'dev'
|
|
17
|
+
Provides-Extra: openai
|
|
18
|
+
Requires-Dist: openai>=1.0; extra == 'openai'
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# ocp-router
|
|
22
|
+
|
|
23
|
+
Hybrid local/cloud model routing layer for [Open Context Protocol](https://github.com/Rajesh1213/OCP).
|
|
24
|
+
|
|
25
|
+
Scores each request for complexity and dispatches it to the right model tier — local model for simple tasks, paid provider for complex reasoning. Vendor-neutral: works with any backend that implements the `ModelBackend` protocol.
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install ocp-router # core — Ollama local backend included
|
|
33
|
+
pip install ocp-router[anthropic] # + Anthropic Claude paid backend
|
|
34
|
+
pip install ocp-router[openai] # + OpenAI paid backend
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Requires Python 3.11+ and a running [Ollama](https://ollama.com) instance for local model support.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Quick start
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
# One-time: install Ollama and pull a model
|
|
45
|
+
brew install ollama # macOS — see ollama.com for other platforms
|
|
46
|
+
ollama pull llama3.2
|
|
47
|
+
ollama serve
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
import asyncio
|
|
52
|
+
from ocp_router import make_router
|
|
53
|
+
|
|
54
|
+
async def main():
|
|
55
|
+
router = make_router() # reads all config from env vars
|
|
56
|
+
|
|
57
|
+
# Simple request — handled locally, no paid API call
|
|
58
|
+
result = await router.route("explain the verify_token function")
|
|
59
|
+
print(result.route_to) # "local"
|
|
60
|
+
print(result.classify.complexity_score) # 0.0
|
|
61
|
+
print(result.model) # "llama3.2"
|
|
62
|
+
print(result.text)
|
|
63
|
+
|
|
64
|
+
# Complex request — escalated to paid provider
|
|
65
|
+
result = await router.route(
|
|
66
|
+
"review security vulnerabilities across all API endpoints"
|
|
67
|
+
)
|
|
68
|
+
print(result.route_to) # "paid"
|
|
69
|
+
print(result.classify.complexity_score) # 0.55
|
|
70
|
+
print(result.classify.signals) # ["security-sensitive"]
|
|
71
|
+
print(result.model) # "claude-sonnet-4-6"
|
|
72
|
+
print(result.text)
|
|
73
|
+
|
|
74
|
+
asyncio.run(main())
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## How routing works
|
|
80
|
+
|
|
81
|
+
Every request passes through the `TaskClassifier` before reaching any model. The classifier scores complexity from `0.0` (trivial) to `1.0` (maximum) using five deterministic heuristic layers — no model required, runs in microseconds.
|
|
82
|
+
|
|
83
|
+
```
|
|
84
|
+
Request prompt
|
|
85
|
+
│
|
|
86
|
+
▼
|
|
87
|
+
┌─────────────────────────────────────────────┐
|
|
88
|
+
│ TaskClassifier │
|
|
89
|
+
│ │
|
|
90
|
+
│ 1. Token length (tiktoken cl100k) │
|
|
91
|
+
│ 2. Code block size (fenced ``` blocks) │
|
|
92
|
+
│ 3. Complex signals security +0.55 │
|
|
93
|
+
│ architecture +0.55 │
|
|
94
|
+
│ migration +0.55 │
|
|
95
|
+
│ deadlock +0.40 │
|
|
96
|
+
│ multi-file +0.35 │
|
|
97
|
+
│ refactor +0.25 ... │
|
|
98
|
+
│ 4. Simple signals explain -0.10 │
|
|
99
|
+
│ summarise -0.10 │
|
|
100
|
+
│ search -0.10 ... │
|
|
101
|
+
│ 5. File references 3-4 files +0.10 │
|
|
102
|
+
│ 5+ files +0.20 │
|
|
103
|
+
│ │
|
|
104
|
+
│ score = clamp(sum, 0.0, 1.0) │
|
|
105
|
+
└──────────────┬──────────────────────────────┘
|
|
106
|
+
│
|
|
107
|
+
┌───────┴────────┐
|
|
108
|
+
score < 0.5 score ≥ 0.5
|
|
109
|
+
│ │
|
|
110
|
+
▼ ▼
|
|
111
|
+
Local model Paid provider
|
|
112
|
+
(Ollama) (Claude / GPT-4 / any)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### What goes where
|
|
116
|
+
|
|
117
|
+
| Request | Score | Route |
|
|
118
|
+
|---|---|---|
|
|
119
|
+
| "explain this function" | 0.00 | local |
|
|
120
|
+
| "what does add() do?" | 0.00 | local |
|
|
121
|
+
| "find all usages of db.connect" | 0.00 | local |
|
|
122
|
+
| "summarise the last session" | 0.00 | local |
|
|
123
|
+
| "refactor the login function" | 0.25 | local |
|
|
124
|
+
| — threshold (default 0.5) — | | |
|
|
125
|
+
| "refactor auth across all files" | 0.80 | paid |
|
|
126
|
+
| "review security vulnerabilities" | 0.55 | paid |
|
|
127
|
+
| "design the payment architecture" | 0.55 | paid |
|
|
128
|
+
| "debug this production deadlock" | 0.60 | paid |
|
|
129
|
+
| "migrate the database schema" | 0.55 | paid |
|
|
130
|
+
|
|
131
|
+
Threshold is configurable via `OCP_ROUTE_THRESHOLD`.
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## RouteResult — what you get back
|
|
136
|
+
|
|
137
|
+
Every `router.route()` call returns a `RouteResult` with the answer and a full trace of the routing decision:
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
@dataclass
|
|
141
|
+
class RouteResult:
|
|
142
|
+
text: str # the model's response
|
|
143
|
+
route_to: str # "local" or "paid"
|
|
144
|
+
classify: ClassifyResult # full classification trace
|
|
145
|
+
model: str # exact model identifier used
|
|
146
|
+
prompt_tokens: int
|
|
147
|
+
completion_tokens: int
|
|
148
|
+
duration_ms: float
|
|
149
|
+
|
|
150
|
+
@dataclass
|
|
151
|
+
class ClassifyResult:
|
|
152
|
+
complexity_score: float # 0.0 – 1.0
|
|
153
|
+
task_type: str # "explain" | "refactor" | "debug" | "architect" | ...
|
|
154
|
+
signals: list[str] # which heuristics fired
|
|
155
|
+
route_to: str # "local" or "paid"
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## Configuration
|
|
161
|
+
|
|
162
|
+
All options are set via environment variables — no code changes needed.
|
|
163
|
+
|
|
164
|
+
### Local backend
|
|
165
|
+
|
|
166
|
+
| Variable | Default | Description |
|
|
167
|
+
|---|---|---|
|
|
168
|
+
| `OCP_LOCAL_BACKEND` | `ollama` | Backend type. `ollama` is the only built-in option. |
|
|
169
|
+
| `OCP_LOCAL_MODEL` | `llama3.2` | Model name passed to Ollama. |
|
|
170
|
+
| `OCP_OLLAMA_URL` | `http://localhost:11434` | Ollama base URL. Point to a remote GPU box if needed. |
|
|
171
|
+
| `OCP_LOCAL_TIMEOUT` | `60` | Inference timeout in seconds. |
|
|
172
|
+
|
|
173
|
+
### Paid backend
|
|
174
|
+
|
|
175
|
+
| Variable | Default | Description |
|
|
176
|
+
|---|---|---|
|
|
177
|
+
| `OCP_PAID_BACKEND` | `anthropic` | Backend type: `anthropic` or `openai`. |
|
|
178
|
+
| `OCP_PAID_MODEL` | `claude-sonnet-4-6` | Model identifier for the paid provider. |
|
|
179
|
+
| `OCP_PAID_MAX_TOKENS` | `4096` | Max tokens for paid responses. |
|
|
180
|
+
| `ANTHROPIC_API_KEY` | *(required)* | API key for Anthropic backend. |
|
|
181
|
+
| `OPENAI_API_KEY` | *(required)* | API key for OpenAI backend. |
|
|
182
|
+
|
|
183
|
+
### Router
|
|
184
|
+
|
|
185
|
+
| Variable | Default | Description |
|
|
186
|
+
|---|---|---|
|
|
187
|
+
| `OCP_ROUTE_THRESHOLD` | `0.5` | Complexity score at or above which requests go to paid. |
|
|
188
|
+
|
|
189
|
+
```bash
|
|
190
|
+
# Example: Mistral locally, GPT-4o for complex tasks, stricter threshold
|
|
191
|
+
OCP_LOCAL_MODEL=mistral \
|
|
192
|
+
OCP_PAID_BACKEND=openai \
|
|
193
|
+
OCP_PAID_MODEL=gpt-4o \
|
|
194
|
+
OCP_ROUTE_THRESHOLD=0.6 \
|
|
195
|
+
python my_agent.py
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
---
|
|
199
|
+
|
|
200
|
+
## IDE integration (Claude Code, Cursor, Windsurf)
|
|
201
|
+
|
|
202
|
+
Add the routing env vars to your `.mcp.json` — OCP handles the rest:
|
|
203
|
+
|
|
204
|
+
```json
|
|
205
|
+
{
|
|
206
|
+
"mcpServers": {
|
|
207
|
+
"ocp": {
|
|
208
|
+
"command": "uvx",
|
|
209
|
+
"args": ["ocp-server"],
|
|
210
|
+
"env": {
|
|
211
|
+
"OCP_DB_PATH": "${workspaceFolder}/.ocp.db",
|
|
212
|
+
"OCP_LOCAL_MODEL": "llama3.2",
|
|
213
|
+
"OCP_OLLAMA_URL": "http://localhost:11434",
|
|
214
|
+
"OCP_PAID_BACKEND": "anthropic",
|
|
215
|
+
"OCP_ROUTE_THRESHOLD": "0.5"
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
Simple tasks (explain, search, summarise) are answered locally by Ollama. Complex requests (security, architecture, multi-file refactor) escalate to your paid provider. Your IDE workflow is unchanged.
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## Supported local models
|
|
227
|
+
|
|
228
|
+
Any model available in Ollama works. Recommended starting points:
|
|
229
|
+
|
|
230
|
+
| Model | Size | Good for |
|
|
231
|
+
|---|---|---|
|
|
232
|
+
| `llama3.2` | 2B | Classification, summarisation, simple Q&A |
|
|
233
|
+
| `phi4-mini` | 3.8B | Code explanation, short answers |
|
|
234
|
+
| `mistral` | 7B | Context compression, draft generation |
|
|
235
|
+
| `codellama` | 7B | Code-specific tasks |
|
|
236
|
+
|
|
237
|
+
```bash
|
|
238
|
+
ollama pull llama3.2
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
---
|
|
242
|
+
|
|
243
|
+
## Bring your own backend
|
|
244
|
+
|
|
245
|
+
Both the local and paid slots accept any object that implements the `ModelBackend` protocol — three methods, no base class required:
|
|
246
|
+
|
|
247
|
+
```python
|
|
248
|
+
from ocp_router import OCPRouter, TaskClassifier
|
|
249
|
+
from ocp_router.backends.base import GenerateRequest, GenerateResponse
|
|
250
|
+
|
|
251
|
+
class MyVLLMBackend:
|
|
252
|
+
@property
|
|
253
|
+
def model(self) -> str:
|
|
254
|
+
return "mistral-7b-instruct"
|
|
255
|
+
|
|
256
|
+
async def is_available(self) -> bool:
|
|
257
|
+
return True # check your endpoint
|
|
258
|
+
|
|
259
|
+
async def generate(self, request: GenerateRequest) -> GenerateResponse:
|
|
260
|
+
# call your inference endpoint
|
|
261
|
+
...
|
|
262
|
+
return GenerateResponse(
|
|
263
|
+
text="...",
|
|
264
|
+
model=self.model,
|
|
265
|
+
prompt_tokens=0,
|
|
266
|
+
completion_tokens=0,
|
|
267
|
+
duration_ms=0.0,
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
# Plug in directly — no factory change needed
|
|
271
|
+
router = OCPRouter(
|
|
272
|
+
local=MyVLLMBackend(),
|
|
273
|
+
paid=MyVLLMBackend(), # or any other backend
|
|
274
|
+
classifier=TaskClassifier(),
|
|
275
|
+
)
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
This is the intended extension point. `ocp-router` ships `OllamaBackend`, `AnthropicBackend`, and `OpenAIBackend` as convenience implementations — not as the only options.
|
|
279
|
+
|
|
280
|
+
---
|
|
281
|
+
|
|
282
|
+
## Running tests
|
|
283
|
+
|
|
284
|
+
```bash
|
|
285
|
+
# Unit tests — no Ollama or API keys required
|
|
286
|
+
pytest packages/ocp-router/tests/ -k "not integration" -v
|
|
287
|
+
|
|
288
|
+
# Integration test — requires: ollama serve + ollama pull llama3.2
|
|
289
|
+
pytest packages/ocp-router/tests/ -m integration -v
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
---
|
|
293
|
+
|
|
294
|
+
## What's next
|
|
295
|
+
|
|
296
|
+
- `ocp.prompt.prepare` — local SLM compresses and optimises prompts before they reach the paid provider, reducing token usage and improving answer quality
|
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
# ocp-router
|
|
2
|
+
|
|
3
|
+
Hybrid local/cloud model routing layer for [Open Context Protocol](https://github.com/Rajesh1213/OCP).
|
|
4
|
+
|
|
5
|
+
Scores each request for complexity and dispatches it to the right model tier — local model for simple tasks, paid provider for complex reasoning. Vendor-neutral: works with any backend that implements the `ModelBackend` protocol.
|
|
6
|
+
|
|
7
|
+
---
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install ocp-router # core — Ollama local backend included
|
|
13
|
+
pip install ocp-router[anthropic] # + Anthropic Claude paid backend
|
|
14
|
+
pip install ocp-router[openai] # + OpenAI paid backend
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
Requires Python 3.11+ and a running [Ollama](https://ollama.com) instance for local model support.
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## Quick start
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
# One-time: install Ollama and pull a model
|
|
25
|
+
brew install ollama # macOS — see ollama.com for other platforms
|
|
26
|
+
ollama pull llama3.2
|
|
27
|
+
ollama serve
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
```python
|
|
31
|
+
import asyncio
|
|
32
|
+
from ocp_router import make_router
|
|
33
|
+
|
|
34
|
+
async def main():
|
|
35
|
+
router = make_router() # reads all config from env vars
|
|
36
|
+
|
|
37
|
+
# Simple request — handled locally, no paid API call
|
|
38
|
+
result = await router.route("explain the verify_token function")
|
|
39
|
+
print(result.route_to) # "local"
|
|
40
|
+
print(result.classify.complexity_score) # 0.0
|
|
41
|
+
print(result.model) # "llama3.2"
|
|
42
|
+
print(result.text)
|
|
43
|
+
|
|
44
|
+
# Complex request — escalated to paid provider
|
|
45
|
+
result = await router.route(
|
|
46
|
+
"review security vulnerabilities across all API endpoints"
|
|
47
|
+
)
|
|
48
|
+
print(result.route_to) # "paid"
|
|
49
|
+
print(result.classify.complexity_score) # 0.55
|
|
50
|
+
print(result.classify.signals) # ["security-sensitive"]
|
|
51
|
+
print(result.model) # "claude-sonnet-4-6"
|
|
52
|
+
print(result.text)
|
|
53
|
+
|
|
54
|
+
asyncio.run(main())
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## How routing works
|
|
60
|
+
|
|
61
|
+
Every request passes through the `TaskClassifier` before reaching any model. The classifier scores complexity from `0.0` (trivial) to `1.0` (maximum) using five deterministic heuristic layers — no model required, runs in microseconds.
|
|
62
|
+
|
|
63
|
+
```
|
|
64
|
+
Request prompt
|
|
65
|
+
│
|
|
66
|
+
▼
|
|
67
|
+
┌─────────────────────────────────────────────┐
|
|
68
|
+
│ TaskClassifier │
|
|
69
|
+
│ │
|
|
70
|
+
│ 1. Token length (tiktoken cl100k) │
|
|
71
|
+
│ 2. Code block size (fenced ``` blocks) │
|
|
72
|
+
│ 3. Complex signals security +0.55 │
|
|
73
|
+
│ architecture +0.55 │
|
|
74
|
+
│ migration +0.55 │
|
|
75
|
+
│ deadlock +0.40 │
|
|
76
|
+
│ multi-file +0.35 │
|
|
77
|
+
│ refactor +0.25 ... │
|
|
78
|
+
│ 4. Simple signals explain -0.10 │
|
|
79
|
+
│ summarise -0.10 │
|
|
80
|
+
│ search -0.10 ... │
|
|
81
|
+
│ 5. File references 3-4 files +0.10 │
|
|
82
|
+
│ 5+ files +0.20 │
|
|
83
|
+
│ │
|
|
84
|
+
│ score = clamp(sum, 0.0, 1.0) │
|
|
85
|
+
└──────────────┬──────────────────────────────┘
|
|
86
|
+
│
|
|
87
|
+
┌───────┴────────┐
|
|
88
|
+
score < 0.5 score ≥ 0.5
|
|
89
|
+
│ │
|
|
90
|
+
▼ ▼
|
|
91
|
+
Local model Paid provider
|
|
92
|
+
(Ollama) (Claude / GPT-4 / any)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### What goes where
|
|
96
|
+
|
|
97
|
+
| Request | Score | Route |
|
|
98
|
+
|---|---|---|
|
|
99
|
+
| "explain this function" | 0.00 | local |
|
|
100
|
+
| "what does add() do?" | 0.00 | local |
|
|
101
|
+
| "find all usages of db.connect" | 0.00 | local |
|
|
102
|
+
| "summarise the last session" | 0.00 | local |
|
|
103
|
+
| "refactor the login function" | 0.25 | local |
|
|
104
|
+
| — threshold (default 0.5) — | | |
|
|
105
|
+
| "refactor auth across all files" | 0.80 | paid |
|
|
106
|
+
| "review security vulnerabilities" | 0.55 | paid |
|
|
107
|
+
| "design the payment architecture" | 0.55 | paid |
|
|
108
|
+
| "debug this production deadlock" | 0.60 | paid |
|
|
109
|
+
| "migrate the database schema" | 0.55 | paid |
|
|
110
|
+
|
|
111
|
+
Threshold is configurable via `OCP_ROUTE_THRESHOLD`.
|
|
112
|
+
|
|
113
|
+
---
|
|
114
|
+
|
|
115
|
+
## RouteResult — what you get back
|
|
116
|
+
|
|
117
|
+
Every `router.route()` call returns a `RouteResult` with the answer and a full trace of the routing decision:
|
|
118
|
+
|
|
119
|
+
```python
|
|
120
|
+
@dataclass
|
|
121
|
+
class RouteResult:
|
|
122
|
+
text: str # the model's response
|
|
123
|
+
route_to: str # "local" or "paid"
|
|
124
|
+
classify: ClassifyResult # full classification trace
|
|
125
|
+
model: str # exact model identifier used
|
|
126
|
+
prompt_tokens: int
|
|
127
|
+
completion_tokens: int
|
|
128
|
+
duration_ms: float
|
|
129
|
+
|
|
130
|
+
@dataclass
|
|
131
|
+
class ClassifyResult:
|
|
132
|
+
complexity_score: float # 0.0 – 1.0
|
|
133
|
+
task_type: str # "explain" | "refactor" | "debug" | "architect" | ...
|
|
134
|
+
signals: list[str] # which heuristics fired
|
|
135
|
+
route_to: str # "local" or "paid"
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
---
|
|
139
|
+
|
|
140
|
+
## Configuration
|
|
141
|
+
|
|
142
|
+
All options are set via environment variables — no code changes needed.
|
|
143
|
+
|
|
144
|
+
### Local backend
|
|
145
|
+
|
|
146
|
+
| Variable | Default | Description |
|
|
147
|
+
|---|---|---|
|
|
148
|
+
| `OCP_LOCAL_BACKEND` | `ollama` | Backend type. `ollama` is the only built-in option. |
|
|
149
|
+
| `OCP_LOCAL_MODEL` | `llama3.2` | Model name passed to Ollama. |
|
|
150
|
+
| `OCP_OLLAMA_URL` | `http://localhost:11434` | Ollama base URL. Point to a remote GPU box if needed. |
|
|
151
|
+
| `OCP_LOCAL_TIMEOUT` | `60` | Inference timeout in seconds. |
|
|
152
|
+
|
|
153
|
+
### Paid backend
|
|
154
|
+
|
|
155
|
+
| Variable | Default | Description |
|
|
156
|
+
|---|---|---|
|
|
157
|
+
| `OCP_PAID_BACKEND` | `anthropic` | Backend type: `anthropic` or `openai`. |
|
|
158
|
+
| `OCP_PAID_MODEL` | `claude-sonnet-4-6` | Model identifier for the paid provider. |
|
|
159
|
+
| `OCP_PAID_MAX_TOKENS` | `4096` | Max tokens for paid responses. |
|
|
160
|
+
| `ANTHROPIC_API_KEY` | *(required)* | API key for Anthropic backend. |
|
|
161
|
+
| `OPENAI_API_KEY` | *(required)* | API key for OpenAI backend. |
|
|
162
|
+
|
|
163
|
+
### Router
|
|
164
|
+
|
|
165
|
+
| Variable | Default | Description |
|
|
166
|
+
|---|---|---|
|
|
167
|
+
| `OCP_ROUTE_THRESHOLD` | `0.5` | Complexity score at or above which requests go to paid. |
|
|
168
|
+
|
|
169
|
+
```bash
|
|
170
|
+
# Example: Mistral locally, GPT-4o for complex tasks, stricter threshold
|
|
171
|
+
OCP_LOCAL_MODEL=mistral \
|
|
172
|
+
OCP_PAID_BACKEND=openai \
|
|
173
|
+
OCP_PAID_MODEL=gpt-4o \
|
|
174
|
+
OCP_ROUTE_THRESHOLD=0.6 \
|
|
175
|
+
python my_agent.py
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
## IDE integration (Claude Code, Cursor, Windsurf)
|
|
181
|
+
|
|
182
|
+
Add the routing env vars to your `.mcp.json` — OCP handles the rest:
|
|
183
|
+
|
|
184
|
+
```json
|
|
185
|
+
{
|
|
186
|
+
"mcpServers": {
|
|
187
|
+
"ocp": {
|
|
188
|
+
"command": "uvx",
|
|
189
|
+
"args": ["ocp-server"],
|
|
190
|
+
"env": {
|
|
191
|
+
"OCP_DB_PATH": "${workspaceFolder}/.ocp.db",
|
|
192
|
+
"OCP_LOCAL_MODEL": "llama3.2",
|
|
193
|
+
"OCP_OLLAMA_URL": "http://localhost:11434",
|
|
194
|
+
"OCP_PAID_BACKEND": "anthropic",
|
|
195
|
+
"OCP_ROUTE_THRESHOLD": "0.5"
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
Simple tasks (explain, search, summarise) are answered locally by Ollama. Complex requests (security, architecture, multi-file refactor) escalate to your paid provider. Your IDE workflow is unchanged.
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
## Supported local models
|
|
207
|
+
|
|
208
|
+
Any model available in Ollama works. Recommended starting points:
|
|
209
|
+
|
|
210
|
+
| Model | Size | Good for |
|
|
211
|
+
|---|---|---|
|
|
212
|
+
| `llama3.2` | 2B | Classification, summarisation, simple Q&A |
|
|
213
|
+
| `phi4-mini` | 3.8B | Code explanation, short answers |
|
|
214
|
+
| `mistral` | 7B | Context compression, draft generation |
|
|
215
|
+
| `codellama` | 7B | Code-specific tasks |
|
|
216
|
+
|
|
217
|
+
```bash
|
|
218
|
+
ollama pull llama3.2
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
---
|
|
222
|
+
|
|
223
|
+
## Bring your own backend
|
|
224
|
+
|
|
225
|
+
Both the local and paid slots accept any object that implements the `ModelBackend` protocol — three methods, no base class required:
|
|
226
|
+
|
|
227
|
+
```python
|
|
228
|
+
from ocp_router import OCPRouter, TaskClassifier
|
|
229
|
+
from ocp_router.backends.base import GenerateRequest, GenerateResponse
|
|
230
|
+
|
|
231
|
+
class MyVLLMBackend:
|
|
232
|
+
@property
|
|
233
|
+
def model(self) -> str:
|
|
234
|
+
return "mistral-7b-instruct"
|
|
235
|
+
|
|
236
|
+
async def is_available(self) -> bool:
|
|
237
|
+
return True # check your endpoint
|
|
238
|
+
|
|
239
|
+
async def generate(self, request: GenerateRequest) -> GenerateResponse:
|
|
240
|
+
# call your inference endpoint
|
|
241
|
+
...
|
|
242
|
+
return GenerateResponse(
|
|
243
|
+
text="...",
|
|
244
|
+
model=self.model,
|
|
245
|
+
prompt_tokens=0,
|
|
246
|
+
completion_tokens=0,
|
|
247
|
+
duration_ms=0.0,
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
# Plug in directly — no factory change needed
|
|
251
|
+
router = OCPRouter(
|
|
252
|
+
local=MyVLLMBackend(),
|
|
253
|
+
paid=MyVLLMBackend(), # or any other backend
|
|
254
|
+
classifier=TaskClassifier(),
|
|
255
|
+
)
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
This is the intended extension point. `ocp-router` ships `OllamaBackend`, `AnthropicBackend`, and `OpenAIBackend` as convenience implementations — not as the only options.
|
|
259
|
+
|
|
260
|
+
---
|
|
261
|
+
|
|
262
|
+
## Running tests
|
|
263
|
+
|
|
264
|
+
```bash
|
|
265
|
+
# Unit tests — no Ollama or API keys required
|
|
266
|
+
pytest packages/ocp-router/tests/ -k "not integration" -v
|
|
267
|
+
|
|
268
|
+
# Integration test — requires: ollama serve + ollama pull llama3.2
|
|
269
|
+
pytest packages/ocp-router/tests/ -m integration -v
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
---
|
|
273
|
+
|
|
274
|
+
## What's next
|
|
275
|
+
|
|
276
|
+
- `ocp.prompt.prepare` — local SLM compresses and optimises prompts before they reach the paid provider, reducing token usage and improving answer quality
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""OCP Router — hybrid local/cloud model routing layer."""
|
|
2
|
+
from ocp_router.backends.base import (
|
|
3
|
+
ClassifyResult,
|
|
4
|
+
GenerateRequest,
|
|
5
|
+
GenerateResponse,
|
|
6
|
+
LocalModelBackend,
|
|
7
|
+
ModelBackend,
|
|
8
|
+
RouteResult,
|
|
9
|
+
RouteTarget,
|
|
10
|
+
TaskType,
|
|
11
|
+
)
|
|
12
|
+
from ocp_router.backends.ollama import OllamaBackend
|
|
13
|
+
from ocp_router.classifier import TaskClassifier
|
|
14
|
+
from ocp_router.factory import make_local_backend, make_paid_backend, make_router
|
|
15
|
+
from ocp_router.router import OCPRouter
|
|
16
|
+
|
|
17
|
+
__all__ = [
|
|
18
|
+
# Router
|
|
19
|
+
"OCPRouter",
|
|
20
|
+
"RouteResult",
|
|
21
|
+
# Backends
|
|
22
|
+
"OllamaBackend",
|
|
23
|
+
"LocalModelBackend",
|
|
24
|
+
"ModelBackend",
|
|
25
|
+
# Inference types
|
|
26
|
+
"GenerateRequest",
|
|
27
|
+
"GenerateResponse",
|
|
28
|
+
# Classifier
|
|
29
|
+
"TaskClassifier",
|
|
30
|
+
"ClassifyResult",
|
|
31
|
+
"TaskType",
|
|
32
|
+
"RouteTarget",
|
|
33
|
+
# Factories
|
|
34
|
+
"make_local_backend",
|
|
35
|
+
"make_paid_backend",
|
|
36
|
+
"make_router",
|
|
37
|
+
]
|
|
File without changes
|