lynkr 7.2.4 → 8.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/config/model-tiers.json +89 -0
- package/docs/docs.html +1 -0
- package/docs/index.md +7 -0
- package/docs/toon-integration-spec.md +130 -0
- package/documentation/README.md +3 -2
- package/documentation/claude-code-cli.md +23 -16
- package/documentation/cursor-integration.md +17 -14
- package/documentation/docker.md +11 -4
- package/documentation/embeddings.md +7 -5
- package/documentation/faq.md +66 -12
- package/documentation/features.md +22 -15
- package/documentation/installation.md +66 -14
- package/documentation/production.md +43 -8
- package/documentation/providers.md +145 -42
- package/documentation/routing.md +476 -0
- package/documentation/token-optimization.md +7 -5
- package/documentation/troubleshooting.md +81 -5
- package/install.sh +6 -1
- package/package.json +5 -3
- package/scripts/setup.js +0 -1
- package/src/agents/executor.js +14 -6
- package/src/api/middleware/session.js +15 -2
- package/src/api/openai-router.js +130 -37
- package/src/api/providers-handler.js +15 -1
- package/src/api/router.js +107 -2
- package/src/budget/index.js +4 -3
- package/src/clients/databricks.js +431 -234
- package/src/clients/gpt-utils.js +181 -0
- package/src/clients/ollama-utils.js +66 -140
- package/src/clients/routing.js +0 -1
- package/src/clients/standard-tools.js +82 -5
- package/src/config/index.js +119 -35
- package/src/context/toon.js +173 -0
- package/src/headroom/launcher.js +8 -3
- package/src/logger/index.js +23 -0
- package/src/orchestrator/index.js +765 -212
- package/src/routing/agentic-detector.js +320 -0
- package/src/routing/complexity-analyzer.js +202 -2
- package/src/routing/cost-optimizer.js +305 -0
- package/src/routing/index.js +168 -159
- package/src/routing/model-registry.js +437 -0
- package/src/routing/model-tiers.js +365 -0
- package/src/server.js +2 -2
- package/src/sessions/cleanup.js +3 -3
- package/src/sessions/record.js +10 -1
- package/src/sessions/store.js +7 -2
- package/src/tools/agent-task.js +48 -1
- package/src/tools/index.js +15 -2
- package/src/tools/workspace.js +35 -4
- package/src/workspace/index.js +30 -0
- package/te +11622 -0
- package/test/README.md +1 -1
- package/test/azure-openai-config.test.js +17 -8
- package/test/azure-openai-integration.test.js +7 -1
- package/test/azure-openai-routing.test.js +41 -43
- package/test/bedrock-integration.test.js +18 -32
- package/test/hybrid-routing-integration.test.js +35 -20
- package/test/hybrid-routing-performance.test.js +74 -64
- package/test/llamacpp-integration.test.js +28 -9
- package/test/lmstudio-integration.test.js +20 -8
- package/test/openai-integration.test.js +17 -20
- package/test/performance-tests.js +1 -1
- package/test/routing.test.js +65 -59
- package/test/toon-compression.test.js +131 -0
- package/CLAWROUTER_ROUTING_PLAN.md +0 -910
- package/ROUTER_COMPARISON.md +0 -173
- package/TIER_ROUTING_PLAN.md +0 -771
|
@@ -0,0 +1,476 @@
|
|
|
1
|
+
# Intelligent Routing & Model Tiering
|
|
2
|
+
|
|
3
|
+
Lynkr's intelligent routing system automatically selects the optimal model and provider for each request based on complexity analysis, agentic workflow detection, and cost optimization.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Overview
|
|
8
|
+
|
|
9
|
+
```
|
|
10
|
+
Request → Force Patterns → Tool Thresholds → Complexity Analysis → Agentic Detection → Tier Selection → Cost Optimization → Provider
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
The routing pipeline evaluates every incoming request through multiple stages to determine which model tier and provider should handle it. Simple requests go to cheap/local models, complex ones go to powerful cloud models.
|
|
14
|
+
|
|
15
|
+
**Key benefits:**
|
|
16
|
+
- 60-80% cost reduction by routing simple tasks to cheaper models
|
|
17
|
+
- Better quality on complex tasks by using capable models when needed
|
|
18
|
+
- Automatic agentic workflow detection with tier upgrades
|
|
19
|
+
- Multi-source pricing for optimal cost decisions
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## 4-Tier Model System
|
|
24
|
+
|
|
25
|
+
Every request is mapped to one of four complexity tiers:
|
|
26
|
+
|
|
27
|
+
| Tier | Score Range | Description | Example Tasks |
|
|
28
|
+
|------|-----------|-------------|---------------|
|
|
29
|
+
| **SIMPLE** | 0-25 | Greetings, simple Q&A, confirmations | "Hello", "What is a variable?", "Yes" |
|
|
30
|
+
| **MEDIUM** | 26-50 | Code reading, simple edits, research | "Read this file", "Fix this typo", "Search for X" |
|
|
31
|
+
| **COMPLEX** | 51-75 | Multi-file changes, debugging, architecture | "Refactor auth module", "Debug this race condition" |
|
|
32
|
+
| **REASONING** | 76-100 | Complex analysis, security audits, novel problems | "Security audit", "Design microservices architecture" |
|
|
33
|
+
|
|
34
|
+
### Configuration
|
|
35
|
+
|
|
36
|
+
Tiers are configured via mandatory environment variables in `provider:model` format:
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
# Required - one per tier
|
|
40
|
+
TIER_SIMPLE=ollama:llama3.2
|
|
41
|
+
TIER_MEDIUM=openai:gpt-4o
|
|
42
|
+
TIER_COMPLEX=openai:o1-mini
|
|
43
|
+
TIER_REASONING=openai:o1
|
|
44
|
+
|
|
45
|
+
# Examples with other providers
|
|
46
|
+
TIER_SIMPLE=ollama:qwen2.5-coder
|
|
47
|
+
TIER_MEDIUM=databricks:databricks-claude-sonnet-4-5
|
|
48
|
+
TIER_COMPLEX=azure-openai:gpt-5.2-chat
|
|
49
|
+
TIER_REASONING=databricks:databricks-claude-opus-4-6
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
If a model name is given without a provider prefix, the default provider (`MODEL_PROVIDER`) is used.
|
|
53
|
+
|
|
54
|
+
### Routing Precedence
|
|
55
|
+
|
|
56
|
+
There are three routing-related settings. Here is exactly how they interact:
|
|
57
|
+
|
|
58
|
+
#### 1. `TIER_*` Environment Variables (Highest Priority)
|
|
59
|
+
|
|
60
|
+
When **all four** `TIER_*` vars are set (`TIER_SIMPLE`, `TIER_MEDIUM`, `TIER_COMPLEX`, `TIER_REASONING`), tiered routing is **active**. Every incoming request is scored for complexity (0-100), mapped to a tier, and routed to the `provider:model` specified in the matching `TIER_*` var.
|
|
61
|
+
|
|
62
|
+
In this mode, `MODEL_PROVIDER` is **not consulted** for routing decisions. The provider comes directly from the `TIER_*` value (e.g., `ollama:llama3.2` routes to Ollama, `openai:gpt-4o` routes to OpenAI).
|
|
63
|
+
|
|
64
|
+
If any of the four `TIER_*` vars are missing, tiered routing is **completely disabled** and the system falls back to `MODEL_PROVIDER`.
|
|
65
|
+
|
|
66
|
+
#### 2. `MODEL_PROVIDER` (Default / Fallback)
|
|
67
|
+
|
|
68
|
+
`MODEL_PROVIDER` controls routing in two scenarios:
|
|
69
|
+
|
|
70
|
+
- **When tiered routing is disabled** (any `TIER_*` var missing) — all requests go to the provider set in `MODEL_PROVIDER`, regardless of complexity. This is static routing.
|
|
71
|
+
- **When a `TIER_*` value has no provider prefix** (e.g., `TIER_SIMPLE=llama3.2` instead of `TIER_SIMPLE=ollama:llama3.2`) — `MODEL_PROVIDER` is used as the default provider for that tier.
|
|
72
|
+
|
|
73
|
+
Even when tiered routing is active and overrides it for request routing, `MODEL_PROVIDER` is still used for:
|
|
74
|
+
- **Startup checks** — e.g., if `MODEL_PROVIDER=ollama`, the server waits for Ollama to be reachable before accepting requests
|
|
75
|
+
- **Provider discovery API** (`/v1/providers`) — marks which provider is "primary" in the response
|
|
76
|
+
- **Embeddings routing** — the OpenAI-compatible router checks `MODEL_PROVIDER` for embedding provider selection
|
|
77
|
+
|
|
78
|
+
**Always set `MODEL_PROVIDER`** even when using tier routing.
|
|
79
|
+
|
|
80
|
+
#### 3. `PREFER_OLLAMA` (Removed)
|
|
81
|
+
|
|
82
|
+
`PREFER_OLLAMA` is **deprecated and has no effect**. If set, a warning is logged at startup:
|
|
83
|
+
|
|
84
|
+
```
|
|
85
|
+
[DEPRECATION] PREFER_OLLAMA is removed. Use TIER_* env vars for routing.
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
To route simple requests to Ollama, use `TIER_SIMPLE=ollama:<model>` instead.
|
|
89
|
+
|
|
90
|
+
#### Summary Table
|
|
91
|
+
|
|
92
|
+
| Configuration | Routing Behavior |
|
|
93
|
+
|---|---|
|
|
94
|
+
| All 4 `TIER_*` set | Tier routing active. Each request scored and routed to its tier's `provider:model`. `MODEL_PROVIDER` ignored for routing. |
|
|
95
|
+
| 1-3 `TIER_*` set | Tier routing **disabled**. All requests go to `MODEL_PROVIDER` (static). |
|
|
96
|
+
| No `TIER_*` set | Static routing. All requests go to `MODEL_PROVIDER`. |
|
|
97
|
+
| `TIER_*` value without provider prefix | `MODEL_PROVIDER` used as the default provider for that tier. |
|
|
98
|
+
| `PREFER_OLLAMA` set | No effect. Deprecation warning logged. |
|
|
99
|
+
|
|
100
|
+
#### Example: Mixed Local + Cloud Setup
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
MODEL_PROVIDER=ollama # Startup checks + default provider
|
|
104
|
+
TIER_SIMPLE=ollama:llama3.2 # Score 0-25 → Ollama (free, local)
|
|
105
|
+
TIER_MEDIUM=openai:gpt-4o # Score 26-50 → OpenAI
|
|
106
|
+
TIER_COMPLEX=databricks:claude-sonnet-4-5 # Score 51-75 → Databricks
|
|
107
|
+
TIER_REASONING=databricks:claude-opus-4-6 # Score 76-100 → Databricks
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
In this setup, a "Hello" message (score ~5) routes to Ollama. A "Refactor the auth module" message (score ~65) routes to Databricks. `MODEL_PROVIDER=ollama` ensures the server waits for Ollama at startup but does not affect where complex requests go.
|
|
111
|
+
|
|
112
|
+
### Tier Config File
|
|
113
|
+
|
|
114
|
+
Additional tier preferences (fallback models per provider) can be defined in `config/model-tiers.json`:
|
|
115
|
+
|
|
116
|
+
```json
|
|
117
|
+
{
|
|
118
|
+
"tiers": {
|
|
119
|
+
"SIMPLE": { "preferred": { "ollama": ["llama3.2"], "openai": ["gpt-4o-mini"] } },
|
|
120
|
+
"MEDIUM": { "preferred": { "openai": ["gpt-4o"], "anthropic": ["claude-sonnet-4-20250514"] } },
|
|
121
|
+
"COMPLEX": { "preferred": { "openai": ["o1-mini"], "anthropic": ["claude-sonnet-4-20250514"] } },
|
|
122
|
+
"REASONING": { "preferred": { "openai": ["o1"], "anthropic": ["claude-opus-4-20250514"] } }
|
|
123
|
+
},
|
|
124
|
+
"localProviders": {
|
|
125
|
+
"ollama": { "free": true, "defaultTier": "SIMPLE" },
|
|
126
|
+
"llamacpp": { "free": true, "defaultTier": "SIMPLE" },
|
|
127
|
+
"lmstudio": { "free": true, "defaultTier": "SIMPLE" }
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## Complexity Scoring Algorithm
|
|
135
|
+
|
|
136
|
+
The complexity analyzer implements 4 phases to produce a score from 0-100.
|
|
137
|
+
|
|
138
|
+
### Phase 1: Basic Scoring
|
|
139
|
+
|
|
140
|
+
Three components scored independently:
|
|
141
|
+
|
|
142
|
+
**Token Count (0-20 points):**
|
|
143
|
+
|
|
144
|
+
| Tokens | Score |
|
|
145
|
+
|--------|-------|
|
|
146
|
+
| < 500 | 0 |
|
|
147
|
+
| 500-999 | 4 |
|
|
148
|
+
| 1,000-1,999 | 8 |
|
|
149
|
+
| 2,000-3,999 | 12 |
|
|
150
|
+
| 4,000-7,999 | 16 |
|
|
151
|
+
| 8,000+ | 20 |
|
|
152
|
+
|
|
153
|
+
**Tool Count (0-20 points):**
|
|
154
|
+
|
|
155
|
+
| Tools | Score |
|
|
156
|
+
|-------|-------|
|
|
157
|
+
| 0 | 0 |
|
|
158
|
+
| 1-3 | 4 |
|
|
159
|
+
| 4-6 | 8 |
|
|
160
|
+
| 7-10 | 12 |
|
|
161
|
+
| 11-15 | 16 |
|
|
162
|
+
| 16+ | 20 |
|
|
163
|
+
|
|
164
|
+
**Task Type (0-25 points):**
|
|
165
|
+
- Greetings / yes-no: 0-2
|
|
166
|
+
- Simple questions: 3
|
|
167
|
+
- General non-technical: 5
|
|
168
|
+
- Technical content: 10
|
|
169
|
+
- Refactoring: 16
|
|
170
|
+
- New implementation: 18
|
|
171
|
+
- From scratch: 20
|
|
172
|
+
- Entire codebase scope: 22
|
|
173
|
+
- Force cloud patterns (security audit, architecture review): 25
|
|
174
|
+
|
|
175
|
+
### Phase 2: Advanced Classification
|
|
176
|
+
|
|
177
|
+
Additional scoring on top of Phase 1:
|
|
178
|
+
|
|
179
|
+
**Code Complexity (0-20 points):**
|
|
180
|
+
|
|
181
|
+
| Pattern | Points |
|
|
182
|
+
|---------|--------|
|
|
183
|
+
| Multi-file operations | +5 |
|
|
184
|
+
| Architecture concerns | +5 |
|
|
185
|
+
| Security | +4 |
|
|
186
|
+
| Concurrency | +3 |
|
|
187
|
+
| Performance | +3 |
|
|
188
|
+
| Database operations | +3 |
|
|
189
|
+
| Testing | +2 |
|
|
190
|
+
|
|
191
|
+
**Reasoning Requirements (0-15 points):**
|
|
192
|
+
|
|
193
|
+
| Pattern | Points |
|
|
194
|
+
|---------|--------|
|
|
195
|
+
| Step-by-step reasoning | +4 |
|
|
196
|
+
| Trade-off analysis | +4 |
|
|
197
|
+
| General analysis | +3 |
|
|
198
|
+
| Planning | +3 |
|
|
199
|
+
| Edge cases | +2 |
|
|
200
|
+
|
|
201
|
+
**Conversation Bonus:**
|
|
202
|
+
- 6-10 messages: +2
|
|
203
|
+
- 11+ messages: +5
|
|
204
|
+
|
|
205
|
+
The standard score is the sum of all components, capped at 100.
|
|
206
|
+
|
|
207
|
+
### Weighted Scoring Mode (15 Dimensions)
|
|
208
|
+
|
|
209
|
+
When `ROUTING_WEIGHTED_SCORING=true`, the analyzer uses a 15-dimension weighted scoring system instead of the standard additive scoring:
|
|
210
|
+
|
|
211
|
+
```
|
|
212
|
+
Score = Sum of (dimension_value * weight) for all 15 dimensions
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
#### Dimension Weights
|
|
216
|
+
|
|
217
|
+
**Content Analysis (35% total):**
|
|
218
|
+
|
|
219
|
+
| Dimension | Weight | Measures |
|
|
220
|
+
|-----------|--------|----------|
|
|
221
|
+
| tokenCount | 0.08 | Request size (token estimate) |
|
|
222
|
+
| promptComplexity | 0.10 | Sentence structure, average length |
|
|
223
|
+
| technicalDepth | 0.10 | Technical keyword density |
|
|
224
|
+
| domainSpecificity | 0.07 | Number of specialized domains (security, ML, distributed, database, frontend, devops) |
|
|
225
|
+
|
|
226
|
+
**Tool Analysis (25% total):**
|
|
227
|
+
|
|
228
|
+
| Dimension | Weight | Measures |
|
|
229
|
+
|-----------|--------|----------|
|
|
230
|
+
| toolCount | 0.08 | Number of tools in request |
|
|
231
|
+
| toolComplexity | 0.10 | Weighted average of tool complexity (Bash=0.9, Write=0.8, Edit=0.7, Read=0.3, Glob/Grep=0.2) |
|
|
232
|
+
| toolChainPotential | 0.07 | Sequential operation indicators ("then", "after", "step 1") |
|
|
233
|
+
|
|
234
|
+
**Reasoning Requirements (25% total):**
|
|
235
|
+
|
|
236
|
+
| Dimension | Weight | Measures |
|
|
237
|
+
|-----------|--------|----------|
|
|
238
|
+
| multiStepReasoning | 0.10 | Step-by-step / planning patterns |
|
|
239
|
+
| codeGeneration | 0.08 | Code creation requests |
|
|
240
|
+
| analysisDepth | 0.07 | Trade-off / analysis patterns |
|
|
241
|
+
|
|
242
|
+
**Context Factors (15% total):**
|
|
243
|
+
|
|
244
|
+
| Dimension | Weight | Measures |
|
|
245
|
+
|-----------|--------|----------|
|
|
246
|
+
| conversationDepth | 0.05 | Message count in conversation |
|
|
247
|
+
| priorToolUsage | 0.05 | Tool results already in conversation |
|
|
248
|
+
| ambiguity | 0.05 | Inverse of request specificity |
|
|
249
|
+
|
|
250
|
+
Each dimension is scored 0-100 independently, then multiplied by its weight. The final score is the rounded sum.
|
|
251
|
+
|
|
252
|
+
### Phase 3: Metrics Tracking
|
|
253
|
+
|
|
254
|
+
Every routing decision is recorded in-memory (last 1,000 decisions) for analytics:
|
|
255
|
+
- Total decisions, local vs. cloud split
|
|
256
|
+
- Average complexity score
|
|
257
|
+
- Per-provider and per-tier distribution
|
|
258
|
+
|
|
259
|
+
Metrics are exposed via the `/metrics` endpoint and `X-Lynkr-*` response headers.
|
|
260
|
+
|
|
261
|
+
### Phase 4: Embeddings-Based Similarity (Optional)
|
|
262
|
+
|
|
263
|
+
When an embeddings model is configured (`OLLAMA_EMBEDDINGS_MODEL`), the analyzer can compare request content against reference embeddings for complex and simple tasks using cosine similarity. This produces a score adjustment of -10 to +10 points.
|
|
264
|
+
|
|
265
|
+
---
|
|
266
|
+
|
|
267
|
+
## Agentic Workflow Detection
|
|
268
|
+
|
|
269
|
+
The agentic detector identifies multi-step tool chains and autonomous agent patterns, boosting the complexity tier accordingly.
|
|
270
|
+
|
|
271
|
+
### Agent Types
|
|
272
|
+
|
|
273
|
+
| Type | Score Boost | Min Tier | Description |
|
|
274
|
+
|------|------------|----------|-------------|
|
|
275
|
+
| **SINGLE_SHOT** | +0 | SIMPLE | Simple request-response, no tool chains |
|
|
276
|
+
| **TOOL_CHAIN** | +15 | MEDIUM | Sequential tool usage (read -> edit -> test) |
|
|
277
|
+
| **ITERATIVE** | +25 | COMPLEX | Retry loops, debugging cycles, iterative refinement |
|
|
278
|
+
| **AUTONOMOUS** | +35 | REASONING | Open-ended tasks, full autonomy, complex decision making |
|
|
279
|
+
|
|
280
|
+
### Detection Signals
|
|
281
|
+
|
|
282
|
+
The detector evaluates 6 signal categories:
|
|
283
|
+
|
|
284
|
+
**1. Tool Count**
|
|
285
|
+
- 4-5 tools: +8
|
|
286
|
+
- 6-10 tools: +15
|
|
287
|
+
- 11+ tools: +25
|
|
288
|
+
|
|
289
|
+
**2. Agentic Tools Present** (Bash, Write, Edit, Task, Git, Test)
|
|
290
|
+
- 1 agentic tool: +8
|
|
291
|
+
- 2-3 agentic tools: +15
|
|
292
|
+
- 4+ agentic tools: +25
|
|
293
|
+
|
|
294
|
+
**3. Prior Tool Results** (already in an agentic loop)
|
|
295
|
+
- 1-2 tool results: +10
|
|
296
|
+
- 3-5 tool results: +20
|
|
297
|
+
- 6+ tool results: +30
|
|
298
|
+
|
|
299
|
+
**4. Content Pattern Matching**
|
|
300
|
+
- Autonomous patterns ("figure out", "solve", "make it work"): +25
|
|
301
|
+
- Iterative patterns ("keep trying", "debug", "retry"): +20
|
|
302
|
+
- Tool chain patterns ("then use", "next step", "step 1"): +15
|
|
303
|
+
- Multi-file work: +15
|
|
304
|
+
- Planning required: +10
|
|
305
|
+
- Implementation + testing: +15
|
|
306
|
+
|
|
307
|
+
**5. Conversation Depth**
|
|
308
|
+
- 5-8 messages: +6
|
|
309
|
+
- 9-15 messages: +12
|
|
310
|
+
- 16+ messages: +20
|
|
311
|
+
|
|
312
|
+
**6. Content Length**
|
|
313
|
+
- 2,000+ characters: +10
|
|
314
|
+
|
|
315
|
+
### Classification Thresholds
|
|
316
|
+
|
|
317
|
+
| Agent Type | Score Threshold | Additional Conditions |
|
|
318
|
+
|------------|----------------|----------------------|
|
|
319
|
+
| AUTONOMOUS | >= 60 | or autonomous pattern + score >= 40 |
|
|
320
|
+
| ITERATIVE | >= 40 | or deep tool loop + score >= 30 |
|
|
321
|
+
| TOOL_CHAIN | >= 20 | or many agentic tools present |
|
|
322
|
+
| SINGLE_SHOT | < 20 | Default |
|
|
323
|
+
|
|
324
|
+
When an agentic workflow is detected (`score >= 25`), the complexity score is boosted by the agent type's `scoreBoost` value, and the tier is upgraded to at least the agent type's `minTier`.
|
|
325
|
+
|
|
326
|
+
---
|
|
327
|
+
|
|
328
|
+
## Force Patterns
|
|
329
|
+
|
|
330
|
+
Certain requests bypass the scoring algorithm entirely:
|
|
331
|
+
|
|
332
|
+
### Force Local (always local model)
|
|
333
|
+
- Greetings: "hi", "hello", "thanks", "bye"
|
|
334
|
+
- Time queries: "what time is it"
|
|
335
|
+
- Confirmations: "yes", "no", "ok", "sure"
|
|
336
|
+
- Help requests: "help", "commands"
|
|
337
|
+
|
|
338
|
+
### Force Cloud (always cloud model)
|
|
339
|
+
- Security audits/reviews
|
|
340
|
+
- Architecture design/review
|
|
341
|
+
- Complete codebase refactoring
|
|
342
|
+
- Code/PR reviews
|
|
343
|
+
- Complex debugging
|
|
344
|
+
- Production incidents
|
|
345
|
+
|
|
346
|
+
---
|
|
347
|
+
|
|
348
|
+
## Cost Optimization
|
|
349
|
+
|
|
350
|
+
When `ROUTING_COST_OPTIMIZATION=true`, the router checks if a cheaper model can handle the determined tier.
|
|
351
|
+
|
|
352
|
+
### Model Registry
|
|
353
|
+
|
|
354
|
+
Pricing data is fetched from three sources (in priority order):
|
|
355
|
+
|
|
356
|
+
1. **LiteLLM** (highest priority) - Community-maintained pricing from [BerriAI/litellm](https://github.com/BerriAI/litellm)
|
|
357
|
+
2. **models.dev** - API pricing aggregator
|
|
358
|
+
3. **Databricks Fallback** - Hardcoded pricing for common models (Claude, Llama, GPT, Gemini, DBRX)
|
|
359
|
+
|
|
360
|
+
Pricing data is cached locally in `data/model-prices-cache.json` with a 24-hour TTL. Background refresh happens automatically when the cache is stale.
|
|
361
|
+
|
|
362
|
+
### Cost Tracking
|
|
363
|
+
|
|
364
|
+
The optimizer tracks costs at both session and global levels:
|
|
365
|
+
- Per-request cost recording (input + output tokens)
|
|
366
|
+
- Per-model, per-provider, per-tier breakdowns
|
|
367
|
+
- Savings calculation when routing to cheaper alternatives
|
|
368
|
+
|
|
369
|
+
### Pricing Lookup
|
|
370
|
+
|
|
371
|
+
The registry supports flexible model name lookup:
|
|
372
|
+
- Direct match: `gpt-4o`
|
|
373
|
+
- Provider prefix stripping: `databricks-claude-sonnet-4-5` -> `claude-sonnet-4-5`
|
|
374
|
+
- Fuzzy matching for partial names
|
|
375
|
+
|
|
376
|
+
---
|
|
377
|
+
|
|
378
|
+
## Routing Headers
|
|
379
|
+
|
|
380
|
+
Every response includes routing metadata in `X-Lynkr-*` headers:
|
|
381
|
+
|
|
382
|
+
| Header | Description | Example |
|
|
383
|
+
|--------|-------------|---------|
|
|
384
|
+
| `X-Lynkr-Routing-Method` | How the decision was made | `tier_config`, `force`, `tool_threshold`, `agentic`, `cost_optimized` |
|
|
385
|
+
| `X-Lynkr-Provider` | Selected provider | `databricks`, `ollama`, `openrouter` |
|
|
386
|
+
| `X-Lynkr-Complexity-Score` | Complexity score (0-100) | `42` |
|
|
387
|
+
| `X-Lynkr-Complexity-Threshold` | Score threshold for cloud routing | `40` |
|
|
388
|
+
| `X-Lynkr-Routing-Reason` | Human-readable reason | `force_local_pattern`, `autonomous_workflow` |
|
|
389
|
+
| `X-Lynkr-Tier` | Selected model tier | `SIMPLE`, `MEDIUM`, `COMPLEX`, `REASONING` |
|
|
390
|
+
| `X-Lynkr-Model` | Selected model | `llama3.2`, `gpt-4o`, `claude-opus-4-6` |
|
|
391
|
+
| `X-Lynkr-Agentic` | Agentic workflow type (if detected) | `TOOL_CHAIN`, `ITERATIVE`, `AUTONOMOUS` |
|
|
392
|
+
| `X-Lynkr-Cost-Optimized` | Whether cost optimization was applied | `true` |
|
|
393
|
+
|
|
394
|
+
---
|
|
395
|
+
|
|
396
|
+
## Configuration Reference
|
|
397
|
+
|
|
398
|
+
### Environment Variables
|
|
399
|
+
|
|
400
|
+
| Variable | Default | Description |
|
|
401
|
+
|----------|---------|-------------|
|
|
402
|
+
| `TIER_SIMPLE` | *required* | Model for simple tier (`provider:model`) |
|
|
403
|
+
| `TIER_MEDIUM` | *required* | Model for medium tier (`provider:model`) |
|
|
404
|
+
| `TIER_COMPLEX` | *required* | Model for complex tier (`provider:model`) |
|
|
405
|
+
| `TIER_REASONING` | *required* | Model for reasoning tier (`provider:model`) |
|
|
406
|
+
| `SMART_TOOL_SELECTION_MODE` | `heuristic` | Scoring mode: `aggressive` (threshold=60), `heuristic` (threshold=40), `conservative` (threshold=25) |
|
|
407
|
+
| `ROUTING_WEIGHTED_SCORING` | `false` | Enable 15-dimension weighted scoring |
|
|
408
|
+
| `ROUTING_AGENTIC_DETECTION` | `true` | Enable agentic workflow detection |
|
|
409
|
+
| `ROUTING_COST_OPTIMIZATION` | `false` | Enable cost-based model selection |
|
|
410
|
+
| `OLLAMA_MAX_TOOLS_FOR_ROUTING` | `3` | Max tools before routing away from Ollama |
|
|
411
|
+
| `OPENROUTER_MAX_TOOLS_FOR_ROUTING` | `15` | Max tools before routing away from OpenRouter |
|
|
412
|
+
| `OLLAMA_EMBEDDINGS_MODEL` | *(none)* | Embeddings model for Phase 4 similarity |
|
|
413
|
+
|
|
414
|
+
### Smart Tool Selection Modes
|
|
415
|
+
|
|
416
|
+
| Mode | Threshold | Behavior |
|
|
417
|
+
|------|-----------|----------|
|
|
418
|
+
| `aggressive` | 60 | More requests go to local (saves cost) |
|
|
419
|
+
| `heuristic` | 40 | Balanced local/cloud split |
|
|
420
|
+
| `conservative` | 25 | More requests go to cloud (better quality) |
|
|
421
|
+
|
|
422
|
+
---
|
|
423
|
+
|
|
424
|
+
## Routing Decision Flow
|
|
425
|
+
|
|
426
|
+
```
|
|
427
|
+
1. Are all 4 TIER_* env vars configured?
|
|
428
|
+
└─ No → Return static provider (MODEL_PROVIDER), skip all routing
|
|
429
|
+
|
|
430
|
+
2. Does content match FORCE_LOCAL patterns?
|
|
431
|
+
└─ Yes → Route to local provider
|
|
432
|
+
|
|
433
|
+
3. Does content match FORCE_CLOUD patterns?
|
|
434
|
+
└─ Yes → Route to best cloud provider (requires FALLBACK_ENABLED)
|
|
435
|
+
|
|
436
|
+
4. Analyze complexity:
|
|
437
|
+
└─ Calculate score 0-100 (standard or weighted mode)
|
|
438
|
+
|
|
439
|
+
5. Optional: Embeddings adjustment:
|
|
440
|
+
└─ Adjust score by -10 to +10 based on semantic similarity
|
|
441
|
+
|
|
442
|
+
6. Agentic detection:
|
|
443
|
+
└─ If agentic → Boost score, enforce minimum tier
|
|
444
|
+
└─ If AUTONOMOUS → Force cloud provider
|
|
445
|
+
|
|
446
|
+
7. Map score to tier (SIMPLE/MEDIUM/COMPLEX/REASONING)
|
|
447
|
+
|
|
448
|
+
8. Select provider:model from matching TIER_* env var
|
|
449
|
+
|
|
450
|
+
9. Optional: Cost optimization
|
|
451
|
+
└─ Check for cheaper model that can handle the tier
|
|
452
|
+
|
|
453
|
+
10. Return { provider, model, tier, score, method }
|
|
454
|
+
```
|
|
455
|
+
|
|
456
|
+
---
|
|
457
|
+
|
|
458
|
+
## Source Files
|
|
459
|
+
|
|
460
|
+
| File | Description |
|
|
461
|
+
|------|-------------|
|
|
462
|
+
| `src/routing/index.js` | Main routing orchestrator (`determineProviderSmart()`) |
|
|
463
|
+
| `src/routing/complexity-analyzer.js` | 4-phase complexity analysis, 15-dimension weighted scoring |
|
|
464
|
+
| `src/routing/agentic-detector.js` | Agentic workflow detection and classification |
|
|
465
|
+
| `src/routing/model-tiers.js` | Tier definitions, model selection from `TIER_*` env vars |
|
|
466
|
+
| `src/routing/model-registry.js` | Multi-source pricing (LiteLLM, models.dev, Databricks fallback) |
|
|
467
|
+
| `src/routing/cost-optimizer.js` | Cost tracking, cheapest model finder, savings calculation |
|
|
468
|
+
|
|
469
|
+
---
|
|
470
|
+
|
|
471
|
+
## Next Steps
|
|
472
|
+
|
|
473
|
+
- **[Features Overview](features.md)** - Architecture and request flow
|
|
474
|
+
- **[Token Optimization](token-optimization.md)** - Cost reduction strategies
|
|
475
|
+
- **[Provider Configuration](providers.md)** - Setting up providers
|
|
476
|
+
- **[Production Guide](production.md)** - Deploy with routing enabled
|
|
@@ -252,14 +252,16 @@ LOG_LEVEL=info
|
|
|
252
252
|
# No configuration needed
|
|
253
253
|
```
|
|
254
254
|
|
|
255
|
-
### 2. Use
|
|
255
|
+
### 2. Use Tier-Based Routing
|
|
256
256
|
|
|
257
257
|
```bash
|
|
258
|
-
# Route simple requests to free Ollama
|
|
259
|
-
|
|
258
|
+
# Route simple requests to free Ollama, complex to cloud
|
|
259
|
+
# Set all 4 TIER_* env vars to enable tier-based routing
|
|
260
|
+
TIER_SIMPLE=ollama:llama3.2
|
|
261
|
+
TIER_MEDIUM=openrouter:openai/gpt-4o-mini
|
|
262
|
+
TIER_COMPLEX=azure-openai:gpt-4o
|
|
263
|
+
TIER_REASONING=azure-openai:gpt-4o
|
|
260
264
|
FALLBACK_ENABLED=true
|
|
261
|
-
|
|
262
|
-
# Complex requests automatically go to cloud
|
|
263
265
|
FALLBACK_PROVIDER=databricks
|
|
264
266
|
```
|
|
265
267
|
|
|
@@ -400,6 +400,75 @@ Error: Cannot find module 'xxx'
|
|
|
400
400
|
|
|
401
401
|
---
|
|
402
402
|
|
|
403
|
+
### Moonshot AI (Kimi)
|
|
404
|
+
|
|
405
|
+
**Issue:** Rate limited (429)
|
|
406
|
+
|
|
407
|
+
**Symptoms:**
|
|
408
|
+
- `429 Too Many Requests`
|
|
409
|
+
- `Rate limit exceeded`
|
|
410
|
+
- Responses failing intermittently
|
|
411
|
+
|
|
412
|
+
**Solutions:**
|
|
413
|
+
|
|
414
|
+
1. **Reduce concurrency:**
|
|
415
|
+
Moonshot has a max concurrency of ~3 requests. Lynkr retries automatically with backoff, but sustained high concurrency will trigger 429s.
|
|
416
|
+
|
|
417
|
+
2. **Use turbo model:**
|
|
418
|
+
```bash
|
|
419
|
+
# Turbo has higher rate limits than thinking model
|
|
420
|
+
export MOONSHOT_MODEL=kimi-k2-turbo-preview
|
|
421
|
+
```
|
|
422
|
+
|
|
423
|
+
3. **Enable fallback:**
|
|
424
|
+
```bash
|
|
425
|
+
export FALLBACK_ENABLED=true
|
|
426
|
+
export FALLBACK_PROVIDER=openrouter
|
|
427
|
+
```
|
|
428
|
+
|
|
429
|
+
**Issue:** Authentication failed
|
|
430
|
+
|
|
431
|
+
**Symptoms:**
|
|
432
|
+
- `401 Unauthorized`
|
|
433
|
+
- `Invalid API key`
|
|
434
|
+
|
|
435
|
+
**Solutions:**
|
|
436
|
+
|
|
437
|
+
1. **Check API key format:**
|
|
438
|
+
```bash
|
|
439
|
+
echo $MOONSHOT_API_KEY
|
|
440
|
+
# Should start with: sk-
|
|
441
|
+
```
|
|
442
|
+
|
|
443
|
+
2. **Regenerate API key:**
|
|
444
|
+
- Visit [platform.moonshot.ai](https://platform.moonshot.ai)
|
|
445
|
+
- Generate a new key
|
|
446
|
+
- Update environment variable
|
|
447
|
+
|
|
448
|
+
3. **Check endpoint:**
|
|
449
|
+
```bash
|
|
450
|
+
echo $MOONSHOT_ENDPOINT
|
|
451
|
+
# Should be: https://api.moonshot.ai/v1/chat/completions
|
|
452
|
+
```
|
|
453
|
+
|
|
454
|
+
**Issue:** Reasoning content displayed in output
|
|
455
|
+
|
|
456
|
+
**Symptoms:**
|
|
457
|
+
- Response includes chain-of-thought text before the actual answer
|
|
458
|
+
- Long preambles like "The user is asking me to..."
|
|
459
|
+
|
|
460
|
+
**Solutions:**
|
|
461
|
+
|
|
462
|
+
This happens when using `kimi-k2-thinking` model. Lynkr should automatically strip reasoning content and only show the final answer. If you see reasoning in the output:
|
|
463
|
+
|
|
464
|
+
1. **Update Lynkr** to the latest version
|
|
465
|
+
2. **Switch to turbo model** if reasoning output is not needed:
|
|
466
|
+
```bash
|
|
467
|
+
export MOONSHOT_MODEL=kimi-k2-turbo-preview
|
|
468
|
+
```
|
|
469
|
+
|
|
470
|
+
---
|
|
471
|
+
|
|
403
472
|
### llama.cpp
|
|
404
473
|
|
|
405
474
|
**Issue:** Server not responding
|
|
@@ -556,9 +625,13 @@ Error: Cannot find module 'xxx'
|
|
|
556
625
|
export OLLAMA_MODEL=llama3.1:8b
|
|
557
626
|
```
|
|
558
627
|
|
|
559
|
-
3. **Enable
|
|
628
|
+
3. **Enable tier-based routing:**
|
|
560
629
|
```bash
|
|
561
|
-
|
|
630
|
+
# Set all 4 TIER_* env vars to enable tier-based routing
|
|
631
|
+
export TIER_SIMPLE=ollama:llama3.2
|
|
632
|
+
export TIER_MEDIUM=openrouter:openai/gpt-4o-mini
|
|
633
|
+
export TIER_COMPLEX=azure-openai:gpt-4o
|
|
634
|
+
export TIER_REASONING=azure-openai:gpt-4o
|
|
562
635
|
export FALLBACK_ENABLED=true
|
|
563
636
|
```
|
|
564
637
|
|
|
@@ -746,10 +819,13 @@ Restart Lynkr after configuration.
|
|
|
746
819
|
export LOAD_SHEDDING_ACTIVE_REQUESTS_THRESHOLD=100
|
|
747
820
|
```
|
|
748
821
|
|
|
749
|
-
2. **Use
|
|
822
|
+
2. **Use tier-based routing to send simple requests to local models:**
|
|
750
823
|
```bash
|
|
751
|
-
|
|
752
|
-
export
|
|
824
|
+
# Set all 4 TIER_* env vars to enable tier-based routing
|
|
825
|
+
export TIER_SIMPLE=ollama:llama3.2
|
|
826
|
+
export TIER_MEDIUM=openrouter:openai/gpt-4o-mini
|
|
827
|
+
export TIER_COMPLEX=azure-openai:gpt-4o
|
|
828
|
+
export TIER_REASONING=azure-openai:gpt-4o
|
|
753
829
|
```
|
|
754
830
|
|
|
755
831
|
3. **Enable circuit breaker:**
|
package/install.sh
CHANGED
|
@@ -134,10 +134,15 @@ MODEL_PROVIDER=ollama
|
|
|
134
134
|
PORT=8080
|
|
135
135
|
|
|
136
136
|
# Ollama Configuration (default for local development)
|
|
137
|
-
PREFER_OLLAMA=true
|
|
138
137
|
OLLAMA_MODEL=qwen2.5-coder:7b
|
|
139
138
|
OLLAMA_ENDPOINT=http://localhost:11434
|
|
140
139
|
|
|
140
|
+
# Tier-based routing (uncomment and configure to enable)
|
|
141
|
+
# TIER_SIMPLE=ollama:qwen2.5-coder:7b
|
|
142
|
+
# TIER_MEDIUM=ollama:qwen2.5-coder:7b
|
|
143
|
+
# TIER_COMPLEX=ollama:qwen2.5-coder:7b
|
|
144
|
+
# TIER_REASONING=ollama:qwen2.5-coder:7b
|
|
145
|
+
|
|
141
146
|
# Long-Term Memory System (Titans-Inspired) - Enabled by default
|
|
142
147
|
MEMORY_ENABLED=true
|
|
143
148
|
MEMORY_RETRIEVAL_LIMIT=5
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "lynkr",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "8.0.0",
|
|
4
4
|
"description": "Self-hosted Claude Code & Cursor proxy with Databricks,AWS BedRock,Azure adapters, openrouter, Ollama,llamacpp,LM Studio, workspace tooling, and MCP integration.",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"bin": {
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
"dev": "nodemon index.js",
|
|
15
15
|
"lint": "eslint src index.js",
|
|
16
16
|
"test": "npm run test:unit && npm run test:performance",
|
|
17
|
-
"test:unit": "DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node --test test/routing.test.js test/hybrid-routing-integration.test.js test/web-tools.test.js test/passthrough-mode.test.js test/openrouter-error-resilience.test.js test/format-conversion.test.js test/azure-openai-config.test.js test/azure-openai-format-conversion.test.js test/azure-openai-routing.test.js test/azure-openai-streaming.test.js test/azure-openai-error-resilience.test.js test/azure-openai-integration.test.js test/openai-integration.test.js test/llamacpp-integration.test.js test/memory/store.test.js test/memory/surprise.test.js test/memory/extractor.test.js test/memory/search.test.js test/memory/retriever.test.js",
|
|
17
|
+
"test:unit": "DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node --test test/routing.test.js test/hybrid-routing-integration.test.js test/web-tools.test.js test/passthrough-mode.test.js test/openrouter-error-resilience.test.js test/format-conversion.test.js test/azure-openai-config.test.js test/azure-openai-format-conversion.test.js test/azure-openai-routing.test.js test/azure-openai-streaming.test.js test/azure-openai-error-resilience.test.js test/azure-openai-integration.test.js test/openai-integration.test.js test/toon-compression.test.js test/llamacpp-integration.test.js test/memory/store.test.js test/memory/surprise.test.js test/memory/extractor.test.js test/memory/search.test.js test/memory/retriever.test.js",
|
|
18
18
|
"test:memory": "DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node --test test/memory/store.test.js test/memory/surprise.test.js test/memory/extractor.test.js test/memory/search.test.js test/memory/retriever.test.js",
|
|
19
19
|
"test:new-features": "DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node --test test/passthrough-mode.test.js test/openrouter-error-resilience.test.js test/format-conversion.test.js",
|
|
20
20
|
"test:performance": "DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node test/hybrid-routing-performance.test.js && DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node test/performance-tests.js",
|
|
@@ -47,9 +47,9 @@
|
|
|
47
47
|
"@azure/openai": "^2.0.0",
|
|
48
48
|
"@babel/parser": "^7.29.0",
|
|
49
49
|
"@babel/traverse": "^7.29.0",
|
|
50
|
+
"@toon-format/toon": "^2.1.0",
|
|
50
51
|
"compression": "^1.7.4",
|
|
51
52
|
"diff": "^5.2.0",
|
|
52
|
-
"dockerode": "^4.0.2",
|
|
53
53
|
"dotenv": "^16.4.5",
|
|
54
54
|
"express": "^5.1.0",
|
|
55
55
|
"express-rate-limit": "^8.2.1",
|
|
@@ -58,10 +58,12 @@
|
|
|
58
58
|
"openai": "^6.14.0",
|
|
59
59
|
"pino": "^8.17.2",
|
|
60
60
|
"pino-http": "^8.6.0",
|
|
61
|
+
"pino-roll": "^4.0.0",
|
|
61
62
|
"undici": "^6.22.0"
|
|
62
63
|
},
|
|
63
64
|
"optionalDependencies": {
|
|
64
65
|
"better-sqlite3": "^12.6.2",
|
|
66
|
+
"dockerode": "^4.0.2",
|
|
65
67
|
"tree-sitter": "^0.21.1",
|
|
66
68
|
"tree-sitter-javascript": "^0.21.0",
|
|
67
69
|
"tree-sitter-python": "^0.21.0",
|
package/scripts/setup.js
CHANGED
|
@@ -251,7 +251,6 @@ async function createEnvFile() {
|
|
|
251
251
|
if (ollamaOnly) {
|
|
252
252
|
let envContent = fs.readFileSync(envPath, "utf-8");
|
|
253
253
|
envContent = envContent.replace(/^# MODEL_PROVIDER=databricks/m, "MODEL_PROVIDER=ollama");
|
|
254
|
-
envContent = envContent.replace(/^PREFER_OLLAMA=true/m, "# PREFER_OLLAMA=true # Not needed when MODEL_PROVIDER=ollama");
|
|
255
254
|
envContent = envContent.replace(/^FALLBACK_ENABLED=true/m, "FALLBACK_ENABLED=false");
|
|
256
255
|
fs.writeFileSync(envPath, envContent);
|
|
257
256
|
}
|