lynkr 7.2.4 → 8.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. package/README.md +2 -2
  2. package/config/model-tiers.json +89 -0
  3. package/docs/docs.html +1 -0
  4. package/docs/index.md +7 -0
  5. package/docs/toon-integration-spec.md +130 -0
  6. package/documentation/README.md +3 -2
  7. package/documentation/claude-code-cli.md +23 -16
  8. package/documentation/cursor-integration.md +17 -14
  9. package/documentation/docker.md +11 -4
  10. package/documentation/embeddings.md +7 -5
  11. package/documentation/faq.md +66 -12
  12. package/documentation/features.md +22 -15
  13. package/documentation/installation.md +66 -14
  14. package/documentation/production.md +43 -8
  15. package/documentation/providers.md +145 -42
  16. package/documentation/routing.md +476 -0
  17. package/documentation/token-optimization.md +7 -5
  18. package/documentation/troubleshooting.md +81 -5
  19. package/install.sh +6 -1
  20. package/package.json +5 -3
  21. package/scripts/setup.js +0 -1
  22. package/src/agents/executor.js +14 -6
  23. package/src/api/middleware/session.js +15 -2
  24. package/src/api/openai-router.js +130 -37
  25. package/src/api/providers-handler.js +15 -1
  26. package/src/api/router.js +107 -2
  27. package/src/budget/index.js +4 -3
  28. package/src/clients/databricks.js +431 -234
  29. package/src/clients/gpt-utils.js +181 -0
  30. package/src/clients/ollama-utils.js +66 -140
  31. package/src/clients/routing.js +0 -1
  32. package/src/clients/standard-tools.js +82 -5
  33. package/src/config/index.js +119 -35
  34. package/src/context/toon.js +173 -0
  35. package/src/headroom/launcher.js +8 -3
  36. package/src/logger/index.js +23 -0
  37. package/src/orchestrator/index.js +765 -212
  38. package/src/routing/agentic-detector.js +320 -0
  39. package/src/routing/complexity-analyzer.js +202 -2
  40. package/src/routing/cost-optimizer.js +305 -0
  41. package/src/routing/index.js +168 -159
  42. package/src/routing/model-registry.js +437 -0
  43. package/src/routing/model-tiers.js +365 -0
  44. package/src/server.js +2 -2
  45. package/src/sessions/cleanup.js +3 -3
  46. package/src/sessions/record.js +10 -1
  47. package/src/sessions/store.js +7 -2
  48. package/src/tools/agent-task.js +48 -1
  49. package/src/tools/index.js +15 -2
  50. package/src/tools/workspace.js +35 -4
  51. package/src/workspace/index.js +30 -0
  52. package/te +11622 -0
  53. package/test/README.md +1 -1
  54. package/test/azure-openai-config.test.js +17 -8
  55. package/test/azure-openai-integration.test.js +7 -1
  56. package/test/azure-openai-routing.test.js +41 -43
  57. package/test/bedrock-integration.test.js +18 -32
  58. package/test/hybrid-routing-integration.test.js +35 -20
  59. package/test/hybrid-routing-performance.test.js +74 -64
  60. package/test/llamacpp-integration.test.js +28 -9
  61. package/test/lmstudio-integration.test.js +20 -8
  62. package/test/openai-integration.test.js +17 -20
  63. package/test/performance-tests.js +1 -1
  64. package/test/routing.test.js +65 -59
  65. package/test/toon-compression.test.js +131 -0
  66. package/CLAWROUTER_ROUTING_PLAN.md +0 -910
  67. package/ROUTER_COMPARISON.md +0 -173
  68. package/TIER_ROUTING_PLAN.md +0 -771
@@ -0,0 +1,476 @@
1
+ # Intelligent Routing & Model Tiering
2
+
3
+ Lynkr's intelligent routing system automatically selects the optimal model and provider for each request based on complexity analysis, agentic workflow detection, and cost optimization.
4
+
5
+ ---
6
+
7
+ ## Overview
8
+
9
+ ```
10
+ Request → Force Patterns → Tool Thresholds → Complexity Analysis → Agentic Detection → Tier Selection → Cost Optimization → Provider
11
+ ```
12
+
13
+ The routing pipeline evaluates every incoming request through multiple stages to determine which model tier and provider should handle it. Simple requests go to cheap/local models, complex ones go to powerful cloud models.
14
+
15
+ **Key benefits:**
16
+ - 60-80% cost reduction by routing simple tasks to cheaper models
17
+ - Better quality on complex tasks by using capable models when needed
18
+ - Automatic agentic workflow detection with tier upgrades
19
+ - Multi-source pricing for optimal cost decisions
20
+
21
+ ---
22
+
23
+ ## 4-Tier Model System
24
+
25
+ Every request is mapped to one of four complexity tiers:
26
+
27
+ | Tier | Score Range | Description | Example Tasks |
28
+ |------|-----------|-------------|---------------|
29
+ | **SIMPLE** | 0-25 | Greetings, simple Q&A, confirmations | "Hello", "What is a variable?", "Yes" |
30
+ | **MEDIUM** | 26-50 | Code reading, simple edits, research | "Read this file", "Fix this typo", "Search for X" |
31
+ | **COMPLEX** | 51-75 | Multi-file changes, debugging, architecture | "Refactor auth module", "Debug this race condition" |
32
+ | **REASONING** | 76-100 | Complex analysis, security audits, novel problems | "Security audit", "Design microservices architecture" |
33
+
34
+ ### Configuration
35
+
36
+ Tiers are configured via mandatory environment variables in `provider:model` format:
37
+
38
+ ```bash
39
+ # Required - one per tier
40
+ TIER_SIMPLE=ollama:llama3.2
41
+ TIER_MEDIUM=openai:gpt-4o
42
+ TIER_COMPLEX=openai:o1-mini
43
+ TIER_REASONING=openai:o1
44
+
45
+ # Examples with other providers
46
+ TIER_SIMPLE=ollama:qwen2.5-coder
47
+ TIER_MEDIUM=databricks:databricks-claude-sonnet-4-5
48
+ TIER_COMPLEX=azure-openai:gpt-5.2-chat
49
+ TIER_REASONING=databricks:databricks-claude-opus-4-6
50
+ ```
51
+
52
+ If a model name is given without a provider prefix, the default provider (`MODEL_PROVIDER`) is used.
53
+
54
+ ### Routing Precedence
55
+
56
+ There are three routing-related settings. Here is exactly how they interact:
57
+
58
+ #### 1. `TIER_*` Environment Variables (Highest Priority)
59
+
60
+ When **all four** `TIER_*` vars are set (`TIER_SIMPLE`, `TIER_MEDIUM`, `TIER_COMPLEX`, `TIER_REASONING`), tiered routing is **active**. Every incoming request is scored for complexity (0-100), mapped to a tier, and routed to the `provider:model` specified in the matching `TIER_*` var.
61
+
62
+ In this mode, `MODEL_PROVIDER` is **not consulted** for routing decisions. The provider comes directly from the `TIER_*` value (e.g., `ollama:llama3.2` routes to Ollama, `openai:gpt-4o` routes to OpenAI).
63
+
64
+ If any of the four `TIER_*` vars are missing, tiered routing is **completely disabled** and the system falls back to `MODEL_PROVIDER`.
65
+
66
+ #### 2. `MODEL_PROVIDER` (Default / Fallback)
67
+
68
+ `MODEL_PROVIDER` controls routing in two scenarios:
69
+
70
+ - **When tiered routing is disabled** (any `TIER_*` var missing) — all requests go to the provider set in `MODEL_PROVIDER`, regardless of complexity. This is static routing.
71
+ - **When a `TIER_*` value has no provider prefix** (e.g., `TIER_SIMPLE=llama3.2` instead of `TIER_SIMPLE=ollama:llama3.2`) — `MODEL_PROVIDER` is used as the default provider for that tier.
72
+
73
+ Even when tiered routing is active and overrides it for request routing, `MODEL_PROVIDER` is still used for:
74
+ - **Startup checks** — e.g., if `MODEL_PROVIDER=ollama`, the server waits for Ollama to be reachable before accepting requests
75
+ - **Provider discovery API** (`/v1/providers`) — marks which provider is "primary" in the response
76
+ - **Embeddings routing** — the OpenAI-compatible router checks `MODEL_PROVIDER` for embedding provider selection
77
+
78
+ **Always set `MODEL_PROVIDER`** even when using tier routing.
79
+
80
+ #### 3. `PREFER_OLLAMA` (Removed)
81
+
82
+ `PREFER_OLLAMA` is **deprecated and has no effect**. If set, a warning is logged at startup:
83
+
84
+ ```
85
+ [DEPRECATION] PREFER_OLLAMA is removed. Use TIER_* env vars for routing.
86
+ ```
87
+
88
+ To route simple requests to Ollama, use `TIER_SIMPLE=ollama:<model>` instead.
89
+
90
+ #### Summary Table
91
+
92
+ | Configuration | Routing Behavior |
93
+ |---|---|
94
+ | All 4 `TIER_*` set | Tier routing active. Each request scored and routed to its tier's `provider:model`. `MODEL_PROVIDER` ignored for routing. |
95
+ | 1-3 `TIER_*` set | Tier routing **disabled**. All requests go to `MODEL_PROVIDER` (static). |
96
+ | No `TIER_*` set | Static routing. All requests go to `MODEL_PROVIDER`. |
97
+ | `TIER_*` value without provider prefix | `MODEL_PROVIDER` used as the default provider for that tier. |
98
+ | `PREFER_OLLAMA` set | No effect. Deprecation warning logged. |
99
+
100
+ #### Example: Mixed Local + Cloud Setup
101
+
102
+ ```bash
103
+ MODEL_PROVIDER=ollama # Startup checks + default provider
104
+ TIER_SIMPLE=ollama:llama3.2 # Score 0-25 → Ollama (free, local)
105
+ TIER_MEDIUM=openai:gpt-4o # Score 26-50 → OpenAI
106
+ TIER_COMPLEX=databricks:claude-sonnet-4-5 # Score 51-75 → Databricks
107
+ TIER_REASONING=databricks:claude-opus-4-6 # Score 76-100 → Databricks
108
+ ```
109
+
110
+ In this setup, a "Hello" message (score ~5) routes to Ollama. A "Refactor the auth module" message (score ~65) routes to Databricks. `MODEL_PROVIDER=ollama` ensures the server waits for Ollama at startup but does not affect where complex requests go.
111
+
112
+ ### Tier Config File
113
+
114
+ Additional tier preferences (fallback models per provider) can be defined in `config/model-tiers.json`:
115
+
116
+ ```json
117
+ {
118
+ "tiers": {
119
+ "SIMPLE": { "preferred": { "ollama": ["llama3.2"], "openai": ["gpt-4o-mini"] } },
120
+ "MEDIUM": { "preferred": { "openai": ["gpt-4o"], "anthropic": ["claude-sonnet-4-20250514"] } },
121
+ "COMPLEX": { "preferred": { "openai": ["o1-mini"], "anthropic": ["claude-sonnet-4-20250514"] } },
122
+ "REASONING": { "preferred": { "openai": ["o1"], "anthropic": ["claude-opus-4-20250514"] } }
123
+ },
124
+ "localProviders": {
125
+ "ollama": { "free": true, "defaultTier": "SIMPLE" },
126
+ "llamacpp": { "free": true, "defaultTier": "SIMPLE" },
127
+ "lmstudio": { "free": true, "defaultTier": "SIMPLE" }
128
+ }
129
+ }
130
+ ```
131
+
132
+ ---
133
+
134
+ ## Complexity Scoring Algorithm
135
+
136
+ The complexity analyzer implements 4 phases to produce a score from 0-100.
137
+
138
+ ### Phase 1: Basic Scoring
139
+
140
+ Three components scored independently:
141
+
142
+ **Token Count (0-20 points):**
143
+
144
+ | Tokens | Score |
145
+ |--------|-------|
146
+ | < 500 | 0 |
147
+ | 500-999 | 4 |
148
+ | 1,000-1,999 | 8 |
149
+ | 2,000-3,999 | 12 |
150
+ | 4,000-7,999 | 16 |
151
+ | 8,000+ | 20 |
152
+
153
+ **Tool Count (0-20 points):**
154
+
155
+ | Tools | Score |
156
+ |-------|-------|
157
+ | 0 | 0 |
158
+ | 1-3 | 4 |
159
+ | 4-6 | 8 |
160
+ | 7-10 | 12 |
161
+ | 11-15 | 16 |
162
+ | 16+ | 20 |
163
+
164
+ **Task Type (0-25 points):**
165
+ - Greetings / yes-no: 0-2
166
+ - Simple questions: 3
167
+ - General non-technical: 5
168
+ - Technical content: 10
169
+ - Refactoring: 16
170
+ - New implementation: 18
171
+ - From scratch: 20
172
+ - Entire codebase scope: 22
173
+ - Force cloud patterns (security audit, architecture review): 25
174
+
175
+ ### Phase 2: Advanced Classification
176
+
177
+ Additional scoring on top of Phase 1:
178
+
179
+ **Code Complexity (0-20 points):**
180
+
181
+ | Pattern | Points |
182
+ |---------|--------|
183
+ | Multi-file operations | +5 |
184
+ | Architecture concerns | +5 |
185
+ | Security | +4 |
186
+ | Concurrency | +3 |
187
+ | Performance | +3 |
188
+ | Database operations | +3 |
189
+ | Testing | +2 |
190
+
191
+ **Reasoning Requirements (0-15 points):**
192
+
193
+ | Pattern | Points |
194
+ |---------|--------|
195
+ | Step-by-step reasoning | +4 |
196
+ | Trade-off analysis | +4 |
197
+ | General analysis | +3 |
198
+ | Planning | +3 |
199
+ | Edge cases | +2 |
200
+
201
+ **Conversation Bonus:**
202
+ - 6-10 messages: +2
203
+ - 11+ messages: +5
204
+
205
+ The standard score is the sum of all components, capped at 100.
206
+
207
+ ### Weighted Scoring Mode (15 Dimensions)
208
+
209
+ When `ROUTING_WEIGHTED_SCORING=true`, the analyzer uses a 15-dimension weighted scoring system instead of the standard additive scoring:
210
+
211
+ ```
212
+ Score = Sum of (dimension_value * weight) for all 15 dimensions
213
+ ```
214
+
215
+ #### Dimension Weights
216
+
217
+ **Content Analysis (35% total):**
218
+
219
+ | Dimension | Weight | Measures |
220
+ |-----------|--------|----------|
221
+ | tokenCount | 0.08 | Request size (token estimate) |
222
+ | promptComplexity | 0.10 | Sentence structure, average length |
223
+ | technicalDepth | 0.10 | Technical keyword density |
224
+ | domainSpecificity | 0.07 | Number of specialized domains (security, ML, distributed, database, frontend, devops) |
225
+
226
+ **Tool Analysis (25% total):**
227
+
228
+ | Dimension | Weight | Measures |
229
+ |-----------|--------|----------|
230
+ | toolCount | 0.08 | Number of tools in request |
231
+ | toolComplexity | 0.10 | Weighted average of tool complexity (Bash=0.9, Write=0.8, Edit=0.7, Read=0.3, Glob/Grep=0.2) |
232
+ | toolChainPotential | 0.07 | Sequential operation indicators ("then", "after", "step 1") |
233
+
234
+ **Reasoning Requirements (25% total):**
235
+
236
+ | Dimension | Weight | Measures |
237
+ |-----------|--------|----------|
238
+ | multiStepReasoning | 0.10 | Step-by-step / planning patterns |
239
+ | codeGeneration | 0.08 | Code creation requests |
240
+ | analysisDepth | 0.07 | Trade-off / analysis patterns |
241
+
242
+ **Context Factors (15% total):**
243
+
244
+ | Dimension | Weight | Measures |
245
+ |-----------|--------|----------|
246
+ | conversationDepth | 0.05 | Message count in conversation |
247
+ | priorToolUsage | 0.05 | Tool results already in conversation |
248
+ | ambiguity | 0.05 | Inverse of request specificity |
249
+
250
+ Each dimension is scored 0-100 independently, then multiplied by its weight. The final score is the rounded sum.
251
+
252
+ ### Phase 3: Metrics Tracking
253
+
254
+ Every routing decision is recorded in-memory (last 1,000 decisions) for analytics:
255
+ - Total decisions, local vs. cloud split
256
+ - Average complexity score
257
+ - Per-provider and per-tier distribution
258
+
259
+ Metrics are exposed via the `/metrics` endpoint and `X-Lynkr-*` response headers.
260
+
261
+ ### Phase 4: Embeddings-Based Similarity (Optional)
262
+
263
+ When an embeddings model is configured (`OLLAMA_EMBEDDINGS_MODEL`), the analyzer can compare request content against reference embeddings for complex and simple tasks using cosine similarity. This produces a score adjustment of -10 to +10 points.
264
+
265
+ ---
266
+
267
+ ## Agentic Workflow Detection
268
+
269
+ The agentic detector identifies multi-step tool chains and autonomous agent patterns, boosting the complexity tier accordingly.
270
+
271
+ ### Agent Types
272
+
273
+ | Type | Score Boost | Min Tier | Description |
274
+ |------|------------|----------|-------------|
275
+ | **SINGLE_SHOT** | +0 | SIMPLE | Simple request-response, no tool chains |
276
+ | **TOOL_CHAIN** | +15 | MEDIUM | Sequential tool usage (read -> edit -> test) |
277
+ | **ITERATIVE** | +25 | COMPLEX | Retry loops, debugging cycles, iterative refinement |
278
+ | **AUTONOMOUS** | +35 | REASONING | Open-ended tasks, full autonomy, complex decision making |
279
+
280
+ ### Detection Signals
281
+
282
+ The detector evaluates 6 signal categories:
283
+
284
+ **1. Tool Count**
285
+ - 4-5 tools: +8
286
+ - 6-10 tools: +15
287
+ - 11+ tools: +25
288
+
289
+ **2. Agentic Tools Present** (Bash, Write, Edit, Task, Git, Test)
290
+ - 1 agentic tool: +8
291
+ - 2-3 agentic tools: +15
292
+ - 4+ agentic tools: +25
293
+
294
+ **3. Prior Tool Results** (already in an agentic loop)
295
+ - 1-2 tool results: +10
296
+ - 3-5 tool results: +20
297
+ - 6+ tool results: +30
298
+
299
+ **4. Content Pattern Matching**
300
+ - Autonomous patterns ("figure out", "solve", "make it work"): +25
301
+ - Iterative patterns ("keep trying", "debug", "retry"): +20
302
+ - Tool chain patterns ("then use", "next step", "step 1"): +15
303
+ - Multi-file work: +15
304
+ - Planning required: +10
305
+ - Implementation + testing: +15
306
+
307
+ **5. Conversation Depth**
308
+ - 5-8 messages: +6
309
+ - 9-15 messages: +12
310
+ - 16+ messages: +20
311
+
312
+ **6. Content Length**
313
+ - 2,000+ characters: +10
314
+
315
+ ### Classification Thresholds
316
+
317
+ | Agent Type | Score Threshold | Additional Conditions |
318
+ |------------|----------------|----------------------|
319
+ | AUTONOMOUS | >= 60 | or autonomous pattern + score >= 40 |
320
+ | ITERATIVE | >= 40 | or deep tool loop + score >= 30 |
321
+ | TOOL_CHAIN | >= 20 | or many agentic tools present |
322
+ | SINGLE_SHOT | < 20 | Default |
323
+
324
+ When an agentic workflow is detected (`score >= 25`), the complexity score is boosted by the agent type's `scoreBoost` value, and the tier is upgraded to at least the agent type's `minTier`.
325
+
326
+ ---
327
+
328
+ ## Force Patterns
329
+
330
+ Certain requests bypass the scoring algorithm entirely:
331
+
332
+ ### Force Local (always local model)
333
+ - Greetings: "hi", "hello", "thanks", "bye"
334
+ - Time queries: "what time is it"
335
+ - Confirmations: "yes", "no", "ok", "sure"
336
+ - Help requests: "help", "commands"
337
+
338
+ ### Force Cloud (always cloud model)
339
+ - Security audits/reviews
340
+ - Architecture design/review
341
+ - Complete codebase refactoring
342
+ - Code/PR reviews
343
+ - Complex debugging
344
+ - Production incidents
345
+
346
+ ---
347
+
348
+ ## Cost Optimization
349
+
350
+ When `ROUTING_COST_OPTIMIZATION=true`, the router checks if a cheaper model can handle the determined tier.
351
+
352
+ ### Model Registry
353
+
354
+ Pricing data is fetched from three sources (in priority order):
355
+
356
+ 1. **LiteLLM** (highest priority) - Community-maintained pricing from [BerriAI/litellm](https://github.com/BerriAI/litellm)
357
+ 2. **models.dev** - API pricing aggregator
358
+ 3. **Databricks Fallback** - Hardcoded pricing for common models (Claude, Llama, GPT, Gemini, DBRX)
359
+
360
+ Pricing data is cached locally in `data/model-prices-cache.json` with a 24-hour TTL. Background refresh happens automatically when the cache is stale.
361
+
362
+ ### Cost Tracking
363
+
364
+ The optimizer tracks costs at both session and global levels:
365
+ - Per-request cost recording (input + output tokens)
366
+ - Per-model, per-provider, per-tier breakdowns
367
+ - Savings calculation when routing to cheaper alternatives
368
+
369
+ ### Pricing Lookup
370
+
371
+ The registry supports flexible model name lookup:
372
+ - Direct match: `gpt-4o`
373
+ - Provider prefix stripping: `databricks-claude-sonnet-4-5` -> `claude-sonnet-4-5`
374
+ - Fuzzy matching for partial names
375
+
376
+ ---
377
+
378
+ ## Routing Headers
379
+
380
+ Every response includes routing metadata in `X-Lynkr-*` headers:
381
+
382
+ | Header | Description | Example |
383
+ |--------|-------------|---------|
384
+ | `X-Lynkr-Routing-Method` | How the decision was made | `tier_config`, `force`, `tool_threshold`, `agentic`, `cost_optimized` |
385
+ | `X-Lynkr-Provider` | Selected provider | `databricks`, `ollama`, `openrouter` |
386
+ | `X-Lynkr-Complexity-Score` | Complexity score (0-100) | `42` |
387
+ | `X-Lynkr-Complexity-Threshold` | Score threshold for cloud routing | `40` |
388
+ | `X-Lynkr-Routing-Reason` | Human-readable reason | `force_local_pattern`, `autonomous_workflow` |
389
+ | `X-Lynkr-Tier` | Selected model tier | `SIMPLE`, `MEDIUM`, `COMPLEX`, `REASONING` |
390
+ | `X-Lynkr-Model` | Selected model | `llama3.2`, `gpt-4o`, `claude-opus-4-6` |
391
+ | `X-Lynkr-Agentic` | Agentic workflow type (if detected) | `TOOL_CHAIN`, `ITERATIVE`, `AUTONOMOUS` |
392
+ | `X-Lynkr-Cost-Optimized` | Whether cost optimization was applied | `true` |
393
+
394
+ ---
395
+
396
+ ## Configuration Reference
397
+
398
+ ### Environment Variables
399
+
400
+ | Variable | Default | Description |
401
+ |----------|---------|-------------|
402
+ | `TIER_SIMPLE` | *required* | Model for simple tier (`provider:model`) |
403
+ | `TIER_MEDIUM` | *required* | Model for medium tier (`provider:model`) |
404
+ | `TIER_COMPLEX` | *required* | Model for complex tier (`provider:model`) |
405
+ | `TIER_REASONING` | *required* | Model for reasoning tier (`provider:model`) |
406
+ | `SMART_TOOL_SELECTION_MODE` | `heuristic` | Scoring mode: `aggressive` (threshold=60), `heuristic` (threshold=40), `conservative` (threshold=25) |
407
+ | `ROUTING_WEIGHTED_SCORING` | `false` | Enable 15-dimension weighted scoring |
408
+ | `ROUTING_AGENTIC_DETECTION` | `true` | Enable agentic workflow detection |
409
+ | `ROUTING_COST_OPTIMIZATION` | `false` | Enable cost-based model selection |
410
+ | `OLLAMA_MAX_TOOLS_FOR_ROUTING` | `3` | Max tools before routing away from Ollama |
411
+ | `OPENROUTER_MAX_TOOLS_FOR_ROUTING` | `15` | Max tools before routing away from OpenRouter |
412
+ | `OLLAMA_EMBEDDINGS_MODEL` | *(none)* | Embeddings model for Phase 4 similarity |
413
+
414
+ ### Smart Tool Selection Modes
415
+
416
+ | Mode | Threshold | Behavior |
417
+ |------|-----------|----------|
418
+ | `aggressive` | 60 | More requests go to local (saves cost) |
419
+ | `heuristic` | 40 | Balanced local/cloud split |
420
+ | `conservative` | 25 | More requests go to cloud (better quality) |
421
+
422
+ ---
423
+
424
+ ## Routing Decision Flow
425
+
426
+ ```
427
+ 1. Are all 4 TIER_* env vars configured?
428
+ └─ No → Return static provider (MODEL_PROVIDER), skip all routing
429
+
430
+ 2. Does content match FORCE_LOCAL patterns?
431
+ └─ Yes → Route to local provider
432
+
433
+ 3. Does content match FORCE_CLOUD patterns?
434
+ └─ Yes → Route to best cloud provider (requires FALLBACK_ENABLED)
435
+
436
+ 4. Analyze complexity:
437
+ └─ Calculate score 0-100 (standard or weighted mode)
438
+
439
+ 5. Optional: Embeddings adjustment:
440
+ └─ Adjust score by -10 to +10 based on semantic similarity
441
+
442
+ 6. Agentic detection:
443
+ └─ If agentic → Boost score, enforce minimum tier
444
+ └─ If AUTONOMOUS → Force cloud provider
445
+
446
+ 7. Map score to tier (SIMPLE/MEDIUM/COMPLEX/REASONING)
447
+
448
+ 8. Select provider:model from matching TIER_* env var
449
+
450
+ 9. Optional: Cost optimization
451
+ └─ Check for cheaper model that can handle the tier
452
+
453
+ 10. Return { provider, model, tier, score, method }
454
+ ```
455
+
456
+ ---
457
+
458
+ ## Source Files
459
+
460
+ | File | Description |
461
+ |------|-------------|
462
+ | `src/routing/index.js` | Main routing orchestrator (`determineProviderSmart()`) |
463
+ | `src/routing/complexity-analyzer.js` | 4-phase complexity analysis, 15-dimension weighted scoring |
464
+ | `src/routing/agentic-detector.js` | Agentic workflow detection and classification |
465
+ | `src/routing/model-tiers.js` | Tier definitions, model selection from `TIER_*` env vars |
466
+ | `src/routing/model-registry.js` | Multi-source pricing (LiteLLM, models.dev, Databricks fallback) |
467
+ | `src/routing/cost-optimizer.js` | Cost tracking, cheapest model finder, savings calculation |
468
+
469
+ ---
470
+
471
+ ## Next Steps
472
+
473
+ - **[Features Overview](features.md)** - Architecture and request flow
474
+ - **[Token Optimization](token-optimization.md)** - Cost reduction strategies
475
+ - **[Provider Configuration](providers.md)** - Setting up providers
476
+ - **[Production Guide](production.md)** - Deploy with routing enabled
@@ -252,14 +252,16 @@ LOG_LEVEL=info
252
252
  # No configuration needed
253
253
  ```
254
254
 
255
- ### 2. Use Hybrid Routing
255
+ ### 2. Use Tier-Based Routing
256
256
 
257
257
  ```bash
258
- # Route simple requests to free Ollama
259
- PREFER_OLLAMA=true
258
+ # Route simple requests to free Ollama, complex to cloud
259
+ # Set all 4 TIER_* env vars to enable tier-based routing
260
+ TIER_SIMPLE=ollama:llama3.2
261
+ TIER_MEDIUM=openrouter:openai/gpt-4o-mini
262
+ TIER_COMPLEX=azure-openai:gpt-4o
263
+ TIER_REASONING=azure-openai:gpt-4o
260
264
  FALLBACK_ENABLED=true
261
-
262
- # Complex requests automatically go to cloud
263
265
  FALLBACK_PROVIDER=databricks
264
266
  ```
265
267
 
@@ -400,6 +400,75 @@ Error: Cannot find module 'xxx'
400
400
 
401
401
  ---
402
402
 
403
+ ### Moonshot AI (Kimi)
404
+
405
+ **Issue:** Rate limited (429)
406
+
407
+ **Symptoms:**
408
+ - `429 Too Many Requests`
409
+ - `Rate limit exceeded`
410
+ - Responses failing intermittently
411
+
412
+ **Solutions:**
413
+
414
+ 1. **Reduce concurrency:**
415
+ Moonshot has a max concurrency of ~3 requests. Lynkr retries automatically with backoff, but sustained high concurrency will trigger 429s.
416
+
417
+ 2. **Use turbo model:**
418
+ ```bash
419
+ # Turbo has higher rate limits than thinking model
420
+ export MOONSHOT_MODEL=kimi-k2-turbo-preview
421
+ ```
422
+
423
+ 3. **Enable fallback:**
424
+ ```bash
425
+ export FALLBACK_ENABLED=true
426
+ export FALLBACK_PROVIDER=openrouter
427
+ ```
428
+
429
+ **Issue:** Authentication failed
430
+
431
+ **Symptoms:**
432
+ - `401 Unauthorized`
433
+ - `Invalid API key`
434
+
435
+ **Solutions:**
436
+
437
+ 1. **Check API key format:**
438
+ ```bash
439
+ echo $MOONSHOT_API_KEY
440
+ # Should start with: sk-
441
+ ```
442
+
443
+ 2. **Regenerate API key:**
444
+ - Visit [platform.moonshot.ai](https://platform.moonshot.ai)
445
+ - Generate a new key
446
+ - Update environment variable
447
+
448
+ 3. **Check endpoint:**
449
+ ```bash
450
+ echo $MOONSHOT_ENDPOINT
451
+ # Should be: https://api.moonshot.ai/v1/chat/completions
452
+ ```
453
+
454
+ **Issue:** Reasoning content displayed in output
455
+
456
+ **Symptoms:**
457
+ - Response includes chain-of-thought text before the actual answer
458
+ - Long preambles like "The user is asking me to..."
459
+
460
+ **Solutions:**
461
+
462
+ This happens when using `kimi-k2-thinking` model. Lynkr should automatically strip reasoning content and only show the final answer. If you see reasoning in the output:
463
+
464
+ 1. **Update Lynkr** to the latest version
465
+ 2. **Switch to turbo model** if reasoning output is not needed:
466
+ ```bash
467
+ export MOONSHOT_MODEL=kimi-k2-turbo-preview
468
+ ```
469
+
470
+ ---
471
+
403
472
  ### llama.cpp
404
473
 
405
474
  **Issue:** Server not responding
@@ -556,9 +625,13 @@ Error: Cannot find module 'xxx'
556
625
  export OLLAMA_MODEL=llama3.1:8b
557
626
  ```
558
627
 
559
- 3. **Enable hybrid routing:**
628
+ 3. **Enable tier-based routing:**
560
629
  ```bash
561
- export PREFER_OLLAMA=true
630
+ # Set all 4 TIER_* env vars to enable tier-based routing
631
+ export TIER_SIMPLE=ollama:llama3.2
632
+ export TIER_MEDIUM=openrouter:openai/gpt-4o-mini
633
+ export TIER_COMPLEX=azure-openai:gpt-4o
634
+ export TIER_REASONING=azure-openai:gpt-4o
562
635
  export FALLBACK_ENABLED=true
563
636
  ```
564
637
 
@@ -746,10 +819,13 @@ Restart Lynkr after configuration.
746
819
  export LOAD_SHEDDING_ACTIVE_REQUESTS_THRESHOLD=100
747
820
  ```
748
821
 
749
- 2. **Use local provider for simple requests:**
822
+ 2. **Use tier-based routing to send simple requests to local models:**
750
823
  ```bash
751
- export PREFER_OLLAMA=true
752
- export OLLAMA_MODEL=llama3.1:8b
824
+ # Set all 4 TIER_* env vars to enable tier-based routing
825
+ export TIER_SIMPLE=ollama:llama3.2
826
+ export TIER_MEDIUM=openrouter:openai/gpt-4o-mini
827
+ export TIER_COMPLEX=azure-openai:gpt-4o
828
+ export TIER_REASONING=azure-openai:gpt-4o
753
829
  ```
754
830
 
755
831
  3. **Enable circuit breaker:**
package/install.sh CHANGED
@@ -134,10 +134,15 @@ MODEL_PROVIDER=ollama
134
134
  PORT=8080
135
135
 
136
136
  # Ollama Configuration (default for local development)
137
- PREFER_OLLAMA=true
138
137
  OLLAMA_MODEL=qwen2.5-coder:7b
139
138
  OLLAMA_ENDPOINT=http://localhost:11434
140
139
 
140
+ # Tier-based routing (uncomment and configure to enable)
141
+ # TIER_SIMPLE=ollama:qwen2.5-coder:7b
142
+ # TIER_MEDIUM=ollama:qwen2.5-coder:7b
143
+ # TIER_COMPLEX=ollama:qwen2.5-coder:7b
144
+ # TIER_REASONING=ollama:qwen2.5-coder:7b
145
+
141
146
  # Long-Term Memory System (Titans-Inspired) - Enabled by default
142
147
  MEMORY_ENABLED=true
143
148
  MEMORY_RETRIEVAL_LIMIT=5
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "lynkr",
3
- "version": "7.2.4",
3
+ "version": "8.0.0",
4
4
  "description": "Self-hosted Claude Code & Cursor proxy with Databricks,AWS BedRock,Azure adapters, openrouter, Ollama,llamacpp,LM Studio, workspace tooling, and MCP integration.",
5
5
  "main": "index.js",
6
6
  "bin": {
@@ -14,7 +14,7 @@
14
14
  "dev": "nodemon index.js",
15
15
  "lint": "eslint src index.js",
16
16
  "test": "npm run test:unit && npm run test:performance",
17
- "test:unit": "DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node --test test/routing.test.js test/hybrid-routing-integration.test.js test/web-tools.test.js test/passthrough-mode.test.js test/openrouter-error-resilience.test.js test/format-conversion.test.js test/azure-openai-config.test.js test/azure-openai-format-conversion.test.js test/azure-openai-routing.test.js test/azure-openai-streaming.test.js test/azure-openai-error-resilience.test.js test/azure-openai-integration.test.js test/openai-integration.test.js test/llamacpp-integration.test.js test/memory/store.test.js test/memory/surprise.test.js test/memory/extractor.test.js test/memory/search.test.js test/memory/retriever.test.js",
17
+ "test:unit": "DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node --test test/routing.test.js test/hybrid-routing-integration.test.js test/web-tools.test.js test/passthrough-mode.test.js test/openrouter-error-resilience.test.js test/format-conversion.test.js test/azure-openai-config.test.js test/azure-openai-format-conversion.test.js test/azure-openai-routing.test.js test/azure-openai-streaming.test.js test/azure-openai-error-resilience.test.js test/azure-openai-integration.test.js test/openai-integration.test.js test/toon-compression.test.js test/llamacpp-integration.test.js test/memory/store.test.js test/memory/surprise.test.js test/memory/extractor.test.js test/memory/search.test.js test/memory/retriever.test.js",
18
18
  "test:memory": "DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node --test test/memory/store.test.js test/memory/surprise.test.js test/memory/extractor.test.js test/memory/search.test.js test/memory/retriever.test.js",
19
19
  "test:new-features": "DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node --test test/passthrough-mode.test.js test/openrouter-error-resilience.test.js test/format-conversion.test.js",
20
20
  "test:performance": "DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node test/hybrid-routing-performance.test.js && DATABRICKS_API_KEY=test-key DATABRICKS_API_BASE=http://test.com node test/performance-tests.js",
@@ -47,9 +47,9 @@
47
47
  "@azure/openai": "^2.0.0",
48
48
  "@babel/parser": "^7.29.0",
49
49
  "@babel/traverse": "^7.29.0",
50
+ "@toon-format/toon": "^2.1.0",
50
51
  "compression": "^1.7.4",
51
52
  "diff": "^5.2.0",
52
- "dockerode": "^4.0.2",
53
53
  "dotenv": "^16.4.5",
54
54
  "express": "^5.1.0",
55
55
  "express-rate-limit": "^8.2.1",
@@ -58,10 +58,12 @@
58
58
  "openai": "^6.14.0",
59
59
  "pino": "^8.17.2",
60
60
  "pino-http": "^8.6.0",
61
+ "pino-roll": "^4.0.0",
61
62
  "undici": "^6.22.0"
62
63
  },
63
64
  "optionalDependencies": {
64
65
  "better-sqlite3": "^12.6.2",
66
+ "dockerode": "^4.0.2",
65
67
  "tree-sitter": "^0.21.1",
66
68
  "tree-sitter-javascript": "^0.21.0",
67
69
  "tree-sitter-python": "^0.21.0",
package/scripts/setup.js CHANGED
@@ -251,7 +251,6 @@ async function createEnvFile() {
251
251
  if (ollamaOnly) {
252
252
  let envContent = fs.readFileSync(envPath, "utf-8");
253
253
  envContent = envContent.replace(/^# MODEL_PROVIDER=databricks/m, "MODEL_PROVIDER=ollama");
254
- envContent = envContent.replace(/^PREFER_OLLAMA=true/m, "# PREFER_OLLAMA=true # Not needed when MODEL_PROVIDER=ollama");
255
254
  envContent = envContent.replace(/^FALLBACK_ENABLED=true/m, "FALLBACK_ENABLED=false");
256
255
  fs.writeFileSync(envPath, envContent);
257
256
  }