lynkr 9.3.2 → 9.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.impeccable/live/config.json +8 -0
- package/README.md +136 -17
- package/benchmark-configs/litellm_config.yaml +86 -0
- package/benchmark-configs/lynkr.env +48 -0
- package/benchmark-configs/portkey-config.json +60 -0
- package/benchmark-configs/portkey-docker.sh +23 -0
- package/benchmark-tier-routing.js +449 -0
- package/package.json +30 -8
- package/src/api/router.js +1 -0
- package/src/cache/semantic.js +5 -2
- package/src/context/toon.js +14 -9
package/README.md
CHANGED
|
@@ -1,22 +1,27 @@
|
|
|
1
1
|
# Lynkr
|
|
2
2
|
|
|
3
|
-
###
|
|
3
|
+
### The AI coding proxy that compresses tokens before they hit the model.
|
|
4
|
+
|
|
5
|
+
**87.6% fewer tokens on JSON tool results. 53% fewer tokens on tool-heavy requests. 171ms semantic cache hits. Zero code changes.**
|
|
4
6
|
|
|
5
7
|
[](https://www.npmjs.com/package/lynkr)
|
|
6
8
|
[](https://github.com/Fast-Editor/Lynkr)
|
|
7
9
|
[](LICENSE)
|
|
8
10
|
[](https://nodejs.org)
|
|
9
|
-
[](https://deepwiki.com/
|
|
11
|
+
[](https://deepwiki.com/Fast-Editor/Lynkr)
|
|
10
12
|
|
|
11
13
|
<table>
|
|
12
14
|
<tr>
|
|
13
|
-
<td align="center"><strong>
|
|
14
|
-
<td align="center"><strong>
|
|
15
|
-
<td align="center"><strong>
|
|
15
|
+
<td align="center"><strong>87.6%</strong><br/>JSON Compression</td>
|
|
16
|
+
<td align="center"><strong>53%</strong><br/>Tool Token Reduction</td>
|
|
17
|
+
<td align="center"><strong>171ms</strong><br/>Semantic Cache Hits</td>
|
|
18
|
+
<td align="center"><strong>13+</strong><br/>LLM Providers</td>
|
|
16
19
|
<td align="center"><strong>0</strong><br/>Code Changes Required</td>
|
|
17
20
|
</tr>
|
|
18
21
|
</table>
|
|
19
22
|
|
|
23
|
+
> Numbers from a live benchmark against LiteLLM on identical workloads. [See full report →](BENCHMARK_REPORT.md)
|
|
24
|
+
|
|
20
25
|
---
|
|
21
26
|
|
|
22
27
|
## Quick Start (2 Minutes)
|
|
@@ -121,6 +126,7 @@ POLICY_MAX_TOOL_CALLS=100
|
|
|
121
126
|
```
|
|
122
127
|
|
|
123
128
|
Then start Lynkr:
|
|
129
|
+
|
|
124
130
|
```bash
|
|
125
131
|
lynkr start
|
|
126
132
|
```
|
|
@@ -128,6 +134,15 @@ lynkr start
|
|
|
128
134
|
### 3. Connect Your Tool
|
|
129
135
|
|
|
130
136
|
**Claude Code**
|
|
137
|
+
|
|
138
|
+
**Windows (Command Prompt):**
|
|
139
|
+
```cmd
|
|
140
|
+
set ANTHROPIC_BASE_URL=http://localhost:8081
|
|
141
|
+
set ANTHROPIC_API_KEY=dummy
|
|
142
|
+
claude "write a hello world in python"
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
**Linux/macOS:**
|
|
131
146
|
```bash
|
|
132
147
|
export ANTHROPIC_BASE_URL=http://localhost:8081
|
|
133
148
|
export ANTHROPIC_API_KEY=dummy
|
|
@@ -154,23 +169,87 @@ wire_api = "responses"
|
|
|
154
169
|
|
|
155
170
|
---
|
|
156
171
|
|
|
172
|
+
## Common Startup Errors
|
|
173
|
+
|
|
174
|
+
### Error: `unable to determine transport target for "pino-pretty"`
|
|
175
|
+
|
|
176
|
+
**Problem:** You're running an older version (< 9.3.0).
|
|
177
|
+
|
|
178
|
+
**Solution:** Update to the latest version:
|
|
179
|
+
```bash
|
|
180
|
+
npm install -g lynkr@latest
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
If you must use an older version, set `NODE_ENV=production` before starting.
|
|
184
|
+
|
|
185
|
+
### Warning: `Missing tier configuration: TIER_SIMPLE, TIER_MEDIUM...`
|
|
186
|
+
|
|
187
|
+
**This is just a warning - you can ignore it.** Tier routing is optional.
|
|
188
|
+
|
|
189
|
+
To remove the warning, add to `.env`:
|
|
190
|
+
```bash
|
|
191
|
+
TIER_SIMPLE=ollama:qwen2.5-coder:latest
|
|
192
|
+
TIER_MEDIUM=ollama:qwen2.5-coder:latest
|
|
193
|
+
TIER_COMPLEX=ollama:qwen2.5-coder:latest
|
|
194
|
+
TIER_REASONING=ollama:qwen2.5-coder:latest
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
### Warning: `FALLBACK_PROVIDER='databricks' is enabled but missing credentials`
|
|
198
|
+
|
|
199
|
+
**Solution:** Add to `.env`:
|
|
200
|
+
```bash
|
|
201
|
+
FALLBACK_ENABLED=false
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
### Error: `connect ECONNREFUSED ::1:11434` (Ollama)
|
|
205
|
+
|
|
206
|
+
**Problem:** Ollama is not running.
|
|
207
|
+
|
|
208
|
+
**Solution:**
|
|
209
|
+
```bash
|
|
210
|
+
ollama serve
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
Keep this terminal open, and start Lynkr in a new terminal.
|
|
214
|
+
|
|
215
|
+
### Error: `Connection refused` or `404 Not Found`
|
|
216
|
+
|
|
217
|
+
**Problem:** Lynkr is not running or wrong port.
|
|
218
|
+
|
|
219
|
+
**Solution:** Check Lynkr is running on the correct port:
|
|
220
|
+
```bash
|
|
221
|
+
curl http://localhost:8081/
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
Should return: `{"service":"Lynkr","version":"9.x.x","status":"running"}`
|
|
225
|
+
|
|
226
|
+
---
|
|
227
|
+
|
|
157
228
|
## Why Lynkr?
|
|
158
229
|
|
|
159
|
-
AI coding tools lock you into one provider. Lynkr breaks
|
|
230
|
+
AI coding tools lock you into one provider and send every token raw. Lynkr breaks both locks.
|
|
160
231
|
|
|
161
232
|
```
|
|
162
233
|
Claude Code / Cursor / Codex / Cline / Continue
|
|
163
234
|
↓
|
|
164
235
|
Lynkr
|
|
236
|
+
┌─────────────────────┐
|
|
237
|
+
│ Strip unused tools │ ← 53% fewer tokens on tool calls
|
|
238
|
+
│ Compress JSON blobs │ ← 87.6% on large tool results
|
|
239
|
+
│ Semantic cache │ ← 171ms hits, 0 tokens billed
|
|
240
|
+
│ Route by complexity │ ← cheap model for simple, cloud for hard
|
|
241
|
+
└─────────────────────┘
|
|
165
242
|
↓
|
|
166
|
-
Ollama | Bedrock | Azure | OpenRouter | OpenAI
|
|
243
|
+
Ollama | Bedrock | Azure | Moonshot | OpenRouter | OpenAI
|
|
167
244
|
```
|
|
168
245
|
|
|
169
246
|
**What you get:**
|
|
170
|
-
- ✅
|
|
247
|
+
- ✅ **53% fewer tokens** on tool-heavy requests (Claude Code, Cursor sessions)
|
|
248
|
+
- ✅ **87.6% compression** on large JSON tool results (grep, file reads, test output)
|
|
249
|
+
- ✅ **Semantic cache** serves repeated queries in 171ms with 0 tokens billed
|
|
250
|
+
- ✅ **Automatic tier routing** — simple questions go to cheap models, complex ones escalate
|
|
171
251
|
- ✅ Route through **your company's infrastructure** (Databricks, Azure, Bedrock)
|
|
172
|
-
- ✅
|
|
173
|
-
- ✅ **Zero code changes** - just change one environment variable
|
|
252
|
+
- ✅ **Zero code changes** — just change one environment variable
|
|
174
253
|
|
|
175
254
|
---
|
|
176
255
|
|
|
@@ -532,7 +611,7 @@ curl -fsSL https://raw.githubusercontent.com/Fast-Editor/Lynkr/main/install.sh |
|
|
|
532
611
|
|
|
533
612
|
**Homebrew**
|
|
534
613
|
```bash
|
|
535
|
-
brew tap
|
|
614
|
+
brew tap fast-editor/lynkr
|
|
536
615
|
brew install lynkr
|
|
537
616
|
```
|
|
538
617
|
|
|
@@ -571,6 +650,42 @@ npm start
|
|
|
571
650
|
|
|
572
651
|
---
|
|
573
652
|
|
|
653
|
+
## Benchmark Results
|
|
654
|
+
|
|
655
|
+
Measured on real agentic coding workloads (Claude Code / Cursor sessions) with Ollama, Moonshot, and Azure OpenAI backends. Run with `node benchmark-tier-routing.js`.
|
|
656
|
+
|
|
657
|
+
### Token compression
|
|
658
|
+
|
|
659
|
+
| Scenario | Tokens without Lynkr | Tokens with Lynkr | Reduction |
|
|
660
|
+
|---|---|---|---|
|
|
661
|
+
| 14-tool request (read task) | 1,042 | **547** | **47%** |
|
|
662
|
+
| 14-tool request (write task) | 1,043 | **412** | **60%** |
|
|
663
|
+
| Large JSON grep result (60 items) | 3,458 | **427** | **87.6%** |
|
|
664
|
+
|
|
665
|
+
Lynkr strips irrelevant tool schemas before forwarding (smart tool selection) and binary-compresses large JSON tool results (TOON) — both happen in-process with no added latency.
|
|
666
|
+
|
|
667
|
+
### Semantic cache
|
|
668
|
+
|
|
669
|
+
| | Tokens billed | Response time |
|
|
670
|
+
|---|---|---|
|
|
671
|
+
| First call (cold) | 2,857 | 1,891ms |
|
|
672
|
+
| **Second call — paraphrased, cache hit** | **0** | **171ms** |
|
|
673
|
+
|
|
674
|
+
Near-identical prompts return cached responses in 171ms. Zero tokens billed on a cache hit.
|
|
675
|
+
|
|
676
|
+
### Tier routing
|
|
677
|
+
|
|
678
|
+
| Request | Routed to |
|
|
679
|
+
|---|---|
|
|
680
|
+
| "What does git stash do?" | SIMPLE → local model (free) |
|
|
681
|
+
| JWT vs cookies security analysis | COMPLEX → cloud model (correct) |
|
|
682
|
+
|
|
683
|
+
Lynkr scores each request on 15 dimensions (token count, code complexity, reasoning markers, risk signals, agentic patterns) and routes automatically. No caller changes needed.
|
|
684
|
+
|
|
685
|
+
→ [Full benchmark report with methodology](BENCHMARK_REPORT.md)
|
|
686
|
+
|
|
687
|
+
---
|
|
688
|
+
|
|
574
689
|
## Cost Comparison
|
|
575
690
|
|
|
576
691
|
| Scenario | Direct Anthropic | Lynkr + Ollama | Lynkr + OpenRouter |
|
|
@@ -578,7 +693,7 @@ npm start
|
|
|
578
693
|
| Daily coding (8h) | $10-30/day | **$0 (free)** | $2-8/day |
|
|
579
694
|
| Monthly (heavy use) | $300-900 | **$0** | $60-240 |
|
|
580
695
|
|
|
581
|
-
With tier routing + token optimization: **additional
|
|
696
|
+
With tier routing + token optimization: **additional 50-87% savings** on cloud providers depending on workload.
|
|
582
697
|
|
|
583
698
|
---
|
|
584
699
|
|
|
@@ -589,13 +704,17 @@ With tier routing + token optimization: **additional 60-80% savings** on cloud p
|
|
|
589
704
|
| **Setup** | `npm install -g lynkr` | Python + Docker + Postgres | Account signup | Docker stack |
|
|
590
705
|
| **Claude Code native** | ✅ Drop-in | ⚠️ Requires config | ❌ | ⚠️ Partial |
|
|
591
706
|
| **Cursor native** | ✅ Drop-in | ⚠️ Partial | ❌ | ⚠️ Partial |
|
|
592
|
-
| **Local models** | Ollama, llama.cpp, LM Studio
|
|
593
|
-
| **
|
|
594
|
-
| **
|
|
707
|
+
| **Local models** | Ollama, llama.cpp, LM Studio | Ollama only | ❌ | ❌ |
|
|
708
|
+
| **Automatic tier routing** | ✅ 15-dimension scorer | ⚠️ Cost-only | ❌ | ❌ Manual metadata |
|
|
709
|
+
| **TOON JSON compression** | ✅ up to 87.6% | ❌ | ❌ | ❌ |
|
|
710
|
+
| **Smart tool selection** | ✅ up to 60% token reduction | ❌ | ❌ | ❌ |
|
|
711
|
+
| **Semantic cache** | ✅ 171ms hits, 0 tokens | ❌ | ❌ | ✅ Prompt cache only |
|
|
712
|
+
| **Long-term memory** | ✅ SQLite, per-session | ❌ | ❌ | ❌ |
|
|
713
|
+
| **MCP integration** | ✅ + Code Mode (96% reduction) | ❌ | ❌ | ❌ |
|
|
595
714
|
| **Self-hosted** | ✅ Node.js only | ✅ Python stack | ❌ SaaS | ✅ Docker |
|
|
596
715
|
| **Dependencies** | Node.js 20+ | Python, Prisma, PostgreSQL | None | Docker, Python |
|
|
597
716
|
|
|
598
|
-
**Lynkr's edge:** Purpose-built for AI coding tools. Zero-config for Claude Code, Cursor, and Codex. Installs in one command
|
|
717
|
+
**Lynkr's edge:** Purpose-built for AI coding tools. Compresses tokens before they reach the model — not just after. Zero-config for Claude Code, Cursor, and Codex. Installs in one command.
|
|
599
718
|
|
|
600
719
|
---
|
|
601
720
|
|
|
@@ -604,7 +723,7 @@ With tier routing + token optimization: **additional 60-80% savings** on cloud p
|
|
|
604
723
|
- [GitHub Discussions](https://github.com/Fast-Editor/Lynkr/discussions) — Ask questions
|
|
605
724
|
- [Report Issues](https://github.com/Fast-Editor/Lynkr/issues) — Bug reports
|
|
606
725
|
- [NPM Package](https://www.npmjs.com/package/lynkr) — Official releases
|
|
607
|
-
- [DeepWiki](https://deepwiki.com/
|
|
726
|
+
- [DeepWiki](https://deepwiki.com/Fast-Editor/Lynkr) — AI-powered docs
|
|
608
727
|
|
|
609
728
|
---
|
|
610
729
|
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# ─── LiteLLM Benchmark Config ─────────────────────────────────────────────────
|
|
2
|
+
# Multi-provider tier routing via LiteLLM Complexity Router.
|
|
3
|
+
#
|
|
4
|
+
# Start: litellm --config benchmark-configs/litellm_config.yaml --port 8082
|
|
5
|
+
#
|
|
6
|
+
# Required env vars:
|
|
7
|
+
# AZURE_OPENAI_API_KEY
|
|
8
|
+
# AZURE_OPENAI_ENDPOINT (https://YOUR-RESOURCE.openai.azure.com)
|
|
9
|
+
# MOONSHOT_API_KEY
|
|
10
|
+
# (Ollama needs no key — running locally on :11434)
|
|
11
|
+
#
|
|
12
|
+
# Tier mapping (matches Lynkr benchmark config):
|
|
13
|
+
# SIMPLE → ollama:minimax-m2.5:cloud
|
|
14
|
+
# MEDIUM → ollama:minimax-m2.5:cloud
|
|
15
|
+
# COMPLEX → moonshot:moonshot-v1-auto
|
|
16
|
+
# REASONING → azure-openai:gpt-5.2-chat
|
|
17
|
+
|
|
18
|
+
model_list:
|
|
19
|
+
|
|
20
|
+
# ── SIMPLE + MEDIUM → Ollama minimax-m2.5:cloud ───────────────────────────
|
|
21
|
+
# Note: the model tag is "minimax-m2.5:cloud" — the colon is part of the
|
|
22
|
+
# Ollama model name, NOT a provider separator here.
|
|
23
|
+
- model_name: smart-router
|
|
24
|
+
litellm_params:
|
|
25
|
+
model: "ollama/minimax-m2.5:cloud"
|
|
26
|
+
api_base: http://localhost:11434
|
|
27
|
+
- model_name: smart-router
|
|
28
|
+
litellm_params:
|
|
29
|
+
model: "ollama/minimax-m2.5:cloud"
|
|
30
|
+
api_base: http://localhost:11434
|
|
31
|
+
|
|
32
|
+
# ── COMPLEX → Moonshot moonshot-v1-auto (matches Lynkr TIER_COMPLEX) ────────
|
|
33
|
+
- model_name: smart-router
|
|
34
|
+
litellm_params:
|
|
35
|
+
model: openai/moonshot-v1-auto
|
|
36
|
+
api_base: https://api.moonshot.ai/v1
|
|
37
|
+
api_key: os.environ/MOONSHOT_API_KEY
|
|
38
|
+
|
|
39
|
+
# ── REASONING → Azure OpenAI gpt-5.2-chat ─────────────────────────────────
|
|
40
|
+
- model_name: smart-router
|
|
41
|
+
litellm_params:
|
|
42
|
+
model: azure/gpt-5.2-chat
|
|
43
|
+
api_base: os.environ/AZURE_OPENAI_ENDPOINT
|
|
44
|
+
api_key: os.environ/AZURE_OPENAI_API_KEY
|
|
45
|
+
api_version: "2024-12-01-preview"
|
|
46
|
+
|
|
47
|
+
# ── Direct aliases (for targeted calls outside the benchmark) ─────────────
|
|
48
|
+
- model_name: ollama-minimax
|
|
49
|
+
litellm_params:
|
|
50
|
+
model: "ollama/minimax-m2.5:cloud"
|
|
51
|
+
api_base: http://localhost:11434
|
|
52
|
+
|
|
53
|
+
- model_name: moonshot-kimi-k2
|
|
54
|
+
litellm_params:
|
|
55
|
+
model: openai/moonshot-v1-auto
|
|
56
|
+
api_base: https://api.moonshot.ai/v1
|
|
57
|
+
api_key: os.environ/MOONSHOT_API_KEY
|
|
58
|
+
|
|
59
|
+
- model_name: azure-gpt5
|
|
60
|
+
litellm_params:
|
|
61
|
+
model: azure/gpt-5.2-chat
|
|
62
|
+
api_base: os.environ/AZURE_OPENAI_ENDPOINT
|
|
63
|
+
api_key: os.environ/AZURE_OPENAI_API_KEY
|
|
64
|
+
api_version: "2024-12-01-preview"
|
|
65
|
+
|
|
66
|
+
router_settings:
|
|
67
|
+
routing_strategy: cost-based-routing
|
|
68
|
+
# Fallback: if smart-router fails on one deployment, try the next
|
|
69
|
+
fallbacks:
|
|
70
|
+
- smart-router:
|
|
71
|
+
- ollama-minimax
|
|
72
|
+
- moonshot-kimi-k2
|
|
73
|
+
- azure-gpt5
|
|
74
|
+
num_retries: 2
|
|
75
|
+
timeout: 90
|
|
76
|
+
|
|
77
|
+
litellm_settings:
|
|
78
|
+
drop_params: true
|
|
79
|
+
use_responses_api: false
|
|
80
|
+
return_response_headers: true
|
|
81
|
+
success_callback: []
|
|
82
|
+
failure_callback: []
|
|
83
|
+
|
|
84
|
+
general_settings:
|
|
85
|
+
master_key: sk-1234 # change this
|
|
86
|
+
port: 8082
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# ─── Lynkr Benchmark Config ───────────────────────────────────────────────────
|
|
2
|
+
# Multi-provider tier routing: Ollama → Moonshot → Azure OpenAI
|
|
3
|
+
# Copy to .env and fill in your credentials.
|
|
4
|
+
|
|
5
|
+
PORT=8081
|
|
6
|
+
|
|
7
|
+
# ── Ollama (local, free) ───────────────────────────────────────────────────────
|
|
8
|
+
OLLAMA_ENDPOINT=http://localhost:11434
|
|
9
|
+
OLLAMA_MODEL=qwen2.5-coder:7b
|
|
10
|
+
OLLAMA_TIMEOUT_MS=120000
|
|
11
|
+
OLLAMA_EMBEDDINGS_MODEL=nomic-embed-text
|
|
12
|
+
OLLAMA_EMBEDDINGS_ENDPOINT=http://localhost:11434/api/embeddings
|
|
13
|
+
|
|
14
|
+
# ── Azure OpenAI ───────────────────────────────────────────────────────────────
|
|
15
|
+
AZURE_OPENAI_ENDPOINT=https://YOUR-RESOURCE.openai.azure.com
|
|
16
|
+
AZURE_OPENAI_API_KEY=your-azure-openai-key
|
|
17
|
+
AZURE_OPENAI_DEPLOYMENT=gpt-4o
|
|
18
|
+
AZURE_OPENAI_API_VERSION=2024-08-01-preview
|
|
19
|
+
|
|
20
|
+
# ── Moonshot (Kimi) ────────────────────────────────────────────────────────────
|
|
21
|
+
MOONSHOT_API_KEY=your-moonshot-api-key
|
|
22
|
+
MOONSHOT_ENDPOINT=https://api.moonshot.ai/v1/chat/completions
|
|
23
|
+
MOONSHOT_MODEL=kimi-k2-turbo-preview
|
|
24
|
+
|
|
25
|
+
# ── Primary provider (Lynkr uses this when no tier matches) ───────────────────
|
|
26
|
+
# Set to whichever you want as the default fallback
|
|
27
|
+
MODEL_PROVIDER=azure-openai
|
|
28
|
+
|
|
29
|
+
# ── Tier Routing ───────────────────────────────────────────────────────────────
|
|
30
|
+
# SIMPLE → Ollama local (free)
|
|
31
|
+
# MEDIUM → Moonshot Kimi (cheap, fast)
|
|
32
|
+
# COMPLEX → Azure OpenAI GPT-4o (powerful)
|
|
33
|
+
# REASONING→ Azure OpenAI o3-mini (best reasoning)
|
|
34
|
+
TIER_SIMPLE=ollama:minimax-m2.5:cloud
|
|
35
|
+
TIER_MEDIUM=ollama:minimax-m2.5:cloud
|
|
36
|
+
TIER_COMPLEX=moonshot:kimi-k2.6
|
|
37
|
+
TIER_REASONING=azure-openai:gpt-5.2-chat
|
|
38
|
+
|
|
39
|
+
# ── Token Optimisations (these are what LiteLLM/Portkey don't have) ────────────
|
|
40
|
+
SMART_TOOL_SELECTION=true
|
|
41
|
+
PROMPT_CACHE_ENABLED=true
|
|
42
|
+
SEMANTIC_CACHE_ENABLED=true
|
|
43
|
+
SEMANTIC_CACHE_THRESHOLD=0.95
|
|
44
|
+
HISTORY_COMPRESSION_ENABLED=true
|
|
45
|
+
TOOL_INJECTION_ENABLED=false
|
|
46
|
+
|
|
47
|
+
# ── Optional: make routing decisions visible in responses ──────────────────────
|
|
48
|
+
LYNKR_VISIBLE_ROUTING=true
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
{
|
|
2
|
+
"_comment": "Portkey Gateway Config — multi-provider conditional routing",
|
|
3
|
+
"_note": "Portkey has NO automatic complexity detection. This config uses max_tokens as a proxy for complexity. For real tier routing pass x-portkey-metadata: { 'tier': 'simple|medium|complex|reasoning' } from your client.",
|
|
4
|
+
|
|
5
|
+
"strategy": {
|
|
6
|
+
"mode": "conditional"
|
|
7
|
+
},
|
|
8
|
+
|
|
9
|
+
"conditions": [
|
|
10
|
+
{
|
|
11
|
+
"_comment": "SIMPLE — short requests (max_tokens <= 256) → Ollama",
|
|
12
|
+
"condition": {
|
|
13
|
+
"query.max_tokens": { "$lte": 256 }
|
|
14
|
+
},
|
|
15
|
+
"target": {
|
|
16
|
+
"provider": "ollama",
|
|
17
|
+
"customHost": "http://localhost:11434",
|
|
18
|
+
"override_params": {
|
|
19
|
+
"model": "qwen2.5-coder:7b"
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
},
|
|
23
|
+
{
|
|
24
|
+
"_comment": "MEDIUM — metadata tier=medium → Moonshot",
|
|
25
|
+
"condition": {
|
|
26
|
+
"metadata.tier": { "$eq": "medium" }
|
|
27
|
+
},
|
|
28
|
+
"target": {
|
|
29
|
+
"provider": "openai",
|
|
30
|
+
"apiKey": "{{MOONSHOT_API_KEY}}",
|
|
31
|
+
"baseURL": "https://api.moonshot.ai/v1",
|
|
32
|
+
"override_params": {
|
|
33
|
+
"model": "moonshot-v1-8k"
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"_comment": "REASONING — metadata tier=reasoning → Azure OpenAI o3-mini",
|
|
39
|
+
"condition": {
|
|
40
|
+
"metadata.tier": { "$eq": "reasoning" }
|
|
41
|
+
},
|
|
42
|
+
"target": {
|
|
43
|
+
"provider": "azure-openai",
|
|
44
|
+
"apiKey": "{{AZURE_OPENAI_API_KEY}}",
|
|
45
|
+
"resourceName": "YOUR-RESOURCE",
|
|
46
|
+
"deploymentId": "o3-mini",
|
|
47
|
+
"apiVersion": "2024-12-01-preview"
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
],
|
|
51
|
+
|
|
52
|
+
"default": {
|
|
53
|
+
"_comment": "COMPLEX — everything else → Azure OpenAI GPT-4o",
|
|
54
|
+
"provider": "azure-openai",
|
|
55
|
+
"apiKey": "{{AZURE_OPENAI_API_KEY}}",
|
|
56
|
+
"resourceName": "YOUR-RESOURCE",
|
|
57
|
+
"deploymentId": "gpt-4o",
|
|
58
|
+
"apiVersion": "2024-08-01-preview"
|
|
59
|
+
}
|
|
60
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Run Portkey local AI Gateway with provider credentials injected
|
|
3
|
+
|
|
4
|
+
docker run -d \
|
|
5
|
+
--name portkey-gateway \
|
|
6
|
+
-p 8083:8787 \
|
|
7
|
+
-e AZURE_OPENAI_API_KEY="${AZURE_OPENAI_API_KEY}" \
|
|
8
|
+
-e MOONSHOT_API_KEY="${MOONSHOT_API_KEY}" \
|
|
9
|
+
portkeyai/gateway:latest
|
|
10
|
+
|
|
11
|
+
echo "Portkey gateway running on http://localhost:8083"
|
|
12
|
+
echo ""
|
|
13
|
+
echo "To use Azure OpenAI directly (no tier routing):"
|
|
14
|
+
echo " curl http://localhost:8083/v1/chat/completions \\"
|
|
15
|
+
echo " -H 'x-portkey-provider: azure-openai' \\"
|
|
16
|
+
echo " -H 'x-portkey-api-key: \$PORTKEY_API_KEY' \\"
|
|
17
|
+
echo " -H 'x-portkey-azure-resource-name: YOUR-RESOURCE' \\"
|
|
18
|
+
echo " -H 'x-portkey-azure-deployment-id: gpt-4o' \\"
|
|
19
|
+
echo " -H 'x-portkey-azure-api-version: 2024-08-01-preview' \\"
|
|
20
|
+
echo " -d '{\"model\": \"gpt-4o\", \"messages\": [...]}'"
|
|
21
|
+
echo ""
|
|
22
|
+
echo "To use conditional routing config, pass:"
|
|
23
|
+
echo " -H 'x-portkey-config: <base64-encoded portkey-config.json>'"
|
|
@@ -0,0 +1,449 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Full-Stack Benchmark: Lynkr vs LiteLLM vs Portkey
|
|
4
|
+
*
|
|
5
|
+
* Tests 6 scenarios that cover Lynkr's full optimization stack:
|
|
6
|
+
* 1. Simple Q&A → tier routing only
|
|
7
|
+
* 2. Tool-heavy request → smart tool selection (50-70% token reduction)
|
|
8
|
+
* 3. Long history → history compression
|
|
9
|
+
* 4. Large payload → TOON compression
|
|
10
|
+
* 5. Repeated prompts → semantic cache (2nd call should be ~0 tokens billed)
|
|
11
|
+
* 6. Reasoning request → tier routing to top model
|
|
12
|
+
*
|
|
13
|
+
* LiteLLM and Portkey send tokens as-is. Lynkr compresses before the model sees them.
|
|
14
|
+
* The delta in input_tokens IS the compression saving.
|
|
15
|
+
*
|
|
16
|
+
* Usage:
|
|
17
|
+
* ANTHROPIC_API_KEY=sk-ant-... \
|
|
18
|
+
* LITELLM_MASTER_KEY=sk-1234 \
|
|
19
|
+
* PORTKEY_API_KEY=your-key \
|
|
20
|
+
* node benchmark-tier-routing.js
|
|
21
|
+
*/
|
|
22
|
+
|
|
23
|
+
// ─── Proxy config ─────────────────────────────────────────────────────────────
|
|
24
|
+
|
|
25
|
+
const PROXIES = [
|
|
26
|
+
{
|
|
27
|
+
name: 'Lynkr',
|
|
28
|
+
url: process.env.LYNKR_URL ?? 'http://localhost:8081',
|
|
29
|
+
apiKey: process.env.ANTHROPIC_API_KEY,
|
|
30
|
+
defaultModel: 'claude-sonnet-4-5',
|
|
31
|
+
headers: {},
|
|
32
|
+
getTier: (_b, h) => h['x-lynkr-tier'] ?? 'unknown',
|
|
33
|
+
getModel: (_b, h) => h['x-lynkr-model'] ?? h['x-lynkr-provider'] ?? 'unknown',
|
|
34
|
+
},
|
|
35
|
+
{
|
|
36
|
+
name: 'LiteLLM',
|
|
37
|
+
url: process.env.LITELLM_URL ?? 'http://localhost:8082',
|
|
38
|
+
apiKey: process.env.LITELLM_MASTER_KEY ?? 'sk-1234',
|
|
39
|
+
defaultModel: 'smart-router',
|
|
40
|
+
headers: {},
|
|
41
|
+
getTier: (_b, h) => {
|
|
42
|
+
const cost = parseFloat(h['x-litellm-response-cost-original'] ?? '0');
|
|
43
|
+
if (cost === 0) return 'SIMPLE/MEDIUM (Ollama)';
|
|
44
|
+
if (cost < 0.01) return 'MEDIUM (Moonshot)';
|
|
45
|
+
return 'COMPLEX/REASONING (Azure)';
|
|
46
|
+
},
|
|
47
|
+
getModel: (_b, h) => {
|
|
48
|
+
const cost = parseFloat(h['x-litellm-response-cost-original'] ?? '0');
|
|
49
|
+
if (cost === 0) return 'ollama (local/free)';
|
|
50
|
+
if (cost < 0.01) return 'moonshot/kimi-k2.6';
|
|
51
|
+
return 'azure/gpt-5.2-chat';
|
|
52
|
+
},
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
name: 'Portkey',
|
|
56
|
+
url: process.env.PORTKEY_URL ?? 'http://localhost:8083',
|
|
57
|
+
apiKey: process.env.ANTHROPIC_API_KEY,
|
|
58
|
+
defaultModel: 'claude-sonnet-4-5',
|
|
59
|
+
headers: {
|
|
60
|
+
'x-portkey-provider': 'anthropic',
|
|
61
|
+
...(process.env.PORTKEY_API_KEY ? { 'x-portkey-api-key': process.env.PORTKEY_API_KEY } : {}),
|
|
62
|
+
},
|
|
63
|
+
getTier: () => 'N/A',
|
|
64
|
+
getModel: (b) => b?.model ?? 'claude-sonnet-4-5',
|
|
65
|
+
},
|
|
66
|
+
];
|
|
67
|
+
|
|
68
|
+
// ─── Pricing per 1M tokens [input, output] USD ───────────────────────────────
|
|
69
|
+
|
|
70
|
+
const PRICING = {
|
|
71
|
+
'claude-haiku-4-5': [0.80, 4.00],
|
|
72
|
+
'claude-haiku-3': [0.25, 1.25],
|
|
73
|
+
'claude-sonnet-4-5': [3.00, 15.00],
|
|
74
|
+
'claude-sonnet-3-5': [3.00, 15.00],
|
|
75
|
+
'claude-opus-4': [15.00, 75.00],
|
|
76
|
+
'gpt-4o-mini': [0.15, 0.60],
|
|
77
|
+
'gpt-4o': [2.50, 10.00],
|
|
78
|
+
'o3-mini': [1.10, 4.40],
|
|
79
|
+
'default': [3.00, 15.00],
|
|
80
|
+
};
|
|
81
|
+
|
|
82
|
+
function costUsd(model, inputTok, outputTok) {
|
|
83
|
+
const key = Object.keys(PRICING).find(k => model.toLowerCase().includes(k)) ?? 'default';
|
|
84
|
+
const [i, o] = PRICING[key];
|
|
85
|
+
return (inputTok / 1e6) * i + (outputTok / 1e6) * o;
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// Rough token estimator: 1 token ≈ 4 chars (GPT/Claude rule of thumb)
|
|
89
|
+
function estimateTokens(payload) {
|
|
90
|
+
return Math.ceil(JSON.stringify(payload).length / 4);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// ─── Reusable tool definitions (simulate a real Claude Code session) ──────────
|
|
94
|
+
// 14 tools ≈ 2,500 tokens of tool schema sent on every request without smart selection
|
|
95
|
+
|
|
96
|
+
const TOOL_DEFINITIONS = [
|
|
97
|
+
{ name: 'Read', description: 'Read a file from disk', input_schema: { type: 'object', properties: { file_path: { type: 'string' }, limit: { type: 'number' } }, required: ['file_path'] } },
|
|
98
|
+
{ name: 'Write', description: 'Write content to a file', input_schema: { type: 'object', properties: { file_path: { type: 'string' }, content: { type: 'string' } }, required: ['file_path', 'content'] } },
|
|
99
|
+
{ name: 'Edit', description: 'Make targeted edits to a file', input_schema: { type: 'object', properties: { file_path: { type: 'string' }, old_string: { type: 'string' }, new_string: { type: 'string' } }, required: ['file_path', 'old_string', 'new_string'] } },
|
|
100
|
+
{ name: 'Bash', description: 'Execute a shell command', input_schema: { type: 'object', properties: { command: { type: 'string' }, timeout: { type: 'number' } }, required: ['command'] } },
|
|
101
|
+
{ name: 'Glob', description: 'Find files matching a pattern', input_schema: { type: 'object', properties: { pattern: { type: 'string' }, path: { type: 'string' } }, required: ['pattern'] } },
|
|
102
|
+
{ name: 'Grep', description: 'Search for patterns in files', input_schema: { type: 'object', properties: { pattern: { type: 'string' }, path: { type: 'string' }, glob: { type: 'string' } }, required: ['pattern'] } },
|
|
103
|
+
{ name: 'WebSearch', description: 'Search the web', input_schema: { type: 'object', properties: { query: { type: 'string' } }, required: ['query'] } },
|
|
104
|
+
{ name: 'WebFetch', description: 'Fetch a URL', input_schema: { type: 'object', properties: { url: { type: 'string' }, prompt: { type: 'string' } }, required: ['url'] } },
|
|
105
|
+
{ name: 'TodoWrite', description: 'Write a todo list', input_schema: { type: 'object', properties: { todos: { type: 'array', items: { type: 'object' } } }, required: ['todos'] } },
|
|
106
|
+
{ name: 'TodoRead', description: 'Read the current todo list', input_schema: { type: 'object', properties: {} } },
|
|
107
|
+
{ name: 'Task', description: 'Spawn a subagent', input_schema: { type: 'object', properties: { description: { type: 'string' }, prompt: { type: 'string' } }, required: ['description', 'prompt'] } },
|
|
108
|
+
{ name: 'NotebookRead', description: 'Read a Jupyter notebook', input_schema: { type: 'object', properties: { notebook_path: { type: 'string' } }, required: ['notebook_path'] } },
|
|
109
|
+
{ name: 'NotebookEdit', description: 'Edit a Jupyter notebook', input_schema: { type: 'object', properties: { notebook_path: { type: 'string' }, cell_index: { type: 'number' }, new_source: { type: 'string' } }, required: ['notebook_path', 'cell_index', 'new_source'] } },
|
|
110
|
+
{ name: 'mcp__github__create_pull_request', description: 'Create a GitHub pull request via MCP', input_schema: { type: 'object', properties: { title: { type: 'string' }, body: { type: 'string' }, base: { type: 'string' }, head: { type: 'string' } }, required: ['title', 'body'] } },
|
|
111
|
+
];
|
|
112
|
+
|
|
113
|
+
// ─── Scenarios ────────────────────────────────────────────────────────────────
|
|
114
|
+
|
|
115
|
+
const SCENARIOS = [
|
|
116
|
+
// ── 1. Simple Q&A ─────────────────────────────────────────────────────────
|
|
117
|
+
{
|
|
118
|
+
id: 'S1', label: 'Simple Q&A',
|
|
119
|
+
feature: 'Tier routing → cheap model',
|
|
120
|
+
buildPayload: (model) => ({
|
|
121
|
+
model, max_tokens: 256,
|
|
122
|
+
messages: [{ role: 'user', content: 'What does git stash do?' }],
|
|
123
|
+
}),
|
|
124
|
+
},
|
|
125
|
+
|
|
126
|
+
// ── 2. Tool-heavy (smart tool selection) ──────────────────────────────────
|
|
127
|
+
// All 14 tools sent — Lynkr strips irrelevant ones before forwarding
|
|
128
|
+
{
|
|
129
|
+
id: 'T1', label: 'Tool-heavy (14 tools)',
|
|
130
|
+
feature: 'Smart tool selection → strips unused tools',
|
|
131
|
+
buildPayload: (model) => ({
|
|
132
|
+
model, max_tokens: 512,
|
|
133
|
+
tools: TOOL_DEFINITIONS,
|
|
134
|
+
messages: [{ role: 'user', content: 'What does the README say about installation?' }],
|
|
135
|
+
}),
|
|
136
|
+
},
|
|
137
|
+
{
|
|
138
|
+
id: 'T2', label: 'Tool-heavy (14 tools) – write task',
|
|
139
|
+
feature: 'Smart tool selection → keeps only write tools',
|
|
140
|
+
buildPayload: (model) => ({
|
|
141
|
+
model, max_tokens: 512,
|
|
142
|
+
tools: TOOL_DEFINITIONS,
|
|
143
|
+
messages: [{ role: 'user', content: 'Edit the config file to set DEBUG=true' }],
|
|
144
|
+
}),
|
|
145
|
+
},
|
|
146
|
+
|
|
147
|
+
// ── 3. Long history (history compression) ─────────────────────────────────
|
|
148
|
+
// 8-turn conversation — Lynkr compresses older turns before forwarding
|
|
149
|
+
{
|
|
150
|
+
id: 'H1', label: 'Long history (8 turns)',
|
|
151
|
+
feature: 'History compression → dedups older turns',
|
|
152
|
+
buildPayload: (model) => ({
|
|
153
|
+
model, max_tokens: 512,
|
|
154
|
+
messages: [
|
|
155
|
+
{ role: 'user', content: 'Can you help me refactor my Express app?' },
|
|
156
|
+
{ role: 'assistant', content: 'Sure! Let\'s start by reviewing your current structure. What does your folder layout look like?' },
|
|
157
|
+
{ role: 'user', content: 'I have routes/, controllers/, models/, middleware/ folders.' },
|
|
158
|
+
{ role: 'assistant', content: 'Good structure. Are you using any ORM, and do you have error handling middleware in place?' },
|
|
159
|
+
{ role: 'user', content: 'I use Sequelize. Error handling is scattered across controllers right now.' },
|
|
160
|
+
{ role: 'assistant', content: 'Let\'s centralise error handling first. Create middleware/errorHandler.js and export an express error middleware with four params (err, req, res, next).' },
|
|
161
|
+
{ role: 'user', content: 'Done. Now I need to add input validation — should I use Joi or express-validator?' },
|
|
162
|
+
{ role: 'assistant', content: 'For Sequelize projects, Joi pairs well. Install it and create a validate() middleware wrapper.' },
|
|
163
|
+
{ role: 'user', content: 'Great, now how do I add rate limiting to specific routes only?' },
|
|
164
|
+
],
|
|
165
|
+
}),
|
|
166
|
+
},
|
|
167
|
+
|
|
168
|
+
// ── 4a. TOON – large JSON tool result (file read) ─────────────────────────
|
|
169
|
+
// Simulates a tool_result block returning a large JSON config file.
|
|
170
|
+
// TOON specifically compresses JSON structures — this is its primary trigger.
|
|
171
|
+
{
|
|
172
|
+
id: 'L1', label: 'TOON – large JSON tool result',
|
|
173
|
+
feature: 'TOON compression → compresses JSON tool_result before forwarding',
|
|
174
|
+
buildPayload: (model) => ({
|
|
175
|
+
model, max_tokens: 512,
|
|
176
|
+
tools: [TOOL_DEFINITIONS[0]], // Read tool only
|
|
177
|
+
messages: [
|
|
178
|
+
{ role: 'user', content: 'Read package.json and tell me the dependencies.' },
|
|
179
|
+
{ role: 'assistant', content: null,
|
|
180
|
+
tool_calls: [{ id: 'tr_001', type: 'function', function: { name: 'Read', arguments: JSON.stringify({ file_path: 'package.json' }) } }] },
|
|
181
|
+
{ role: 'user', content: [
|
|
182
|
+
{ type: 'tool_result', tool_use_id: 'tr_001', content: JSON.stringify(generateFakeLargeJsonResult()) },
|
|
183
|
+
]},
|
|
184
|
+
{ role: 'user', content: 'What are the top-level dependencies?' },
|
|
185
|
+
],
|
|
186
|
+
}),
|
|
187
|
+
},
|
|
188
|
+
|
|
189
|
+
// ── 4b. TOON – large grep/glob JSON result ────────────────────────────────
|
|
190
|
+
// Simulates a Bash tool returning a large JSON array of search results.
|
|
191
|
+
{
|
|
192
|
+
id: 'L2', label: 'TOON – large JSON grep result (~2k tokens)',
|
|
193
|
+
feature: 'TOON compression → compresses JSON array tool_result',
|
|
194
|
+
buildPayload: (model) => ({
|
|
195
|
+
model, max_tokens: 512,
|
|
196
|
+
tools: [TOOL_DEFINITIONS[3]], // Bash tool only
|
|
197
|
+
messages: [
|
|
198
|
+
{ role: 'user', content: 'Find all TODO comments in the codebase.' },
|
|
199
|
+
{ role: 'assistant', content: null,
|
|
200
|
+
tool_calls: [{ id: 'tr_002', type: 'function', function: { name: 'Bash', arguments: JSON.stringify({ command: 'grep -rn "TODO" src/' }) } }] },
|
|
201
|
+
{ role: 'user', content: [
|
|
202
|
+
{ type: 'tool_result', tool_use_id: 'tr_002', content: JSON.stringify(generateFakeGrepResult()) },
|
|
203
|
+
]},
|
|
204
|
+
{ role: 'user', content: 'Summarise the most important TODOs.' },
|
|
205
|
+
],
|
|
206
|
+
}),
|
|
207
|
+
},
|
|
208
|
+
|
|
209
|
+
// ── 5. Semantic cache (send same prompt twice) ─────────────────────────────
|
|
210
|
+
// First call: billed normally. Second call: Lynkr returns cached response (0 LLM tokens).
|
|
211
|
+
{
|
|
212
|
+
id: 'SC1', label: 'Cache – first call',
|
|
213
|
+
feature: 'Semantic cache – populates cache',
|
|
214
|
+
buildPayload: (model) => ({
|
|
215
|
+
model, max_tokens: 256,
|
|
216
|
+
messages: [{ role: 'user', content: 'Explain the difference between TCP and UDP in two sentences.' }],
|
|
217
|
+
}),
|
|
218
|
+
},
|
|
219
|
+
{
|
|
220
|
+
id: 'SC2', label: 'Cache – second call (near-identical)',
|
|
221
|
+
feature: 'Semantic cache – should hit cache → 0 tokens billed',
|
|
222
|
+
buildPayload: (model) => ({
|
|
223
|
+
model, max_tokens: 256,
|
|
224
|
+
// Slightly paraphrased — semantic cache threshold 0.95 should still match
|
|
225
|
+
messages: [{ role: 'user', content: 'What is the difference between TCP and UDP? Keep it brief.' }],
|
|
226
|
+
}),
|
|
227
|
+
},
|
|
228
|
+
|
|
229
|
+
// ── 6. Reasoning ──────────────────────────────────────────────────────────
|
|
230
|
+
{
|
|
231
|
+
id: 'R1', label: 'Reasoning – security analysis',
|
|
232
|
+
feature: 'Tier routing → top model + risk classifier',
|
|
233
|
+
buildPayload: (model) => ({
|
|
234
|
+
model, max_tokens: 1024,
|
|
235
|
+
messages: [{ role: 'user', content: 'Analyse the security trade-offs of storing JWT tokens in localStorage vs httpOnly cookies for a banking application. Step by step.' }],
|
|
236
|
+
}),
|
|
237
|
+
},
|
|
238
|
+
];
|
|
239
|
+
|
|
240
|
+
// ─── JSON payload generators (TOON compresses these, plain text it ignores) ──
|
|
241
|
+
|
|
242
|
+
function generateFakeLargeJsonResult() {
|
|
243
|
+
// Simulates a package.json with many dependencies — ~1,800 tokens of JSON
|
|
244
|
+
const deps = {};
|
|
245
|
+
const devDeps = {};
|
|
246
|
+
const packages = [
|
|
247
|
+
'express','lodash','axios','react','typescript','webpack','babel','eslint',
|
|
248
|
+
'jest','mocha','chai','sinon','supertest','dotenv','cors','helmet','morgan',
|
|
249
|
+
'winston','pino','joi','yup','zod','mongoose','sequelize','prisma','knex',
|
|
250
|
+
'redis','ioredis','bull','agenda','node-cron','socket.io','ws','graphql',
|
|
251
|
+
'apollo-server','type-graphql','class-transformer','class-validator','reflect-metadata',
|
|
252
|
+
];
|
|
253
|
+
packages.forEach((p, i) => {
|
|
254
|
+
const ver = `^${Math.floor(i/10)+1}.${i%10}.${Math.floor(Math.random()*20)}`;
|
|
255
|
+
if (i % 3 === 0) devDeps[p] = ver; else deps[p] = ver;
|
|
256
|
+
});
|
|
257
|
+
return {
|
|
258
|
+
name: 'my-app', version: '1.0.0',
|
|
259
|
+
scripts: { start: 'node index.js', test: 'jest', build: 'webpack', lint: 'eslint src/' },
|
|
260
|
+
dependencies: deps,
|
|
261
|
+
devDependencies: devDeps,
|
|
262
|
+
engines: { node: '>=18.0.0' },
|
|
263
|
+
keywords: ['api','backend','nodejs'],
|
|
264
|
+
files: Array.from({ length: 30 }, (_, i) => `src/module${i}.js`),
|
|
265
|
+
exports: Object.fromEntries(packages.map(p => [`./${p}`, `./dist/${p}/index.js`])),
|
|
266
|
+
};
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
function generateFakeGrepResult() {
|
|
270
|
+
// Simulates grep -rn "TODO" returning a large JSON array — ~1,200 tokens
|
|
271
|
+
return Array.from({ length: 60 }, (_, i) => ({
|
|
272
|
+
file: `src/${['routes','controllers','models','middleware','utils'][i % 5]}/module${i % 15}.js`,
|
|
273
|
+
line: Math.floor(Math.random() * 500) + 1,
|
|
274
|
+
match: `TODO: ${['fix error handling','add validation','refactor this','add tests','update docs','remove hardcoded value','add rate limiting','handle edge case'][i % 8]} — assigned to ${['alice','bob','carol','dave'][i % 4]}`,
|
|
275
|
+
context: ` // TODO: ${['fix error handling','add validation','refactor this','add tests'][i % 4]}\n function handler${i}(req, res) { return res.json({ status: 'ok' }); }`,
|
|
276
|
+
}));
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
// ─── HTTP request ─────────────────────────────────────────────────────────────
|
|
280
|
+
|
|
281
|
+
async function sendRequest(proxy, scenario) {
|
|
282
|
+
const payload = scenario.buildPayload(proxy.defaultModel);
|
|
283
|
+
const estimatedInputTokens = estimateTokens(payload.messages) + estimateTokens(payload.tools ?? []);
|
|
284
|
+
const start = Date.now();
|
|
285
|
+
|
|
286
|
+
try {
|
|
287
|
+
const res = await fetch(`${proxy.url}/v1/messages`, {
|
|
288
|
+
method: 'POST',
|
|
289
|
+
headers: {
|
|
290
|
+
'content-type': 'application/json',
|
|
291
|
+
'x-api-key': proxy.apiKey,
|
|
292
|
+
'anthropic-version': '2023-06-01',
|
|
293
|
+
...proxy.headers,
|
|
294
|
+
},
|
|
295
|
+
body: JSON.stringify(payload),
|
|
296
|
+
signal: AbortSignal.timeout(90_000),
|
|
297
|
+
});
|
|
298
|
+
|
|
299
|
+
const latencyMs = Date.now() - start;
|
|
300
|
+
const headers = Object.fromEntries(res.headers.entries());
|
|
301
|
+
|
|
302
|
+
if (!res.ok) {
|
|
303
|
+
const err = await res.text();
|
|
304
|
+
return { ok: false, error: `HTTP ${res.status}: ${err.slice(0, 100)}`, latencyMs, estimatedInputTokens };
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
const body = await res.json();
|
|
308
|
+
const billedInput = body?.usage?.input_tokens ?? 0;
|
|
309
|
+
const billedOutput = body?.usage?.output_tokens ?? 0;
|
|
310
|
+
const model = proxy.getModel(body, headers);
|
|
311
|
+
const tier = proxy.getTier(body, headers);
|
|
312
|
+
const cost = costUsd(model, billedInput, billedOutput);
|
|
313
|
+
const tokensSaved = Math.max(0, estimatedInputTokens - billedInput);
|
|
314
|
+
const compressionPct = estimatedInputTokens > 0
|
|
315
|
+
? ((tokensSaved / estimatedInputTokens) * 100).toFixed(1)
|
|
316
|
+
: '0.0';
|
|
317
|
+
|
|
318
|
+
return { ok: true, tier, model, billedInput, billedOutput, estimatedInputTokens, tokensSaved, compressionPct, cost, latencyMs };
|
|
319
|
+
} catch (e) {
|
|
320
|
+
return { ok: false, error: e.message, latencyMs: Date.now() - start, estimatedInputTokens };
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
// ─── Formatting helpers ───────────────────────────────────────────────────────
|
|
325
|
+
|
|
326
|
+
const col = (s, w) => String(s ?? '').slice(0, w).padEnd(w);
|
|
327
|
+
const $ = (n) => `$${n.toFixed(6)}`;
|
|
328
|
+
|
|
329
|
+
// ─── Main ─────────────────────────────────────────────────────────────────────
|
|
330
|
+
|
|
331
|
+
async function runBenchmark() {
|
|
332
|
+
console.log('\n╔═══════════════════════════════════════════════════════════════════╗');
|
|
333
|
+
console.log('║ Full-Stack Benchmark: Lynkr vs LiteLLM vs Portkey ║');
|
|
334
|
+
console.log('║ Tests: tier routing · tool selection · history · TOON · cache ║');
|
|
335
|
+
console.log('╚═══════════════════════════════════════════════════════════════════╝\n');
|
|
336
|
+
|
|
337
|
+
// results[proxyName][scenarioId] = result
|
|
338
|
+
const results = {};
|
|
339
|
+
for (const p of PROXIES) results[p.name] = {};
|
|
340
|
+
|
|
341
|
+
for (const scenario of SCENARIOS) {
|
|
342
|
+
process.stdout.write(`\n[${scenario.id}] ${scenario.label.padEnd(35)} `);
|
|
343
|
+
for (const proxy of PROXIES) {
|
|
344
|
+
process.stdout.write(`${proxy.name}… `);
|
|
345
|
+
results[proxy.name][scenario.id] = await sendRequest(proxy, scenario);
|
|
346
|
+
await new Promise(r => setTimeout(r, 400));
|
|
347
|
+
}
|
|
348
|
+
process.stdout.write('✓');
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
// ─── Per-Scenario Detail ────────────────────────────────────────────────────
|
|
352
|
+
|
|
353
|
+
console.log('\n\n\n━━━ PER-SCENARIO DETAIL ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n');
|
|
354
|
+
|
|
355
|
+
for (const scenario of SCENARIOS) {
|
|
356
|
+
console.log(`\n▸ [${scenario.id}] ${scenario.label}`);
|
|
357
|
+
console.log(` Feature under test: ${scenario.feature}`);
|
|
358
|
+
console.log(` ${'Proxy'.padEnd(10)} ${'Tier'.padEnd(14)} ${'Model'.padEnd(26)} ${'Est.Tok'.padEnd(9)} ${'Billed'.padEnd(9)} ${'Saved'.padEnd(8)} ${'Compress%'.padEnd(11)} ${'Cost'.padEnd(12)} Latency`);
|
|
359
|
+
console.log(' ' + '─'.repeat(110));
|
|
360
|
+
|
|
361
|
+
for (const proxy of PROXIES) {
|
|
362
|
+
const r = results[proxy.name][scenario.id];
|
|
363
|
+
if (!r.ok) {
|
|
364
|
+
console.log(` ${col(proxy.name,10)} ERROR: ${r.error?.slice(0,80)}`);
|
|
365
|
+
continue;
|
|
366
|
+
}
|
|
367
|
+
console.log(
|
|
368
|
+
' ' +
|
|
369
|
+
col(proxy.name, 10) +
|
|
370
|
+
col(r.tier, 14) +
|
|
371
|
+
col(r.model, 26) +
|
|
372
|
+
col(r.estimatedInputTokens, 9) +
|
|
373
|
+
col(r.billedInput, 9) +
|
|
374
|
+
col(r.tokensSaved, 8) +
|
|
375
|
+
col(r.compressionPct + '%', 11) +
|
|
376
|
+
col($(r.cost), 12) +
|
|
377
|
+
`${r.latencyMs}ms`
|
|
378
|
+
);
|
|
379
|
+
}
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
// ─── Feature-Level Summary ──────────────────────────────────────────────────
|
|
383
|
+
|
|
384
|
+
console.log('\n\n━━━ FEATURE SUMMARY ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n');
|
|
385
|
+
|
|
386
|
+
const featureGroups = [
|
|
387
|
+
{ label: 'Tier Routing (S1, R1)', ids: ['S1', 'R1'] },
|
|
388
|
+
{ label: 'Smart Tool Selection (T1, T2)', ids: ['T1', 'T2'] },
|
|
389
|
+
{ label: 'History Compression (H1)', ids: ['H1'] },
|
|
390
|
+
{ label: 'TOON / JSON Tool Results (L1+L2)', ids: ['L1', 'L2'] },
|
|
391
|
+
{ label: 'Semantic Cache (SC1 + SC2)', ids: ['SC1','SC2'] },
|
|
392
|
+
];
|
|
393
|
+
|
|
394
|
+
for (const group of featureGroups) {
|
|
395
|
+
console.log(` ${group.label}`);
|
|
396
|
+
for (const proxy of PROXIES) {
|
|
397
|
+
const rs = group.ids.map(id => results[proxy.name][id]).filter(r => r?.ok);
|
|
398
|
+
if (rs.length === 0) { console.log(` ${proxy.name.padEnd(10)} – no data`); continue; }
|
|
399
|
+
const totalCost = rs.reduce((s, r) => s + r.cost, 0);
|
|
400
|
+
const totalSaved = rs.reduce((s, r) => s + r.tokensSaved, 0);
|
|
401
|
+
const totalEst = rs.reduce((s, r) => s + r.estimatedInputTokens, 0);
|
|
402
|
+
const avgCompress = totalEst > 0 ? ((totalSaved / totalEst) * 100).toFixed(1) : '0.0';
|
|
403
|
+
console.log(` ${proxy.name.padEnd(10)} cost: ${$(totalCost).padEnd(14)} tokens saved: ${String(totalSaved).padEnd(8)} compression: ${avgCompress}%`);
|
|
404
|
+
}
|
|
405
|
+
console.log();
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
// ─── Overall Cost Summary ───────────────────────────────────────────────────
|
|
409
|
+
|
|
410
|
+
console.log('\n━━━ OVERALL COST (all scenarios) ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n');
|
|
411
|
+
|
|
412
|
+
const totals = PROXIES.map(proxy => {
|
|
413
|
+
const rs = Object.values(results[proxy.name]).filter(r => r?.ok);
|
|
414
|
+
return {
|
|
415
|
+
name: proxy.name,
|
|
416
|
+
cost: rs.reduce((s, r) => s + r.cost, 0),
|
|
417
|
+
tokensSaved: rs.reduce((s, r) => s + r.tokensSaved, 0),
|
|
418
|
+
avgLatency: rs.length ? rs.reduce((s, r) => s + r.latencyMs, 0) / rs.length : 0,
|
|
419
|
+
};
|
|
420
|
+
}).sort((a, b) => a.cost - b.cost);
|
|
421
|
+
|
|
422
|
+
const maxCost = Math.max(...totals.map(t => t.cost), 0.000001);
|
|
423
|
+
const baseline = totals.find(t => t.name === 'Portkey')?.cost
|
|
424
|
+
?? totals.find(t => t.cost > 0)?.cost
|
|
425
|
+
?? maxCost;
|
|
426
|
+
|
|
427
|
+
for (const t of totals) {
|
|
428
|
+
const pct = baseline > 0 ? ((baseline - t.cost) / baseline * 100).toFixed(1) : '0.0';
|
|
429
|
+
const barLen = maxCost > 0 ? Math.max(1, Math.round((t.cost / maxCost) * 30)) : 1;
|
|
430
|
+
const bar = '█'.repeat(barLen);
|
|
431
|
+
console.log(` ${t.name.padEnd(10)} ${$(t.cost).padEnd(14)} ${pct.padStart(5)}% cheaper vs baseline avg ${Math.round(t.avgLatency)}ms ${bar}`);
|
|
432
|
+
}
|
|
433
|
+
|
|
434
|
+
// ─── Extrapolated: 100k requests/month ─────────────────────────────────────
|
|
435
|
+
|
|
436
|
+
console.log('\n\n━━━ EXTRAPOLATED: 100,000 requests/month ──────────────────────────\n');
|
|
437
|
+
console.log(' (same scenario mix × scale factor)\n');
|
|
438
|
+
|
|
439
|
+
const factor = 100_000 / SCENARIOS.length;
|
|
440
|
+
for (const t of totals) {
|
|
441
|
+
const monthly = t.cost * factor;
|
|
442
|
+
const annualSaving = baseline > 0 ? (baseline - t.cost) * factor * 12 : 0;
|
|
443
|
+
console.log(` ${t.name.padEnd(10)} ~$${monthly.toFixed(2).padStart(10)}/month ~$${(annualSaving).toFixed(0).padStart(10)}/year saved vs Portkey`);
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
console.log('\nDone.\n');
|
|
447
|
+
}
|
|
448
|
+
|
|
449
|
+
runBenchmark().catch(e => { console.error(e); process.exit(1); });
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "lynkr",
|
|
3
|
-
"version": "9.
|
|
4
|
-
"description": "Self-hosted Claude Code
|
|
3
|
+
"version": "9.4.0",
|
|
4
|
+
"description": "Self-hosted LLM gateway and tier-routing proxy for Claude Code, Cursor, and Codex. Routes across Ollama, AWS Bedrock, OpenRouter, Databricks, Azure OpenAI, llama.cpp, and LM Studio with prompt caching, MCP tools, and 60-80% cost savings.",
|
|
5
5
|
"main": "index.js",
|
|
6
6
|
"bin": {
|
|
7
7
|
"lynkr": "bin/cli.js",
|
|
@@ -23,23 +23,45 @@
|
|
|
23
23
|
"test:all": "npm run test:unit && npm run test:performance && npm run test:benchmark"
|
|
24
24
|
},
|
|
25
25
|
"keywords": [
|
|
26
|
+
"llm",
|
|
27
|
+
"llm-gateway",
|
|
28
|
+
"llm-proxy",
|
|
29
|
+
"llm-router",
|
|
30
|
+
"ai-gateway",
|
|
31
|
+
"tier-routing",
|
|
26
32
|
"claude",
|
|
33
|
+
"claude-code",
|
|
27
34
|
"anthropic",
|
|
28
|
-
"
|
|
35
|
+
"cursor",
|
|
36
|
+
"codex",
|
|
37
|
+
"openai",
|
|
38
|
+
"openrouter",
|
|
39
|
+
"ollama",
|
|
40
|
+
"llamacpp",
|
|
41
|
+
"lm-studio",
|
|
42
|
+
"bedrock",
|
|
43
|
+
"aws-bedrock",
|
|
44
|
+
"azure-openai",
|
|
29
45
|
"databricks",
|
|
30
|
-
"
|
|
31
|
-
"mcp"
|
|
46
|
+
"gemini",
|
|
47
|
+
"mcp",
|
|
48
|
+
"model-context-protocol",
|
|
49
|
+
"prompt-cache",
|
|
50
|
+
"self-hosted",
|
|
51
|
+
"litellm-alternative",
|
|
52
|
+
"portkey-alternative",
|
|
53
|
+
"copilot-alternative"
|
|
32
54
|
],
|
|
33
55
|
"author": "Vishal Veera Reddy",
|
|
34
56
|
"license": "Apache-2.0",
|
|
35
57
|
"repository": {
|
|
36
58
|
"type": "git",
|
|
37
|
-
"url": "git+https://github.com/
|
|
59
|
+
"url": "git+https://github.com/Fast-Editor/Lynkr.git"
|
|
38
60
|
},
|
|
39
61
|
"bugs": {
|
|
40
|
-
"url": "https://github.com/
|
|
62
|
+
"url": "https://github.com/Fast-Editor/Lynkr/issues"
|
|
41
63
|
},
|
|
42
|
-
"homepage": "https://github.
|
|
64
|
+
"homepage": "https://fast-editor.github.io/Lynkr/",
|
|
43
65
|
"engines": {
|
|
44
66
|
"node": ">=20.0.0"
|
|
45
67
|
},
|
package/src/api/router.js
CHANGED
package/src/cache/semantic.js
CHANGED
|
@@ -132,10 +132,13 @@ class SemanticCache {
|
|
|
132
132
|
|
|
133
133
|
const hash = crypto.createHash('sha256');
|
|
134
134
|
|
|
135
|
-
// Include system prompt
|
|
135
|
+
// Include only a stable prefix of the system prompt (first 200 chars of static
|
|
136
|
+
// instructions). Lynkr appends dynamic memory/context after the static prefix —
|
|
137
|
+
// hashing the full content causes cache misses between near-identical requests
|
|
138
|
+
// when memory retrieval returns slightly different results.
|
|
136
139
|
const systemMsg = messages.find(m => m.role === 'system');
|
|
137
140
|
if (systemMsg && typeof systemMsg.content === 'string') {
|
|
138
|
-
hash.update(systemMsg.content);
|
|
141
|
+
hash.update(systemMsg.content.substring(0, 200));
|
|
139
142
|
}
|
|
140
143
|
|
|
141
144
|
// Include conversation state indicators to prevent tool loop caching
|
package/src/context/toon.js
CHANGED
|
@@ -18,17 +18,22 @@ function normaliseSettings(settings = {}) {
|
|
|
18
18
|
function resolveEncodeFn(overrideEncode) {
|
|
19
19
|
if (typeof overrideEncode === "function") return overrideEncode;
|
|
20
20
|
if (cachedEncode !== undefined) return cachedEncode;
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
cachedLoadError = cachedEncode ? null : new Error("Missing encode() export from @toon-format/toon");
|
|
25
|
-
} catch (err) {
|
|
26
|
-
cachedEncode = null;
|
|
27
|
-
cachedLoadError = err;
|
|
28
|
-
}
|
|
29
|
-
return cachedEncode;
|
|
21
|
+
// cachedEncode is populated asynchronously at module load via dynamic import below.
|
|
22
|
+
// Return null here — the warn-once log will fire on first request if still loading.
|
|
23
|
+
return null;
|
|
30
24
|
}
|
|
31
25
|
|
|
26
|
+
// @toon-format/toon is ESM-only; dynamic import() is available in CommonJS modules.
|
|
27
|
+
// Pre-warm at startup so encode is ready before the first request arrives.
|
|
28
|
+
import("@toon-format/toon").then((mod) => {
|
|
29
|
+
const fn = mod?.encode ?? mod?.default?.encode ?? null;
|
|
30
|
+
cachedEncode = typeof fn === "function" ? fn : null;
|
|
31
|
+
cachedLoadError = cachedEncode ? null : new Error("Missing encode() export from @toon-format/toon");
|
|
32
|
+
}).catch((err) => {
|
|
33
|
+
cachedEncode = null;
|
|
34
|
+
cachedLoadError = err;
|
|
35
|
+
});
|
|
36
|
+
|
|
32
37
|
function looksLikeJsonObjectOrArray(text) {
|
|
33
38
|
if (typeof text !== "string") return false;
|
|
34
39
|
const trimmed = text.trim();
|