krusch-cascade-router 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +158 -0
- package/dist/index.cjs +254 -0
- package/dist/index.d.cts +60 -0
- package/dist/index.d.ts +60 -0
- package/dist/index.js +226 -0
- package/package.json +54 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 kruschdev
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="docs/assets/banner.png" alt="Krusch Cascade Router" width="800" />
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
<p align="center">
|
|
6
|
+
<strong>Latency-aware LLM router that dynamically cascades between edge and cloud models via logprob inspection.</strong>
|
|
7
|
+
</p>
|
|
8
|
+
|
|
9
|
+
<p align="center">
|
|
10
|
+
<a href="https://www.npmjs.com/package/krusch-cascade-router"><img src="https://img.shields.io/github/package-json/v/kruschdev/krusch-cascade-router.svg?style=flat-square" alt="NPM Version"></a>
|
|
11
|
+
<a href="https://github.com/kruschdev/krusch-cascade-router/blob/main/LICENSE"><img src="https://img.shields.io/github/license/kruschdev/krusch-cascade-router.svg?style=flat-square" alt="License"></a>
|
|
12
|
+
<img src="https://img.shields.io/badge/node-%3E%3D18-blue.svg?style=flat-square" alt="Node Version">
|
|
13
|
+
</p>
|
|
14
|
+
|
|
15
|
+
---
|
|
16
|
+
|
|
17
|
+
## ⚡ Why Krusch Cascade Router?
|
|
18
|
+
|
|
19
|
+
**"LLM routing an LLM is a trap."**
|
|
20
|
+
|
|
21
|
+
Using a massive third LLM to decide which LLM to route a query to adds severe TTFT (Time To First Token) latency and API costs. `krusch-cascade-router` solves this by combining a fast predictive heuristic classifier (<50ms latency) with a reactive logprob-based speculative cascade. Designed specifically for agentic developers building with local AI, it allows you to optimize for cost, performance, and reliability without sacrificing capability.
|
|
22
|
+
|
|
23
|
+
### Key Features
|
|
24
|
+
- **🚀 Sub-50ms Heuristic Classifier:** Evaluates prompt complexity instantly.
|
|
25
|
+
- **🧠 Logprob Speculative Execution:** Reactively cascades to heavy cloud models if the edge model's confidence drops.
|
|
26
|
+
- **🔌 Framework Agnostic:** Can be plugged into any Node.js AI architecture.
|
|
27
|
+
- **🛡️ Custom Heuristics:** Support for `customRules` to inject your own prompt complexity detection logic.
|
|
28
|
+
- **🛑 Native AbortSignal Support:** Manage request timeouts natively via `ChatOptions`.
|
|
29
|
+
- **📦 Dual CJS/ESM Support:** Works in modern ECMAScript and legacy environments.
|
|
30
|
+
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## 🧠 Architecture: How It Works
|
|
34
|
+
|
|
35
|
+
1. **Predictive Classifier**: Instantly evaluates the prompt's complexity via string heuristics (length, code blocks, complex cognitive verbs, or your `customRules`). If classified as complex, it routes directly to the heavy cloud model.
|
|
36
|
+
2. **Speculative Cascade**: If classified as simple, it streams the fast local edge model. It buffers and inspects the logprobs of the first N tokens. If the confidence (probability) dips below your configured threshold, it silently aborts the stream and falls back to the heavy cloud model.
|
|
37
|
+
|
|
38
|
+
```mermaid
|
|
39
|
+
graph TD;
|
|
40
|
+
A[Incoming Prompt] --> B{Heuristic Classifier};
|
|
41
|
+
B -- Complex --> C[Heavy Cloud Model];
|
|
42
|
+
B -- Simple --> D[Local Edge Model];
|
|
43
|
+
D --> E{Evaluate Logprobs first N tokens};
|
|
44
|
+
E -- Confidence >= Threshold --> F[Stream Edge Response];
|
|
45
|
+
E -- Confidence < Threshold --> G[Abort Edge];
|
|
46
|
+
G --> C;
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## 📦 Installation
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
npm install krusch-cascade-router
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
> **Note**: Requires Node.js 18+ for native fetch and `AbortSignal` support.
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## 🚀 Quick Start Guide
|
|
62
|
+
|
|
63
|
+
```javascript
|
|
64
|
+
import { CascadeRouter } from 'krusch-cascade-router';
|
|
65
|
+
|
|
66
|
+
// 1. Initialize the router with your edge and cloud models
|
|
67
|
+
const router = new CascadeRouter({
|
|
68
|
+
fastModel: {
|
|
69
|
+
url: 'http://localhost:11434/v1/chat/completions',
|
|
70
|
+
model: 'qwen2.5:3b' // Edge node tag resolution
|
|
71
|
+
},
|
|
72
|
+
heavyModel: {
|
|
73
|
+
apiKey: process.env.GEMINI_API_KEY,
|
|
74
|
+
model: 'gemini-2.5-pro',
|
|
75
|
+
provider: 'gemini'
|
|
76
|
+
},
|
|
77
|
+
cascadeThreshold: 0.85, // Abort if average probability of first 5 tokens is < 85%
|
|
78
|
+
tokensToEvaluate: 5
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
// 2. Send a chat request
|
|
82
|
+
const response = await router.chat("Write a complex architectural plan...");
|
|
83
|
+
|
|
84
|
+
// 3. Check where it was routed
|
|
85
|
+
console.log(`Routed to: ${response.routedTo}`);
|
|
86
|
+
console.log(response.text);
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
---
|
|
90
|
+
|
|
91
|
+
## 🛠️ Advanced Usage
|
|
92
|
+
|
|
93
|
+
### Custom Heuristic Rules (`customRules`)
|
|
94
|
+
You can inject your own detection logic to fine-tune what goes directly to the cloud model:
|
|
95
|
+
|
|
96
|
+
```javascript
|
|
97
|
+
const router = new CascadeRouter({
|
|
98
|
+
// ...models config
|
|
99
|
+
customRules: [
|
|
100
|
+
(prompt) => prompt.includes('PostgreSQL'), // Always route DB questions to cloud
|
|
101
|
+
(prompt) => prompt.length > 2000 // Override default length heuristics
|
|
102
|
+
]
|
|
103
|
+
});
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Timeouts and AbortSignals
|
|
107
|
+
Native integration with `AbortSignal` for graceful timeout handling:
|
|
108
|
+
|
|
109
|
+
```javascript
|
|
110
|
+
const controller = new AbortController();
|
|
111
|
+
setTimeout(() => controller.abort(), 10000); // 10s timeout
|
|
112
|
+
|
|
113
|
+
try {
|
|
114
|
+
const response = await router.chat("Analyze this dataset", {
|
|
115
|
+
signal: controller.signal
|
|
116
|
+
});
|
|
117
|
+
} catch (err) {
|
|
118
|
+
if (err.name === 'AbortError') {
|
|
119
|
+
console.log('Request was timed out or aborted manually.');
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## 📚 API Reference
|
|
127
|
+
|
|
128
|
+
### `new CascadeRouter(config)`
|
|
129
|
+
|
|
130
|
+
| Property | Type | Description |
|
|
131
|
+
|---|---|---|
|
|
132
|
+
| `fastModel` | `ModelConfig` | Configuration for your fast, local edge model (e.g. Ollama). |
|
|
133
|
+
| `heavyModel` | `ModelConfig` | Configuration for your heavy cloud fallback (e.g. Gemini, OpenAI). |
|
|
134
|
+
| `cascadeThreshold` | `number` | Confidence probability (0.0 to 1.0). If logprobs dip below this, it cascades. |
|
|
135
|
+
| `tokensToEvaluate` | `number` | How many tokens to buffer before making the speculative decision. |
|
|
136
|
+
| `customRules` | `Array<(prompt: string) => boolean>` | *(Optional)* Array of heuristic functions to override complex prompt detection. |
|
|
137
|
+
|
|
138
|
+
### `router.chat(prompt, options?)`
|
|
139
|
+
|
|
140
|
+
| Parameter | Type | Description |
|
|
141
|
+
|---|---|---|
|
|
142
|
+
| `prompt` | `string` | The user's input prompt. |
|
|
143
|
+
| `options` | `ChatOptions` | *(Optional)* Options like `{ signal: AbortSignal }`. |
|
|
144
|
+
|
|
145
|
+
**Returns:** `Promise<{ text: string, routedTo: 'fast' | 'heavy' }>`
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## 🤝 Contributing
|
|
150
|
+
|
|
151
|
+
We welcome contributions! Please follow the established homelab conventions:
|
|
152
|
+
- Library code must NEVER use `console.warn` or `console.log` directly. Route diagnostics through callback options (`onEvent` pattern).
|
|
153
|
+
- Ensure your `AbortSignal` listeners use `{ once: true }` to prevent leaks.
|
|
154
|
+
- Run tests via `npm test` before submitting PRs.
|
|
155
|
+
|
|
156
|
+
## 📄 License
|
|
157
|
+
|
|
158
|
+
MIT License © 2026 kruschdev
|
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
|
|
20
|
+
// src/index.ts
|
|
21
|
+
var index_exports = {};
|
|
22
|
+
__export(index_exports, {
|
|
23
|
+
CascadeRouter: () => CascadeRouter,
|
|
24
|
+
isComplexPrompt: () => isComplexPrompt
|
|
25
|
+
});
|
|
26
|
+
module.exports = __toCommonJS(index_exports);
|
|
27
|
+
|
|
28
|
+
// src/classifier.ts
|
|
29
|
+
function isComplexPrompt(messages, options) {
|
|
30
|
+
const lengthThreshold = options?.lengthThreshold || 2e3;
|
|
31
|
+
const fullText = Array.isArray(messages) ? messages.map((m) => m.content).join("\n") : messages;
|
|
32
|
+
if (fullText.length > lengthThreshold) {
|
|
33
|
+
return true;
|
|
34
|
+
}
|
|
35
|
+
const complexMarkers = [
|
|
36
|
+
/```[a-z]*/i,
|
|
37
|
+
// Contains code blocks
|
|
38
|
+
/<\/?([a-z][a-z0-9]*)\b[^>]*>/i,
|
|
39
|
+
// Contains XML/HTML tags
|
|
40
|
+
/\{[\s\S]*"[\s\S]*\}/,
|
|
41
|
+
// Contains JSON-like structures
|
|
42
|
+
/\b(analyze|evaluate|architect|synthesize|speculate|refactor)\b/i
|
|
43
|
+
// Complex cognitive verbs
|
|
44
|
+
];
|
|
45
|
+
if (options?.customRules) {
|
|
46
|
+
complexMarkers.push(...options.customRules);
|
|
47
|
+
}
|
|
48
|
+
for (const marker of complexMarkers) {
|
|
49
|
+
if (marker.test(fullText)) {
|
|
50
|
+
return true;
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
return false;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// src/cascade.ts
|
|
57
|
+
var CascadeRouter = class {
|
|
58
|
+
config;
|
|
59
|
+
fetchFn;
|
|
60
|
+
constructor(config) {
|
|
61
|
+
this.config = {
|
|
62
|
+
...config,
|
|
63
|
+
cascadeThreshold: config.cascadeThreshold ?? 0.85,
|
|
64
|
+
tokensToEvaluate: config.tokensToEvaluate ?? 5
|
|
65
|
+
};
|
|
66
|
+
this.fetchFn = config.fetch ?? (typeof globalThis !== "undefined" ? globalThis.fetch : fetch);
|
|
67
|
+
if (!this.fetchFn) {
|
|
68
|
+
throw new Error("A global fetch API is required, or a custom fetch implementation must be provided in RouterConfig.");
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Complete a chat request, routing automatically.
|
|
73
|
+
*/
|
|
74
|
+
async chat(messages, systemPrompt, options) {
|
|
75
|
+
const formattedMessages = this.formatMessages(messages, systemPrompt);
|
|
76
|
+
const isComplex = isComplexPrompt(formattedMessages, this.config.classifier);
|
|
77
|
+
if (isComplex) {
|
|
78
|
+
this.config.onEvent?.("route_heavy", { reason: "classifier_heuristic" });
|
|
79
|
+
const text = await this.fetchHeavyModel(formattedMessages, options);
|
|
80
|
+
return { text, routedTo: "heavy", aborted: false };
|
|
81
|
+
}
|
|
82
|
+
try {
|
|
83
|
+
const fastResult = await this.streamAndEvaluateFastModel(formattedMessages, options);
|
|
84
|
+
if (fastResult.aborted) {
|
|
85
|
+
this.config.onEvent?.("route_heavy", { reason: "cascade_fallback" });
|
|
86
|
+
const heavyText = await this.fetchHeavyModel(formattedMessages, options);
|
|
87
|
+
return { text: heavyText, routedTo: "heavy", aborted: true };
|
|
88
|
+
}
|
|
89
|
+
this.config.onEvent?.("route_fast", { reason: "high_confidence" });
|
|
90
|
+
return { text: fastResult.text, routedTo: "fast", aborted: false };
|
|
91
|
+
} catch (err) {
|
|
92
|
+
this.config.onEvent?.("route_heavy", { reason: "fast_model_error", error: err.message });
|
|
93
|
+
const heavyText = await this.fetchHeavyModel(formattedMessages, options);
|
|
94
|
+
return { text: heavyText, routedTo: "heavy", aborted: true };
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
formatMessages(messages, systemPrompt) {
|
|
98
|
+
const msgs = [];
|
|
99
|
+
if (systemPrompt) {
|
|
100
|
+
msgs.push({ role: "system", content: systemPrompt });
|
|
101
|
+
}
|
|
102
|
+
if (typeof messages === "string") {
|
|
103
|
+
msgs.push({ role: "user", content: messages });
|
|
104
|
+
} else {
|
|
105
|
+
msgs.push(...messages);
|
|
106
|
+
}
|
|
107
|
+
return msgs;
|
|
108
|
+
}
|
|
109
|
+
/**
|
|
110
|
+
* Streams the fast model, buffering the first N tokens to check logprobs.
|
|
111
|
+
* If confidence is lower than threshold, aborts and returns { aborted: true }.
|
|
112
|
+
*/
|
|
113
|
+
async streamAndEvaluateFastModel(messages, options) {
|
|
114
|
+
const { fastModel } = this.config;
|
|
115
|
+
const url = fastModel.url || "http://localhost:11434/v1/chat/completions";
|
|
116
|
+
const headers = { "Content-Type": "application/json" };
|
|
117
|
+
if (fastModel.apiKey) headers["Authorization"] = `Bearer ${fastModel.apiKey}`;
|
|
118
|
+
const controller = new AbortController();
|
|
119
|
+
if (options?.signal) {
|
|
120
|
+
options.signal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
121
|
+
if (options.signal.aborted) controller.abort();
|
|
122
|
+
}
|
|
123
|
+
const response = await this.fetchFn(url, {
|
|
124
|
+
method: "POST",
|
|
125
|
+
headers,
|
|
126
|
+
signal: controller.signal,
|
|
127
|
+
body: JSON.stringify({
|
|
128
|
+
model: fastModel.model,
|
|
129
|
+
messages,
|
|
130
|
+
stream: true,
|
|
131
|
+
logprobs: true
|
|
132
|
+
// Request logprobs (OpenAI format)
|
|
133
|
+
})
|
|
134
|
+
});
|
|
135
|
+
if (!response.ok) {
|
|
136
|
+
throw new Error(`Fast model HTTP ${response.status}`);
|
|
137
|
+
}
|
|
138
|
+
if (!response.body) throw new Error("No response body");
|
|
139
|
+
const reader = response.body.getReader();
|
|
140
|
+
const decoder = new TextDecoder("utf-8");
|
|
141
|
+
let fullText = "";
|
|
142
|
+
let tokenCount = 0;
|
|
143
|
+
let accumulatedProb = 0;
|
|
144
|
+
let buffer = "";
|
|
145
|
+
while (true) {
|
|
146
|
+
const { done, value } = await reader.read();
|
|
147
|
+
if (done) break;
|
|
148
|
+
buffer += decoder.decode(value, { stream: true });
|
|
149
|
+
const lines = buffer.split("\n");
|
|
150
|
+
buffer = lines.pop() || "";
|
|
151
|
+
for (const line of lines) {
|
|
152
|
+
const trimmed = line.trim();
|
|
153
|
+
if (!trimmed || trimmed === "data: [DONE]") continue;
|
|
154
|
+
if (!trimmed.startsWith("data: ")) continue;
|
|
155
|
+
try {
|
|
156
|
+
const data = JSON.parse(trimmed.slice(6));
|
|
157
|
+
const choice = data.choices?.[0];
|
|
158
|
+
const delta = choice?.delta?.content || "";
|
|
159
|
+
if (delta) fullText += delta;
|
|
160
|
+
const logprobsObj = choice?.logprobs?.content;
|
|
161
|
+
if (logprobsObj && Array.isArray(logprobsObj) && logprobsObj.length > 0) {
|
|
162
|
+
for (const lp of logprobsObj) {
|
|
163
|
+
const logprob = lp.logprob;
|
|
164
|
+
if (logprob !== void 0) {
|
|
165
|
+
const linearProb = Math.exp(logprob);
|
|
166
|
+
accumulatedProb += linearProb;
|
|
167
|
+
tokenCount++;
|
|
168
|
+
if (tokenCount === this.config.tokensToEvaluate) {
|
|
169
|
+
const avgProb = accumulatedProb / tokenCount;
|
|
170
|
+
if (avgProb < (this.config.cascadeThreshold || 0.85)) {
|
|
171
|
+
this.config.onEvent?.("cascade_triggered", { tokenCount, avgProb, threshold: this.config.cascadeThreshold || 0.85 });
|
|
172
|
+
controller.abort();
|
|
173
|
+
return { text: "", aborted: true };
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
} catch (e) {
|
|
180
|
+
if (!(e instanceof SyntaxError)) throw e;
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
if (buffer.trim().startsWith("data: ")) {
|
|
185
|
+
try {
|
|
186
|
+
const data = JSON.parse(buffer.trim().slice(6));
|
|
187
|
+
const delta = data.choices?.[0]?.delta?.content || "";
|
|
188
|
+
if (delta) fullText += delta;
|
|
189
|
+
} catch (e) {
|
|
190
|
+
if (!(e instanceof SyntaxError)) throw e;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
return { text: fullText, aborted: false };
|
|
194
|
+
}
|
|
195
|
+
/**
|
|
196
|
+
* Fallback to heavy model. Only returns the full string for now.
|
|
197
|
+
*/
|
|
198
|
+
async fetchHeavyModel(messages, options) {
|
|
199
|
+
const { heavyModel } = this.config;
|
|
200
|
+
const provider = heavyModel.provider || "openai";
|
|
201
|
+
if (provider === "gemini") {
|
|
202
|
+
return this.fetchGemini(messages, options);
|
|
203
|
+
}
|
|
204
|
+
const url = heavyModel.url || "https://api.openai.com/v1/chat/completions";
|
|
205
|
+
const headers = { "Content-Type": "application/json" };
|
|
206
|
+
if (heavyModel.apiKey) headers["Authorization"] = `Bearer ${heavyModel.apiKey}`;
|
|
207
|
+
const response = await this.fetchFn(url, {
|
|
208
|
+
method: "POST",
|
|
209
|
+
headers,
|
|
210
|
+
signal: options?.signal,
|
|
211
|
+
body: JSON.stringify({
|
|
212
|
+
model: heavyModel.model,
|
|
213
|
+
messages,
|
|
214
|
+
stream: false
|
|
215
|
+
})
|
|
216
|
+
});
|
|
217
|
+
if (!response.ok) {
|
|
218
|
+
throw new Error(`Heavy model HTTP ${response.status}`);
|
|
219
|
+
}
|
|
220
|
+
const data = await response.json();
|
|
221
|
+
return data.choices?.[0]?.message?.content || "";
|
|
222
|
+
}
|
|
223
|
+
async fetchGemini(messages, options) {
|
|
224
|
+
const { heavyModel } = this.config;
|
|
225
|
+
const apiKey = heavyModel.apiKey;
|
|
226
|
+
if (!apiKey) throw new Error("Gemini requires an API key");
|
|
227
|
+
const url = `https://generativelanguage.googleapis.com/v1beta/models/${heavyModel.model}:generateContent?key=${apiKey}`;
|
|
228
|
+
const systemPrompt = messages.find((m) => m.role === "system")?.content;
|
|
229
|
+
const contents = messages.filter((m) => m.role !== "system").map((m) => ({
|
|
230
|
+
role: m.role === "assistant" ? "model" : "user",
|
|
231
|
+
parts: [{ text: m.content }]
|
|
232
|
+
}));
|
|
233
|
+
const body = { contents };
|
|
234
|
+
if (systemPrompt) {
|
|
235
|
+
body.system_instruction = { parts: [{ text: systemPrompt }] };
|
|
236
|
+
}
|
|
237
|
+
const response = await this.fetchFn(url, {
|
|
238
|
+
method: "POST",
|
|
239
|
+
headers: { "Content-Type": "application/json" },
|
|
240
|
+
signal: options?.signal,
|
|
241
|
+
body: JSON.stringify(body)
|
|
242
|
+
});
|
|
243
|
+
if (!response.ok) {
|
|
244
|
+
throw new Error(`Gemini HTTP ${response.status}: ${await response.text()}`);
|
|
245
|
+
}
|
|
246
|
+
const data = await response.json();
|
|
247
|
+
return data.candidates?.[0]?.content?.parts?.[0]?.text || "";
|
|
248
|
+
}
|
|
249
|
+
};
|
|
250
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
251
|
+
0 && (module.exports = {
|
|
252
|
+
CascadeRouter,
|
|
253
|
+
isComplexPrompt
|
|
254
|
+
});
|
package/dist/index.d.cts
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
interface Message {
|
|
2
|
+
role: 'system' | 'user' | 'assistant';
|
|
3
|
+
content: string;
|
|
4
|
+
}
|
|
5
|
+
interface ClassifierOptions {
|
|
6
|
+
lengthThreshold?: number;
|
|
7
|
+
customRules?: RegExp[];
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* A fast, <50ms heuristic classifier to predict if a prompt is "simple" or "complex".
|
|
11
|
+
* Evaluates message length and structural markers (code blocks, XML, JSON).
|
|
12
|
+
*/
|
|
13
|
+
declare function isComplexPrompt(messages: Message[] | string, options?: ClassifierOptions): boolean;
|
|
14
|
+
|
|
15
|
+
interface ModelConfig {
|
|
16
|
+
url?: string;
|
|
17
|
+
apiKey?: string;
|
|
18
|
+
model: string;
|
|
19
|
+
provider?: 'openai' | 'gemini';
|
|
20
|
+
}
|
|
21
|
+
type TelemetryEvent = 'route_fast' | 'route_heavy' | 'cascade_triggered';
|
|
22
|
+
interface RouterConfig {
|
|
23
|
+
fastModel: ModelConfig;
|
|
24
|
+
heavyModel: ModelConfig;
|
|
25
|
+
cascadeThreshold?: number;
|
|
26
|
+
tokensToEvaluate?: number;
|
|
27
|
+
classifier?: ClassifierOptions;
|
|
28
|
+
fetch?: typeof fetch;
|
|
29
|
+
onEvent?: (event: TelemetryEvent, metadata?: Record<string, any>) => void;
|
|
30
|
+
}
|
|
31
|
+
interface CascadeResponse {
|
|
32
|
+
text: string;
|
|
33
|
+
routedTo: 'fast' | 'heavy';
|
|
34
|
+
aborted: boolean;
|
|
35
|
+
}
|
|
36
|
+
interface ChatOptions {
|
|
37
|
+
signal?: AbortSignal;
|
|
38
|
+
}
|
|
39
|
+
declare class CascadeRouter {
|
|
40
|
+
private config;
|
|
41
|
+
private fetchFn;
|
|
42
|
+
constructor(config: RouterConfig);
|
|
43
|
+
/**
|
|
44
|
+
* Complete a chat request, routing automatically.
|
|
45
|
+
*/
|
|
46
|
+
chat(messages: Message[] | string, systemPrompt?: string, options?: ChatOptions): Promise<CascadeResponse>;
|
|
47
|
+
private formatMessages;
|
|
48
|
+
/**
|
|
49
|
+
* Streams the fast model, buffering the first N tokens to check logprobs.
|
|
50
|
+
* If confidence is lower than threshold, aborts and returns { aborted: true }.
|
|
51
|
+
*/
|
|
52
|
+
private streamAndEvaluateFastModel;
|
|
53
|
+
/**
|
|
54
|
+
* Fallback to heavy model. Only returns the full string for now.
|
|
55
|
+
*/
|
|
56
|
+
private fetchHeavyModel;
|
|
57
|
+
private fetchGemini;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
export { type CascadeResponse, CascadeRouter, type ChatOptions, type ClassifierOptions, type Message, type ModelConfig, type RouterConfig, type TelemetryEvent, isComplexPrompt };
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
interface Message {
|
|
2
|
+
role: 'system' | 'user' | 'assistant';
|
|
3
|
+
content: string;
|
|
4
|
+
}
|
|
5
|
+
interface ClassifierOptions {
|
|
6
|
+
lengthThreshold?: number;
|
|
7
|
+
customRules?: RegExp[];
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* A fast, <50ms heuristic classifier to predict if a prompt is "simple" or "complex".
|
|
11
|
+
* Evaluates message length and structural markers (code blocks, XML, JSON).
|
|
12
|
+
*/
|
|
13
|
+
declare function isComplexPrompt(messages: Message[] | string, options?: ClassifierOptions): boolean;
|
|
14
|
+
|
|
15
|
+
interface ModelConfig {
|
|
16
|
+
url?: string;
|
|
17
|
+
apiKey?: string;
|
|
18
|
+
model: string;
|
|
19
|
+
provider?: 'openai' | 'gemini';
|
|
20
|
+
}
|
|
21
|
+
type TelemetryEvent = 'route_fast' | 'route_heavy' | 'cascade_triggered';
|
|
22
|
+
interface RouterConfig {
|
|
23
|
+
fastModel: ModelConfig;
|
|
24
|
+
heavyModel: ModelConfig;
|
|
25
|
+
cascadeThreshold?: number;
|
|
26
|
+
tokensToEvaluate?: number;
|
|
27
|
+
classifier?: ClassifierOptions;
|
|
28
|
+
fetch?: typeof fetch;
|
|
29
|
+
onEvent?: (event: TelemetryEvent, metadata?: Record<string, any>) => void;
|
|
30
|
+
}
|
|
31
|
+
interface CascadeResponse {
|
|
32
|
+
text: string;
|
|
33
|
+
routedTo: 'fast' | 'heavy';
|
|
34
|
+
aborted: boolean;
|
|
35
|
+
}
|
|
36
|
+
interface ChatOptions {
|
|
37
|
+
signal?: AbortSignal;
|
|
38
|
+
}
|
|
39
|
+
declare class CascadeRouter {
|
|
40
|
+
private config;
|
|
41
|
+
private fetchFn;
|
|
42
|
+
constructor(config: RouterConfig);
|
|
43
|
+
/**
|
|
44
|
+
* Complete a chat request, routing automatically.
|
|
45
|
+
*/
|
|
46
|
+
chat(messages: Message[] | string, systemPrompt?: string, options?: ChatOptions): Promise<CascadeResponse>;
|
|
47
|
+
private formatMessages;
|
|
48
|
+
/**
|
|
49
|
+
* Streams the fast model, buffering the first N tokens to check logprobs.
|
|
50
|
+
* If confidence is lower than threshold, aborts and returns { aborted: true }.
|
|
51
|
+
*/
|
|
52
|
+
private streamAndEvaluateFastModel;
|
|
53
|
+
/**
|
|
54
|
+
* Fallback to heavy model. Only returns the full string for now.
|
|
55
|
+
*/
|
|
56
|
+
private fetchHeavyModel;
|
|
57
|
+
private fetchGemini;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
export { type CascadeResponse, CascadeRouter, type ChatOptions, type ClassifierOptions, type Message, type ModelConfig, type RouterConfig, type TelemetryEvent, isComplexPrompt };
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
// src/classifier.ts
|
|
2
|
+
function isComplexPrompt(messages, options) {
|
|
3
|
+
const lengthThreshold = options?.lengthThreshold || 2e3;
|
|
4
|
+
const fullText = Array.isArray(messages) ? messages.map((m) => m.content).join("\n") : messages;
|
|
5
|
+
if (fullText.length > lengthThreshold) {
|
|
6
|
+
return true;
|
|
7
|
+
}
|
|
8
|
+
const complexMarkers = [
|
|
9
|
+
/```[a-z]*/i,
|
|
10
|
+
// Contains code blocks
|
|
11
|
+
/<\/?([a-z][a-z0-9]*)\b[^>]*>/i,
|
|
12
|
+
// Contains XML/HTML tags
|
|
13
|
+
/\{[\s\S]*"[\s\S]*\}/,
|
|
14
|
+
// Contains JSON-like structures
|
|
15
|
+
/\b(analyze|evaluate|architect|synthesize|speculate|refactor)\b/i
|
|
16
|
+
// Complex cognitive verbs
|
|
17
|
+
];
|
|
18
|
+
if (options?.customRules) {
|
|
19
|
+
complexMarkers.push(...options.customRules);
|
|
20
|
+
}
|
|
21
|
+
for (const marker of complexMarkers) {
|
|
22
|
+
if (marker.test(fullText)) {
|
|
23
|
+
return true;
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
return false;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// src/cascade.ts
|
|
30
|
+
var CascadeRouter = class {
|
|
31
|
+
config;
|
|
32
|
+
fetchFn;
|
|
33
|
+
constructor(config) {
|
|
34
|
+
this.config = {
|
|
35
|
+
...config,
|
|
36
|
+
cascadeThreshold: config.cascadeThreshold ?? 0.85,
|
|
37
|
+
tokensToEvaluate: config.tokensToEvaluate ?? 5
|
|
38
|
+
};
|
|
39
|
+
this.fetchFn = config.fetch ?? (typeof globalThis !== "undefined" ? globalThis.fetch : fetch);
|
|
40
|
+
if (!this.fetchFn) {
|
|
41
|
+
throw new Error("A global fetch API is required, or a custom fetch implementation must be provided in RouterConfig.");
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
/**
|
|
45
|
+
* Complete a chat request, routing automatically.
|
|
46
|
+
*/
|
|
47
|
+
async chat(messages, systemPrompt, options) {
|
|
48
|
+
const formattedMessages = this.formatMessages(messages, systemPrompt);
|
|
49
|
+
const isComplex = isComplexPrompt(formattedMessages, this.config.classifier);
|
|
50
|
+
if (isComplex) {
|
|
51
|
+
this.config.onEvent?.("route_heavy", { reason: "classifier_heuristic" });
|
|
52
|
+
const text = await this.fetchHeavyModel(formattedMessages, options);
|
|
53
|
+
return { text, routedTo: "heavy", aborted: false };
|
|
54
|
+
}
|
|
55
|
+
try {
|
|
56
|
+
const fastResult = await this.streamAndEvaluateFastModel(formattedMessages, options);
|
|
57
|
+
if (fastResult.aborted) {
|
|
58
|
+
this.config.onEvent?.("route_heavy", { reason: "cascade_fallback" });
|
|
59
|
+
const heavyText = await this.fetchHeavyModel(formattedMessages, options);
|
|
60
|
+
return { text: heavyText, routedTo: "heavy", aborted: true };
|
|
61
|
+
}
|
|
62
|
+
this.config.onEvent?.("route_fast", { reason: "high_confidence" });
|
|
63
|
+
return { text: fastResult.text, routedTo: "fast", aborted: false };
|
|
64
|
+
} catch (err) {
|
|
65
|
+
this.config.onEvent?.("route_heavy", { reason: "fast_model_error", error: err.message });
|
|
66
|
+
const heavyText = await this.fetchHeavyModel(formattedMessages, options);
|
|
67
|
+
return { text: heavyText, routedTo: "heavy", aborted: true };
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
formatMessages(messages, systemPrompt) {
|
|
71
|
+
const msgs = [];
|
|
72
|
+
if (systemPrompt) {
|
|
73
|
+
msgs.push({ role: "system", content: systemPrompt });
|
|
74
|
+
}
|
|
75
|
+
if (typeof messages === "string") {
|
|
76
|
+
msgs.push({ role: "user", content: messages });
|
|
77
|
+
} else {
|
|
78
|
+
msgs.push(...messages);
|
|
79
|
+
}
|
|
80
|
+
return msgs;
|
|
81
|
+
}
|
|
82
|
+
/**
|
|
83
|
+
* Streams the fast model, buffering the first N tokens to check logprobs.
|
|
84
|
+
* If confidence is lower than threshold, aborts and returns { aborted: true }.
|
|
85
|
+
*/
|
|
86
|
+
async streamAndEvaluateFastModel(messages, options) {
|
|
87
|
+
const { fastModel } = this.config;
|
|
88
|
+
const url = fastModel.url || "http://localhost:11434/v1/chat/completions";
|
|
89
|
+
const headers = { "Content-Type": "application/json" };
|
|
90
|
+
if (fastModel.apiKey) headers["Authorization"] = `Bearer ${fastModel.apiKey}`;
|
|
91
|
+
const controller = new AbortController();
|
|
92
|
+
if (options?.signal) {
|
|
93
|
+
options.signal.addEventListener("abort", () => controller.abort(), { once: true });
|
|
94
|
+
if (options.signal.aborted) controller.abort();
|
|
95
|
+
}
|
|
96
|
+
const response = await this.fetchFn(url, {
|
|
97
|
+
method: "POST",
|
|
98
|
+
headers,
|
|
99
|
+
signal: controller.signal,
|
|
100
|
+
body: JSON.stringify({
|
|
101
|
+
model: fastModel.model,
|
|
102
|
+
messages,
|
|
103
|
+
stream: true,
|
|
104
|
+
logprobs: true
|
|
105
|
+
// Request logprobs (OpenAI format)
|
|
106
|
+
})
|
|
107
|
+
});
|
|
108
|
+
if (!response.ok) {
|
|
109
|
+
throw new Error(`Fast model HTTP ${response.status}`);
|
|
110
|
+
}
|
|
111
|
+
if (!response.body) throw new Error("No response body");
|
|
112
|
+
const reader = response.body.getReader();
|
|
113
|
+
const decoder = new TextDecoder("utf-8");
|
|
114
|
+
let fullText = "";
|
|
115
|
+
let tokenCount = 0;
|
|
116
|
+
let accumulatedProb = 0;
|
|
117
|
+
let buffer = "";
|
|
118
|
+
while (true) {
|
|
119
|
+
const { done, value } = await reader.read();
|
|
120
|
+
if (done) break;
|
|
121
|
+
buffer += decoder.decode(value, { stream: true });
|
|
122
|
+
const lines = buffer.split("\n");
|
|
123
|
+
buffer = lines.pop() || "";
|
|
124
|
+
for (const line of lines) {
|
|
125
|
+
const trimmed = line.trim();
|
|
126
|
+
if (!trimmed || trimmed === "data: [DONE]") continue;
|
|
127
|
+
if (!trimmed.startsWith("data: ")) continue;
|
|
128
|
+
try {
|
|
129
|
+
const data = JSON.parse(trimmed.slice(6));
|
|
130
|
+
const choice = data.choices?.[0];
|
|
131
|
+
const delta = choice?.delta?.content || "";
|
|
132
|
+
if (delta) fullText += delta;
|
|
133
|
+
const logprobsObj = choice?.logprobs?.content;
|
|
134
|
+
if (logprobsObj && Array.isArray(logprobsObj) && logprobsObj.length > 0) {
|
|
135
|
+
for (const lp of logprobsObj) {
|
|
136
|
+
const logprob = lp.logprob;
|
|
137
|
+
if (logprob !== void 0) {
|
|
138
|
+
const linearProb = Math.exp(logprob);
|
|
139
|
+
accumulatedProb += linearProb;
|
|
140
|
+
tokenCount++;
|
|
141
|
+
if (tokenCount === this.config.tokensToEvaluate) {
|
|
142
|
+
const avgProb = accumulatedProb / tokenCount;
|
|
143
|
+
if (avgProb < (this.config.cascadeThreshold || 0.85)) {
|
|
144
|
+
this.config.onEvent?.("cascade_triggered", { tokenCount, avgProb, threshold: this.config.cascadeThreshold || 0.85 });
|
|
145
|
+
controller.abort();
|
|
146
|
+
return { text: "", aborted: true };
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
} catch (e) {
|
|
153
|
+
if (!(e instanceof SyntaxError)) throw e;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
if (buffer.trim().startsWith("data: ")) {
|
|
158
|
+
try {
|
|
159
|
+
const data = JSON.parse(buffer.trim().slice(6));
|
|
160
|
+
const delta = data.choices?.[0]?.delta?.content || "";
|
|
161
|
+
if (delta) fullText += delta;
|
|
162
|
+
} catch (e) {
|
|
163
|
+
if (!(e instanceof SyntaxError)) throw e;
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
return { text: fullText, aborted: false };
|
|
167
|
+
}
|
|
168
|
+
/**
|
|
169
|
+
* Fallback to heavy model. Only returns the full string for now.
|
|
170
|
+
*/
|
|
171
|
+
async fetchHeavyModel(messages, options) {
|
|
172
|
+
const { heavyModel } = this.config;
|
|
173
|
+
const provider = heavyModel.provider || "openai";
|
|
174
|
+
if (provider === "gemini") {
|
|
175
|
+
return this.fetchGemini(messages, options);
|
|
176
|
+
}
|
|
177
|
+
const url = heavyModel.url || "https://api.openai.com/v1/chat/completions";
|
|
178
|
+
const headers = { "Content-Type": "application/json" };
|
|
179
|
+
if (heavyModel.apiKey) headers["Authorization"] = `Bearer ${heavyModel.apiKey}`;
|
|
180
|
+
const response = await this.fetchFn(url, {
|
|
181
|
+
method: "POST",
|
|
182
|
+
headers,
|
|
183
|
+
signal: options?.signal,
|
|
184
|
+
body: JSON.stringify({
|
|
185
|
+
model: heavyModel.model,
|
|
186
|
+
messages,
|
|
187
|
+
stream: false
|
|
188
|
+
})
|
|
189
|
+
});
|
|
190
|
+
if (!response.ok) {
|
|
191
|
+
throw new Error(`Heavy model HTTP ${response.status}`);
|
|
192
|
+
}
|
|
193
|
+
const data = await response.json();
|
|
194
|
+
return data.choices?.[0]?.message?.content || "";
|
|
195
|
+
}
|
|
196
|
+
async fetchGemini(messages, options) {
|
|
197
|
+
const { heavyModel } = this.config;
|
|
198
|
+
const apiKey = heavyModel.apiKey;
|
|
199
|
+
if (!apiKey) throw new Error("Gemini requires an API key");
|
|
200
|
+
const url = `https://generativelanguage.googleapis.com/v1beta/models/${heavyModel.model}:generateContent?key=${apiKey}`;
|
|
201
|
+
const systemPrompt = messages.find((m) => m.role === "system")?.content;
|
|
202
|
+
const contents = messages.filter((m) => m.role !== "system").map((m) => ({
|
|
203
|
+
role: m.role === "assistant" ? "model" : "user",
|
|
204
|
+
parts: [{ text: m.content }]
|
|
205
|
+
}));
|
|
206
|
+
const body = { contents };
|
|
207
|
+
if (systemPrompt) {
|
|
208
|
+
body.system_instruction = { parts: [{ text: systemPrompt }] };
|
|
209
|
+
}
|
|
210
|
+
const response = await this.fetchFn(url, {
|
|
211
|
+
method: "POST",
|
|
212
|
+
headers: { "Content-Type": "application/json" },
|
|
213
|
+
signal: options?.signal,
|
|
214
|
+
body: JSON.stringify(body)
|
|
215
|
+
});
|
|
216
|
+
if (!response.ok) {
|
|
217
|
+
throw new Error(`Gemini HTTP ${response.status}: ${await response.text()}`);
|
|
218
|
+
}
|
|
219
|
+
const data = await response.json();
|
|
220
|
+
return data.candidates?.[0]?.content?.parts?.[0]?.text || "";
|
|
221
|
+
}
|
|
222
|
+
};
|
|
223
|
+
export {
|
|
224
|
+
CascadeRouter,
|
|
225
|
+
isComplexPrompt
|
|
226
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "krusch-cascade-router",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Latency-aware LLM router that dynamically cascades between edge and cloud models via logprob inspection.",
|
|
5
|
+
"main": "dist/index.cjs",
|
|
6
|
+
"module": "dist/index.js",
|
|
7
|
+
"types": "dist/index.d.ts",
|
|
8
|
+
"type": "module",
|
|
9
|
+
"exports": {
|
|
10
|
+
".": {
|
|
11
|
+
"import": {
|
|
12
|
+
"types": "./dist/index.d.ts",
|
|
13
|
+
"default": "./dist/index.js"
|
|
14
|
+
},
|
|
15
|
+
"require": {
|
|
16
|
+
"types": "./dist/index.d.cts",
|
|
17
|
+
"default": "./dist/index.cjs"
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
},
|
|
21
|
+
"files": [
|
|
22
|
+
"dist",
|
|
23
|
+
"README.md",
|
|
24
|
+
"LICENSE"
|
|
25
|
+
],
|
|
26
|
+
"scripts": {
|
|
27
|
+
"build": "tsup src/index.ts --format cjs,esm --dts",
|
|
28
|
+
"dev": "tsup src/index.ts --format cjs,esm --watch",
|
|
29
|
+
"test": "node --test"
|
|
30
|
+
},
|
|
31
|
+
"keywords": [
|
|
32
|
+
"llm",
|
|
33
|
+
"router",
|
|
34
|
+
"cascade",
|
|
35
|
+
"logprobs",
|
|
36
|
+
"agentic",
|
|
37
|
+
"local-ai"
|
|
38
|
+
],
|
|
39
|
+
"author": "kruschdev",
|
|
40
|
+
"license": "MIT",
|
|
41
|
+
"repository": {
|
|
42
|
+
"type": "git",
|
|
43
|
+
"url": "git+https://github.com/kruschdev/krusch-cascade-router.git"
|
|
44
|
+
},
|
|
45
|
+
"homepage": "https://github.com/kruschdev/krusch-cascade-router#readme",
|
|
46
|
+
"bugs": {
|
|
47
|
+
"url": "https://github.com/kruschdev/krusch-cascade-router/issues"
|
|
48
|
+
},
|
|
49
|
+
"devDependencies": {
|
|
50
|
+
"@types/node": "^22.0.0",
|
|
51
|
+
"tsup": "^8.0.0",
|
|
52
|
+
"typescript": "^5.0.0"
|
|
53
|
+
}
|
|
54
|
+
}
|