lynkr 9.1.2 → 9.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +21 -10
- package/package.json +3 -1
- package/scripts/build-knn-index.js +130 -0
- package/scripts/calibrate-thresholds.js +197 -0
- package/scripts/compare-policies.js +67 -0
- package/scripts/learn-output-ratios.js +162 -0
- package/scripts/refresh-pricing.js +122 -0
- package/scripts/run-routerarena.js +26 -0
- package/scripts/sample-regret.js +84 -0
- package/scripts/train-risk-classifier.js +191 -0
- package/src/api/middleware/budget-enforcer.js +60 -0
- package/src/api/middleware/load-shedding.js +11 -1
- package/src/api/middleware/tenant.js +21 -0
- package/src/api/router.js +19 -40
- package/src/budget/hierarchical-budget.js +159 -0
- package/src/cache/semantic.js +28 -2
- package/src/clients/databricks.js +59 -5
- package/src/config/index.js +239 -43
- package/src/context/toon.js +5 -4
- package/src/orchestrator/index.js +44 -6
- package/src/prompts/system.js +34 -6
- package/src/routing/bandit.js +246 -0
- package/src/routing/cascade.js +106 -0
- package/src/routing/complexity-analyzer.js +7 -15
- package/src/routing/confidence-scorer.js +121 -0
- package/src/routing/context-validator.js +71 -0
- package/src/routing/cost-optimizer.js +5 -2
- package/src/routing/deadline.js +52 -0
- package/src/routing/drift-monitor.js +113 -0
- package/src/routing/embedding-cache.js +77 -0
- package/src/routing/index.js +314 -5
- package/src/routing/knn-router.js +206 -0
- package/src/routing/latency-tracker.js +113 -71
- package/src/routing/model-tiers.js +156 -6
- package/src/routing/output-ratios.js +57 -0
- package/src/routing/regret-estimator.js +91 -0
- package/src/routing/reward-pipeline.js +62 -0
- package/src/routing/risk-classifier.js +130 -0
- package/src/routing/shadow-mode.js +77 -0
- package/src/routing/tenant-policy.js +96 -0
- package/src/routing/tokenizer.js +162 -0
- package/src/server.js +9 -0
package/src/context/toon.js
CHANGED
|
@@ -15,11 +15,12 @@ function normaliseSettings(settings = {}) {
|
|
|
15
15
|
};
|
|
16
16
|
}
|
|
17
17
|
|
|
18
|
-
function resolveEncodeFn(overrideEncode) {
|
|
18
|
+
async function resolveEncodeFn(overrideEncode) {
|
|
19
19
|
if (typeof overrideEncode === "function") return overrideEncode;
|
|
20
20
|
if (cachedEncode !== undefined) return cachedEncode;
|
|
21
21
|
try {
|
|
22
|
-
|
|
22
|
+
// Use dynamic import for ES module compatibility
|
|
23
|
+
const toon = await import("@toon-format/toon");
|
|
23
24
|
cachedEncode = typeof toon?.encode === "function" ? toon.encode : null;
|
|
24
25
|
cachedLoadError = cachedEncode ? null : new Error("Missing encode() export from @toon-format/toon");
|
|
25
26
|
} catch (err) {
|
|
@@ -89,7 +90,7 @@ function compressStringContent(content, cfg, encodeFn, stats) {
|
|
|
89
90
|
return toonText;
|
|
90
91
|
}
|
|
91
92
|
|
|
92
|
-
function applyToonCompression(payload, settings = {}, options = {}) {
|
|
93
|
+
async function applyToonCompression(payload, settings = {}, options = {}) {
|
|
93
94
|
const cfg = normaliseSettings(settings);
|
|
94
95
|
const stats = {
|
|
95
96
|
enabled: cfg.enabled,
|
|
@@ -109,7 +110,7 @@ function applyToonCompression(payload, settings = {}, options = {}) {
|
|
|
109
110
|
return { payload, stats };
|
|
110
111
|
}
|
|
111
112
|
|
|
112
|
-
const encodeFn = resolveEncodeFn(options.encode);
|
|
113
|
+
const encodeFn = await resolveEncodeFn(options.encode);
|
|
113
114
|
if (typeof encodeFn !== "function") {
|
|
114
115
|
stats.available = false;
|
|
115
116
|
const err = cachedLoadError ?? new Error("TOON encoder unavailable");
|
|
@@ -1101,7 +1101,7 @@ function toAnthropicResponse(openai, requestedModel, wantsThinking) {
|
|
|
1101
1101
|
};
|
|
1102
1102
|
}
|
|
1103
1103
|
|
|
1104
|
-
function sanitizePayload(payload) {
|
|
1104
|
+
async function sanitizePayload(payload) {
|
|
1105
1105
|
const { clonePayloadSmart } = require("../utils/payload");
|
|
1106
1106
|
const providerType = config.modelProvider?.type ?? "databricks";
|
|
1107
1107
|
const willFlatten = providerType !== "azure-anthropic";
|
|
@@ -1418,7 +1418,7 @@ function sanitizePayload(payload) {
|
|
|
1418
1418
|
|
|
1419
1419
|
// Optional TOON conversion for large JSON message payloads (prompt context only).
|
|
1420
1420
|
// Run this BEFORE message coalescing to preserve parseable JSON boundaries.
|
|
1421
|
-
applyToonCompression(clean, config.toon, { logger });
|
|
1421
|
+
await applyToonCompression(clean, config.toon, { logger });
|
|
1422
1422
|
|
|
1423
1423
|
// FIX: Handle consecutive messages with the same role (causes llama.cpp 400 error)
|
|
1424
1424
|
// Strategy: Merge consecutive same-role messages, but NEVER merge messages
|
|
@@ -1529,12 +1529,35 @@ function getToolCallSignature(toolCall) {
|
|
|
1529
1529
|
}
|
|
1530
1530
|
|
|
1531
1531
|
function buildNonJsonResponse(databricksResponse) {
|
|
1532
|
+
// Convert plain text response to Anthropic message format
|
|
1533
|
+
// so SSE handler can properly render it
|
|
1534
|
+
const textContent = databricksResponse.text || "";
|
|
1535
|
+
|
|
1532
1536
|
return {
|
|
1533
1537
|
status: databricksResponse.status,
|
|
1534
1538
|
headers: {
|
|
1535
|
-
"Content-Type":
|
|
1539
|
+
"Content-Type": "application/json", // Changed from text/plain
|
|
1540
|
+
},
|
|
1541
|
+
body: {
|
|
1542
|
+
id: `msg_${Date.now()}`,
|
|
1543
|
+
type: "message",
|
|
1544
|
+
role: "assistant",
|
|
1545
|
+
model: "unknown",
|
|
1546
|
+
content: [
|
|
1547
|
+
{
|
|
1548
|
+
type: "text",
|
|
1549
|
+
text: textContent
|
|
1550
|
+
}
|
|
1551
|
+
],
|
|
1552
|
+
stop_reason: "end_turn",
|
|
1553
|
+
stop_sequence: null,
|
|
1554
|
+
usage: {
|
|
1555
|
+
input_tokens: 0,
|
|
1556
|
+
output_tokens: 0,
|
|
1557
|
+
cache_creation_input_tokens: 0,
|
|
1558
|
+
cache_read_input_tokens: 0,
|
|
1559
|
+
}
|
|
1536
1560
|
},
|
|
1537
|
-
body: databricksResponse.text,
|
|
1538
1561
|
terminationReason: "non_json_response",
|
|
1539
1562
|
};
|
|
1540
1563
|
}
|
|
@@ -1966,6 +1989,17 @@ IMPORTANT TOOL USAGE RULES:
|
|
|
1966
1989
|
cleanPayload._workspace = headers["x-lynkr-workspace"];
|
|
1967
1990
|
}
|
|
1968
1991
|
|
|
1992
|
+
// Phase 6.3 — thread deadline for latency-aware routing.
|
|
1993
|
+
if (headers?.["lynkr-deadline-ms"]) {
|
|
1994
|
+
const dl = parseInt(headers["lynkr-deadline-ms"], 10);
|
|
1995
|
+
if (!isNaN(dl) && dl > 0) cleanPayload._deadlineMs = dl;
|
|
1996
|
+
}
|
|
1997
|
+
|
|
1998
|
+
// Phase 6.1 — thread tenant policy for per-tenant routing overrides.
|
|
1999
|
+
if (options?.tenantPolicy) {
|
|
2000
|
+
cleanPayload._tenantPolicy = options.tenantPolicy;
|
|
2001
|
+
}
|
|
2002
|
+
|
|
1969
2003
|
// RTK-inspired tool result compression: compress large tool_results
|
|
1970
2004
|
// before they reach the model (saves 60-90% on test/git/lint output)
|
|
1971
2005
|
if (config.toolResultCompression?.enabled !== false) {
|
|
@@ -3895,7 +3929,7 @@ async function processMessage({ payload, headers, session, cwd, options = {} })
|
|
|
3895
3929
|
const { createTimer } = require("../utils/perf-timer");
|
|
3896
3930
|
const pTimer = createTimer("processMessage");
|
|
3897
3931
|
|
|
3898
|
-
const cleanPayload = sanitizePayload(payload);
|
|
3932
|
+
const cleanPayload = await sanitizePayload(payload);
|
|
3899
3933
|
pTimer.mark("sanitizePayload");
|
|
3900
3934
|
|
|
3901
3935
|
// Proactively load tools based on prompt content (lazy loading)
|
|
@@ -4033,7 +4067,11 @@ async function processMessage({ payload, headers, session, cwd, options = {} })
|
|
|
4033
4067
|
if (semanticCache.isEnabled() && semanticLookupResult && !semanticLookupResult.hit) {
|
|
4034
4068
|
if (loopResult.response?.status === 200 && loopResult.response?.body) {
|
|
4035
4069
|
try {
|
|
4036
|
-
|
|
4070
|
+
// Only cache valid JSON responses, not HTML error pages
|
|
4071
|
+
const body = loopResult.response.body;
|
|
4072
|
+
if (typeof body === 'object' || (typeof body === 'string' && body.trim().startsWith('{'))) {
|
|
4073
|
+
await semanticCache.store(semanticLookupResult, body);
|
|
4074
|
+
}
|
|
4037
4075
|
} catch (err) {
|
|
4038
4076
|
logger.debug({ error: err.message }, "Semantic cache store failed");
|
|
4039
4077
|
}
|
package/src/prompts/system.js
CHANGED
|
@@ -70,13 +70,41 @@ function compressToolDescriptions(tools, mode = null) {
|
|
|
70
70
|
return tools; // Return unmodified if not in minimal mode
|
|
71
71
|
}
|
|
72
72
|
|
|
73
|
-
|
|
73
|
+
const validTools = tools.filter(tool => {
|
|
74
|
+
// Handle both Anthropic format (name + input_schema) and OpenAI format (function.name)
|
|
75
|
+
const hasAnthropicFormat = tool && tool.name && tool.input_schema;
|
|
76
|
+
const hasOpenAIFormat = tool && tool.function && tool.function.name;
|
|
77
|
+
const isValid = hasAnthropicFormat || hasOpenAIFormat;
|
|
78
|
+
|
|
79
|
+
if (!isValid) {
|
|
80
|
+
logger.debug({
|
|
81
|
+
hasName: !!tool?.name,
|
|
82
|
+
hasSchema: !!tool?.input_schema,
|
|
83
|
+
hasFunctionName: !!tool?.function?.name,
|
|
84
|
+
toolType: typeof tool
|
|
85
|
+
}, 'Filtered out malformed tool');
|
|
86
|
+
}
|
|
87
|
+
return isValid;
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
if (validTools.length === 0 && tools.length > 0) {
|
|
91
|
+
logger.warn({ originalCount: tools.length }, 'All tools filtered out as malformed - returning original');
|
|
92
|
+
return tools;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
return validTools.map(tool => {
|
|
96
|
+
// If already in OpenAI format, return as-is (no compression for OpenAI format)
|
|
97
|
+
if (tool.function && !tool.input_schema) {
|
|
98
|
+
return tool;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Compress Anthropic format
|
|
74
102
|
const compressed = {
|
|
75
103
|
name: tool.name,
|
|
76
104
|
input_schema: {
|
|
77
|
-
type: tool.input_schema
|
|
105
|
+
type: tool.input_schema?.type || "object",
|
|
78
106
|
properties: {},
|
|
79
|
-
required: tool.input_schema
|
|
107
|
+
required: tool.input_schema?.required || [],
|
|
80
108
|
}
|
|
81
109
|
};
|
|
82
110
|
|
|
@@ -190,7 +218,7 @@ function optimizeSystemPrompt(system, context = {}, mode = null) {
|
|
|
190
218
|
|
|
191
219
|
// 2. Remove file operation guidelines if no file tools
|
|
192
220
|
const hasFileTools = context.tools?.some(t =>
|
|
193
|
-
['Read', 'Write', 'Edit', 'Glob', 'Grep'].includes(t.name)
|
|
221
|
+
t?.name && ['Read', 'Write', 'Edit', 'Glob', 'Grep'].includes(t.name)
|
|
194
222
|
);
|
|
195
223
|
if (!hasFileTools) {
|
|
196
224
|
text = removeSection(text, /# File Operations?[\s\S]*?(?=\n#|\n\n[A-Z]|$)/gi, optimizations, 'file operations');
|
|
@@ -198,7 +226,7 @@ function optimizeSystemPrompt(system, context = {}, mode = null) {
|
|
|
198
226
|
|
|
199
227
|
// 3. Remove git guidelines if no git tools
|
|
200
228
|
const hasGitTools = context.tools?.some(t =>
|
|
201
|
-
t.name.toLowerCase().includes('git')
|
|
229
|
+
t?.name && t.name.toLowerCase().includes('git')
|
|
202
230
|
);
|
|
203
231
|
if (!hasGitTools) {
|
|
204
232
|
text = removeSection(text, /# Git.*?[\s\S]*?(?=\n#|\n\n[A-Z]|$)/gi, optimizations, 'git guidelines');
|
|
@@ -207,7 +235,7 @@ function optimizeSystemPrompt(system, context = {}, mode = null) {
|
|
|
207
235
|
|
|
208
236
|
// 4. Remove web search guidelines if no web tools
|
|
209
237
|
const hasWebTools = context.tools?.some(t =>
|
|
210
|
-
['WebSearch', 'WebFetch'].includes(t.name)
|
|
238
|
+
t?.name && ['WebSearch', 'WebFetch'].includes(t.name)
|
|
211
239
|
);
|
|
212
240
|
if (!hasWebTools) {
|
|
213
241
|
text = removeSection(text, /# Web.*?[\s\S]*?(?=\n#|\n\n[A-Z]|$)/gi, optimizations, 'web guidelines');
|
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LinUCB contextual bandit for intra-tier model selection (Phase 4.1).
|
|
3
|
+
*
|
|
4
|
+
* Standard LinUCB-with-disjoint-models algorithm (Li et al. 2010).
|
|
5
|
+
* - One arm per (provider, model) pair in a tier
|
|
6
|
+
* - Context = numerical feature vector for the request
|
|
7
|
+
* - Reward = quality_score - λ·norm_cost - μ·norm_latency
|
|
8
|
+
* - Per-arm A (d×d ridge-regression matrix) and b (d-vector) stored to disk
|
|
9
|
+
*
|
|
10
|
+
* State persists to data/bandit-state.json. Loaded on startup; saved on
|
|
11
|
+
* every `update()` (cheap — small matrices) and on graceful shutdown.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
const fs = require('fs');
|
|
15
|
+
const path = require('path');
|
|
16
|
+
const logger = require('../logger');
|
|
17
|
+
|
|
18
|
+
const STATE_PATH = path.join(__dirname, '../../data/bandit-state.json');
|
|
19
|
+
const DEFAULT_ALPHA = 1.5;
|
|
20
|
+
const DEFAULT_LAMBDA = 0.3; // cost penalty weight
|
|
21
|
+
const DEFAULT_MU = 0.1; // latency penalty weight
|
|
22
|
+
const FEATURE_DIM = 12;
|
|
23
|
+
const EXPLORATION_RATE = 0.05;
|
|
24
|
+
|
|
25
|
+
function _identity(d) {
|
|
26
|
+
const m = new Array(d);
|
|
27
|
+
for (let i = 0; i < d; i++) {
|
|
28
|
+
m[i] = new Array(d).fill(0);
|
|
29
|
+
m[i][i] = 1;
|
|
30
|
+
}
|
|
31
|
+
return m;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
function _zeros(d) {
|
|
35
|
+
return new Array(d).fill(0);
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function _matVec(M, v) {
|
|
39
|
+
const d = v.length;
|
|
40
|
+
const out = new Array(d).fill(0);
|
|
41
|
+
for (let i = 0; i < d; i++) {
|
|
42
|
+
for (let j = 0; j < d; j++) out[i] += M[i][j] * v[j];
|
|
43
|
+
}
|
|
44
|
+
return out;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function _dot(a, b) {
|
|
48
|
+
let s = 0;
|
|
49
|
+
for (let i = 0; i < a.length; i++) s += a[i] * b[i];
|
|
50
|
+
return s;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function _outer(a, b) {
|
|
54
|
+
const out = new Array(a.length);
|
|
55
|
+
for (let i = 0; i < a.length; i++) {
|
|
56
|
+
out[i] = new Array(b.length);
|
|
57
|
+
for (let j = 0; j < b.length; j++) out[i][j] = a[i] * b[j];
|
|
58
|
+
}
|
|
59
|
+
return out;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
function _addMat(A, B) {
|
|
63
|
+
for (let i = 0; i < A.length; i++) {
|
|
64
|
+
for (let j = 0; j < A[i].length; j++) A[i][j] += B[i][j];
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function _addVec(a, b) {
|
|
69
|
+
for (let i = 0; i < a.length; i++) a[i] += b[i];
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* Invert a small dense matrix via Gauss-Jordan. For d=12 this is plenty fast
|
|
74
|
+
* and saves us a dependency on a linear algebra library.
|
|
75
|
+
*/
|
|
76
|
+
function _inv(M) {
|
|
77
|
+
const d = M.length;
|
|
78
|
+
const aug = M.map((row, i) => {
|
|
79
|
+
const r = row.slice();
|
|
80
|
+
for (let j = 0; j < d; j++) r.push(i === j ? 1 : 0);
|
|
81
|
+
return r;
|
|
82
|
+
});
|
|
83
|
+
for (let i = 0; i < d; i++) {
|
|
84
|
+
let pivot = aug[i][i];
|
|
85
|
+
if (Math.abs(pivot) < 1e-12) {
|
|
86
|
+
let swap = -1;
|
|
87
|
+
for (let k = i + 1; k < d; k++) {
|
|
88
|
+
if (Math.abs(aug[k][i]) > 1e-12) { swap = k; break; }
|
|
89
|
+
}
|
|
90
|
+
if (swap < 0) throw new Error('matrix singular');
|
|
91
|
+
[aug[i], aug[swap]] = [aug[swap], aug[i]];
|
|
92
|
+
pivot = aug[i][i];
|
|
93
|
+
}
|
|
94
|
+
for (let j = 0; j < 2 * d; j++) aug[i][j] /= pivot;
|
|
95
|
+
for (let k = 0; k < d; k++) {
|
|
96
|
+
if (k === i) continue;
|
|
97
|
+
const factor = aug[k][i];
|
|
98
|
+
for (let j = 0; j < 2 * d; j++) aug[k][j] -= factor * aug[i][j];
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
return aug.map(row => row.slice(d));
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
class LinUCBBandit {
|
|
105
|
+
constructor({ alpha = DEFAULT_ALPHA, lambda = DEFAULT_LAMBDA, mu = DEFAULT_MU, dim = FEATURE_DIM } = {}) {
|
|
106
|
+
this.alpha = alpha;
|
|
107
|
+
this.lambda = lambda;
|
|
108
|
+
this.mu = mu;
|
|
109
|
+
this.dim = dim;
|
|
110
|
+
/** arms: Map<armKey, { A: number[][], b: number[], count: number }> */
|
|
111
|
+
this.arms = new Map();
|
|
112
|
+
this.steps = 0;
|
|
113
|
+
this._load();
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
_armKey(tier, provider, model) {
|
|
117
|
+
return `${tier}|${provider}:${model}`;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
_ensureArm(armKey) {
|
|
121
|
+
if (!this.arms.has(armKey)) {
|
|
122
|
+
this.arms.set(armKey, { A: _identity(this.dim), b: _zeros(this.dim), count: 0 });
|
|
123
|
+
}
|
|
124
|
+
return this.arms.get(armKey);
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/**
|
|
128
|
+
* Pick an arm for a given tier and context.
|
|
129
|
+
* @param {string} tier
|
|
130
|
+
* @param {Array<{ provider: string, model: string }>} candidates — qualifying arms
|
|
131
|
+
* @param {number[]} context — feature vector
|
|
132
|
+
* @returns {{ provider, model, ucb, explored }} chosen arm
|
|
133
|
+
*/
|
|
134
|
+
pick(tier, candidates, context) {
|
|
135
|
+
if (!candidates || candidates.length === 0) return null;
|
|
136
|
+
if (context.length !== this.dim) {
|
|
137
|
+
// Pad or truncate to dim
|
|
138
|
+
context = context.slice(0, this.dim);
|
|
139
|
+
while (context.length < this.dim) context.push(0);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// ε-greedy: 5% pure exploration
|
|
143
|
+
if (Math.random() < EXPLORATION_RATE) {
|
|
144
|
+
const random = candidates[Math.floor(Math.random() * candidates.length)];
|
|
145
|
+
return { ...random, ucb: null, explored: true };
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
let best = null;
|
|
149
|
+
let bestUcb = -Infinity;
|
|
150
|
+
for (const c of candidates) {
|
|
151
|
+
const key = this._armKey(tier, c.provider, c.model);
|
|
152
|
+
const arm = this._ensureArm(key);
|
|
153
|
+
let Ainv;
|
|
154
|
+
try {
|
|
155
|
+
Ainv = _inv(arm.A);
|
|
156
|
+
} catch (err) {
|
|
157
|
+
continue;
|
|
158
|
+
}
|
|
159
|
+
const theta = _matVec(Ainv, arm.b);
|
|
160
|
+
const mean = _dot(theta, context);
|
|
161
|
+
const variance = _dot(context, _matVec(Ainv, context));
|
|
162
|
+
const ucb = mean + this.alpha * Math.sqrt(Math.max(0, variance));
|
|
163
|
+
if (ucb > bestUcb) {
|
|
164
|
+
bestUcb = ucb;
|
|
165
|
+
best = { ...c, ucb, explored: false };
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
return best;
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Update the chosen arm with the observed reward.
|
|
173
|
+
* @param {string} tier
|
|
174
|
+
* @param {string} provider
|
|
175
|
+
* @param {string} model
|
|
176
|
+
* @param {number[]} context
|
|
177
|
+
* @param {number} reward — typically in [0, 100]; will be rescaled to [0, 1] internally
|
|
178
|
+
*/
|
|
179
|
+
update(tier, provider, model, context, reward) {
|
|
180
|
+
const key = this._armKey(tier, provider, model);
|
|
181
|
+
const arm = this._ensureArm(key);
|
|
182
|
+
let ctx = context;
|
|
183
|
+
if (ctx.length !== this.dim) {
|
|
184
|
+
ctx = ctx.slice(0, this.dim);
|
|
185
|
+
while (ctx.length < this.dim) ctx.push(0);
|
|
186
|
+
}
|
|
187
|
+
const r = Math.max(0, Math.min(1, reward / 100));
|
|
188
|
+
_addMat(arm.A, _outer(ctx, ctx));
|
|
189
|
+
_addVec(arm.b, ctx.map(x => x * r));
|
|
190
|
+
arm.count++;
|
|
191
|
+
this.steps++;
|
|
192
|
+
// Save periodically (not every step to limit IO)
|
|
193
|
+
if (this.steps % 25 === 0) this._save();
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
_save() {
|
|
197
|
+
try {
|
|
198
|
+
fs.mkdirSync(path.dirname(STATE_PATH), { recursive: true });
|
|
199
|
+
const arms = {};
|
|
200
|
+
for (const [k, v] of this.arms) arms[k] = v;
|
|
201
|
+
fs.writeFileSync(STATE_PATH, JSON.stringify({
|
|
202
|
+
savedAt: Date.now(),
|
|
203
|
+
steps: this.steps,
|
|
204
|
+
alpha: this.alpha,
|
|
205
|
+
lambda: this.lambda,
|
|
206
|
+
mu: this.mu,
|
|
207
|
+
dim: this.dim,
|
|
208
|
+
arms,
|
|
209
|
+
}, null, 0));
|
|
210
|
+
} catch (err) {
|
|
211
|
+
logger.debug({ err: err.message }, '[Bandit] State save failed');
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
_load() {
|
|
216
|
+
try {
|
|
217
|
+
if (!fs.existsSync(STATE_PATH)) return;
|
|
218
|
+
const raw = JSON.parse(fs.readFileSync(STATE_PATH, 'utf8'));
|
|
219
|
+
if (raw.dim && raw.dim === this.dim) {
|
|
220
|
+
for (const [k, v] of Object.entries(raw.arms || {})) {
|
|
221
|
+
this.arms.set(k, v);
|
|
222
|
+
}
|
|
223
|
+
this.steps = raw.steps || 0;
|
|
224
|
+
logger.info({ arms: this.arms.size, steps: this.steps }, '[Bandit] State loaded');
|
|
225
|
+
}
|
|
226
|
+
} catch (err) {
|
|
227
|
+
logger.debug({ err: err.message }, '[Bandit] State load failed');
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
getStats() {
|
|
232
|
+
const armStats = {};
|
|
233
|
+
for (const [k, v] of this.arms) {
|
|
234
|
+
armStats[k] = { count: v.count };
|
|
235
|
+
}
|
|
236
|
+
return { steps: this.steps, arms: armStats, alpha: this.alpha };
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
let _instance = null;
|
|
241
|
+
function getBandit() {
|
|
242
|
+
if (!_instance) _instance = new LinUCBBandit();
|
|
243
|
+
return _instance;
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
module.exports = { LinUCBBandit, getBandit, FEATURE_DIM };
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Small-first cascade with confidence-based deferral (Phase 3.3).
|
|
3
|
+
*
|
|
4
|
+
* For tier-MEDIUM/COMPLEX requests, optionally try a smaller model first.
|
|
5
|
+
* If the response confidence (from confidence-scorer) ≥ threshold, accept it.
|
|
6
|
+
* Otherwise, escalate to the originally-routed tier model.
|
|
7
|
+
*
|
|
8
|
+
* Off by default for streaming (can't retry mid-stream cleanly).
|
|
9
|
+
* Opt-in via LYNKR_CASCADE_ENABLED=true.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
const logger = require('../logger');
|
|
13
|
+
const confidenceScorer = require('./confidence-scorer');
|
|
14
|
+
|
|
15
|
+
const DEFAULT_THRESHOLD = 0.85;
|
|
16
|
+
const TIERS_ELIGIBLE = ['MEDIUM', 'COMPLEX'];
|
|
17
|
+
|
|
18
|
+
function isEnabled() {
|
|
19
|
+
return process.env.LYNKR_CASCADE_ENABLED === 'true';
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* @param {object} args
|
|
24
|
+
* @param {string} args.tier — the originally selected tier
|
|
25
|
+
* @param {boolean} args.streaming — true if the request is streaming
|
|
26
|
+
* @param {boolean} args.hasTools — true if tools are present
|
|
27
|
+
* @returns {boolean}
|
|
28
|
+
*/
|
|
29
|
+
function shouldCascade(args) {
|
|
30
|
+
if (!isEnabled()) return false;
|
|
31
|
+
if (args.streaming) return false; // streaming responses can't be retried cleanly
|
|
32
|
+
if (args.hasTools) return false; // tool calls have side effects; don't double-run
|
|
33
|
+
if (!TIERS_ELIGIBLE.includes(args.tier)) return false;
|
|
34
|
+
return true;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Run a small-first cascade.
|
|
39
|
+
*
|
|
40
|
+
* @param {object} args
|
|
41
|
+
* @param {object} args.payload — the request payload
|
|
42
|
+
* @param {object} args.smallModel — { provider, model }
|
|
43
|
+
* @param {object} args.bigModel — { provider, model }
|
|
44
|
+
* @param {function} args.invoke — async (provider, model, payload) → response
|
|
45
|
+
* @param {string} args.taskType — used by confidence scorer
|
|
46
|
+
* @param {number} args.threshold — confidence threshold, defaults to 0.85
|
|
47
|
+
* @param {function} args.judge — optional judge LLM for reasoning tasks
|
|
48
|
+
* @returns {Promise<{ response, usedModel, cascadeStats }>}
|
|
49
|
+
*/
|
|
50
|
+
async function run(args) {
|
|
51
|
+
const threshold = args.threshold ?? DEFAULT_THRESHOLD;
|
|
52
|
+
const start = Date.now();
|
|
53
|
+
let smallLatency = 0;
|
|
54
|
+
let bigLatency = 0;
|
|
55
|
+
|
|
56
|
+
// Try small model
|
|
57
|
+
let smallResponse;
|
|
58
|
+
try {
|
|
59
|
+
const t0 = Date.now();
|
|
60
|
+
smallResponse = await args.invoke(args.smallModel.provider, args.smallModel.model, args.payload);
|
|
61
|
+
smallLatency = Date.now() - t0;
|
|
62
|
+
} catch (err) {
|
|
63
|
+
logger.debug({ err: err.message }, '[Cascade] Small model failed, escalating');
|
|
64
|
+
const t0 = Date.now();
|
|
65
|
+
const bigResponse = await args.invoke(args.bigModel.provider, args.bigModel.model, args.payload);
|
|
66
|
+
bigLatency = Date.now() - t0;
|
|
67
|
+
return {
|
|
68
|
+
response: bigResponse,
|
|
69
|
+
usedModel: args.bigModel,
|
|
70
|
+
cascadeStats: { accepted: false, reason: 'small_failed', smallLatency, bigLatency, totalLatency: Date.now() - start },
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const confidence = await confidenceScorer.score(smallResponse, {
|
|
75
|
+
taskType: args.taskType,
|
|
76
|
+
question: args.payload?.messages?.[args.payload.messages.length - 1]?.content,
|
|
77
|
+
judge: args.judge,
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
if (confidence >= threshold) {
|
|
81
|
+
return {
|
|
82
|
+
response: smallResponse,
|
|
83
|
+
usedModel: args.smallModel,
|
|
84
|
+
cascadeStats: { accepted: true, confidence, smallLatency, bigLatency: 0, totalLatency: Date.now() - start },
|
|
85
|
+
};
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// Escalate
|
|
89
|
+
const t0 = Date.now();
|
|
90
|
+
const bigResponse = await args.invoke(args.bigModel.provider, args.bigModel.model, args.payload);
|
|
91
|
+
bigLatency = Date.now() - t0;
|
|
92
|
+
return {
|
|
93
|
+
response: bigResponse,
|
|
94
|
+
usedModel: args.bigModel,
|
|
95
|
+
cascadeStats: {
|
|
96
|
+
accepted: false,
|
|
97
|
+
confidence,
|
|
98
|
+
threshold,
|
|
99
|
+
smallLatency,
|
|
100
|
+
bigLatency,
|
|
101
|
+
totalLatency: Date.now() - start,
|
|
102
|
+
},
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
module.exports = { run, shouldCascade, isEnabled, DEFAULT_THRESHOLD };
|
|
@@ -395,24 +395,16 @@ function extractContent(payload) {
|
|
|
395
395
|
}
|
|
396
396
|
|
|
397
397
|
/**
|
|
398
|
-
* Estimate token count
|
|
398
|
+
* Estimate token count.
|
|
399
|
+
*
|
|
400
|
+
* Phase 1.1: delegates to the tiktoken-backed tokenizer (graceful fallback to
|
|
401
|
+
* chars/4 if js-tiktoken is unavailable).
|
|
399
402
|
*/
|
|
403
|
+
const { countPayloadTokens } = require('./tokenizer');
|
|
404
|
+
|
|
400
405
|
function estimateTokens(payload) {
|
|
401
406
|
if (!payload?.messages) return 0;
|
|
402
|
-
|
|
403
|
-
let totalChars = 0;
|
|
404
|
-
for (const msg of payload.messages) {
|
|
405
|
-
if (typeof msg.content === 'string') {
|
|
406
|
-
totalChars += msg.content.length;
|
|
407
|
-
} else if (Array.isArray(msg.content)) {
|
|
408
|
-
for (const block of msg.content) {
|
|
409
|
-
if (block?.text) totalChars += block.text.length;
|
|
410
|
-
}
|
|
411
|
-
}
|
|
412
|
-
}
|
|
413
|
-
|
|
414
|
-
// Rough approximation: 4 chars per token
|
|
415
|
-
return Math.ceil(totalChars / 4);
|
|
407
|
+
return countPayloadTokens(payload, payload?.model);
|
|
416
408
|
}
|
|
417
409
|
|
|
418
410
|
/**
|