lynkr 9.1.2 → 9.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/README.md +21 -10
  2. package/package.json +3 -1
  3. package/scripts/build-knn-index.js +130 -0
  4. package/scripts/calibrate-thresholds.js +197 -0
  5. package/scripts/compare-policies.js +67 -0
  6. package/scripts/learn-output-ratios.js +162 -0
  7. package/scripts/refresh-pricing.js +122 -0
  8. package/scripts/run-routerarena.js +26 -0
  9. package/scripts/sample-regret.js +84 -0
  10. package/scripts/train-risk-classifier.js +191 -0
  11. package/src/api/middleware/budget-enforcer.js +60 -0
  12. package/src/api/middleware/load-shedding.js +11 -1
  13. package/src/api/middleware/tenant.js +21 -0
  14. package/src/api/router.js +19 -40
  15. package/src/budget/hierarchical-budget.js +159 -0
  16. package/src/cache/semantic.js +28 -2
  17. package/src/clients/databricks.js +59 -5
  18. package/src/config/index.js +239 -43
  19. package/src/context/toon.js +5 -4
  20. package/src/orchestrator/index.js +44 -6
  21. package/src/prompts/system.js +34 -6
  22. package/src/routing/bandit.js +246 -0
  23. package/src/routing/cascade.js +106 -0
  24. package/src/routing/complexity-analyzer.js +7 -15
  25. package/src/routing/confidence-scorer.js +121 -0
  26. package/src/routing/context-validator.js +71 -0
  27. package/src/routing/cost-optimizer.js +5 -2
  28. package/src/routing/deadline.js +52 -0
  29. package/src/routing/drift-monitor.js +113 -0
  30. package/src/routing/embedding-cache.js +77 -0
  31. package/src/routing/index.js +314 -5
  32. package/src/routing/knn-router.js +206 -0
  33. package/src/routing/latency-tracker.js +113 -71
  34. package/src/routing/model-tiers.js +156 -6
  35. package/src/routing/output-ratios.js +57 -0
  36. package/src/routing/regret-estimator.js +91 -0
  37. package/src/routing/reward-pipeline.js +62 -0
  38. package/src/routing/risk-classifier.js +130 -0
  39. package/src/routing/shadow-mode.js +77 -0
  40. package/src/routing/tenant-policy.js +96 -0
  41. package/src/routing/tokenizer.js +162 -0
  42. package/src/server.js +9 -0
@@ -15,11 +15,12 @@ function normaliseSettings(settings = {}) {
15
15
  };
16
16
  }
17
17
 
18
- function resolveEncodeFn(overrideEncode) {
18
+ async function resolveEncodeFn(overrideEncode) {
19
19
  if (typeof overrideEncode === "function") return overrideEncode;
20
20
  if (cachedEncode !== undefined) return cachedEncode;
21
21
  try {
22
- const toon = require("@toon-format/toon");
22
+ // Use dynamic import for ES module compatibility
23
+ const toon = await import("@toon-format/toon");
23
24
  cachedEncode = typeof toon?.encode === "function" ? toon.encode : null;
24
25
  cachedLoadError = cachedEncode ? null : new Error("Missing encode() export from @toon-format/toon");
25
26
  } catch (err) {
@@ -89,7 +90,7 @@ function compressStringContent(content, cfg, encodeFn, stats) {
89
90
  return toonText;
90
91
  }
91
92
 
92
- function applyToonCompression(payload, settings = {}, options = {}) {
93
+ async function applyToonCompression(payload, settings = {}, options = {}) {
93
94
  const cfg = normaliseSettings(settings);
94
95
  const stats = {
95
96
  enabled: cfg.enabled,
@@ -109,7 +110,7 @@ function applyToonCompression(payload, settings = {}, options = {}) {
109
110
  return { payload, stats };
110
111
  }
111
112
 
112
- const encodeFn = resolveEncodeFn(options.encode);
113
+ const encodeFn = await resolveEncodeFn(options.encode);
113
114
  if (typeof encodeFn !== "function") {
114
115
  stats.available = false;
115
116
  const err = cachedLoadError ?? new Error("TOON encoder unavailable");
@@ -1101,7 +1101,7 @@ function toAnthropicResponse(openai, requestedModel, wantsThinking) {
1101
1101
  };
1102
1102
  }
1103
1103
 
1104
- function sanitizePayload(payload) {
1104
+ async function sanitizePayload(payload) {
1105
1105
  const { clonePayloadSmart } = require("../utils/payload");
1106
1106
  const providerType = config.modelProvider?.type ?? "databricks";
1107
1107
  const willFlatten = providerType !== "azure-anthropic";
@@ -1418,7 +1418,7 @@ function sanitizePayload(payload) {
1418
1418
 
1419
1419
  // Optional TOON conversion for large JSON message payloads (prompt context only).
1420
1420
  // Run this BEFORE message coalescing to preserve parseable JSON boundaries.
1421
- applyToonCompression(clean, config.toon, { logger });
1421
+ await applyToonCompression(clean, config.toon, { logger });
1422
1422
 
1423
1423
  // FIX: Handle consecutive messages with the same role (causes llama.cpp 400 error)
1424
1424
  // Strategy: Merge consecutive same-role messages, but NEVER merge messages
@@ -1529,12 +1529,35 @@ function getToolCallSignature(toolCall) {
1529
1529
  }
1530
1530
 
1531
1531
  function buildNonJsonResponse(databricksResponse) {
1532
+ // Convert plain text response to Anthropic message format
1533
+ // so SSE handler can properly render it
1534
+ const textContent = databricksResponse.text || "";
1535
+
1532
1536
  return {
1533
1537
  status: databricksResponse.status,
1534
1538
  headers: {
1535
- "Content-Type": databricksResponse.contentType ?? "text/plain",
1539
+ "Content-Type": "application/json", // Changed from text/plain
1540
+ },
1541
+ body: {
1542
+ id: `msg_${Date.now()}`,
1543
+ type: "message",
1544
+ role: "assistant",
1545
+ model: "unknown",
1546
+ content: [
1547
+ {
1548
+ type: "text",
1549
+ text: textContent
1550
+ }
1551
+ ],
1552
+ stop_reason: "end_turn",
1553
+ stop_sequence: null,
1554
+ usage: {
1555
+ input_tokens: 0,
1556
+ output_tokens: 0,
1557
+ cache_creation_input_tokens: 0,
1558
+ cache_read_input_tokens: 0,
1559
+ }
1536
1560
  },
1537
- body: databricksResponse.text,
1538
1561
  terminationReason: "non_json_response",
1539
1562
  };
1540
1563
  }
@@ -1966,6 +1989,17 @@ IMPORTANT TOOL USAGE RULES:
1966
1989
  cleanPayload._workspace = headers["x-lynkr-workspace"];
1967
1990
  }
1968
1991
 
1992
+ // Phase 6.3 — thread deadline for latency-aware routing.
1993
+ if (headers?.["lynkr-deadline-ms"]) {
1994
+ const dl = parseInt(headers["lynkr-deadline-ms"], 10);
1995
+ if (!isNaN(dl) && dl > 0) cleanPayload._deadlineMs = dl;
1996
+ }
1997
+
1998
+ // Phase 6.1 — thread tenant policy for per-tenant routing overrides.
1999
+ if (options?.tenantPolicy) {
2000
+ cleanPayload._tenantPolicy = options.tenantPolicy;
2001
+ }
2002
+
1969
2003
  // RTK-inspired tool result compression: compress large tool_results
1970
2004
  // before they reach the model (saves 60-90% on test/git/lint output)
1971
2005
  if (config.toolResultCompression?.enabled !== false) {
@@ -3895,7 +3929,7 @@ async function processMessage({ payload, headers, session, cwd, options = {} })
3895
3929
  const { createTimer } = require("../utils/perf-timer");
3896
3930
  const pTimer = createTimer("processMessage");
3897
3931
 
3898
- const cleanPayload = sanitizePayload(payload);
3932
+ const cleanPayload = await sanitizePayload(payload);
3899
3933
  pTimer.mark("sanitizePayload");
3900
3934
 
3901
3935
  // Proactively load tools based on prompt content (lazy loading)
@@ -4033,7 +4067,11 @@ async function processMessage({ payload, headers, session, cwd, options = {} })
4033
4067
  if (semanticCache.isEnabled() && semanticLookupResult && !semanticLookupResult.hit) {
4034
4068
  if (loopResult.response?.status === 200 && loopResult.response?.body) {
4035
4069
  try {
4036
- await semanticCache.store(semanticLookupResult, loopResult.response.body);
4070
+ // Only cache valid JSON responses, not HTML error pages
4071
+ const body = loopResult.response.body;
4072
+ if (typeof body === 'object' || (typeof body === 'string' && body.trim().startsWith('{'))) {
4073
+ await semanticCache.store(semanticLookupResult, body);
4074
+ }
4037
4075
  } catch (err) {
4038
4076
  logger.debug({ error: err.message }, "Semantic cache store failed");
4039
4077
  }
@@ -70,13 +70,41 @@ function compressToolDescriptions(tools, mode = null) {
70
70
  return tools; // Return unmodified if not in minimal mode
71
71
  }
72
72
 
73
- return tools.map(tool => {
73
+ const validTools = tools.filter(tool => {
74
+ // Handle both Anthropic format (name + input_schema) and OpenAI format (function.name)
75
+ const hasAnthropicFormat = tool && tool.name && tool.input_schema;
76
+ const hasOpenAIFormat = tool && tool.function && tool.function.name;
77
+ const isValid = hasAnthropicFormat || hasOpenAIFormat;
78
+
79
+ if (!isValid) {
80
+ logger.debug({
81
+ hasName: !!tool?.name,
82
+ hasSchema: !!tool?.input_schema,
83
+ hasFunctionName: !!tool?.function?.name,
84
+ toolType: typeof tool
85
+ }, 'Filtered out malformed tool');
86
+ }
87
+ return isValid;
88
+ });
89
+
90
+ if (validTools.length === 0 && tools.length > 0) {
91
+ logger.warn({ originalCount: tools.length }, 'All tools filtered out as malformed - returning original');
92
+ return tools;
93
+ }
94
+
95
+ return validTools.map(tool => {
96
+ // If already in OpenAI format, return as-is (no compression for OpenAI format)
97
+ if (tool.function && !tool.input_schema) {
98
+ return tool;
99
+ }
100
+
101
+ // Compress Anthropic format
74
102
  const compressed = {
75
103
  name: tool.name,
76
104
  input_schema: {
77
- type: tool.input_schema.type,
105
+ type: tool.input_schema?.type || "object",
78
106
  properties: {},
79
- required: tool.input_schema.required || [],
107
+ required: tool.input_schema?.required || [],
80
108
  }
81
109
  };
82
110
 
@@ -190,7 +218,7 @@ function optimizeSystemPrompt(system, context = {}, mode = null) {
190
218
 
191
219
  // 2. Remove file operation guidelines if no file tools
192
220
  const hasFileTools = context.tools?.some(t =>
193
- ['Read', 'Write', 'Edit', 'Glob', 'Grep'].includes(t.name)
221
+ t?.name && ['Read', 'Write', 'Edit', 'Glob', 'Grep'].includes(t.name)
194
222
  );
195
223
  if (!hasFileTools) {
196
224
  text = removeSection(text, /# File Operations?[\s\S]*?(?=\n#|\n\n[A-Z]|$)/gi, optimizations, 'file operations');
@@ -198,7 +226,7 @@ function optimizeSystemPrompt(system, context = {}, mode = null) {
198
226
 
199
227
  // 3. Remove git guidelines if no git tools
200
228
  const hasGitTools = context.tools?.some(t =>
201
- t.name.toLowerCase().includes('git')
229
+ t?.name && t.name.toLowerCase().includes('git')
202
230
  );
203
231
  if (!hasGitTools) {
204
232
  text = removeSection(text, /# Git.*?[\s\S]*?(?=\n#|\n\n[A-Z]|$)/gi, optimizations, 'git guidelines');
@@ -207,7 +235,7 @@ function optimizeSystemPrompt(system, context = {}, mode = null) {
207
235
 
208
236
  // 4. Remove web search guidelines if no web tools
209
237
  const hasWebTools = context.tools?.some(t =>
210
- ['WebSearch', 'WebFetch'].includes(t.name)
238
+ t?.name && ['WebSearch', 'WebFetch'].includes(t.name)
211
239
  );
212
240
  if (!hasWebTools) {
213
241
  text = removeSection(text, /# Web.*?[\s\S]*?(?=\n#|\n\n[A-Z]|$)/gi, optimizations, 'web guidelines');
@@ -0,0 +1,246 @@
1
+ /**
2
+ * LinUCB contextual bandit for intra-tier model selection (Phase 4.1).
3
+ *
4
+ * Standard LinUCB-with-disjoint-models algorithm (Li et al. 2010).
5
+ * - One arm per (provider, model) pair in a tier
6
+ * - Context = numerical feature vector for the request
7
+ * - Reward = quality_score - λ·norm_cost - μ·norm_latency
8
+ * - Per-arm A (d×d ridge-regression matrix) and b (d-vector) stored to disk
9
+ *
10
+ * State persists to data/bandit-state.json. Loaded on startup; saved on
11
+ * every `update()` (cheap — small matrices) and on graceful shutdown.
12
+ */
13
+
14
+ const fs = require('fs');
15
+ const path = require('path');
16
+ const logger = require('../logger');
17
+
18
+ const STATE_PATH = path.join(__dirname, '../../data/bandit-state.json');
19
+ const DEFAULT_ALPHA = 1.5;
20
+ const DEFAULT_LAMBDA = 0.3; // cost penalty weight
21
+ const DEFAULT_MU = 0.1; // latency penalty weight
22
+ const FEATURE_DIM = 12;
23
+ const EXPLORATION_RATE = 0.05;
24
+
25
+ function _identity(d) {
26
+ const m = new Array(d);
27
+ for (let i = 0; i < d; i++) {
28
+ m[i] = new Array(d).fill(0);
29
+ m[i][i] = 1;
30
+ }
31
+ return m;
32
+ }
33
+
34
+ function _zeros(d) {
35
+ return new Array(d).fill(0);
36
+ }
37
+
38
+ function _matVec(M, v) {
39
+ const d = v.length;
40
+ const out = new Array(d).fill(0);
41
+ for (let i = 0; i < d; i++) {
42
+ for (let j = 0; j < d; j++) out[i] += M[i][j] * v[j];
43
+ }
44
+ return out;
45
+ }
46
+
47
+ function _dot(a, b) {
48
+ let s = 0;
49
+ for (let i = 0; i < a.length; i++) s += a[i] * b[i];
50
+ return s;
51
+ }
52
+
53
+ function _outer(a, b) {
54
+ const out = new Array(a.length);
55
+ for (let i = 0; i < a.length; i++) {
56
+ out[i] = new Array(b.length);
57
+ for (let j = 0; j < b.length; j++) out[i][j] = a[i] * b[j];
58
+ }
59
+ return out;
60
+ }
61
+
62
+ function _addMat(A, B) {
63
+ for (let i = 0; i < A.length; i++) {
64
+ for (let j = 0; j < A[i].length; j++) A[i][j] += B[i][j];
65
+ }
66
+ }
67
+
68
+ function _addVec(a, b) {
69
+ for (let i = 0; i < a.length; i++) a[i] += b[i];
70
+ }
71
+
72
+ /**
73
+ * Invert a small dense matrix via Gauss-Jordan. For d=12 this is plenty fast
74
+ * and saves us a dependency on a linear algebra library.
75
+ */
76
+ function _inv(M) {
77
+ const d = M.length;
78
+ const aug = M.map((row, i) => {
79
+ const r = row.slice();
80
+ for (let j = 0; j < d; j++) r.push(i === j ? 1 : 0);
81
+ return r;
82
+ });
83
+ for (let i = 0; i < d; i++) {
84
+ let pivot = aug[i][i];
85
+ if (Math.abs(pivot) < 1e-12) {
86
+ let swap = -1;
87
+ for (let k = i + 1; k < d; k++) {
88
+ if (Math.abs(aug[k][i]) > 1e-12) { swap = k; break; }
89
+ }
90
+ if (swap < 0) throw new Error('matrix singular');
91
+ [aug[i], aug[swap]] = [aug[swap], aug[i]];
92
+ pivot = aug[i][i];
93
+ }
94
+ for (let j = 0; j < 2 * d; j++) aug[i][j] /= pivot;
95
+ for (let k = 0; k < d; k++) {
96
+ if (k === i) continue;
97
+ const factor = aug[k][i];
98
+ for (let j = 0; j < 2 * d; j++) aug[k][j] -= factor * aug[i][j];
99
+ }
100
+ }
101
+ return aug.map(row => row.slice(d));
102
+ }
103
+
104
+ class LinUCBBandit {
105
+ constructor({ alpha = DEFAULT_ALPHA, lambda = DEFAULT_LAMBDA, mu = DEFAULT_MU, dim = FEATURE_DIM } = {}) {
106
+ this.alpha = alpha;
107
+ this.lambda = lambda;
108
+ this.mu = mu;
109
+ this.dim = dim;
110
+ /** arms: Map<armKey, { A: number[][], b: number[], count: number }> */
111
+ this.arms = new Map();
112
+ this.steps = 0;
113
+ this._load();
114
+ }
115
+
116
+ _armKey(tier, provider, model) {
117
+ return `${tier}|${provider}:${model}`;
118
+ }
119
+
120
+ _ensureArm(armKey) {
121
+ if (!this.arms.has(armKey)) {
122
+ this.arms.set(armKey, { A: _identity(this.dim), b: _zeros(this.dim), count: 0 });
123
+ }
124
+ return this.arms.get(armKey);
125
+ }
126
+
127
+ /**
128
+ * Pick an arm for a given tier and context.
129
+ * @param {string} tier
130
+ * @param {Array<{ provider: string, model: string }>} candidates — qualifying arms
131
+ * @param {number[]} context — feature vector
132
+ * @returns {{ provider, model, ucb, explored }} chosen arm
133
+ */
134
+ pick(tier, candidates, context) {
135
+ if (!candidates || candidates.length === 0) return null;
136
+ if (context.length !== this.dim) {
137
+ // Pad or truncate to dim
138
+ context = context.slice(0, this.dim);
139
+ while (context.length < this.dim) context.push(0);
140
+ }
141
+
142
+ // ε-greedy: 5% pure exploration
143
+ if (Math.random() < EXPLORATION_RATE) {
144
+ const random = candidates[Math.floor(Math.random() * candidates.length)];
145
+ return { ...random, ucb: null, explored: true };
146
+ }
147
+
148
+ let best = null;
149
+ let bestUcb = -Infinity;
150
+ for (const c of candidates) {
151
+ const key = this._armKey(tier, c.provider, c.model);
152
+ const arm = this._ensureArm(key);
153
+ let Ainv;
154
+ try {
155
+ Ainv = _inv(arm.A);
156
+ } catch (err) {
157
+ continue;
158
+ }
159
+ const theta = _matVec(Ainv, arm.b);
160
+ const mean = _dot(theta, context);
161
+ const variance = _dot(context, _matVec(Ainv, context));
162
+ const ucb = mean + this.alpha * Math.sqrt(Math.max(0, variance));
163
+ if (ucb > bestUcb) {
164
+ bestUcb = ucb;
165
+ best = { ...c, ucb, explored: false };
166
+ }
167
+ }
168
+ return best;
169
+ }
170
+
171
+ /**
172
+ * Update the chosen arm with the observed reward.
173
+ * @param {string} tier
174
+ * @param {string} provider
175
+ * @param {string} model
176
+ * @param {number[]} context
177
+ * @param {number} reward — typically in [0, 100]; will be rescaled to [0, 1] internally
178
+ */
179
+ update(tier, provider, model, context, reward) {
180
+ const key = this._armKey(tier, provider, model);
181
+ const arm = this._ensureArm(key);
182
+ let ctx = context;
183
+ if (ctx.length !== this.dim) {
184
+ ctx = ctx.slice(0, this.dim);
185
+ while (ctx.length < this.dim) ctx.push(0);
186
+ }
187
+ const r = Math.max(0, Math.min(1, reward / 100));
188
+ _addMat(arm.A, _outer(ctx, ctx));
189
+ _addVec(arm.b, ctx.map(x => x * r));
190
+ arm.count++;
191
+ this.steps++;
192
+ // Save periodically (not every step to limit IO)
193
+ if (this.steps % 25 === 0) this._save();
194
+ }
195
+
196
+ _save() {
197
+ try {
198
+ fs.mkdirSync(path.dirname(STATE_PATH), { recursive: true });
199
+ const arms = {};
200
+ for (const [k, v] of this.arms) arms[k] = v;
201
+ fs.writeFileSync(STATE_PATH, JSON.stringify({
202
+ savedAt: Date.now(),
203
+ steps: this.steps,
204
+ alpha: this.alpha,
205
+ lambda: this.lambda,
206
+ mu: this.mu,
207
+ dim: this.dim,
208
+ arms,
209
+ }, null, 0));
210
+ } catch (err) {
211
+ logger.debug({ err: err.message }, '[Bandit] State save failed');
212
+ }
213
+ }
214
+
215
+ _load() {
216
+ try {
217
+ if (!fs.existsSync(STATE_PATH)) return;
218
+ const raw = JSON.parse(fs.readFileSync(STATE_PATH, 'utf8'));
219
+ if (raw.dim && raw.dim === this.dim) {
220
+ for (const [k, v] of Object.entries(raw.arms || {})) {
221
+ this.arms.set(k, v);
222
+ }
223
+ this.steps = raw.steps || 0;
224
+ logger.info({ arms: this.arms.size, steps: this.steps }, '[Bandit] State loaded');
225
+ }
226
+ } catch (err) {
227
+ logger.debug({ err: err.message }, '[Bandit] State load failed');
228
+ }
229
+ }
230
+
231
+ getStats() {
232
+ const armStats = {};
233
+ for (const [k, v] of this.arms) {
234
+ armStats[k] = { count: v.count };
235
+ }
236
+ return { steps: this.steps, arms: armStats, alpha: this.alpha };
237
+ }
238
+ }
239
+
240
+ let _instance = null;
241
+ function getBandit() {
242
+ if (!_instance) _instance = new LinUCBBandit();
243
+ return _instance;
244
+ }
245
+
246
+ module.exports = { LinUCBBandit, getBandit, FEATURE_DIM };
@@ -0,0 +1,106 @@
1
+ /**
2
+ * Small-first cascade with confidence-based deferral (Phase 3.3).
3
+ *
4
+ * For tier-MEDIUM/COMPLEX requests, optionally try a smaller model first.
5
+ * If the response confidence (from confidence-scorer) ≥ threshold, accept it.
6
+ * Otherwise, escalate to the originally-routed tier model.
7
+ *
8
+ * Off by default for streaming (can't retry mid-stream cleanly).
9
+ * Opt-in via LYNKR_CASCADE_ENABLED=true.
10
+ */
11
+
12
+ const logger = require('../logger');
13
+ const confidenceScorer = require('./confidence-scorer');
14
+
15
+ const DEFAULT_THRESHOLD = 0.85;
16
+ const TIERS_ELIGIBLE = ['MEDIUM', 'COMPLEX'];
17
+
18
+ function isEnabled() {
19
+ return process.env.LYNKR_CASCADE_ENABLED === 'true';
20
+ }
21
+
22
+ /**
23
+ * @param {object} args
24
+ * @param {string} args.tier — the originally selected tier
25
+ * @param {boolean} args.streaming — true if the request is streaming
26
+ * @param {boolean} args.hasTools — true if tools are present
27
+ * @returns {boolean}
28
+ */
29
+ function shouldCascade(args) {
30
+ if (!isEnabled()) return false;
31
+ if (args.streaming) return false; // streaming responses can't be retried cleanly
32
+ if (args.hasTools) return false; // tool calls have side effects; don't double-run
33
+ if (!TIERS_ELIGIBLE.includes(args.tier)) return false;
34
+ return true;
35
+ }
36
+
37
+ /**
38
+ * Run a small-first cascade.
39
+ *
40
+ * @param {object} args
41
+ * @param {object} args.payload — the request payload
42
+ * @param {object} args.smallModel — { provider, model }
43
+ * @param {object} args.bigModel — { provider, model }
44
+ * @param {function} args.invoke — async (provider, model, payload) → response
45
+ * @param {string} args.taskType — used by confidence scorer
46
+ * @param {number} args.threshold — confidence threshold, defaults to 0.85
47
+ * @param {function} args.judge — optional judge LLM for reasoning tasks
48
+ * @returns {Promise<{ response, usedModel, cascadeStats }>}
49
+ */
50
+ async function run(args) {
51
+ const threshold = args.threshold ?? DEFAULT_THRESHOLD;
52
+ const start = Date.now();
53
+ let smallLatency = 0;
54
+ let bigLatency = 0;
55
+
56
+ // Try small model
57
+ let smallResponse;
58
+ try {
59
+ const t0 = Date.now();
60
+ smallResponse = await args.invoke(args.smallModel.provider, args.smallModel.model, args.payload);
61
+ smallLatency = Date.now() - t0;
62
+ } catch (err) {
63
+ logger.debug({ err: err.message }, '[Cascade] Small model failed, escalating');
64
+ const t0 = Date.now();
65
+ const bigResponse = await args.invoke(args.bigModel.provider, args.bigModel.model, args.payload);
66
+ bigLatency = Date.now() - t0;
67
+ return {
68
+ response: bigResponse,
69
+ usedModel: args.bigModel,
70
+ cascadeStats: { accepted: false, reason: 'small_failed', smallLatency, bigLatency, totalLatency: Date.now() - start },
71
+ };
72
+ }
73
+
74
+ const confidence = await confidenceScorer.score(smallResponse, {
75
+ taskType: args.taskType,
76
+ question: args.payload?.messages?.[args.payload.messages.length - 1]?.content,
77
+ judge: args.judge,
78
+ });
79
+
80
+ if (confidence >= threshold) {
81
+ return {
82
+ response: smallResponse,
83
+ usedModel: args.smallModel,
84
+ cascadeStats: { accepted: true, confidence, smallLatency, bigLatency: 0, totalLatency: Date.now() - start },
85
+ };
86
+ }
87
+
88
+ // Escalate
89
+ const t0 = Date.now();
90
+ const bigResponse = await args.invoke(args.bigModel.provider, args.bigModel.model, args.payload);
91
+ bigLatency = Date.now() - t0;
92
+ return {
93
+ response: bigResponse,
94
+ usedModel: args.bigModel,
95
+ cascadeStats: {
96
+ accepted: false,
97
+ confidence,
98
+ threshold,
99
+ smallLatency,
100
+ bigLatency,
101
+ totalLatency: Date.now() - start,
102
+ },
103
+ };
104
+ }
105
+
106
+ module.exports = { run, shouldCascade, isEnabled, DEFAULT_THRESHOLD };
@@ -395,24 +395,16 @@ function extractContent(payload) {
395
395
  }
396
396
 
397
397
  /**
398
- * Estimate token count (rough approximation)
398
+ * Estimate token count.
399
+ *
400
+ * Phase 1.1: delegates to the tiktoken-backed tokenizer (graceful fallback to
401
+ * chars/4 if js-tiktoken is unavailable).
399
402
  */
403
+ const { countPayloadTokens } = require('./tokenizer');
404
+
400
405
  function estimateTokens(payload) {
401
406
  if (!payload?.messages) return 0;
402
-
403
- let totalChars = 0;
404
- for (const msg of payload.messages) {
405
- if (typeof msg.content === 'string') {
406
- totalChars += msg.content.length;
407
- } else if (Array.isArray(msg.content)) {
408
- for (const block of msg.content) {
409
- if (block?.text) totalChars += block.text.length;
410
- }
411
- }
412
- }
413
-
414
- // Rough approximation: 4 chars per token
415
- return Math.ceil(totalChars / 4);
407
+ return countPayloadTokens(payload, payload?.model);
416
408
  }
417
409
 
418
410
  /**