lynkr 9.0.2 → 9.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/README.md +21 -10
  2. package/bin/cli.js +18 -1
  3. package/bin/lynkr-trajectory.js +136 -0
  4. package/bin/lynkr-usage.js +219 -0
  5. package/funding.json +110 -0
  6. package/package.json +4 -2
  7. package/public/dashboard.html +665 -0
  8. package/scripts/build-knn-index.js +130 -0
  9. package/scripts/calibrate-thresholds.js +197 -0
  10. package/scripts/compare-policies.js +67 -0
  11. package/scripts/learn-output-ratios.js +162 -0
  12. package/scripts/refresh-pricing.js +122 -0
  13. package/scripts/run-routerarena.js +26 -0
  14. package/scripts/sample-regret.js +84 -0
  15. package/scripts/train-risk-classifier.js +191 -0
  16. package/src/api/files-router.js +6 -6
  17. package/src/api/middleware/budget-enforcer.js +60 -0
  18. package/src/api/middleware/budget.js +19 -1
  19. package/src/api/middleware/load-shedding.js +17 -0
  20. package/src/api/middleware/tenant.js +21 -0
  21. package/src/api/openai-router.js +1 -1
  22. package/src/api/router.js +204 -87
  23. package/src/budget/hierarchical-budget.js +159 -0
  24. package/src/cache/semantic.js +28 -2
  25. package/src/clients/databricks.js +68 -10
  26. package/src/clients/openai-format.js +31 -5
  27. package/src/config/index.js +246 -43
  28. package/src/context/toon.js +5 -4
  29. package/src/dashboard/api.js +170 -0
  30. package/src/dashboard/router.js +13 -0
  31. package/src/headroom/client.js +3 -109
  32. package/src/headroom/index.js +0 -14
  33. package/src/memory/search.js +0 -50
  34. package/src/orchestrator/index.js +106 -11
  35. package/src/orchestrator/preflight.js +188 -0
  36. package/src/prompts/system.js +34 -6
  37. package/src/routing/bandit.js +246 -0
  38. package/src/routing/cascade.js +106 -0
  39. package/src/routing/complexity-analyzer.js +7 -15
  40. package/src/routing/confidence-scorer.js +121 -0
  41. package/src/routing/context-validator.js +71 -0
  42. package/src/routing/cost-optimizer.js +5 -2
  43. package/src/routing/deadline.js +52 -0
  44. package/src/routing/drift-monitor.js +113 -0
  45. package/src/routing/embedding-cache.js +77 -0
  46. package/src/routing/index.js +374 -4
  47. package/src/routing/interaction.js +183 -0
  48. package/src/routing/knn-router.js +206 -0
  49. package/src/routing/latency-tracker.js +113 -71
  50. package/src/routing/model-tiers.js +156 -6
  51. package/src/routing/output-ratios.js +57 -0
  52. package/src/routing/regret-estimator.js +91 -0
  53. package/src/routing/reward-pipeline.js +62 -0
  54. package/src/routing/risk-analyzer.js +194 -0
  55. package/src/routing/risk-classifier.js +130 -0
  56. package/src/routing/shadow-mode.js +77 -0
  57. package/src/routing/telemetry.js +7 -0
  58. package/src/routing/tenant-policy.js +96 -0
  59. package/src/routing/tokenizer.js +162 -0
  60. package/src/server.js +12 -0
  61. package/src/stores/file-store.js +42 -7
  62. package/src/tools/smart-selection.js +11 -2
  63. package/src/training/trajectory-compressor.js +266 -0
  64. package/src/usage/aggregator.js +206 -0
  65. package/src/utils/markdown-ansi.js +146 -0
@@ -0,0 +1,188 @@
1
+ /**
2
+ * Preflight Checks
3
+ *
4
+ * Runs user-supplied commands before invoking the model. If they all
5
+ * exit 0, the work is already done — we skip the LLM call entirely
6
+ * and return a synthetic "preflight_satisfied" response at zero cost.
7
+ *
8
+ * Typical use case: a fix-the-failing-test request that arrives after
9
+ * the test already passes (CI lag, retry-after-fix, idempotent agent
10
+ * retries).
11
+ *
12
+ * The request opts in by including a top-level `preflight_commands`
13
+ * array on the Anthropic-format payload, e.g.:
14
+ *
15
+ * {
16
+ * "model": "...",
17
+ * "messages": [...],
18
+ * "preflight_commands": ["pnpm test -- user-service"]
19
+ * }
20
+ *
21
+ * Disabled by default — gated on LYNKR_PREFLIGHT_ENABLED=true. The
22
+ * commands run with the same permissions as the Lynkr server, so
23
+ * operators should only enable this on workspaces where that is OK.
24
+ *
25
+ * @module orchestrator/preflight
26
+ */
27
+
28
+ const { spawnSync } = require('child_process');
29
+ const path = require('path');
30
+ const config = require('../config');
31
+ const logger = require('../logger');
32
+
33
+ const MAX_COMMANDS = 10;
34
+ const MAX_OUTPUT_BYTES = 4000;
35
+
36
+ /**
37
+ * Extract the preflight command list from a request payload.
38
+ * Accepts either `preflight_commands` (Lynkr-specific) or
39
+ * `metadata.lynkr_preflight_commands` (for clients that strip unknown
40
+ * top-level fields).
41
+ *
42
+ * @param {object} payload
43
+ * @returns {string[]}
44
+ */
45
+ function extractCommands(payload) {
46
+ if (!payload) return [];
47
+ const raw =
48
+ payload.preflight_commands ||
49
+ payload.metadata?.lynkr_preflight_commands ||
50
+ [];
51
+ if (!Array.isArray(raw)) return [];
52
+ return raw
53
+ .filter(cmd => typeof cmd === 'string' && cmd.trim().length > 0)
54
+ .slice(0, MAX_COMMANDS);
55
+ }
56
+
57
+ /**
58
+ * Resolve the workspace path for command execution. Falls back to
59
+ * process.cwd() if no workspace is supplied (the caller should usually
60
+ * pass one explicitly).
61
+ *
62
+ * @param {string|null|undefined} cwd
63
+ * @returns {string|null} absolute path, or null if invalid
64
+ */
65
+ function resolveCwd(cwd) {
66
+ if (!cwd || typeof cwd !== 'string') return null;
67
+ if (!path.isAbsolute(cwd)) return null;
68
+ return cwd;
69
+ }
70
+
71
+ /**
72
+ * Run a single command, returning a structured result.
73
+ *
74
+ * @param {string} command
75
+ * @param {string} cwd
76
+ * @param {number} timeoutMs
77
+ * @returns {{ command: string, exit_code: number|null, stdout: string, stderr: string, timed_out: boolean }}
78
+ */
79
+ function runCommand(command, cwd, timeoutMs) {
80
+ const result = spawnSync(command, {
81
+ cwd,
82
+ shell: true,
83
+ encoding: 'utf8',
84
+ timeout: timeoutMs,
85
+ maxBuffer: 10 * 1024 * 1024,
86
+ });
87
+ return {
88
+ command,
89
+ exit_code: result.status,
90
+ stdout: (result.stdout || '').slice(-MAX_OUTPUT_BYTES),
91
+ stderr: (result.stderr || '').slice(-MAX_OUTPUT_BYTES),
92
+ timed_out: result.signal === 'SIGTERM',
93
+ };
94
+ }
95
+
96
+ /**
97
+ * Try the preflight pass. Returns null when preflight should be
98
+ * skipped (disabled, no commands, missing cwd). Returns a result
99
+ * object otherwise.
100
+ *
101
+ * @param {object} args
102
+ * @param {object} args.payload - Anthropic-format request payload
103
+ * @param {string} [args.cwd] - Workspace cwd (absolute path)
104
+ * @returns {null | {
105
+ * satisfied: boolean,
106
+ * results: object[],
107
+ * failedCommand: string|null,
108
+ * reason: string,
109
+ * }}
110
+ */
111
+ function tryPreflight({ payload, cwd }) {
112
+ if (!config.routing?.preflightEnabled) return null;
113
+ const commands = extractCommands(payload);
114
+ if (commands.length === 0) return null;
115
+ const workspaceCwd = resolveCwd(cwd);
116
+ if (!workspaceCwd) {
117
+ logger.debug({ cwd }, '[Preflight] No valid cwd, skipping');
118
+ return null;
119
+ }
120
+
121
+ const timeoutMs = config.routing?.preflightTimeoutMs || 120000;
122
+ const results = [];
123
+ for (const command of commands) {
124
+ const r = runCommand(command, workspaceCwd, timeoutMs);
125
+ results.push(r);
126
+ if (r.exit_code !== 0) {
127
+ return {
128
+ satisfied: false,
129
+ results,
130
+ failedCommand: command,
131
+ reason: r.timed_out
132
+ ? `Preflight command timed out: ${command}`
133
+ : `Preflight command exited ${r.exit_code}: ${command}`,
134
+ };
135
+ }
136
+ }
137
+ return {
138
+ satisfied: true,
139
+ results,
140
+ failedCommand: null,
141
+ reason: 'All preflight commands passed.',
142
+ };
143
+ }
144
+
145
+ /**
146
+ * Build a synthetic "preflight satisfied" Anthropic Message response
147
+ * that processMessage can return without hitting the model.
148
+ *
149
+ * @param {object} args
150
+ * @param {string} args.model
151
+ * @param {object} args.preflightResult
152
+ * @returns {object} The full processMessage return value.
153
+ */
154
+ function buildSatisfiedResponse({ model, preflightResult }) {
155
+ const summary = `Preflight satisfied — work appears already complete (${preflightResult.results.length} command${preflightResult.results.length === 1 ? '' : 's'} passed).`;
156
+ return {
157
+ response: {
158
+ json: {
159
+ id: `msg_preflight_${Date.now()}`,
160
+ type: 'message',
161
+ role: 'assistant',
162
+ content: [{ type: 'text', text: summary }],
163
+ model,
164
+ stop_reason: 'end_turn',
165
+ stop_sequence: null,
166
+ usage: { input_tokens: 0, output_tokens: 0 },
167
+ lynkr_preflight: {
168
+ satisfied: true,
169
+ reason: preflightResult.reason,
170
+ results: preflightResult.results,
171
+ },
172
+ },
173
+ ok: true,
174
+ status: 200,
175
+ },
176
+ steps: 0,
177
+ durationMs: 0,
178
+ terminationReason: 'preflight_satisfied',
179
+ };
180
+ }
181
+
182
+ module.exports = {
183
+ tryPreflight,
184
+ buildSatisfiedResponse,
185
+ extractCommands,
186
+ // Exposed for tests
187
+ resolveCwd,
188
+ };
@@ -70,13 +70,41 @@ function compressToolDescriptions(tools, mode = null) {
70
70
  return tools; // Return unmodified if not in minimal mode
71
71
  }
72
72
 
73
- return tools.map(tool => {
73
+ const validTools = tools.filter(tool => {
74
+ // Handle both Anthropic format (name + input_schema) and OpenAI format (function.name)
75
+ const hasAnthropicFormat = tool && tool.name && tool.input_schema;
76
+ const hasOpenAIFormat = tool && tool.function && tool.function.name;
77
+ const isValid = hasAnthropicFormat || hasOpenAIFormat;
78
+
79
+ if (!isValid) {
80
+ logger.debug({
81
+ hasName: !!tool?.name,
82
+ hasSchema: !!tool?.input_schema,
83
+ hasFunctionName: !!tool?.function?.name,
84
+ toolType: typeof tool
85
+ }, 'Filtered out malformed tool');
86
+ }
87
+ return isValid;
88
+ });
89
+
90
+ if (validTools.length === 0 && tools.length > 0) {
91
+ logger.warn({ originalCount: tools.length }, 'All tools filtered out as malformed - returning original');
92
+ return tools;
93
+ }
94
+
95
+ return validTools.map(tool => {
96
+ // If already in OpenAI format, return as-is (no compression for OpenAI format)
97
+ if (tool.function && !tool.input_schema) {
98
+ return tool;
99
+ }
100
+
101
+ // Compress Anthropic format
74
102
  const compressed = {
75
103
  name: tool.name,
76
104
  input_schema: {
77
- type: tool.input_schema.type,
105
+ type: tool.input_schema?.type || "object",
78
106
  properties: {},
79
- required: tool.input_schema.required || [],
107
+ required: tool.input_schema?.required || [],
80
108
  }
81
109
  };
82
110
 
@@ -190,7 +218,7 @@ function optimizeSystemPrompt(system, context = {}, mode = null) {
190
218
 
191
219
  // 2. Remove file operation guidelines if no file tools
192
220
  const hasFileTools = context.tools?.some(t =>
193
- ['Read', 'Write', 'Edit', 'Glob', 'Grep'].includes(t.name)
221
+ t?.name && ['Read', 'Write', 'Edit', 'Glob', 'Grep'].includes(t.name)
194
222
  );
195
223
  if (!hasFileTools) {
196
224
  text = removeSection(text, /# File Operations?[\s\S]*?(?=\n#|\n\n[A-Z]|$)/gi, optimizations, 'file operations');
@@ -198,7 +226,7 @@ function optimizeSystemPrompt(system, context = {}, mode = null) {
198
226
 
199
227
  // 3. Remove git guidelines if no git tools
200
228
  const hasGitTools = context.tools?.some(t =>
201
- t.name.toLowerCase().includes('git')
229
+ t?.name && t.name.toLowerCase().includes('git')
202
230
  );
203
231
  if (!hasGitTools) {
204
232
  text = removeSection(text, /# Git.*?[\s\S]*?(?=\n#|\n\n[A-Z]|$)/gi, optimizations, 'git guidelines');
@@ -207,7 +235,7 @@ function optimizeSystemPrompt(system, context = {}, mode = null) {
207
235
 
208
236
  // 4. Remove web search guidelines if no web tools
209
237
  const hasWebTools = context.tools?.some(t =>
210
- ['WebSearch', 'WebFetch'].includes(t.name)
238
+ t?.name && ['WebSearch', 'WebFetch'].includes(t.name)
211
239
  );
212
240
  if (!hasWebTools) {
213
241
  text = removeSection(text, /# Web.*?[\s\S]*?(?=\n#|\n\n[A-Z]|$)/gi, optimizations, 'web guidelines');
@@ -0,0 +1,246 @@
1
+ /**
2
+ * LinUCB contextual bandit for intra-tier model selection (Phase 4.1).
3
+ *
4
+ * Standard LinUCB-with-disjoint-models algorithm (Li et al. 2010).
5
+ * - One arm per (provider, model) pair in a tier
6
+ * - Context = numerical feature vector for the request
7
+ * - Reward = quality_score - λ·norm_cost - μ·norm_latency
8
+ * - Per-arm A (d×d ridge-regression matrix) and b (d-vector) stored to disk
9
+ *
10
+ * State persists to data/bandit-state.json. Loaded on startup; saved on
11
+ * every `update()` (cheap — small matrices) and on graceful shutdown.
12
+ */
13
+
14
+ const fs = require('fs');
15
+ const path = require('path');
16
+ const logger = require('../logger');
17
+
18
+ const STATE_PATH = path.join(__dirname, '../../data/bandit-state.json');
19
+ const DEFAULT_ALPHA = 1.5;
20
+ const DEFAULT_LAMBDA = 0.3; // cost penalty weight
21
+ const DEFAULT_MU = 0.1; // latency penalty weight
22
+ const FEATURE_DIM = 12;
23
+ const EXPLORATION_RATE = 0.05;
24
+
25
+ function _identity(d) {
26
+ const m = new Array(d);
27
+ for (let i = 0; i < d; i++) {
28
+ m[i] = new Array(d).fill(0);
29
+ m[i][i] = 1;
30
+ }
31
+ return m;
32
+ }
33
+
34
+ function _zeros(d) {
35
+ return new Array(d).fill(0);
36
+ }
37
+
38
+ function _matVec(M, v) {
39
+ const d = v.length;
40
+ const out = new Array(d).fill(0);
41
+ for (let i = 0; i < d; i++) {
42
+ for (let j = 0; j < d; j++) out[i] += M[i][j] * v[j];
43
+ }
44
+ return out;
45
+ }
46
+
47
+ function _dot(a, b) {
48
+ let s = 0;
49
+ for (let i = 0; i < a.length; i++) s += a[i] * b[i];
50
+ return s;
51
+ }
52
+
53
+ function _outer(a, b) {
54
+ const out = new Array(a.length);
55
+ for (let i = 0; i < a.length; i++) {
56
+ out[i] = new Array(b.length);
57
+ for (let j = 0; j < b.length; j++) out[i][j] = a[i] * b[j];
58
+ }
59
+ return out;
60
+ }
61
+
62
+ function _addMat(A, B) {
63
+ for (let i = 0; i < A.length; i++) {
64
+ for (let j = 0; j < A[i].length; j++) A[i][j] += B[i][j];
65
+ }
66
+ }
67
+
68
+ function _addVec(a, b) {
69
+ for (let i = 0; i < a.length; i++) a[i] += b[i];
70
+ }
71
+
72
+ /**
73
+ * Invert a small dense matrix via Gauss-Jordan. For d=12 this is plenty fast
74
+ * and saves us a dependency on a linear algebra library.
75
+ */
76
+ function _inv(M) {
77
+ const d = M.length;
78
+ const aug = M.map((row, i) => {
79
+ const r = row.slice();
80
+ for (let j = 0; j < d; j++) r.push(i === j ? 1 : 0);
81
+ return r;
82
+ });
83
+ for (let i = 0; i < d; i++) {
84
+ let pivot = aug[i][i];
85
+ if (Math.abs(pivot) < 1e-12) {
86
+ let swap = -1;
87
+ for (let k = i + 1; k < d; k++) {
88
+ if (Math.abs(aug[k][i]) > 1e-12) { swap = k; break; }
89
+ }
90
+ if (swap < 0) throw new Error('matrix singular');
91
+ [aug[i], aug[swap]] = [aug[swap], aug[i]];
92
+ pivot = aug[i][i];
93
+ }
94
+ for (let j = 0; j < 2 * d; j++) aug[i][j] /= pivot;
95
+ for (let k = 0; k < d; k++) {
96
+ if (k === i) continue;
97
+ const factor = aug[k][i];
98
+ for (let j = 0; j < 2 * d; j++) aug[k][j] -= factor * aug[i][j];
99
+ }
100
+ }
101
+ return aug.map(row => row.slice(d));
102
+ }
103
+
104
+ class LinUCBBandit {
105
+ constructor({ alpha = DEFAULT_ALPHA, lambda = DEFAULT_LAMBDA, mu = DEFAULT_MU, dim = FEATURE_DIM } = {}) {
106
+ this.alpha = alpha;
107
+ this.lambda = lambda;
108
+ this.mu = mu;
109
+ this.dim = dim;
110
+ /** arms: Map<armKey, { A: number[][], b: number[], count: number }> */
111
+ this.arms = new Map();
112
+ this.steps = 0;
113
+ this._load();
114
+ }
115
+
116
+ _armKey(tier, provider, model) {
117
+ return `${tier}|${provider}:${model}`;
118
+ }
119
+
120
+ _ensureArm(armKey) {
121
+ if (!this.arms.has(armKey)) {
122
+ this.arms.set(armKey, { A: _identity(this.dim), b: _zeros(this.dim), count: 0 });
123
+ }
124
+ return this.arms.get(armKey);
125
+ }
126
+
127
+ /**
128
+ * Pick an arm for a given tier and context.
129
+ * @param {string} tier
130
+ * @param {Array<{ provider: string, model: string }>} candidates — qualifying arms
131
+ * @param {number[]} context — feature vector
132
+ * @returns {{ provider, model, ucb, explored }} chosen arm
133
+ */
134
+ pick(tier, candidates, context) {
135
+ if (!candidates || candidates.length === 0) return null;
136
+ if (context.length !== this.dim) {
137
+ // Pad or truncate to dim
138
+ context = context.slice(0, this.dim);
139
+ while (context.length < this.dim) context.push(0);
140
+ }
141
+
142
+ // ε-greedy: 5% pure exploration
143
+ if (Math.random() < EXPLORATION_RATE) {
144
+ const random = candidates[Math.floor(Math.random() * candidates.length)];
145
+ return { ...random, ucb: null, explored: true };
146
+ }
147
+
148
+ let best = null;
149
+ let bestUcb = -Infinity;
150
+ for (const c of candidates) {
151
+ const key = this._armKey(tier, c.provider, c.model);
152
+ const arm = this._ensureArm(key);
153
+ let Ainv;
154
+ try {
155
+ Ainv = _inv(arm.A);
156
+ } catch (err) {
157
+ continue;
158
+ }
159
+ const theta = _matVec(Ainv, arm.b);
160
+ const mean = _dot(theta, context);
161
+ const variance = _dot(context, _matVec(Ainv, context));
162
+ const ucb = mean + this.alpha * Math.sqrt(Math.max(0, variance));
163
+ if (ucb > bestUcb) {
164
+ bestUcb = ucb;
165
+ best = { ...c, ucb, explored: false };
166
+ }
167
+ }
168
+ return best;
169
+ }
170
+
171
+ /**
172
+ * Update the chosen arm with the observed reward.
173
+ * @param {string} tier
174
+ * @param {string} provider
175
+ * @param {string} model
176
+ * @param {number[]} context
177
+ * @param {number} reward — typically in [0, 100]; will be rescaled to [0, 1] internally
178
+ */
179
+ update(tier, provider, model, context, reward) {
180
+ const key = this._armKey(tier, provider, model);
181
+ const arm = this._ensureArm(key);
182
+ let ctx = context;
183
+ if (ctx.length !== this.dim) {
184
+ ctx = ctx.slice(0, this.dim);
185
+ while (ctx.length < this.dim) ctx.push(0);
186
+ }
187
+ const r = Math.max(0, Math.min(1, reward / 100));
188
+ _addMat(arm.A, _outer(ctx, ctx));
189
+ _addVec(arm.b, ctx.map(x => x * r));
190
+ arm.count++;
191
+ this.steps++;
192
+ // Save periodically (not every step to limit IO)
193
+ if (this.steps % 25 === 0) this._save();
194
+ }
195
+
196
+ _save() {
197
+ try {
198
+ fs.mkdirSync(path.dirname(STATE_PATH), { recursive: true });
199
+ const arms = {};
200
+ for (const [k, v] of this.arms) arms[k] = v;
201
+ fs.writeFileSync(STATE_PATH, JSON.stringify({
202
+ savedAt: Date.now(),
203
+ steps: this.steps,
204
+ alpha: this.alpha,
205
+ lambda: this.lambda,
206
+ mu: this.mu,
207
+ dim: this.dim,
208
+ arms,
209
+ }, null, 0));
210
+ } catch (err) {
211
+ logger.debug({ err: err.message }, '[Bandit] State save failed');
212
+ }
213
+ }
214
+
215
+ _load() {
216
+ try {
217
+ if (!fs.existsSync(STATE_PATH)) return;
218
+ const raw = JSON.parse(fs.readFileSync(STATE_PATH, 'utf8'));
219
+ if (raw.dim && raw.dim === this.dim) {
220
+ for (const [k, v] of Object.entries(raw.arms || {})) {
221
+ this.arms.set(k, v);
222
+ }
223
+ this.steps = raw.steps || 0;
224
+ logger.info({ arms: this.arms.size, steps: this.steps }, '[Bandit] State loaded');
225
+ }
226
+ } catch (err) {
227
+ logger.debug({ err: err.message }, '[Bandit] State load failed');
228
+ }
229
+ }
230
+
231
+ getStats() {
232
+ const armStats = {};
233
+ for (const [k, v] of this.arms) {
234
+ armStats[k] = { count: v.count };
235
+ }
236
+ return { steps: this.steps, arms: armStats, alpha: this.alpha };
237
+ }
238
+ }
239
+
240
+ let _instance = null;
241
+ function getBandit() {
242
+ if (!_instance) _instance = new LinUCBBandit();
243
+ return _instance;
244
+ }
245
+
246
+ module.exports = { LinUCBBandit, getBandit, FEATURE_DIM };
@@ -0,0 +1,106 @@
1
+ /**
2
+ * Small-first cascade with confidence-based deferral (Phase 3.3).
3
+ *
4
+ * For tier-MEDIUM/COMPLEX requests, optionally try a smaller model first.
5
+ * If the response confidence (from confidence-scorer) ≥ threshold, accept it.
6
+ * Otherwise, escalate to the originally-routed tier model.
7
+ *
8
+ * Off by default for streaming (can't retry mid-stream cleanly).
9
+ * Opt-in via LYNKR_CASCADE_ENABLED=true.
10
+ */
11
+
12
+ const logger = require('../logger');
13
+ const confidenceScorer = require('./confidence-scorer');
14
+
15
+ const DEFAULT_THRESHOLD = 0.85;
16
+ const TIERS_ELIGIBLE = ['MEDIUM', 'COMPLEX'];
17
+
18
+ function isEnabled() {
19
+ return process.env.LYNKR_CASCADE_ENABLED === 'true';
20
+ }
21
+
22
+ /**
23
+ * @param {object} args
24
+ * @param {string} args.tier — the originally selected tier
25
+ * @param {boolean} args.streaming — true if the request is streaming
26
+ * @param {boolean} args.hasTools — true if tools are present
27
+ * @returns {boolean}
28
+ */
29
+ function shouldCascade(args) {
30
+ if (!isEnabled()) return false;
31
+ if (args.streaming) return false; // streaming responses can't be retried cleanly
32
+ if (args.hasTools) return false; // tool calls have side effects; don't double-run
33
+ if (!TIERS_ELIGIBLE.includes(args.tier)) return false;
34
+ return true;
35
+ }
36
+
37
+ /**
38
+ * Run a small-first cascade.
39
+ *
40
+ * @param {object} args
41
+ * @param {object} args.payload — the request payload
42
+ * @param {object} args.smallModel — { provider, model }
43
+ * @param {object} args.bigModel — { provider, model }
44
+ * @param {function} args.invoke — async (provider, model, payload) → response
45
+ * @param {string} args.taskType — used by confidence scorer
46
+ * @param {number} args.threshold — confidence threshold, defaults to 0.85
47
+ * @param {function} args.judge — optional judge LLM for reasoning tasks
48
+ * @returns {Promise<{ response, usedModel, cascadeStats }>}
49
+ */
50
+ async function run(args) {
51
+ const threshold = args.threshold ?? DEFAULT_THRESHOLD;
52
+ const start = Date.now();
53
+ let smallLatency = 0;
54
+ let bigLatency = 0;
55
+
56
+ // Try small model
57
+ let smallResponse;
58
+ try {
59
+ const t0 = Date.now();
60
+ smallResponse = await args.invoke(args.smallModel.provider, args.smallModel.model, args.payload);
61
+ smallLatency = Date.now() - t0;
62
+ } catch (err) {
63
+ logger.debug({ err: err.message }, '[Cascade] Small model failed, escalating');
64
+ const t0 = Date.now();
65
+ const bigResponse = await args.invoke(args.bigModel.provider, args.bigModel.model, args.payload);
66
+ bigLatency = Date.now() - t0;
67
+ return {
68
+ response: bigResponse,
69
+ usedModel: args.bigModel,
70
+ cascadeStats: { accepted: false, reason: 'small_failed', smallLatency, bigLatency, totalLatency: Date.now() - start },
71
+ };
72
+ }
73
+
74
+ const confidence = await confidenceScorer.score(smallResponse, {
75
+ taskType: args.taskType,
76
+ question: args.payload?.messages?.[args.payload.messages.length - 1]?.content,
77
+ judge: args.judge,
78
+ });
79
+
80
+ if (confidence >= threshold) {
81
+ return {
82
+ response: smallResponse,
83
+ usedModel: args.smallModel,
84
+ cascadeStats: { accepted: true, confidence, smallLatency, bigLatency: 0, totalLatency: Date.now() - start },
85
+ };
86
+ }
87
+
88
+ // Escalate
89
+ const t0 = Date.now();
90
+ const bigResponse = await args.invoke(args.bigModel.provider, args.bigModel.model, args.payload);
91
+ bigLatency = Date.now() - t0;
92
+ return {
93
+ response: bigResponse,
94
+ usedModel: args.bigModel,
95
+ cascadeStats: {
96
+ accepted: false,
97
+ confidence,
98
+ threshold,
99
+ smallLatency,
100
+ bigLatency,
101
+ totalLatency: Date.now() - start,
102
+ },
103
+ };
104
+ }
105
+
106
+ module.exports = { run, shouldCascade, isEnabled, DEFAULT_THRESHOLD };
@@ -395,24 +395,16 @@ function extractContent(payload) {
395
395
  }
396
396
 
397
397
  /**
398
- * Estimate token count (rough approximation)
398
+ * Estimate token count.
399
+ *
400
+ * Phase 1.1: delegates to the tiktoken-backed tokenizer (graceful fallback to
401
+ * chars/4 if js-tiktoken is unavailable).
399
402
  */
403
+ const { countPayloadTokens } = require('./tokenizer');
404
+
400
405
  function estimateTokens(payload) {
401
406
  if (!payload?.messages) return 0;
402
-
403
- let totalChars = 0;
404
- for (const msg of payload.messages) {
405
- if (typeof msg.content === 'string') {
406
- totalChars += msg.content.length;
407
- } else if (Array.isArray(msg.content)) {
408
- for (const block of msg.content) {
409
- if (block?.text) totalChars += block.text.length;
410
- }
411
- }
412
- }
413
-
414
- // Rough approximation: 4 chars per token
415
- return Math.ceil(totalChars / 4);
407
+ return countPayloadTokens(payload, payload?.model);
416
408
  }
417
409
 
418
410
  /**