lynkr 9.1.2 → 9.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +21 -10
- package/package.json +3 -1
- package/scripts/build-knn-index.js +130 -0
- package/scripts/calibrate-thresholds.js +197 -0
- package/scripts/compare-policies.js +67 -0
- package/scripts/learn-output-ratios.js +162 -0
- package/scripts/refresh-pricing.js +122 -0
- package/scripts/run-routerarena.js +26 -0
- package/scripts/sample-regret.js +84 -0
- package/scripts/train-risk-classifier.js +191 -0
- package/src/api/middleware/budget-enforcer.js +60 -0
- package/src/api/middleware/tenant.js +21 -0
- package/src/api/router.js +19 -40
- package/src/budget/hierarchical-budget.js +159 -0
- package/src/cache/semantic.js +28 -2
- package/src/clients/databricks.js +59 -5
- package/src/config/index.js +239 -43
- package/src/context/toon.js +5 -4
- package/src/orchestrator/index.js +44 -6
- package/src/prompts/system.js +34 -6
- package/src/routing/bandit.js +246 -0
- package/src/routing/cascade.js +106 -0
- package/src/routing/complexity-analyzer.js +7 -15
- package/src/routing/confidence-scorer.js +121 -0
- package/src/routing/context-validator.js +71 -0
- package/src/routing/cost-optimizer.js +5 -2
- package/src/routing/deadline.js +52 -0
- package/src/routing/drift-monitor.js +113 -0
- package/src/routing/embedding-cache.js +77 -0
- package/src/routing/index.js +314 -5
- package/src/routing/knn-router.js +206 -0
- package/src/routing/latency-tracker.js +113 -71
- package/src/routing/model-tiers.js +156 -6
- package/src/routing/output-ratios.js +57 -0
- package/src/routing/regret-estimator.js +91 -0
- package/src/routing/reward-pipeline.js +62 -0
- package/src/routing/risk-classifier.js +130 -0
- package/src/routing/shadow-mode.js +77 -0
- package/src/routing/tenant-policy.js +96 -0
- package/src/routing/tokenizer.js +162 -0
- package/src/server.js +9 -0
package/src/routing/index.js
CHANGED
|
@@ -22,16 +22,59 @@ const {
|
|
|
22
22
|
const { getAgenticDetector, AGENT_TYPES } = require('./agentic-detector');
|
|
23
23
|
const { getModelTierSelector, TIER_DEFINITIONS } = require('./model-tiers');
|
|
24
24
|
const { getCostOptimizer } = require('./cost-optimizer');
|
|
25
|
-
const { analyzeRisk } = require('./risk-
|
|
25
|
+
const { analyzeRisk } = require('./risk-classifier');
|
|
26
|
+
|
|
27
|
+
// Phase 3-6 routing modules
|
|
28
|
+
const { getKnnRouter } = require('./knn-router');
|
|
29
|
+
const { getBandit } = require('./bandit');
|
|
30
|
+
const { getShadowPolicy, compareAndLog: shadowCompareAndLog } = require('./shadow-mode');
|
|
31
|
+
const { chooseFastest } = require('./deadline');
|
|
32
|
+
const { applyTenantOverrides } = require('./tenant-policy');
|
|
26
33
|
|
|
27
34
|
// Telemetry modules
|
|
28
35
|
const telemetry = require('./telemetry');
|
|
29
36
|
const { scoreResponseQuality } = require('./quality-scorer');
|
|
30
37
|
const { getLatencyTracker } = require('./latency-tracker');
|
|
31
38
|
|
|
39
|
+
// Phase 1 modules
|
|
40
|
+
const contextValidator = require('./context-validator');
|
|
41
|
+
const { countPayloadTokens } = require('./tokenizer');
|
|
42
|
+
|
|
32
43
|
// Local providers
|
|
33
44
|
const LOCAL_PROVIDERS = ['ollama', 'llamacpp', 'lmstudio'];
|
|
34
45
|
|
|
46
|
+
/**
|
|
47
|
+
* Returns true when any message content block is an image.
|
|
48
|
+
* Handles both string content and structured content arrays.
|
|
49
|
+
*/
|
|
50
|
+
function _payloadHasImages(payload) {
|
|
51
|
+
const messages = payload?.messages;
|
|
52
|
+
if (!Array.isArray(messages)) return false;
|
|
53
|
+
return messages.some(msg => {
|
|
54
|
+
const content = msg?.content;
|
|
55
|
+
if (!Array.isArray(content)) return false;
|
|
56
|
+
return content.some(block => block?.type === 'image' || block?.type === 'image_url');
|
|
57
|
+
});
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* List of providers that currently have credentials configured.
|
|
62
|
+
* Used by the Phase 1.2 cost-optimizer override to scope candidates.
|
|
63
|
+
*/
|
|
64
|
+
function _enabledProviders() {
|
|
65
|
+
const out = [];
|
|
66
|
+
if (config.databricks?.url && config.databricks?.apiKey) out.push('databricks');
|
|
67
|
+
if (config.azureAnthropic?.endpoint && config.azureAnthropic?.apiKey) out.push('azure-anthropic');
|
|
68
|
+
if (config.bedrock?.apiKey) out.push('bedrock');
|
|
69
|
+
if (config.openrouter?.apiKey) out.push('openrouter');
|
|
70
|
+
if (config.openai?.apiKey) out.push('openai');
|
|
71
|
+
if (config.azureOpenAI?.endpoint && config.azureOpenAI?.apiKey) out.push('azure-openai');
|
|
72
|
+
if (config.ollama?.endpoint) out.push('ollama');
|
|
73
|
+
if (config.llamacpp?.endpoint) out.push('llamacpp');
|
|
74
|
+
if (config.lmstudio?.endpoint) out.push('lmstudio');
|
|
75
|
+
return out;
|
|
76
|
+
}
|
|
77
|
+
|
|
35
78
|
/**
|
|
36
79
|
* Check if a provider is local
|
|
37
80
|
*/
|
|
@@ -41,15 +84,28 @@ function isLocalProvider(provider) {
|
|
|
41
84
|
|
|
42
85
|
/**
|
|
43
86
|
* Check if fallback is enabled
|
|
87
|
+
* In tier routing mode, fallback is always enabled
|
|
44
88
|
*/
|
|
45
89
|
function isFallbackEnabled() {
|
|
90
|
+
if (config.modelTiers?.enabled) {
|
|
91
|
+
// Tier routing mode: fallback always enabled
|
|
92
|
+
return true;
|
|
93
|
+
}
|
|
94
|
+
// Static provider mode: use FALLBACK_ENABLED
|
|
46
95
|
return config.modelProvider?.fallbackEnabled !== false;
|
|
47
96
|
}
|
|
48
97
|
|
|
49
98
|
/**
|
|
50
99
|
* Get the configured fallback provider
|
|
100
|
+
* In tier routing mode, fallback = TIER_REASONING provider
|
|
51
101
|
*/
|
|
52
102
|
function getFallbackProvider() {
|
|
103
|
+
if (config.modelTiers?.enabled && config.modelTiers?.REASONING) {
|
|
104
|
+
// Tier routing mode: extract provider from TIER_REASONING
|
|
105
|
+
const match = config.modelTiers.REASONING.match(/^([a-z-]+):/);
|
|
106
|
+
if (match) return match[1];
|
|
107
|
+
}
|
|
108
|
+
// Static provider mode: use FALLBACK_PROVIDER
|
|
53
109
|
return config.modelProvider?.fallbackProvider ?? 'databricks';
|
|
54
110
|
}
|
|
55
111
|
|
|
@@ -283,9 +339,11 @@ async function determineProviderSmart(payload, options = {}) {
|
|
|
283
339
|
}
|
|
284
340
|
}
|
|
285
341
|
|
|
286
|
-
// Apply routing decision based on tier config (TIER_* env vars
|
|
342
|
+
// Apply routing decision based on tier config (TIER_* env vars take precedence
|
|
343
|
+
// but Phase 1.2 lets the cost-optimizer pick a cheaper qualifying model when safe).
|
|
287
344
|
let provider;
|
|
288
345
|
let method = 'tier_config';
|
|
346
|
+
let costOptimized = false;
|
|
289
347
|
|
|
290
348
|
const selector = getModelTierSelector();
|
|
291
349
|
const modelSelection = selector.selectModel(tier, null);
|
|
@@ -294,8 +352,242 @@ async function determineProviderSmart(payload, options = {}) {
|
|
|
294
352
|
selectedModel = modelSelection.model;
|
|
295
353
|
logger.debug({ tier, provider, model: selectedModel }, '[Routing] Using tier config');
|
|
296
354
|
|
|
297
|
-
//
|
|
298
|
-
//
|
|
355
|
+
// Phase 1.2 — cost-optimizer override.
|
|
356
|
+
// Only kick in when:
|
|
357
|
+
// - feature flag enabled (default true, disable with LYNKR_COST_OPTIMIZE=false)
|
|
358
|
+
// - risk level is not high (high-risk keeps the explicitly-configured model)
|
|
359
|
+
// - the optimizer finds a meaningfully cheaper qualifying model
|
|
360
|
+
const costOptimizeEnabled = process.env.LYNKR_COST_OPTIMIZE !== 'false'
|
|
361
|
+
&& config.routing?.costOptimize !== false;
|
|
362
|
+
if (costOptimizeEnabled && risk?.level !== 'high') {
|
|
363
|
+
try {
|
|
364
|
+
const optimizer = getCostOptimizer();
|
|
365
|
+
const availableProviders = _enabledProviders();
|
|
366
|
+
const cheapest = optimizer.findCheapestForTier(tier, availableProviders);
|
|
367
|
+
if (cheapest && cheapest.model && cheapest.model !== selectedModel) {
|
|
368
|
+
const current = optimizer.estimateCost(selectedModel, 1000);
|
|
369
|
+
const candidate = optimizer.estimateCost(cheapest.model, 1000);
|
|
370
|
+
if (candidate.totalEstimate > 0 && candidate.totalEstimate < current.totalEstimate * 0.75) {
|
|
371
|
+
logger.debug({
|
|
372
|
+
tier,
|
|
373
|
+
from: `${provider}:${selectedModel}`,
|
|
374
|
+
to: `${cheapest.provider}:${cheapest.model}`,
|
|
375
|
+
savedPerK: (current.totalEstimate - candidate.totalEstimate).toFixed(6),
|
|
376
|
+
}, '[Routing] Cost-optimizer override');
|
|
377
|
+
provider = cheapest.provider;
|
|
378
|
+
selectedModel = cheapest.model;
|
|
379
|
+
method = 'tier_config+cost_optimized';
|
|
380
|
+
costOptimized = true;
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
} catch (err) {
|
|
384
|
+
logger.debug({ err: err.message }, '[Routing] Cost-optimize failed, keeping tier_config selection');
|
|
385
|
+
}
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
// Phase 1.3 — context window validation. If estimated tokens exceed the
|
|
389
|
+
// selected model's context (with response headroom), escalate to a
|
|
390
|
+
// context-capable model regardless of tier.
|
|
391
|
+
try {
|
|
392
|
+
const estimatedTokens = countPayloadTokens(payload, selectedModel);
|
|
393
|
+
const ctxResult = contextValidator.validate(selectedModel, estimatedTokens);
|
|
394
|
+
if (!ctxResult.ok) {
|
|
395
|
+
const capable = selector.findContextCapable(estimatedTokens, tier);
|
|
396
|
+
if (capable) {
|
|
397
|
+
logger.info({
|
|
398
|
+
from: `${provider}:${selectedModel}`,
|
|
399
|
+
to: `${capable.provider}:${capable.model}`,
|
|
400
|
+
required: estimatedTokens,
|
|
401
|
+
oldContext: ctxResult.context,
|
|
402
|
+
newContext: capable.context,
|
|
403
|
+
}, '[Routing] Context window escalation');
|
|
404
|
+
provider = capable.provider;
|
|
405
|
+
selectedModel = capable.model;
|
|
406
|
+
if (capable.tier) tier = capable.tier;
|
|
407
|
+
method = method + '+context_escalated';
|
|
408
|
+
} else {
|
|
409
|
+
logger.warn({
|
|
410
|
+
model: selectedModel,
|
|
411
|
+
required: estimatedTokens,
|
|
412
|
+
available: ctxResult.context,
|
|
413
|
+
}, '[Routing] No context-capable fallback — request may fail upstream');
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
} catch (err) {
|
|
417
|
+
logger.debug({ err: err.message }, '[Routing] Context validation failed, proceeding without check');
|
|
418
|
+
}
|
|
419
|
+
|
|
420
|
+
// Phase 1.4 — vision capability guard.
|
|
421
|
+
// If the payload contains image content blocks but the selected model lacks
|
|
422
|
+
// vision support, silently swap to the cheapest vision-capable model at or
|
|
423
|
+
// above the current tier. Prevents silent upstream failures.
|
|
424
|
+
if (_payloadHasImages(payload)) {
|
|
425
|
+
try {
|
|
426
|
+
const { getModelRegistrySync } = require('./model-registry');
|
|
427
|
+
const registry = getModelRegistrySync();
|
|
428
|
+
const modelInfo = registry.getCost(selectedModel);
|
|
429
|
+
if (!modelInfo?.vision) {
|
|
430
|
+
const visionModel = selector.findVisionCapable(tier);
|
|
431
|
+
if (visionModel) {
|
|
432
|
+
logger.info({
|
|
433
|
+
from: `${provider}:${selectedModel}`,
|
|
434
|
+
to: `${visionModel.provider}:${visionModel.model}`,
|
|
435
|
+
tier: visionModel.tier,
|
|
436
|
+
}, '[Routing] Vision guard — upgrading to vision-capable model');
|
|
437
|
+
provider = visionModel.provider;
|
|
438
|
+
selectedModel = visionModel.model;
|
|
439
|
+
if (visionModel.tier !== tier) tier = visionModel.tier;
|
|
440
|
+
method = method + '+vision_guard';
|
|
441
|
+
} else {
|
|
442
|
+
logger.warn({ model: selectedModel }, '[Routing] Vision guard — no vision-capable model found, request may fail');
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
} catch (err) {
|
|
446
|
+
logger.debug({ err: err.message }, '[Routing] Vision guard check failed, proceeding');
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
// Phase 3.1 — kNN routing hint.
|
|
451
|
+
// If the index has enough entries, query it with the last user message.
|
|
452
|
+
// A high-confidence kNN suggestion overrides the heuristic selection.
|
|
453
|
+
let knnResult = null;
|
|
454
|
+
if (config.routing?.knnEnabled !== false) {
|
|
455
|
+
try {
|
|
456
|
+
const msgs = payload?.messages;
|
|
457
|
+
const lastMsg = Array.isArray(msgs) ? msgs[msgs.length - 1]?.content : null;
|
|
458
|
+
const queryText = typeof lastMsg === 'string' ? lastMsg
|
|
459
|
+
: Array.isArray(lastMsg) ? lastMsg.filter(b => b?.type === 'text').map(b => b.text || '').join(' ')
|
|
460
|
+
: null;
|
|
461
|
+
if (queryText) {
|
|
462
|
+
knnResult = await getKnnRouter().query(queryText);
|
|
463
|
+
if (knnResult && knnResult.confidence > 0.7 && knnResult.model && knnResult.model !== selectedModel) {
|
|
464
|
+
// High confidence — trust kNN's model recommendation directly.
|
|
465
|
+
logger.debug({
|
|
466
|
+
from: `${provider}:${selectedModel}`,
|
|
467
|
+
to: `${knnResult.provider}:${knnResult.model}`,
|
|
468
|
+
confidence: knnResult.confidence.toFixed(3),
|
|
469
|
+
}, '[Routing] kNN override');
|
|
470
|
+
provider = knnResult.provider;
|
|
471
|
+
selectedModel = knnResult.model;
|
|
472
|
+
method = method + '+knn';
|
|
473
|
+
} else if (knnResult && knnResult.confidence > 0.4 && knnResult.confidence <= 0.7) {
|
|
474
|
+
// Ambiguous signal — neighbors are split, we can't trust any single model
|
|
475
|
+
// recommendation. Err on quality: bump the current tier one step up so the
|
|
476
|
+
// request gets a more capable model rather than risking a bad answer from
|
|
477
|
+
// a model that was borderline for similar past requests.
|
|
478
|
+
const TIER_ORDER = ['SIMPLE', 'MEDIUM', 'COMPLEX', 'REASONING'];
|
|
479
|
+
const currentIdx = TIER_ORDER.indexOf(tier);
|
|
480
|
+
if (currentIdx >= 0 && currentIdx < TIER_ORDER.length - 1) {
|
|
481
|
+
const upgradedTier = TIER_ORDER[currentIdx + 1];
|
|
482
|
+
try {
|
|
483
|
+
const upgraded = selector.selectModel(upgradedTier, null);
|
|
484
|
+
logger.debug({
|
|
485
|
+
from: `${tier}:${provider}:${selectedModel}`,
|
|
486
|
+
to: `${upgradedTier}:${upgraded.provider}:${upgraded.model}`,
|
|
487
|
+
confidence: knnResult.confidence.toFixed(3),
|
|
488
|
+
}, '[Routing] kNN ambiguous — escalating tier for safety');
|
|
489
|
+
provider = upgraded.provider;
|
|
490
|
+
selectedModel = upgraded.model;
|
|
491
|
+
tier = upgradedTier;
|
|
492
|
+
method = method + '+knn_ambiguous_escalate';
|
|
493
|
+
} catch (err) {
|
|
494
|
+
logger.debug({ err: err.message }, '[Routing] kNN ambiguous escalation failed, keeping current tier');
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
} catch (err) {
|
|
500
|
+
logger.debug({ err: err.message }, '[Routing] kNN query failed, ignoring');
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
|
|
504
|
+
// Phase 4.1 — LinUCB bandit intra-tier selection.
|
|
505
|
+
// When there are two candidates (heuristic vs kNN), the bandit picks the
|
|
506
|
+
// one with the highest estimated UCB score for the current context.
|
|
507
|
+
if (config.routing?.banditEnabled !== false && knnResult && knnResult.model) {
|
|
508
|
+
try {
|
|
509
|
+
// Build candidates: current selection and kNN alternative if different
|
|
510
|
+
const allCandidates = [{ provider, model: selectedModel }];
|
|
511
|
+
if (knnResult.model !== selectedModel) {
|
|
512
|
+
allCandidates.push({ provider: knnResult.provider, model: knnResult.model });
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
if (allCandidates.length > 1) {
|
|
516
|
+
const bandit = getBandit();
|
|
517
|
+
const TASK_TYPES = ['code_gen', 'summarization', 'reasoning', 'factoid', 'chat', 'other'];
|
|
518
|
+
const inferredTask = (analysis.breakdown?.taskType?.reason || 'other').toLowerCase();
|
|
519
|
+
const taskIdx = Math.max(0, TASK_TYPES.findIndex(t => inferredTask.includes(t)));
|
|
520
|
+
const ctx = [
|
|
521
|
+
(analysis.score || 0) / 100,
|
|
522
|
+
Math.log(Math.max(1, analysis.breakdown?.tokenCount || 0) + 1) / 15,
|
|
523
|
+
((payload?.tools?.length ?? 0) > 0) ? 1 : 0,
|
|
524
|
+
options.streaming ? 1 : 0,
|
|
525
|
+
risk?.level === 'high' ? 1 : risk?.level === 'medium' ? 0.5 : 0,
|
|
526
|
+
agenticResult?.isAgentic ? 1 : 0,
|
|
527
|
+
...TASK_TYPES.map((_, i) => i === taskIdx ? 1 : 0),
|
|
528
|
+
];
|
|
529
|
+
const picked = bandit.pick(tier, allCandidates, ctx);
|
|
530
|
+
if (picked && picked.model !== selectedModel) {
|
|
531
|
+
logger.debug({
|
|
532
|
+
from: `${provider}:${selectedModel}`,
|
|
533
|
+
to: `${picked.provider}:${picked.model}`,
|
|
534
|
+
ucb: picked.ucb?.toFixed(4),
|
|
535
|
+
explored: picked.explored,
|
|
536
|
+
}, '[Routing] Bandit override');
|
|
537
|
+
provider = picked.provider;
|
|
538
|
+
selectedModel = picked.model;
|
|
539
|
+
method = method + (picked.explored ? '+bandit_explore' : '+bandit');
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
} catch (err) {
|
|
543
|
+
logger.debug({ err: err.message }, '[Routing] Bandit pick failed, ignoring');
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
// Phase 6.3 — deadline-aware fastest-model selection.
|
|
548
|
+
// Payload carries _deadlineMs injected by the orchestrator from the
|
|
549
|
+
// LYNKR-Deadline-Ms request header.
|
|
550
|
+
const deadlineMs = payload?._deadlineMs ?? null;
|
|
551
|
+
if (deadlineMs) {
|
|
552
|
+
try {
|
|
553
|
+
const fastest = chooseFastest([{ provider, model: selectedModel }], deadlineMs);
|
|
554
|
+
if (fastest && fastest.model !== selectedModel) {
|
|
555
|
+
logger.debug({
|
|
556
|
+
from: `${provider}:${selectedModel}`,
|
|
557
|
+
to: `${fastest.provider}:${fastest.model}`,
|
|
558
|
+
deadlineMs,
|
|
559
|
+
}, '[Routing] Deadline override');
|
|
560
|
+
provider = fastest.provider;
|
|
561
|
+
selectedModel = fastest.model;
|
|
562
|
+
method = method + '+deadline';
|
|
563
|
+
}
|
|
564
|
+
} catch (err) {
|
|
565
|
+
logger.debug({ err: err.message }, '[Routing] Deadline check failed, ignoring');
|
|
566
|
+
}
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
// Phase 6.1 — per-tenant policy overrides.
|
|
570
|
+
// tenantPolicy comes from options (threaded from Express res.locals via
|
|
571
|
+
// orchestrator → databricks → here).
|
|
572
|
+
if (options.tenantPolicy) {
|
|
573
|
+
try {
|
|
574
|
+
const overridden = applyTenantOverrides(
|
|
575
|
+
{ provider, model: selectedModel, tier, method },
|
|
576
|
+
options.tenantPolicy,
|
|
577
|
+
);
|
|
578
|
+
if (overridden && overridden.model !== selectedModel) {
|
|
579
|
+
logger.debug({
|
|
580
|
+
from: `${provider}:${selectedModel}`,
|
|
581
|
+
to: `${overridden.provider}:${overridden.model}`,
|
|
582
|
+
}, '[Routing] Tenant override');
|
|
583
|
+
provider = overridden.provider;
|
|
584
|
+
selectedModel = overridden.model;
|
|
585
|
+
method = overridden.method;
|
|
586
|
+
}
|
|
587
|
+
} catch (err) {
|
|
588
|
+
logger.debug({ err: err.message }, '[Routing] Tenant override failed, ignoring');
|
|
589
|
+
}
|
|
590
|
+
}
|
|
299
591
|
|
|
300
592
|
const decision = {
|
|
301
593
|
provider,
|
|
@@ -309,10 +601,19 @@ async function determineProviderSmart(payload, options = {}) {
|
|
|
309
601
|
analysis,
|
|
310
602
|
embeddingsResult,
|
|
311
603
|
agenticResult,
|
|
312
|
-
costOptimized
|
|
604
|
+
costOptimized,
|
|
313
605
|
risk,
|
|
606
|
+
knnResult,
|
|
314
607
|
};
|
|
315
608
|
|
|
609
|
+
// Phase 4.4 — shadow-mode policy comparison (fire-and-forget).
|
|
610
|
+
const shadowFn = getShadowPolicy();
|
|
611
|
+
if (shadowFn) {
|
|
612
|
+
setImmediate(() =>
|
|
613
|
+
shadowCompareAndLog({ payload, activeDecision: decision, shadowFn }).catch(() => {})
|
|
614
|
+
);
|
|
615
|
+
}
|
|
616
|
+
|
|
316
617
|
// Phase 3: Record metrics
|
|
317
618
|
routingMetrics.record(decision);
|
|
318
619
|
|
|
@@ -419,6 +720,14 @@ module.exports = {
|
|
|
419
720
|
AGENT_TYPES,
|
|
420
721
|
TIER_DEFINITIONS,
|
|
421
722
|
|
|
723
|
+
// Phase 3-6 modules
|
|
724
|
+
getKnnRouter,
|
|
725
|
+
getBandit,
|
|
726
|
+
getShadowPolicy,
|
|
727
|
+
shadowCompareAndLog,
|
|
728
|
+
chooseFastest,
|
|
729
|
+
applyTenantOverrides,
|
|
730
|
+
|
|
422
731
|
// Telemetry
|
|
423
732
|
telemetry,
|
|
424
733
|
scoreResponseQuality,
|
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* kNN-based routing decision (Phase 3.1).
|
|
3
|
+
*
|
|
4
|
+
* Embeds the incoming query, finds the K nearest historical queries from the
|
|
5
|
+
* hnswlib-node index, and returns a confidence-weighted recommendation
|
|
6
|
+
* (model, expected quality, expected cost) based on those neighbors' actual
|
|
7
|
+
* outcomes from telemetry.
|
|
8
|
+
*
|
|
9
|
+
* Behavior:
|
|
10
|
+
* - Empty index → returns null. Caller falls back to heuristic router.
|
|
11
|
+
* - Sparse index (N < MIN_INDEX_SIZE) → returns null. Heuristic wins until
|
|
12
|
+
* we have enough data to be confident.
|
|
13
|
+
* - Embedder unavailable → returns null. Same fallback path.
|
|
14
|
+
*
|
|
15
|
+
* Bootstrap: scripts/build-knn-index.js (also accepts optional RouterBench
|
|
16
|
+
* corpus path to seed the index).
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
const fs = require('fs');
|
|
20
|
+
const path = require('path');
|
|
21
|
+
const logger = require('../logger');
|
|
22
|
+
const { generateEmbedding } = require('../cache/embeddings');
|
|
23
|
+
const { getEmbeddingCache } = require('./embedding-cache');
|
|
24
|
+
|
|
25
|
+
const INDEX_DIR = path.join(__dirname, '../../data/knn');
|
|
26
|
+
const INDEX_FILE = path.join(INDEX_DIR, 'index.hnsw');
|
|
27
|
+
const META_FILE = path.join(INDEX_DIR, 'meta.json');
|
|
28
|
+
|
|
29
|
+
const MAX_ELEMENTS = 50000;
|
|
30
|
+
const DIM = 768; // nomic-embed-text default
|
|
31
|
+
const K = 10;
|
|
32
|
+
const MIN_INDEX_SIZE = 1000;
|
|
33
|
+
|
|
34
|
+
let _hnsw = null;
|
|
35
|
+
let _hnswLoaded = false;
|
|
36
|
+
function _loadHnsw() {
|
|
37
|
+
if (_hnswLoaded) return _hnsw;
|
|
38
|
+
_hnswLoaded = true;
|
|
39
|
+
try {
|
|
40
|
+
_hnsw = require('hnswlib-node');
|
|
41
|
+
} catch (err) {
|
|
42
|
+
logger.debug({ err: err.message }, '[KnnRouter] hnswlib-node not available');
|
|
43
|
+
_hnsw = null;
|
|
44
|
+
}
|
|
45
|
+
return _hnsw;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
class KnnRouter {
|
|
49
|
+
constructor() {
|
|
50
|
+
this.index = null;
|
|
51
|
+
this.meta = []; // parallel to index: per-id outcome { query, model, quality, cost, latency, tier }
|
|
52
|
+
this.size = 0;
|
|
53
|
+
this.dim = DIM;
|
|
54
|
+
this.ready = false;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
load() {
|
|
58
|
+
const hnsw = _loadHnsw();
|
|
59
|
+
if (!hnsw) return false;
|
|
60
|
+
try {
|
|
61
|
+
if (!fs.existsSync(INDEX_FILE) || !fs.existsSync(META_FILE)) {
|
|
62
|
+
// Initialize empty index (caller can add() later)
|
|
63
|
+
this.index = new hnsw.HierarchicalNSW('cosine', this.dim);
|
|
64
|
+
this.index.initIndex(MAX_ELEMENTS);
|
|
65
|
+
this.meta = [];
|
|
66
|
+
this.size = 0;
|
|
67
|
+
this.ready = true;
|
|
68
|
+
return true;
|
|
69
|
+
}
|
|
70
|
+
const metaData = JSON.parse(fs.readFileSync(META_FILE, 'utf8'));
|
|
71
|
+
this.dim = metaData.dim || DIM;
|
|
72
|
+
this.meta = metaData.entries || [];
|
|
73
|
+
this.size = this.meta.length;
|
|
74
|
+
this.index = new hnsw.HierarchicalNSW('cosine', this.dim);
|
|
75
|
+
this.index.readIndexSync(INDEX_FILE, MAX_ELEMENTS);
|
|
76
|
+
this.ready = true;
|
|
77
|
+
logger.info({ size: this.size, dim: this.dim }, '[KnnRouter] Index loaded');
|
|
78
|
+
return true;
|
|
79
|
+
} catch (err) {
|
|
80
|
+
logger.warn({ err: err.message }, '[KnnRouter] Index load failed');
|
|
81
|
+
return false;
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
save() {
|
|
86
|
+
if (!this.ready || !this.index) return;
|
|
87
|
+
try {
|
|
88
|
+
fs.mkdirSync(INDEX_DIR, { recursive: true });
|
|
89
|
+
this.index.writeIndexSync(INDEX_FILE);
|
|
90
|
+
fs.writeFileSync(META_FILE, JSON.stringify({ dim: this.dim, entries: this.meta }, null, 0));
|
|
91
|
+
} catch (err) {
|
|
92
|
+
logger.warn({ err: err.message }, '[KnnRouter] Index save failed');
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
add(embedding, outcome) {
|
|
97
|
+
if (!this.ready || !this.index || !Array.isArray(embedding)) return;
|
|
98
|
+
if (this.size >= MAX_ELEMENTS) {
|
|
99
|
+
// Simple FIFO eviction: drop the oldest meta and reuse its id
|
|
100
|
+
// hnswlib doesn't support deletion in place; we just stop adding past max
|
|
101
|
+
return;
|
|
102
|
+
}
|
|
103
|
+
this.index.addPoint(embedding, this.size);
|
|
104
|
+
this.meta.push(outcome);
|
|
105
|
+
this.size++;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
async query(text) {
|
|
109
|
+
if (!this.ready) this.load();
|
|
110
|
+
if (!this.ready || !this.index || this.size < MIN_INDEX_SIZE) return null;
|
|
111
|
+
if (!text || typeof text !== 'string') return null;
|
|
112
|
+
|
|
113
|
+
const cache = getEmbeddingCache();
|
|
114
|
+
let embedding = cache.get(text);
|
|
115
|
+
if (!embedding) {
|
|
116
|
+
try {
|
|
117
|
+
embedding = await generateEmbedding(text);
|
|
118
|
+
if (!embedding || embedding.length !== this.dim) {
|
|
119
|
+
// Skip if dim mismatch (embedder produced different dimensions)
|
|
120
|
+
return null;
|
|
121
|
+
}
|
|
122
|
+
cache.set(text, embedding);
|
|
123
|
+
} catch (err) {
|
|
124
|
+
logger.debug({ err: err.message }, '[KnnRouter] Embedding failed, skipping');
|
|
125
|
+
return null;
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
let result;
|
|
130
|
+
try {
|
|
131
|
+
result = this.index.searchKnn(embedding, K);
|
|
132
|
+
} catch (err) {
|
|
133
|
+
logger.debug({ err: err.message }, '[KnnRouter] Search failed');
|
|
134
|
+
return null;
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
const neighbors = (result.neighbors || []).map((id, i) => ({
|
|
138
|
+
id,
|
|
139
|
+
distance: result.distances?.[i] ?? 1,
|
|
140
|
+
outcome: this.meta[id],
|
|
141
|
+
})).filter(n => n.outcome);
|
|
142
|
+
|
|
143
|
+
if (neighbors.length === 0) return null;
|
|
144
|
+
|
|
145
|
+
// Confidence-weighted aggregation per candidate model.
|
|
146
|
+
// weight = 1 - distance (cosine distance → similarity)
|
|
147
|
+
const byModel = new Map();
|
|
148
|
+
for (const n of neighbors) {
|
|
149
|
+
const w = Math.max(0, 1 - n.distance);
|
|
150
|
+
const m = `${n.outcome.provider}:${n.outcome.model}`;
|
|
151
|
+
if (!byModel.has(m)) {
|
|
152
|
+
byModel.set(m, { weight: 0, quality: 0, cost: 0, latency: 0, count: 0, sample: n.outcome });
|
|
153
|
+
}
|
|
154
|
+
const agg = byModel.get(m);
|
|
155
|
+
agg.weight += w;
|
|
156
|
+
agg.quality += w * (n.outcome.quality || 50);
|
|
157
|
+
agg.cost += w * (n.outcome.cost || 0);
|
|
158
|
+
agg.latency += w * (n.outcome.latency || 0);
|
|
159
|
+
agg.count++;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
let best = null;
|
|
163
|
+
let bestScore = -Infinity;
|
|
164
|
+
for (const [model, agg] of byModel) {
|
|
165
|
+
const avgQ = agg.quality / agg.weight;
|
|
166
|
+
const avgC = agg.cost / agg.weight;
|
|
167
|
+
// Score = quality / log(cost+1) — reward quality, penalise cost gently
|
|
168
|
+
const score = avgQ / Math.log(avgC * 1000 + 2);
|
|
169
|
+
if (score > bestScore) {
|
|
170
|
+
bestScore = score;
|
|
171
|
+
best = {
|
|
172
|
+
provider: agg.sample.provider,
|
|
173
|
+
model: agg.sample.model,
|
|
174
|
+
tier: agg.sample.tier,
|
|
175
|
+
expectedQuality: avgQ,
|
|
176
|
+
expectedCost: avgC,
|
|
177
|
+
expectedLatency: agg.latency / agg.weight,
|
|
178
|
+
confidence: Math.min(1, agg.weight / K),
|
|
179
|
+
neighborCount: agg.count,
|
|
180
|
+
};
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
return best;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
getStats() {
|
|
188
|
+
return {
|
|
189
|
+
size: this.size,
|
|
190
|
+
maxElements: MAX_ELEMENTS,
|
|
191
|
+
ready: this.ready,
|
|
192
|
+
dim: this.dim,
|
|
193
|
+
};
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
let _instance = null;
|
|
198
|
+
function getKnnRouter() {
|
|
199
|
+
if (!_instance) {
|
|
200
|
+
_instance = new KnnRouter();
|
|
201
|
+
_instance.load();
|
|
202
|
+
}
|
|
203
|
+
return _instance;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
module.exports = { KnnRouter, getKnnRouter };
|