@jsonstudio/llms 0.6.1164 → 0.6.1172
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/router/virtual-router/bootstrap.js +17 -1
- package/dist/router/virtual-router/context-advisor.d.ts +4 -0
- package/dist/router/virtual-router/context-advisor.js +3 -0
- package/dist/router/virtual-router/context-weighted.d.ts +31 -0
- package/dist/router/virtual-router/context-weighted.js +54 -0
- package/dist/router/virtual-router/engine-selection.js +162 -15
- package/dist/router/virtual-router/types.d.ts +28 -0
- package/package.json +1 -1
|
@@ -1177,10 +1177,26 @@ function normalizeLoadBalancing(input) {
|
|
|
1177
1177
|
: {})
|
|
1178
1178
|
}
|
|
1179
1179
|
: undefined;
|
|
1180
|
+
const contextWeightedRaw = asRecord(record.contextWeighted);
|
|
1181
|
+
const contextWeighted = Object.keys(contextWeightedRaw).length > 0
|
|
1182
|
+
? {
|
|
1183
|
+
...(typeof contextWeightedRaw.enabled === 'boolean' ? { enabled: contextWeightedRaw.enabled } : {}),
|
|
1184
|
+
...(typeof contextWeightedRaw.clientCapTokens === 'number' && Number.isFinite(contextWeightedRaw.clientCapTokens)
|
|
1185
|
+
? { clientCapTokens: contextWeightedRaw.clientCapTokens }
|
|
1186
|
+
: {}),
|
|
1187
|
+
...(typeof contextWeightedRaw.gamma === 'number' && Number.isFinite(contextWeightedRaw.gamma)
|
|
1188
|
+
? { gamma: contextWeightedRaw.gamma }
|
|
1189
|
+
: {}),
|
|
1190
|
+
...(typeof contextWeightedRaw.maxMultiplier === 'number' && Number.isFinite(contextWeightedRaw.maxMultiplier)
|
|
1191
|
+
? { maxMultiplier: contextWeightedRaw.maxMultiplier }
|
|
1192
|
+
: {})
|
|
1193
|
+
}
|
|
1194
|
+
: undefined;
|
|
1180
1195
|
return {
|
|
1181
1196
|
strategy,
|
|
1182
1197
|
...(Object.keys(weightsEntries).length ? { weights: weightsEntries } : {}),
|
|
1183
|
-
...(healthWeighted ? { healthWeighted } : {})
|
|
1198
|
+
...(healthWeighted ? { healthWeighted } : {}),
|
|
1199
|
+
...(contextWeighted ? { contextWeighted } : {})
|
|
1184
1200
|
};
|
|
1185
1201
|
}
|
|
1186
1202
|
function coerceRatio(value) {
|
|
@@ -16,4 +16,8 @@ export declare class ContextAdvisor {
|
|
|
16
16
|
private hardLimit;
|
|
17
17
|
configure(config?: VirtualRouterContextRoutingConfig | null): void;
|
|
18
18
|
classify(pool: string[], estimatedTokens: number, resolveProfile: (key: string) => ProviderProfile): ContextAdvisorResult;
|
|
19
|
+
getConfig(): {
|
|
20
|
+
warnRatio: number;
|
|
21
|
+
hardLimit: boolean;
|
|
22
|
+
};
|
|
19
23
|
}
|
|
@@ -55,6 +55,9 @@ export class ContextAdvisor {
|
|
|
55
55
|
allOverflow: safe.length === 0 && risky.length === 0 && overflow.length > 0
|
|
56
56
|
};
|
|
57
57
|
}
|
|
58
|
+
getConfig() {
|
|
59
|
+
return { warnRatio: this.warnRatio, hardLimit: this.hardLimit };
|
|
60
|
+
}
|
|
58
61
|
}
|
|
59
62
|
function clampWarnRatio(value) {
|
|
60
63
|
if (!Number.isFinite(value)) {
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import type { ContextWeightedLoadBalancingConfig } from './types.js';
|
|
2
|
+
export type ResolvedContextWeightedConfig = Required<{
|
|
3
|
+
enabled: boolean;
|
|
4
|
+
clientCapTokens: number;
|
|
5
|
+
gamma: number;
|
|
6
|
+
maxMultiplier: number;
|
|
7
|
+
}>;
|
|
8
|
+
/**
|
|
9
|
+
* Context-weighted constant table (defaults).
|
|
10
|
+
*
|
|
11
|
+
* Intended behavior:
|
|
12
|
+
* - Prefer smaller effective safe context windows early, so that larger windows remain available later.
|
|
13
|
+
* - Compensation is proportional by default (`gamma=1`), but capped by `maxMultiplier`.
|
|
14
|
+
*
|
|
15
|
+
* Notes:
|
|
16
|
+
* - `clientCapTokens` is the maximum effective context the client can consume, even if the model supports more.
|
|
17
|
+
* - The effective safe window is computed using ContextAdvisor's `warnRatio` and model "slack" above the client cap.
|
|
18
|
+
* - If a model has slack >= the reserved margin, it effectively gets the full client cap as safe window.
|
|
19
|
+
*/
|
|
20
|
+
export declare const DEFAULT_CONTEXT_WEIGHTED_CONFIG: ResolvedContextWeightedConfig;
|
|
21
|
+
export declare function resolveContextWeightedConfig(raw?: ContextWeightedLoadBalancingConfig | null): ResolvedContextWeightedConfig;
|
|
22
|
+
export declare function computeEffectiveSafeWindowTokens(options: {
|
|
23
|
+
modelMaxTokens: number;
|
|
24
|
+
warnRatio: number;
|
|
25
|
+
clientCapTokens: number;
|
|
26
|
+
}): number;
|
|
27
|
+
export declare function computeContextMultiplier(options: {
|
|
28
|
+
effectiveSafeRefTokens: number;
|
|
29
|
+
effectiveSafeTokens: number;
|
|
30
|
+
cfg: ResolvedContextWeightedConfig;
|
|
31
|
+
}): number;
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Context-weighted constant table (defaults).
|
|
3
|
+
*
|
|
4
|
+
* Intended behavior:
|
|
5
|
+
* - Prefer smaller effective safe context windows early, so that larger windows remain available later.
|
|
6
|
+
* - Compensation is proportional by default (`gamma=1`), but capped by `maxMultiplier`.
|
|
7
|
+
*
|
|
8
|
+
* Notes:
|
|
9
|
+
* - `clientCapTokens` is the maximum effective context the client can consume, even if the model supports more.
|
|
10
|
+
* - The effective safe window is computed using ContextAdvisor's `warnRatio` and model "slack" above the client cap.
|
|
11
|
+
* - If a model has slack >= the reserved margin, it effectively gets the full client cap as safe window.
|
|
12
|
+
*/
|
|
13
|
+
export const DEFAULT_CONTEXT_WEIGHTED_CONFIG = {
|
|
14
|
+
enabled: false,
|
|
15
|
+
clientCapTokens: 200_000,
|
|
16
|
+
gamma: 1,
|
|
17
|
+
maxMultiplier: 2
|
|
18
|
+
};
|
|
19
|
+
export function resolveContextWeightedConfig(raw) {
|
|
20
|
+
const enabled = raw?.enabled ?? DEFAULT_CONTEXT_WEIGHTED_CONFIG.enabled;
|
|
21
|
+
const clientCapTokens = typeof raw?.clientCapTokens === 'number' && Number.isFinite(raw.clientCapTokens) && raw.clientCapTokens > 0
|
|
22
|
+
? Math.floor(raw.clientCapTokens)
|
|
23
|
+
: DEFAULT_CONTEXT_WEIGHTED_CONFIG.clientCapTokens;
|
|
24
|
+
const gamma = typeof raw?.gamma === 'number' && Number.isFinite(raw.gamma) && raw.gamma > 0
|
|
25
|
+
? raw.gamma
|
|
26
|
+
: DEFAULT_CONTEXT_WEIGHTED_CONFIG.gamma;
|
|
27
|
+
const maxMultiplier = typeof raw?.maxMultiplier === 'number' && Number.isFinite(raw.maxMultiplier) && raw.maxMultiplier >= 1
|
|
28
|
+
? raw.maxMultiplier
|
|
29
|
+
: DEFAULT_CONTEXT_WEIGHTED_CONFIG.maxMultiplier;
|
|
30
|
+
return { enabled, clientCapTokens, gamma, maxMultiplier };
|
|
31
|
+
}
|
|
32
|
+
export function computeEffectiveSafeWindowTokens(options) {
|
|
33
|
+
const modelMaxTokens = typeof options.modelMaxTokens === 'number' && Number.isFinite(options.modelMaxTokens) && options.modelMaxTokens > 0
|
|
34
|
+
? Math.floor(options.modelMaxTokens)
|
|
35
|
+
: 1;
|
|
36
|
+
const clientCapTokens = typeof options.clientCapTokens === 'number' && Number.isFinite(options.clientCapTokens) && options.clientCapTokens > 0
|
|
37
|
+
? Math.floor(options.clientCapTokens)
|
|
38
|
+
: DEFAULT_CONTEXT_WEIGHTED_CONFIG.clientCapTokens;
|
|
39
|
+
const warnRatio = typeof options.warnRatio === 'number' && Number.isFinite(options.warnRatio) && options.warnRatio > 0 && options.warnRatio < 1
|
|
40
|
+
? options.warnRatio
|
|
41
|
+
: 0.9;
|
|
42
|
+
const effectiveMax = Math.min(modelMaxTokens, clientCapTokens);
|
|
43
|
+
const reserve = Math.ceil(effectiveMax * (1 - warnRatio));
|
|
44
|
+
const slack = Math.max(0, modelMaxTokens - clientCapTokens);
|
|
45
|
+
const reserveEff = Math.max(0, reserve - slack);
|
|
46
|
+
return Math.max(1, effectiveMax - reserveEff);
|
|
47
|
+
}
|
|
48
|
+
export function computeContextMultiplier(options) {
|
|
49
|
+
const ref = Math.max(1, Math.floor(options.effectiveSafeRefTokens));
|
|
50
|
+
const cur = Math.max(1, Math.floor(options.effectiveSafeTokens));
|
|
51
|
+
const ratio = ref / cur;
|
|
52
|
+
const raw = Math.pow(Math.max(1, ratio), options.cfg.gamma);
|
|
53
|
+
return Math.min(options.cfg.maxMultiplier, raw);
|
|
54
|
+
}
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { computeContextMultiplier, computeEffectiveSafeWindowTokens, resolveContextWeightedConfig } from './context-weighted.js';
|
|
1
2
|
import { computeHealthWeight, resolveHealthWeightedConfig } from './health-weighted.js';
|
|
2
3
|
import { DEFAULT_ROUTE, ROUTE_PRIORITY, VirtualRouterError, VirtualRouterErrorCode } from './types.js';
|
|
3
4
|
export function selectProviderImpl(requestedRoute, metadata, classification, features, activeState, deps, options = {}) {
|
|
@@ -402,6 +403,9 @@ function trySelectFromTier(routeName, tier, stickyKey, estimatedTokens, features
|
|
|
402
403
|
const quotaView = deps.quotaView;
|
|
403
404
|
const now = quotaView ? Date.now() : 0;
|
|
404
405
|
const healthWeightedCfg = resolveHealthWeightedConfig(deps.loadBalancer.getPolicy().healthWeighted);
|
|
406
|
+
const contextWeightedCfg = resolveContextWeightedConfig(deps.loadBalancer.getPolicy().contextWeighted);
|
|
407
|
+
const warnRatio = deps.contextAdvisor.getConfig().warnRatio;
|
|
408
|
+
const nowForWeights = Date.now();
|
|
405
409
|
const selectFirstAvailable = (candidates) => {
|
|
406
410
|
for (const key of candidates) {
|
|
407
411
|
if (deps.healthManager.isAvailable(key)) {
|
|
@@ -410,23 +414,137 @@ function trySelectFromTier(routeName, tier, stickyKey, estimatedTokens, features
|
|
|
410
414
|
}
|
|
411
415
|
return null;
|
|
412
416
|
};
|
|
413
|
-
const
|
|
417
|
+
const resolvePriorityMeta = (orderedTargets) => {
|
|
418
|
+
// Priority mode semantics (strict group priority + alias-level balancing):
|
|
419
|
+
// - Targets are interpreted as ordered (providerId, modelId) groups.
|
|
420
|
+
// - Group base priorities: 100, 90, 80, ... (step=10) by appearance order.
|
|
421
|
+
// - Within a group (different auth aliases), base scores: 100, 99, 98, ... (step=1).
|
|
422
|
+
//
|
|
423
|
+
// Group selection is strict: always use the best group until it is unavailable.
|
|
424
|
+
// Alias selection is balanced within the chosen group (RR / health-weighted / context-weighted).
|
|
425
|
+
const meta = new Map();
|
|
426
|
+
if (!Array.isArray(orderedTargets) || orderedTargets.length === 0) {
|
|
427
|
+
return meta;
|
|
428
|
+
}
|
|
429
|
+
let groupIndex = -1;
|
|
430
|
+
let aliasOffset = 0;
|
|
431
|
+
let lastGroupKey = '';
|
|
432
|
+
for (const key of orderedTargets) {
|
|
433
|
+
const providerId = extractProviderId(key) ?? '';
|
|
434
|
+
const modelId = getProviderModelId(key, deps.providerRegistry) ?? '';
|
|
435
|
+
const groupKey = `${providerId}::${modelId}`;
|
|
436
|
+
if (groupKey !== lastGroupKey) {
|
|
437
|
+
groupIndex += 1;
|
|
438
|
+
aliasOffset = 0;
|
|
439
|
+
lastGroupKey = groupKey;
|
|
440
|
+
}
|
|
441
|
+
const groupBase = 100 - groupIndex * 10;
|
|
442
|
+
const base = groupBase - aliasOffset;
|
|
443
|
+
meta.set(key, { groupId: `${providerId}.${modelId}`, groupBase, base });
|
|
444
|
+
aliasOffset += 1;
|
|
445
|
+
}
|
|
446
|
+
return meta;
|
|
447
|
+
};
|
|
448
|
+
const pickPriorityGroup = (candidates, orderedTargets, penalties) => {
|
|
449
|
+
const meta = resolvePriorityMeta(orderedTargets);
|
|
450
|
+
let bestGroupId = null;
|
|
451
|
+
let bestScore = Number.NEGATIVE_INFINITY;
|
|
452
|
+
for (const key of candidates) {
|
|
453
|
+
if (!deps.healthManager.isAvailable(key))
|
|
454
|
+
continue;
|
|
455
|
+
const m = meta.get(key);
|
|
456
|
+
if (!m)
|
|
457
|
+
continue;
|
|
458
|
+
const penalty = penalties ? Math.max(0, Math.floor(penalties[key] ?? 0)) : 0;
|
|
459
|
+
const score = m.base - penalty;
|
|
460
|
+
if (score > bestScore) {
|
|
461
|
+
bestScore = score;
|
|
462
|
+
bestGroupId = m.groupId;
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
if (!bestGroupId)
|
|
466
|
+
return null;
|
|
467
|
+
const groupCandidates = candidates.filter((key) => meta.get(key)?.groupId === bestGroupId);
|
|
468
|
+
return groupCandidates.length ? { groupId: bestGroupId, groupCandidates } : null;
|
|
469
|
+
};
|
|
470
|
+
const computeContextWeightMultipliers = (candidates) => {
|
|
471
|
+
if (!contextWeightedCfg.enabled) {
|
|
472
|
+
return null;
|
|
473
|
+
}
|
|
474
|
+
const eff = {};
|
|
475
|
+
let ref = 1;
|
|
476
|
+
for (const key of candidates) {
|
|
477
|
+
const usage = contextResult.usage?.[key];
|
|
478
|
+
const limit = usage && typeof usage.limit === 'number' && Number.isFinite(usage.limit) ? Math.floor(usage.limit) : 0;
|
|
479
|
+
const safeEff = computeEffectiveSafeWindowTokens({
|
|
480
|
+
modelMaxTokens: Math.max(1, limit),
|
|
481
|
+
warnRatio,
|
|
482
|
+
clientCapTokens: contextWeightedCfg.clientCapTokens
|
|
483
|
+
});
|
|
484
|
+
eff[key] = safeEff;
|
|
485
|
+
if (safeEff > ref) {
|
|
486
|
+
ref = safeEff;
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
return { ref, eff };
|
|
490
|
+
};
|
|
491
|
+
const selectWithQuota = (candidates, isSafePool) => {
|
|
414
492
|
if (!quotaView) {
|
|
415
493
|
if (tier.mode === 'priority') {
|
|
416
494
|
if (isRecoveryAttempt) {
|
|
417
495
|
return selectFirstAvailable(candidates);
|
|
418
496
|
}
|
|
497
|
+
const group = pickPriorityGroup(candidates, tier.targets);
|
|
498
|
+
if (!group) {
|
|
499
|
+
return null;
|
|
500
|
+
}
|
|
501
|
+
const weights = (() => {
|
|
502
|
+
if (!isSafePool)
|
|
503
|
+
return undefined;
|
|
504
|
+
const ctx = computeContextWeightMultipliers(group.groupCandidates);
|
|
505
|
+
if (!ctx)
|
|
506
|
+
return undefined;
|
|
507
|
+
const out = {};
|
|
508
|
+
for (const key of group.groupCandidates) {
|
|
509
|
+
const m = computeContextMultiplier({
|
|
510
|
+
effectiveSafeRefTokens: ctx.ref,
|
|
511
|
+
effectiveSafeTokens: ctx.eff[key] ?? 1,
|
|
512
|
+
cfg: contextWeightedCfg
|
|
513
|
+
});
|
|
514
|
+
out[key] = Math.max(1, Math.round(100 * m));
|
|
515
|
+
}
|
|
516
|
+
return out;
|
|
517
|
+
})();
|
|
419
518
|
return deps.loadBalancer.select({
|
|
420
|
-
routeName: `${routeName}:${tier.id}:priority`,
|
|
421
|
-
candidates,
|
|
519
|
+
routeName: `${routeName}:${tier.id}:priority:group:${group.groupId}`,
|
|
520
|
+
candidates: group.groupCandidates,
|
|
422
521
|
stickyKey: options.allowAliasRotation ? undefined : stickyKey,
|
|
522
|
+
weights,
|
|
423
523
|
availabilityCheck: (key) => deps.healthManager.isAvailable(key)
|
|
424
524
|
}, 'round-robin');
|
|
425
525
|
}
|
|
526
|
+
const weights = (() => {
|
|
527
|
+
if (!isSafePool || !contextWeightedCfg.enabled)
|
|
528
|
+
return undefined;
|
|
529
|
+
const ctx = computeContextWeightMultipliers(candidates);
|
|
530
|
+
if (!ctx)
|
|
531
|
+
return undefined;
|
|
532
|
+
const out = {};
|
|
533
|
+
for (const key of candidates) {
|
|
534
|
+
const m = computeContextMultiplier({
|
|
535
|
+
effectiveSafeRefTokens: ctx.ref,
|
|
536
|
+
effectiveSafeTokens: ctx.eff[key] ?? 1,
|
|
537
|
+
cfg: contextWeightedCfg
|
|
538
|
+
});
|
|
539
|
+
out[key] = Math.max(1, Math.round(100 * m));
|
|
540
|
+
}
|
|
541
|
+
return out;
|
|
542
|
+
})();
|
|
426
543
|
const selected = deps.loadBalancer.select({
|
|
427
544
|
routeName: `${routeName}:${tier.id}`,
|
|
428
545
|
candidates,
|
|
429
546
|
stickyKey: options.allowAliasRotation ? undefined : stickyKey,
|
|
547
|
+
weights,
|
|
430
548
|
availabilityCheck: (key) => deps.healthManager.isAvailable(key)
|
|
431
549
|
}, tier.mode === 'round-robin' ? 'round-robin' : undefined);
|
|
432
550
|
return selected;
|
|
@@ -508,12 +626,16 @@ function trySelectFromTier(routeName, tier, stickyKey, estimatedTokens, features
|
|
|
508
626
|
return pinned;
|
|
509
627
|
}
|
|
510
628
|
}
|
|
629
|
+
const bucketPenaltyMap = {};
|
|
630
|
+
for (const item of bucket) {
|
|
631
|
+
bucketPenaltyMap[item.key] = item.penalty;
|
|
632
|
+
}
|
|
511
633
|
const bucketWeights = {};
|
|
512
634
|
const bucketMultipliers = {};
|
|
513
635
|
for (const item of bucket) {
|
|
514
636
|
if (healthWeightedCfg.enabled) {
|
|
515
637
|
const entry = quotaView(item.key);
|
|
516
|
-
const { weight, multiplier } = computeHealthWeight(entry,
|
|
638
|
+
const { weight, multiplier } = computeHealthWeight(entry, nowForWeights, healthWeightedCfg);
|
|
517
639
|
bucketWeights[item.key] = weight;
|
|
518
640
|
bucketMultipliers[item.key] = multiplier;
|
|
519
641
|
}
|
|
@@ -523,7 +645,41 @@ function trySelectFromTier(routeName, tier, stickyKey, estimatedTokens, features
|
|
|
523
645
|
bucketMultipliers[item.key] = 1;
|
|
524
646
|
}
|
|
525
647
|
}
|
|
648
|
+
if (isSafePool && contextWeightedCfg.enabled) {
|
|
649
|
+
const ctx = computeContextWeightMultipliers(bucketCandidates);
|
|
650
|
+
if (ctx) {
|
|
651
|
+
for (const key of bucketCandidates) {
|
|
652
|
+
const m = computeContextMultiplier({
|
|
653
|
+
effectiveSafeRefTokens: ctx.ref,
|
|
654
|
+
effectiveSafeTokens: ctx.eff[key] ?? 1,
|
|
655
|
+
cfg: contextWeightedCfg
|
|
656
|
+
});
|
|
657
|
+
bucketWeights[key] = Math.max(1, Math.round((bucketWeights[key] ?? 1) * m));
|
|
658
|
+
}
|
|
659
|
+
}
|
|
660
|
+
}
|
|
526
661
|
if (tier.mode === 'priority') {
|
|
662
|
+
if (!isRecoveryAttempt) {
|
|
663
|
+
const group = pickPriorityGroup(bucketCandidates, tier.targets, bucketPenaltyMap);
|
|
664
|
+
if (!group) {
|
|
665
|
+
continue;
|
|
666
|
+
}
|
|
667
|
+
const groupWeights = {};
|
|
668
|
+
for (const key of group.groupCandidates) {
|
|
669
|
+
groupWeights[key] = bucketWeights[key] ?? 1;
|
|
670
|
+
}
|
|
671
|
+
const selected = deps.loadBalancer.select({
|
|
672
|
+
routeName: `${routeName}:${tier.id}:priority:${priority}:group:${group.groupId}`,
|
|
673
|
+
candidates: group.groupCandidates,
|
|
674
|
+
stickyKey: options.allowAliasRotation ? undefined : stickyKey,
|
|
675
|
+
weights: groupWeights,
|
|
676
|
+
availabilityCheck: (key) => deps.healthManager.isAvailable(key)
|
|
677
|
+
}, 'round-robin');
|
|
678
|
+
if (selected) {
|
|
679
|
+
return selected;
|
|
680
|
+
}
|
|
681
|
+
continue;
|
|
682
|
+
}
|
|
527
683
|
if (isRecoveryAttempt && healthWeightedCfg.enabled && healthWeightedCfg.recoverToBestOnRetry) {
|
|
528
684
|
let best = null;
|
|
529
685
|
let bestM = Number.NEGATIVE_INFINITY;
|
|
@@ -547,16 +703,7 @@ function trySelectFromTier(routeName, tier, stickyKey, estimatedTokens, features
|
|
|
547
703
|
return recovered;
|
|
548
704
|
continue;
|
|
549
705
|
}
|
|
550
|
-
|
|
551
|
-
routeName: `${routeName}:${tier.id}:priority:${priority}`,
|
|
552
|
-
candidates: bucketCandidates,
|
|
553
|
-
stickyKey: options.allowAliasRotation ? undefined : stickyKey,
|
|
554
|
-
weights: bucketWeights,
|
|
555
|
-
availabilityCheck: (key) => deps.healthManager.isAvailable(key)
|
|
556
|
-
}, 'round-robin');
|
|
557
|
-
if (selected) {
|
|
558
|
-
return selected;
|
|
559
|
-
}
|
|
706
|
+
// (unreachable) recovery handled above
|
|
560
707
|
}
|
|
561
708
|
else {
|
|
562
709
|
if (isRecoveryAttempt && healthWeightedCfg.enabled && healthWeightedCfg.recoverToBestOnRetry) {
|
|
@@ -597,7 +744,7 @@ function trySelectFromTier(routeName, tier, stickyKey, estimatedTokens, features
|
|
|
597
744
|
return null;
|
|
598
745
|
};
|
|
599
746
|
for (const candidatePool of prioritizedPools) {
|
|
600
|
-
const providerKey = selectWithQuota(candidatePool);
|
|
747
|
+
const providerKey = selectWithQuota(candidatePool, candidatePool === contextResult.safe);
|
|
601
748
|
if (providerKey) {
|
|
602
749
|
return { providerKey, poolTargets: tier.targets, tierId: tier.id };
|
|
603
750
|
}
|
|
@@ -107,6 +107,13 @@ export interface LoadBalancingPolicy {
|
|
|
107
107
|
* - Gradually recovers weights as time passes without errors
|
|
108
108
|
*/
|
|
109
109
|
healthWeighted?: HealthWeightedLoadBalancingConfig;
|
|
110
|
+
/**
|
|
111
|
+
* Context-aware weighting (best-fit under safe window):
|
|
112
|
+
* - Prefer smaller effective context windows early, to preserve larger windows for later.
|
|
113
|
+
* - Uses ContextAdvisor's warnRatio to compute an "effective safe window" per model.
|
|
114
|
+
* - Caps comparisons by client context (e.g. 200k).
|
|
115
|
+
*/
|
|
116
|
+
contextWeighted?: ContextWeightedLoadBalancingConfig;
|
|
110
117
|
}
|
|
111
118
|
export interface HealthWeightedLoadBalancingConfig {
|
|
112
119
|
/**
|
|
@@ -136,6 +143,27 @@ export interface HealthWeightedLoadBalancingConfig {
|
|
|
136
143
|
*/
|
|
137
144
|
recoverToBestOnRetry?: boolean;
|
|
138
145
|
}
|
|
146
|
+
export interface ContextWeightedLoadBalancingConfig {
|
|
147
|
+
/**
|
|
148
|
+
* When false, context-weighted logic is disabled.
|
|
149
|
+
* When true/undefined, context-weighted logic applies within the same pool bucket,
|
|
150
|
+
* and only for candidates that are considered "safe" by ContextAdvisor.
|
|
151
|
+
*/
|
|
152
|
+
enabled?: boolean;
|
|
153
|
+
/**
|
|
154
|
+
* Client-side maximum usable context (tokens). Models above this are capped.
|
|
155
|
+
* Example: 200000 for Codex/Claude Code style clients.
|
|
156
|
+
*/
|
|
157
|
+
clientCapTokens?: number;
|
|
158
|
+
/**
|
|
159
|
+
* Exponent for the compensation ratio. Use 1 for proportional compensation.
|
|
160
|
+
*/
|
|
161
|
+
gamma?: number;
|
|
162
|
+
/**
|
|
163
|
+
* Upper bound for the multiplier to avoid extreme skew.
|
|
164
|
+
*/
|
|
165
|
+
maxMultiplier?: number;
|
|
166
|
+
}
|
|
139
167
|
export interface ProviderHealthConfig {
|
|
140
168
|
failureThreshold: number;
|
|
141
169
|
cooldownMs: number;
|