@jsonstudio/llms 0.6.1164 → 0.6.1172

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1177,10 +1177,26 @@ function normalizeLoadBalancing(input) {
1177
1177
  : {})
1178
1178
  }
1179
1179
  : undefined;
1180
+ const contextWeightedRaw = asRecord(record.contextWeighted);
1181
+ const contextWeighted = Object.keys(contextWeightedRaw).length > 0
1182
+ ? {
1183
+ ...(typeof contextWeightedRaw.enabled === 'boolean' ? { enabled: contextWeightedRaw.enabled } : {}),
1184
+ ...(typeof contextWeightedRaw.clientCapTokens === 'number' && Number.isFinite(contextWeightedRaw.clientCapTokens)
1185
+ ? { clientCapTokens: contextWeightedRaw.clientCapTokens }
1186
+ : {}),
1187
+ ...(typeof contextWeightedRaw.gamma === 'number' && Number.isFinite(contextWeightedRaw.gamma)
1188
+ ? { gamma: contextWeightedRaw.gamma }
1189
+ : {}),
1190
+ ...(typeof contextWeightedRaw.maxMultiplier === 'number' && Number.isFinite(contextWeightedRaw.maxMultiplier)
1191
+ ? { maxMultiplier: contextWeightedRaw.maxMultiplier }
1192
+ : {})
1193
+ }
1194
+ : undefined;
1180
1195
  return {
1181
1196
  strategy,
1182
1197
  ...(Object.keys(weightsEntries).length ? { weights: weightsEntries } : {}),
1183
- ...(healthWeighted ? { healthWeighted } : {})
1198
+ ...(healthWeighted ? { healthWeighted } : {}),
1199
+ ...(contextWeighted ? { contextWeighted } : {})
1184
1200
  };
1185
1201
  }
1186
1202
  function coerceRatio(value) {
@@ -16,4 +16,8 @@ export declare class ContextAdvisor {
16
16
  private hardLimit;
17
17
  configure(config?: VirtualRouterContextRoutingConfig | null): void;
18
18
  classify(pool: string[], estimatedTokens: number, resolveProfile: (key: string) => ProviderProfile): ContextAdvisorResult;
19
+ getConfig(): {
20
+ warnRatio: number;
21
+ hardLimit: boolean;
22
+ };
19
23
  }
@@ -55,6 +55,9 @@ export class ContextAdvisor {
55
55
  allOverflow: safe.length === 0 && risky.length === 0 && overflow.length > 0
56
56
  };
57
57
  }
58
+ getConfig() {
59
+ return { warnRatio: this.warnRatio, hardLimit: this.hardLimit };
60
+ }
58
61
  }
59
62
  function clampWarnRatio(value) {
60
63
  if (!Number.isFinite(value)) {
@@ -0,0 +1,31 @@
1
+ import type { ContextWeightedLoadBalancingConfig } from './types.js';
2
+ export type ResolvedContextWeightedConfig = Required<{
3
+ enabled: boolean;
4
+ clientCapTokens: number;
5
+ gamma: number;
6
+ maxMultiplier: number;
7
+ }>;
8
+ /**
9
+ * Context-weighted constant table (defaults).
10
+ *
11
+ * Intended behavior:
12
+ * - Prefer smaller effective safe context windows early, so that larger windows remain available later.
13
+ * - Compensation is proportional by default (`gamma=1`), but capped by `maxMultiplier`.
14
+ *
15
+ * Notes:
16
+ * - `clientCapTokens` is the maximum effective context the client can consume, even if the model supports more.
17
+ * - The effective safe window is computed using ContextAdvisor's `warnRatio` and model "slack" above the client cap.
18
+ * - If a model has slack >= the reserved margin, it effectively gets the full client cap as safe window.
19
+ */
20
+ export declare const DEFAULT_CONTEXT_WEIGHTED_CONFIG: ResolvedContextWeightedConfig;
21
+ export declare function resolveContextWeightedConfig(raw?: ContextWeightedLoadBalancingConfig | null): ResolvedContextWeightedConfig;
22
+ export declare function computeEffectiveSafeWindowTokens(options: {
23
+ modelMaxTokens: number;
24
+ warnRatio: number;
25
+ clientCapTokens: number;
26
+ }): number;
27
+ export declare function computeContextMultiplier(options: {
28
+ effectiveSafeRefTokens: number;
29
+ effectiveSafeTokens: number;
30
+ cfg: ResolvedContextWeightedConfig;
31
+ }): number;
@@ -0,0 +1,54 @@
1
+ /**
2
+ * Context-weighted constant table (defaults).
3
+ *
4
+ * Intended behavior:
5
+ * - Prefer smaller effective safe context windows early, so that larger windows remain available later.
6
+ * - Compensation is proportional by default (`gamma=1`), but capped by `maxMultiplier`.
7
+ *
8
+ * Notes:
9
+ * - `clientCapTokens` is the maximum effective context the client can consume, even if the model supports more.
10
+ * - The effective safe window is computed using ContextAdvisor's `warnRatio` and model "slack" above the client cap.
11
+ * - If a model has slack >= the reserved margin, it effectively gets the full client cap as safe window.
12
+ */
13
+ export const DEFAULT_CONTEXT_WEIGHTED_CONFIG = {
14
+ enabled: false,
15
+ clientCapTokens: 200_000,
16
+ gamma: 1,
17
+ maxMultiplier: 2
18
+ };
19
+ export function resolveContextWeightedConfig(raw) {
20
+ const enabled = raw?.enabled ?? DEFAULT_CONTEXT_WEIGHTED_CONFIG.enabled;
21
+ const clientCapTokens = typeof raw?.clientCapTokens === 'number' && Number.isFinite(raw.clientCapTokens) && raw.clientCapTokens > 0
22
+ ? Math.floor(raw.clientCapTokens)
23
+ : DEFAULT_CONTEXT_WEIGHTED_CONFIG.clientCapTokens;
24
+ const gamma = typeof raw?.gamma === 'number' && Number.isFinite(raw.gamma) && raw.gamma > 0
25
+ ? raw.gamma
26
+ : DEFAULT_CONTEXT_WEIGHTED_CONFIG.gamma;
27
+ const maxMultiplier = typeof raw?.maxMultiplier === 'number' && Number.isFinite(raw.maxMultiplier) && raw.maxMultiplier >= 1
28
+ ? raw.maxMultiplier
29
+ : DEFAULT_CONTEXT_WEIGHTED_CONFIG.maxMultiplier;
30
+ return { enabled, clientCapTokens, gamma, maxMultiplier };
31
+ }
32
+ export function computeEffectiveSafeWindowTokens(options) {
33
+ const modelMaxTokens = typeof options.modelMaxTokens === 'number' && Number.isFinite(options.modelMaxTokens) && options.modelMaxTokens > 0
34
+ ? Math.floor(options.modelMaxTokens)
35
+ : 1;
36
+ const clientCapTokens = typeof options.clientCapTokens === 'number' && Number.isFinite(options.clientCapTokens) && options.clientCapTokens > 0
37
+ ? Math.floor(options.clientCapTokens)
38
+ : DEFAULT_CONTEXT_WEIGHTED_CONFIG.clientCapTokens;
39
+ const warnRatio = typeof options.warnRatio === 'number' && Number.isFinite(options.warnRatio) && options.warnRatio > 0 && options.warnRatio < 1
40
+ ? options.warnRatio
41
+ : 0.9;
42
+ const effectiveMax = Math.min(modelMaxTokens, clientCapTokens);
43
+ const reserve = Math.ceil(effectiveMax * (1 - warnRatio));
44
+ const slack = Math.max(0, modelMaxTokens - clientCapTokens);
45
+ const reserveEff = Math.max(0, reserve - slack);
46
+ return Math.max(1, effectiveMax - reserveEff);
47
+ }
48
+ export function computeContextMultiplier(options) {
49
+ const ref = Math.max(1, Math.floor(options.effectiveSafeRefTokens));
50
+ const cur = Math.max(1, Math.floor(options.effectiveSafeTokens));
51
+ const ratio = ref / cur;
52
+ const raw = Math.pow(Math.max(1, ratio), options.cfg.gamma);
53
+ return Math.min(options.cfg.maxMultiplier, raw);
54
+ }
@@ -1,3 +1,4 @@
1
+ import { computeContextMultiplier, computeEffectiveSafeWindowTokens, resolveContextWeightedConfig } from './context-weighted.js';
1
2
  import { computeHealthWeight, resolveHealthWeightedConfig } from './health-weighted.js';
2
3
  import { DEFAULT_ROUTE, ROUTE_PRIORITY, VirtualRouterError, VirtualRouterErrorCode } from './types.js';
3
4
  export function selectProviderImpl(requestedRoute, metadata, classification, features, activeState, deps, options = {}) {
@@ -402,6 +403,9 @@ function trySelectFromTier(routeName, tier, stickyKey, estimatedTokens, features
402
403
  const quotaView = deps.quotaView;
403
404
  const now = quotaView ? Date.now() : 0;
404
405
  const healthWeightedCfg = resolveHealthWeightedConfig(deps.loadBalancer.getPolicy().healthWeighted);
406
+ const contextWeightedCfg = resolveContextWeightedConfig(deps.loadBalancer.getPolicy().contextWeighted);
407
+ const warnRatio = deps.contextAdvisor.getConfig().warnRatio;
408
+ const nowForWeights = Date.now();
405
409
  const selectFirstAvailable = (candidates) => {
406
410
  for (const key of candidates) {
407
411
  if (deps.healthManager.isAvailable(key)) {
@@ -410,23 +414,137 @@ function trySelectFromTier(routeName, tier, stickyKey, estimatedTokens, features
410
414
  }
411
415
  return null;
412
416
  };
413
- const selectWithQuota = (candidates) => {
417
+ const resolvePriorityMeta = (orderedTargets) => {
418
+ // Priority mode semantics (strict group priority + alias-level balancing):
419
+ // - Targets are interpreted as ordered (providerId, modelId) groups.
420
+ // - Group base priorities: 100, 90, 80, ... (step=10) by appearance order.
421
+ // - Within a group (different auth aliases), base scores: 100, 99, 98, ... (step=1).
422
+ //
423
+ // Group selection is strict: always use the best group until it is unavailable.
424
+ // Alias selection is balanced within the chosen group (RR / health-weighted / context-weighted).
425
+ const meta = new Map();
426
+ if (!Array.isArray(orderedTargets) || orderedTargets.length === 0) {
427
+ return meta;
428
+ }
429
+ let groupIndex = -1;
430
+ let aliasOffset = 0;
431
+ let lastGroupKey = '';
432
+ for (const key of orderedTargets) {
433
+ const providerId = extractProviderId(key) ?? '';
434
+ const modelId = getProviderModelId(key, deps.providerRegistry) ?? '';
435
+ const groupKey = `${providerId}::${modelId}`;
436
+ if (groupKey !== lastGroupKey) {
437
+ groupIndex += 1;
438
+ aliasOffset = 0;
439
+ lastGroupKey = groupKey;
440
+ }
441
+ const groupBase = 100 - groupIndex * 10;
442
+ const base = groupBase - aliasOffset;
443
+ meta.set(key, { groupId: `${providerId}.${modelId}`, groupBase, base });
444
+ aliasOffset += 1;
445
+ }
446
+ return meta;
447
+ };
448
+ const pickPriorityGroup = (candidates, orderedTargets, penalties) => {
449
+ const meta = resolvePriorityMeta(orderedTargets);
450
+ let bestGroupId = null;
451
+ let bestScore = Number.NEGATIVE_INFINITY;
452
+ for (const key of candidates) {
453
+ if (!deps.healthManager.isAvailable(key))
454
+ continue;
455
+ const m = meta.get(key);
456
+ if (!m)
457
+ continue;
458
+ const penalty = penalties ? Math.max(0, Math.floor(penalties[key] ?? 0)) : 0;
459
+ const score = m.base - penalty;
460
+ if (score > bestScore) {
461
+ bestScore = score;
462
+ bestGroupId = m.groupId;
463
+ }
464
+ }
465
+ if (!bestGroupId)
466
+ return null;
467
+ const groupCandidates = candidates.filter((key) => meta.get(key)?.groupId === bestGroupId);
468
+ return groupCandidates.length ? { groupId: bestGroupId, groupCandidates } : null;
469
+ };
470
+ const computeContextWeightMultipliers = (candidates) => {
471
+ if (!contextWeightedCfg.enabled) {
472
+ return null;
473
+ }
474
+ const eff = {};
475
+ let ref = 1;
476
+ for (const key of candidates) {
477
+ const usage = contextResult.usage?.[key];
478
+ const limit = usage && typeof usage.limit === 'number' && Number.isFinite(usage.limit) ? Math.floor(usage.limit) : 0;
479
+ const safeEff = computeEffectiveSafeWindowTokens({
480
+ modelMaxTokens: Math.max(1, limit),
481
+ warnRatio,
482
+ clientCapTokens: contextWeightedCfg.clientCapTokens
483
+ });
484
+ eff[key] = safeEff;
485
+ if (safeEff > ref) {
486
+ ref = safeEff;
487
+ }
488
+ }
489
+ return { ref, eff };
490
+ };
491
+ const selectWithQuota = (candidates, isSafePool) => {
414
492
  if (!quotaView) {
415
493
  if (tier.mode === 'priority') {
416
494
  if (isRecoveryAttempt) {
417
495
  return selectFirstAvailable(candidates);
418
496
  }
497
+ const group = pickPriorityGroup(candidates, tier.targets);
498
+ if (!group) {
499
+ return null;
500
+ }
501
+ const weights = (() => {
502
+ if (!isSafePool)
503
+ return undefined;
504
+ const ctx = computeContextWeightMultipliers(group.groupCandidates);
505
+ if (!ctx)
506
+ return undefined;
507
+ const out = {};
508
+ for (const key of group.groupCandidates) {
509
+ const m = computeContextMultiplier({
510
+ effectiveSafeRefTokens: ctx.ref,
511
+ effectiveSafeTokens: ctx.eff[key] ?? 1,
512
+ cfg: contextWeightedCfg
513
+ });
514
+ out[key] = Math.max(1, Math.round(100 * m));
515
+ }
516
+ return out;
517
+ })();
419
518
  return deps.loadBalancer.select({
420
- routeName: `${routeName}:${tier.id}:priority`,
421
- candidates,
519
+ routeName: `${routeName}:${tier.id}:priority:group:${group.groupId}`,
520
+ candidates: group.groupCandidates,
422
521
  stickyKey: options.allowAliasRotation ? undefined : stickyKey,
522
+ weights,
423
523
  availabilityCheck: (key) => deps.healthManager.isAvailable(key)
424
524
  }, 'round-robin');
425
525
  }
526
+ const weights = (() => {
527
+ if (!isSafePool || !contextWeightedCfg.enabled)
528
+ return undefined;
529
+ const ctx = computeContextWeightMultipliers(candidates);
530
+ if (!ctx)
531
+ return undefined;
532
+ const out = {};
533
+ for (const key of candidates) {
534
+ const m = computeContextMultiplier({
535
+ effectiveSafeRefTokens: ctx.ref,
536
+ effectiveSafeTokens: ctx.eff[key] ?? 1,
537
+ cfg: contextWeightedCfg
538
+ });
539
+ out[key] = Math.max(1, Math.round(100 * m));
540
+ }
541
+ return out;
542
+ })();
426
543
  const selected = deps.loadBalancer.select({
427
544
  routeName: `${routeName}:${tier.id}`,
428
545
  candidates,
429
546
  stickyKey: options.allowAliasRotation ? undefined : stickyKey,
547
+ weights,
430
548
  availabilityCheck: (key) => deps.healthManager.isAvailable(key)
431
549
  }, tier.mode === 'round-robin' ? 'round-robin' : undefined);
432
550
  return selected;
@@ -508,12 +626,16 @@ function trySelectFromTier(routeName, tier, stickyKey, estimatedTokens, features
508
626
  return pinned;
509
627
  }
510
628
  }
629
+ const bucketPenaltyMap = {};
630
+ for (const item of bucket) {
631
+ bucketPenaltyMap[item.key] = item.penalty;
632
+ }
511
633
  const bucketWeights = {};
512
634
  const bucketMultipliers = {};
513
635
  for (const item of bucket) {
514
636
  if (healthWeightedCfg.enabled) {
515
637
  const entry = quotaView(item.key);
516
- const { weight, multiplier } = computeHealthWeight(entry, now, healthWeightedCfg);
638
+ const { weight, multiplier } = computeHealthWeight(entry, nowForWeights, healthWeightedCfg);
517
639
  bucketWeights[item.key] = weight;
518
640
  bucketMultipliers[item.key] = multiplier;
519
641
  }
@@ -523,7 +645,41 @@ function trySelectFromTier(routeName, tier, stickyKey, estimatedTokens, features
523
645
  bucketMultipliers[item.key] = 1;
524
646
  }
525
647
  }
648
+ if (isSafePool && contextWeightedCfg.enabled) {
649
+ const ctx = computeContextWeightMultipliers(bucketCandidates);
650
+ if (ctx) {
651
+ for (const key of bucketCandidates) {
652
+ const m = computeContextMultiplier({
653
+ effectiveSafeRefTokens: ctx.ref,
654
+ effectiveSafeTokens: ctx.eff[key] ?? 1,
655
+ cfg: contextWeightedCfg
656
+ });
657
+ bucketWeights[key] = Math.max(1, Math.round((bucketWeights[key] ?? 1) * m));
658
+ }
659
+ }
660
+ }
526
661
  if (tier.mode === 'priority') {
662
+ if (!isRecoveryAttempt) {
663
+ const group = pickPriorityGroup(bucketCandidates, tier.targets, bucketPenaltyMap);
664
+ if (!group) {
665
+ continue;
666
+ }
667
+ const groupWeights = {};
668
+ for (const key of group.groupCandidates) {
669
+ groupWeights[key] = bucketWeights[key] ?? 1;
670
+ }
671
+ const selected = deps.loadBalancer.select({
672
+ routeName: `${routeName}:${tier.id}:priority:${priority}:group:${group.groupId}`,
673
+ candidates: group.groupCandidates,
674
+ stickyKey: options.allowAliasRotation ? undefined : stickyKey,
675
+ weights: groupWeights,
676
+ availabilityCheck: (key) => deps.healthManager.isAvailable(key)
677
+ }, 'round-robin');
678
+ if (selected) {
679
+ return selected;
680
+ }
681
+ continue;
682
+ }
527
683
  if (isRecoveryAttempt && healthWeightedCfg.enabled && healthWeightedCfg.recoverToBestOnRetry) {
528
684
  let best = null;
529
685
  let bestM = Number.NEGATIVE_INFINITY;
@@ -547,16 +703,7 @@ function trySelectFromTier(routeName, tier, stickyKey, estimatedTokens, features
547
703
  return recovered;
548
704
  continue;
549
705
  }
550
- const selected = deps.loadBalancer.select({
551
- routeName: `${routeName}:${tier.id}:priority:${priority}`,
552
- candidates: bucketCandidates,
553
- stickyKey: options.allowAliasRotation ? undefined : stickyKey,
554
- weights: bucketWeights,
555
- availabilityCheck: (key) => deps.healthManager.isAvailable(key)
556
- }, 'round-robin');
557
- if (selected) {
558
- return selected;
559
- }
706
+ // (unreachable) recovery handled above
560
707
  }
561
708
  else {
562
709
  if (isRecoveryAttempt && healthWeightedCfg.enabled && healthWeightedCfg.recoverToBestOnRetry) {
@@ -597,7 +744,7 @@ function trySelectFromTier(routeName, tier, stickyKey, estimatedTokens, features
597
744
  return null;
598
745
  };
599
746
  for (const candidatePool of prioritizedPools) {
600
- const providerKey = selectWithQuota(candidatePool);
747
+ const providerKey = selectWithQuota(candidatePool, candidatePool === contextResult.safe);
601
748
  if (providerKey) {
602
749
  return { providerKey, poolTargets: tier.targets, tierId: tier.id };
603
750
  }
@@ -107,6 +107,13 @@ export interface LoadBalancingPolicy {
107
107
  * - Gradually recovers weights as time passes without errors
108
108
  */
109
109
  healthWeighted?: HealthWeightedLoadBalancingConfig;
110
+ /**
111
+ * Context-aware weighting (best-fit under safe window):
112
+ * - Prefer smaller effective context windows early, to preserve larger windows for later.
113
+ * - Uses ContextAdvisor's warnRatio to compute an "effective safe window" per model.
114
+ * - Caps comparisons by client context (e.g. 200k).
115
+ */
116
+ contextWeighted?: ContextWeightedLoadBalancingConfig;
110
117
  }
111
118
  export interface HealthWeightedLoadBalancingConfig {
112
119
  /**
@@ -136,6 +143,27 @@ export interface HealthWeightedLoadBalancingConfig {
136
143
  */
137
144
  recoverToBestOnRetry?: boolean;
138
145
  }
146
+ export interface ContextWeightedLoadBalancingConfig {
147
+ /**
148
+ * When false, context-weighted logic is disabled.
149
+ * When true/undefined, context-weighted logic applies within the same pool bucket,
150
+ * and only for candidates that are considered "safe" by ContextAdvisor.
151
+ */
152
+ enabled?: boolean;
153
+ /**
154
+ * Client-side maximum usable context (tokens). Models above this are capped.
155
+ * Example: 200000 for Codex/Claude Code style clients.
156
+ */
157
+ clientCapTokens?: number;
158
+ /**
159
+ * Exponent for the compensation ratio. Use 1 for proportional compensation.
160
+ */
161
+ gamma?: number;
162
+ /**
163
+ * Upper bound for the multiplier to avoid extreme skew.
164
+ */
165
+ maxMultiplier?: number;
166
+ }
139
167
  export interface ProviderHealthConfig {
140
168
  failureThreshold: number;
141
169
  cooldownMs: number;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@jsonstudio/llms",
3
- "version": "0.6.1164",
3
+ "version": "0.6.1172",
4
4
  "type": "module",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.js",