ai-lcr 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -5
- package/README.zh-CN.md +1 -5
- package/dist/index.cjs +63 -14
- package/dist/index.d.cts +40 -5
- package/dist/index.d.ts +40 -5
- package/dist/index.js +63 -14
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
</p>
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
18
|
-
<img src="assets/ai-lcr-hero.svg" alt="ai-lcr
|
|
18
|
+
<img src="assets/ai-lcr-hero.svg" alt="ai-lcr keeps a cheapest-first list of providers per model — serves the cheapest (saving ~40%), fails over to the next on error, and snaps back to the cheapest after ~60s" width="720">
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
The same model costs different amounts on different providers — and no single provider is cheapest for everything. `ai-lcr` keeps a cheapest-first list per model, routes to the cheapest healthy one (⭐ below), and falls through on failure — the way phone carriers have done [Least Cost Routing](https://en.wikipedia.org/wiki/Least-cost_routing) for decades.
|
|
@@ -144,10 +144,6 @@ DeepInfra carries open weights only — no first-party Claude / GPT / Gemini. Fo
|
|
|
144
144
|
2. **Fall through on failure.** On a retryable error — rate limit, 5xx, timeout, or a **billing cap** (402 / out-of-credit / quota) — it advances to the next provider, streaming-safe. A caller's own bad request (e.g. 400, 422) passes through immediately.
|
|
145
145
|
3. **Recover.** After an idle window (`resetIntervalMs`, default 60s) it snaps back to the cheapest provider.
|
|
146
146
|
|
|
147
|
-
<p align="center">
|
|
148
|
-
<img src="assets/ai-lcr-routing.svg" alt="routing diagram: cheapest first, fallback on failure, recover after idle" width="820">
|
|
149
|
-
</p>
|
|
150
|
-
|
|
151
147
|
## See what happened (`onCall`)
|
|
152
148
|
|
|
153
149
|
`onError`/`onCost` fire separately and uncorrelated, so a failover is hard to read after the fact. `onCall` gives you **one record per request** — the full chain, the winner, the reason for each failed hop, latency, and cost — and `formatCallRecord` turns it into a one-liner you can scan:
|
package/README.zh-CN.md
CHANGED
|
@@ -15,7 +15,7 @@
|
|
|
15
15
|
</p>
|
|
16
16
|
|
|
17
17
|
<p align="center">
|
|
18
|
-
<img src="assets/ai-lcr-hero.svg" alt="ai-lcr
|
|
18
|
+
<img src="assets/ai-lcr-hero.svg" alt="ai-lcr 为每个模型维护一份「最便宜优先」的 provider 列表——默认走最便宜的(省约 40%),出错时切到下一个,约 60 秒后自动切回最便宜" width="720">
|
|
19
19
|
</p>
|
|
20
20
|
|
|
21
21
|
同一个模型在不同 provider 上的价格不同——而且没有任何单一 provider 在所有模型上都最便宜。`ai-lcr` 为每个模型维护一份「最便宜优先」的列表,路由到其中最便宜且健康的 provider(下表中的 ⭐),失败时向下穿透——这正是电话运营商几十年来一直在做的 [最低成本路由(Least Cost Routing)](https://en.wikipedia.org/wiki/Least-cost_routing)。
|
|
@@ -144,10 +144,6 @@ DeepInfra 只承载开源权重——没有第一方 Claude / GPT / Gemini。那
|
|
|
144
144
|
2. **失败时向下穿透。** 遇到可重试的错误(限流、5xx、超时)时,前进到下一个 provider,且对流式安全。硬错误(400、401、403、422)会直接透传,不做重试。
|
|
145
145
|
3. **恢复。** 在一段空闲窗口(`resetIntervalMs`,默认 60s)之后,自动回到最便宜的 provider。
|
|
146
146
|
|
|
147
|
-
<p align="center">
|
|
148
|
-
<img src="assets/ai-lcr-routing.svg" alt="路由示意图:最便宜优先、失败时 fallback、空闲后恢复" width="820">
|
|
149
|
-
</p>
|
|
150
|
-
|
|
151
147
|
## 支持的 provider
|
|
152
148
|
|
|
153
149
|
任何 OpenAI 兼容的 endpoint 都可用——任何 AI SDK 的 provider 包也都可用,包括模型厂商自己的官方 API。
|
package/dist/index.cjs
CHANGED
|
@@ -41,6 +41,12 @@ __export(index_exports, {
|
|
|
41
41
|
module.exports = __toCommonJS(index_exports);
|
|
42
42
|
|
|
43
43
|
// src/fallback.ts
|
|
44
|
+
var EmptyCompletionError = class extends Error {
|
|
45
|
+
constructor(provider) {
|
|
46
|
+
super(`ai-lcr: provider "${provider}" returned an empty completion (0 output tokens, no content)`);
|
|
47
|
+
this.name = "EmptyCompletionError";
|
|
48
|
+
}
|
|
49
|
+
};
|
|
44
50
|
var RETRYABLE_STATUS = /* @__PURE__ */ new Set([401, 402, 403, 408, 409, 413, 429, 498, 500]);
|
|
45
51
|
var RETRYABLE_PATTERNS = [
|
|
46
52
|
"overloaded",
|
|
@@ -153,6 +159,7 @@ function isRetryableError(error) {
|
|
|
153
159
|
return RETRYABLE_PATTERNS.some((p) => text.includes(p));
|
|
154
160
|
}
|
|
155
161
|
function classifyError(error) {
|
|
162
|
+
if (error instanceof EmptyCompletionError) return "empty_completion";
|
|
156
163
|
const e = error;
|
|
157
164
|
const status = e?.statusCode ?? e?.status;
|
|
158
165
|
if (typeof status === "number") return String(status);
|
|
@@ -175,6 +182,7 @@ var BILLING_PATTERNS = [
|
|
|
175
182
|
"\u6263\u6B3E"
|
|
176
183
|
];
|
|
177
184
|
function classifyErrorKind(error) {
|
|
185
|
+
if (error instanceof EmptyCompletionError) return "empty";
|
|
178
186
|
const e = error;
|
|
179
187
|
const status = e?.statusCode ?? e?.status;
|
|
180
188
|
const { text } = errorSignals(error);
|
|
@@ -194,10 +202,27 @@ function costForUsage(cost, inputTokens, outputTokens, cacheReadTokens) {
|
|
|
194
202
|
const cachedRate = cost.cacheRead ?? cost.input;
|
|
195
203
|
return fullInput / 1e6 * cost.input + cached / 1e6 * cachedRate + outputTokens / 1e6 * cost.output;
|
|
196
204
|
}
|
|
205
|
+
function cacheSavingForUsage(cost, inputTokens, cacheReadTokens) {
|
|
206
|
+
if (cost.cacheRead === void 0) return 0;
|
|
207
|
+
const cached = Math.min(Math.max(cacheReadTokens, 0), inputTokens);
|
|
208
|
+
return cached / 1e6 * (cost.input - cost.cacheRead);
|
|
209
|
+
}
|
|
197
210
|
function requestIdFrom(options) {
|
|
198
211
|
const raw = options.providerOptions?.lcr?.requestId;
|
|
199
212
|
return typeof raw === "string" && raw.length > 0 ? raw : void 0;
|
|
200
213
|
}
|
|
214
|
+
var CONTENT_PART_TYPES = /* @__PURE__ */ new Set([
|
|
215
|
+
"text-delta",
|
|
216
|
+
"reasoning-delta",
|
|
217
|
+
"tool-call",
|
|
218
|
+
"tool-input-start",
|
|
219
|
+
"tool-input-delta",
|
|
220
|
+
"tool-input-end",
|
|
221
|
+
"file",
|
|
222
|
+
"source",
|
|
223
|
+
"tool-result",
|
|
224
|
+
"raw"
|
|
225
|
+
]);
|
|
201
226
|
var LcrFallbackModel = class {
|
|
202
227
|
constructor(opts) {
|
|
203
228
|
this.opts = opts;
|
|
@@ -298,19 +323,22 @@ var LcrFallbackModel = class {
|
|
|
298
323
|
});
|
|
299
324
|
}
|
|
300
325
|
/**
|
|
301
|
-
* Baseline = what this same usage would have cost on the
|
|
302
|
-
*
|
|
303
|
-
*
|
|
304
|
-
* the
|
|
326
|
+
* Baseline = what this same usage would have cost on the always-on fallback:
|
|
327
|
+
* the LAST priced leg of the chain (by convention the list-price provider you'd
|
|
328
|
+
* use without routing — e.g. OpenRouter, always last). The winner's saving is
|
|
329
|
+
* `baselineUsd - costUsd`. We take the last priced leg, NOT the most expensive
|
|
330
|
+
* one: prompt caching can make a sticker-cheaper provider (no `cacheRead`) cost
|
|
331
|
+
* MORE on a cache-heavy call, and a max-of-chain baseline would then fabricate a
|
|
332
|
+
* "saving" even on calls the fallback itself served. Undefined when no provider
|
|
333
|
+
* in the chain carries a price (nothing to compare against).
|
|
305
334
|
*/
|
|
306
335
|
baselineUsd(inputTokens, outputTokens, cacheReadTokens) {
|
|
307
|
-
let
|
|
336
|
+
let baseline;
|
|
308
337
|
for (const p of this.opts.providers) {
|
|
309
338
|
if (!p.cost) continue;
|
|
310
|
-
|
|
311
|
-
if (max === void 0 || c > max) max = c;
|
|
339
|
+
baseline = costForUsage(p.cost, inputTokens, outputTokens, cacheReadTokens);
|
|
312
340
|
}
|
|
313
|
-
return
|
|
341
|
+
return baseline;
|
|
314
342
|
}
|
|
315
343
|
/** Winner settled: record the attempt, fire `onCost` (compat) + `onCall`. */
|
|
316
344
|
finalizeOk(ctx, provider, attemptStart, usage, ttftMs) {
|
|
@@ -319,7 +347,9 @@ var LcrFallbackModel = class {
|
|
|
319
347
|
const outputTokens = usage?.outputTokens?.total ?? 0;
|
|
320
348
|
const cacheReadTokens = usage?.inputTokens?.cacheRead ?? 0;
|
|
321
349
|
const costUsd = provider.cost ? costForUsage(provider.cost, inputTokens, outputTokens, cacheReadTokens) : 0;
|
|
350
|
+
const cachedSavingUsd = provider.cost ? cacheSavingForUsage(provider.cost, inputTokens, cacheReadTokens) : 0;
|
|
322
351
|
const usageMissing = inputTokens === 0 && outputTokens === 0;
|
|
352
|
+
const emptyCompletion = inputTokens > 0 && outputTokens === 0;
|
|
323
353
|
this.emitCost({
|
|
324
354
|
model: this.opts.modelName,
|
|
325
355
|
provider: provider.label,
|
|
@@ -341,8 +371,10 @@ var LcrFallbackModel = class {
|
|
|
341
371
|
...cacheReadTokens > 0 ? { cachedInputTokens: cacheReadTokens } : {},
|
|
342
372
|
costUsd,
|
|
343
373
|
baselineUsd: this.baselineUsd(inputTokens, outputTokens, cacheReadTokens),
|
|
374
|
+
...cachedSavingUsd > 0 ? { cachedSavingUsd } : {},
|
|
344
375
|
...ctx.requestId ? { requestId: ctx.requestId } : {},
|
|
345
|
-
...usageMissing ? { usageMissing: true } : {}
|
|
376
|
+
...usageMissing ? { usageMissing: true } : {},
|
|
377
|
+
...emptyCompletion ? { emptyCompletion: true } : {}
|
|
346
378
|
});
|
|
347
379
|
}
|
|
348
380
|
/** Every provider failed: fire `onCall` with no winner. */
|
|
@@ -373,6 +405,15 @@ var LcrFallbackModel = class {
|
|
|
373
405
|
const attemptStart = Date.now();
|
|
374
406
|
try {
|
|
375
407
|
const result = await provider.model.doGenerate(options);
|
|
408
|
+
const out = result.usage?.outputTokens?.total ?? 0;
|
|
409
|
+
const inp = result.usage?.inputTokens?.total ?? 0;
|
|
410
|
+
if (inp > 0 && out === 0 && tried < n - 1) {
|
|
411
|
+
const emptyErr = new EmptyCompletionError(provider.label);
|
|
412
|
+
lastError = emptyErr;
|
|
413
|
+
this.emitError(emptyErr, provider.label);
|
|
414
|
+
this.recordFail(ctx, provider, attemptStart, emptyErr);
|
|
415
|
+
continue;
|
|
416
|
+
}
|
|
376
417
|
this.settleSticky(idx);
|
|
377
418
|
this.finalizeOk(ctx, provider, attemptStart, result.usage);
|
|
378
419
|
return result;
|
|
@@ -434,7 +475,7 @@ var LcrFallbackModel = class {
|
|
|
434
475
|
const servingIdx = idx;
|
|
435
476
|
const triedBeforeServing = tried;
|
|
436
477
|
let usage;
|
|
437
|
-
let
|
|
478
|
+
let contentStreamed = false;
|
|
438
479
|
let ttftMs;
|
|
439
480
|
const stream = new ReadableStream({
|
|
440
481
|
async start(controller) {
|
|
@@ -443,17 +484,24 @@ var LcrFallbackModel = class {
|
|
|
443
484
|
reader = result.stream.getReader();
|
|
444
485
|
for (; ; ) {
|
|
445
486
|
const { done, value } = await reader.read();
|
|
446
|
-
if (!
|
|
487
|
+
if (!contentStreamed && value && typeof value === "object" && "error" in value) {
|
|
447
488
|
const err = value.error;
|
|
448
489
|
if (self.shouldRetry(err)) throw err;
|
|
449
490
|
}
|
|
450
491
|
if (done) break;
|
|
451
|
-
if (value.type === "finish")
|
|
492
|
+
if (value.type === "finish") {
|
|
493
|
+
usage = value.usage;
|
|
494
|
+
const out = value.usage?.outputTokens?.total ?? 0;
|
|
495
|
+
const inp = value.usage?.inputTokens?.total ?? 0;
|
|
496
|
+
if (inp > 0 && out === 0 && !contentStreamed && triedBeforeServing + 1 < n) {
|
|
497
|
+
throw new EmptyCompletionError(servingProvider.label);
|
|
498
|
+
}
|
|
499
|
+
}
|
|
452
500
|
if (ttftMs === void 0 && (value.type === "text-delta" || value.type === "reasoning-delta")) {
|
|
453
501
|
ttftMs = Date.now() - servingAttemptStart;
|
|
454
502
|
}
|
|
455
503
|
controller.enqueue(value);
|
|
456
|
-
if (value.type
|
|
504
|
+
if (CONTENT_PART_TYPES.has(value.type)) contentStreamed = true;
|
|
457
505
|
}
|
|
458
506
|
self.settleSticky(servingIdx);
|
|
459
507
|
self.finalizeOk(ctx, servingProvider, servingAttemptStart, usage, ttftMs);
|
|
@@ -461,7 +509,7 @@ var LcrFallbackModel = class {
|
|
|
461
509
|
} catch (error) {
|
|
462
510
|
self.emitError(error, servingProvider.label);
|
|
463
511
|
self.recordFail(ctx, servingProvider, servingAttemptStart, error);
|
|
464
|
-
if (!
|
|
512
|
+
if (!contentStreamed) {
|
|
465
513
|
const nextTried = triedBeforeServing + 1;
|
|
466
514
|
if (nextTried >= n) {
|
|
467
515
|
self.finalizeFail(ctx);
|
|
@@ -524,6 +572,7 @@ function formatCallRecord(record, opts = {}) {
|
|
|
524
572
|
line += ` (saved $${(record.baselineUsd - record.costUsd).toFixed(4)})`;
|
|
525
573
|
}
|
|
526
574
|
if (record.usageMissing) line += ` \u26A0no-usage`;
|
|
575
|
+
if (record.emptyCompletion) line += ` \u26A0empty`;
|
|
527
576
|
const failed = record.attempts.filter((a) => !a.ok);
|
|
528
577
|
if (failed.length > 0) {
|
|
529
578
|
const reasons = failed.map((a) => `${a.provider} ${a.errorClass ?? "error"}`).join(", ");
|
package/dist/index.d.cts
CHANGED
|
@@ -49,8 +49,15 @@ interface CostEvent {
|
|
|
49
49
|
* - "auth": 401 / 403 — a misconfigured or revoked key.
|
|
50
50
|
* - "billing": 402 / out-of-credit / quota — account needs topping up.
|
|
51
51
|
* - "client": a non-retryable caller error (e.g. 400 bad request).
|
|
52
|
+
* - "empty": provider returned a clean 200 but generated nothing
|
|
53
|
+
* (zero output tokens, no content) — a *content*-integrity
|
|
54
|
+
* failure, not a transport one. The provider looks healthy to
|
|
55
|
+
* every status/network check yet hands the user a blank. We
|
|
56
|
+
* fail over on it like a transient error, but tag it separately
|
|
57
|
+
* so a run of `"empty"` attempts (a quietly degraded model)
|
|
58
|
+
* doesn't hide inside the transient noise.
|
|
52
59
|
*/
|
|
53
|
-
type ErrorKind = "transient" | "auth" | "billing" | "client";
|
|
60
|
+
type ErrorKind = "transient" | "auth" | "billing" | "client" | "empty";
|
|
54
61
|
/** One provider attempt within a single request. */
|
|
55
62
|
interface RouteAttempt {
|
|
56
63
|
/** Provider label that was tried (e.g. "tokenmart"). */
|
|
@@ -109,12 +116,25 @@ interface CallRecord {
|
|
|
109
116
|
/** Computed from the winner's `cost`; 0 if no price was given or the call failed. */
|
|
110
117
|
costUsd: number;
|
|
111
118
|
/**
|
|
112
|
-
* What
|
|
113
|
-
*
|
|
114
|
-
*
|
|
115
|
-
* provider
|
|
119
|
+
* What this same usage would have cost on the savings baseline, so
|
|
120
|
+
* `baselineUsd - costUsd` is what routing actually saved. Text router: the
|
|
121
|
+
* always-on fallback leg — the LAST priced provider in the chain, i.e. the
|
|
122
|
+
* list-price provider you'd fall back to without routing (e.g. OpenRouter).
|
|
123
|
+
* Media router: the model-maker's official direct price. NOT the most
|
|
124
|
+
* expensive leg of the chain: prompt caching can make a sticker-cheaper
|
|
125
|
+
* provider cost more on a cache-heavy call, and a max-of-chain baseline would
|
|
126
|
+
* fabricate a "saving" on calls the fallback itself served. Undefined only
|
|
127
|
+
* when no provider was priced.
|
|
116
128
|
*/
|
|
117
129
|
baselineUsd?: number;
|
|
130
|
+
/**
|
|
131
|
+
* The slice of `costUsd` that prompt-cache reads saved versus paying the full
|
|
132
|
+
* input rate for those same tokens (`cachedTokens × (input − cacheRead)`).
|
|
133
|
+
* Present only when > 0. This is the serving provider's own caching benefit —
|
|
134
|
+
* it happens with or without routing — so it is NOT a routing saving and must
|
|
135
|
+
* be surfaced separately, never folded into `baselineUsd - costUsd`.
|
|
136
|
+
*/
|
|
137
|
+
cachedSavingUsd?: number;
|
|
118
138
|
/**
|
|
119
139
|
* Caller-supplied correlation id, read from `providerOptions.lcr.requestId`
|
|
120
140
|
* on the call. Multi-step tool loops emit one record per `doStream`/
|
|
@@ -129,6 +149,21 @@ interface CallRecord {
|
|
|
129
149
|
* other signal. Treat a flagged record as "cost unknown", not "free".
|
|
130
150
|
*/
|
|
131
151
|
usageMissing?: boolean;
|
|
152
|
+
/**
|
|
153
|
+
* True when the winner served a clean, error-free response that nonetheless
|
|
154
|
+
* generated **nothing**: zero output tokens with a non-empty prompt (and, for
|
|
155
|
+
* streams, not one content part). The user asked and got a blank. Distinct
|
|
156
|
+
* from {@link usageMissing} (which is input *and* output both zero — usage not
|
|
157
|
+
* reported); here the prompt was billed but the model produced no output.
|
|
158
|
+
*
|
|
159
|
+
* Set only when this empty response is what the caller actually received —
|
|
160
|
+
* i.e. every provider in the chain came back empty, so failover couldn't
|
|
161
|
+
* rescue it. (When an earlier provider returns empty but a later one produces
|
|
162
|
+
* content, that earlier attempt is recorded as a failed `empty_completion` hop
|
|
163
|
+
* and this flag stays unset, because the winner did produce output.) Alert on
|
|
164
|
+
* it: a provider that quietly returns blanks passes every health check.
|
|
165
|
+
*/
|
|
166
|
+
emptyCompletion?: boolean;
|
|
132
167
|
}
|
|
133
168
|
/**
|
|
134
169
|
* Normalize an error into a short, log-friendly class for {@link CallRecord}.
|
package/dist/index.d.ts
CHANGED
|
@@ -49,8 +49,15 @@ interface CostEvent {
|
|
|
49
49
|
* - "auth": 401 / 403 — a misconfigured or revoked key.
|
|
50
50
|
* - "billing": 402 / out-of-credit / quota — account needs topping up.
|
|
51
51
|
* - "client": a non-retryable caller error (e.g. 400 bad request).
|
|
52
|
+
* - "empty": provider returned a clean 200 but generated nothing
|
|
53
|
+
* (zero output tokens, no content) — a *content*-integrity
|
|
54
|
+
* failure, not a transport one. The provider looks healthy to
|
|
55
|
+
* every status/network check yet hands the user a blank. We
|
|
56
|
+
* fail over on it like a transient error, but tag it separately
|
|
57
|
+
* so a run of `"empty"` attempts (a quietly degraded model)
|
|
58
|
+
* doesn't hide inside the transient noise.
|
|
52
59
|
*/
|
|
53
|
-
type ErrorKind = "transient" | "auth" | "billing" | "client";
|
|
60
|
+
type ErrorKind = "transient" | "auth" | "billing" | "client" | "empty";
|
|
54
61
|
/** One provider attempt within a single request. */
|
|
55
62
|
interface RouteAttempt {
|
|
56
63
|
/** Provider label that was tried (e.g. "tokenmart"). */
|
|
@@ -109,12 +116,25 @@ interface CallRecord {
|
|
|
109
116
|
/** Computed from the winner's `cost`; 0 if no price was given or the call failed. */
|
|
110
117
|
costUsd: number;
|
|
111
118
|
/**
|
|
112
|
-
* What
|
|
113
|
-
*
|
|
114
|
-
*
|
|
115
|
-
* provider
|
|
119
|
+
* What this same usage would have cost on the savings baseline, so
|
|
120
|
+
* `baselineUsd - costUsd` is what routing actually saved. Text router: the
|
|
121
|
+
* always-on fallback leg — the LAST priced provider in the chain, i.e. the
|
|
122
|
+
* list-price provider you'd fall back to without routing (e.g. OpenRouter).
|
|
123
|
+
* Media router: the model-maker's official direct price. NOT the most
|
|
124
|
+
* expensive leg of the chain: prompt caching can make a sticker-cheaper
|
|
125
|
+
* provider cost more on a cache-heavy call, and a max-of-chain baseline would
|
|
126
|
+
* fabricate a "saving" on calls the fallback itself served. Undefined only
|
|
127
|
+
* when no provider was priced.
|
|
116
128
|
*/
|
|
117
129
|
baselineUsd?: number;
|
|
130
|
+
/**
|
|
131
|
+
* The slice of `costUsd` that prompt-cache reads saved versus paying the full
|
|
132
|
+
* input rate for those same tokens (`cachedTokens × (input − cacheRead)`).
|
|
133
|
+
* Present only when > 0. This is the serving provider's own caching benefit —
|
|
134
|
+
* it happens with or without routing — so it is NOT a routing saving and must
|
|
135
|
+
* be surfaced separately, never folded into `baselineUsd - costUsd`.
|
|
136
|
+
*/
|
|
137
|
+
cachedSavingUsd?: number;
|
|
118
138
|
/**
|
|
119
139
|
* Caller-supplied correlation id, read from `providerOptions.lcr.requestId`
|
|
120
140
|
* on the call. Multi-step tool loops emit one record per `doStream`/
|
|
@@ -129,6 +149,21 @@ interface CallRecord {
|
|
|
129
149
|
* other signal. Treat a flagged record as "cost unknown", not "free".
|
|
130
150
|
*/
|
|
131
151
|
usageMissing?: boolean;
|
|
152
|
+
/**
|
|
153
|
+
* True when the winner served a clean, error-free response that nonetheless
|
|
154
|
+
* generated **nothing**: zero output tokens with a non-empty prompt (and, for
|
|
155
|
+
* streams, not one content part). The user asked and got a blank. Distinct
|
|
156
|
+
* from {@link usageMissing} (which is input *and* output both zero — usage not
|
|
157
|
+
* reported); here the prompt was billed but the model produced no output.
|
|
158
|
+
*
|
|
159
|
+
* Set only when this empty response is what the caller actually received —
|
|
160
|
+
* i.e. every provider in the chain came back empty, so failover couldn't
|
|
161
|
+
* rescue it. (When an earlier provider returns empty but a later one produces
|
|
162
|
+
* content, that earlier attempt is recorded as a failed `empty_completion` hop
|
|
163
|
+
* and this flag stays unset, because the winner did produce output.) Alert on
|
|
164
|
+
* it: a provider that quietly returns blanks passes every health check.
|
|
165
|
+
*/
|
|
166
|
+
emptyCompletion?: boolean;
|
|
132
167
|
}
|
|
133
168
|
/**
|
|
134
169
|
* Normalize an error into a short, log-friendly class for {@link CallRecord}.
|
package/dist/index.js
CHANGED
|
@@ -1,4 +1,10 @@
|
|
|
1
1
|
// src/fallback.ts
|
|
2
|
+
var EmptyCompletionError = class extends Error {
|
|
3
|
+
constructor(provider) {
|
|
4
|
+
super(`ai-lcr: provider "${provider}" returned an empty completion (0 output tokens, no content)`);
|
|
5
|
+
this.name = "EmptyCompletionError";
|
|
6
|
+
}
|
|
7
|
+
};
|
|
2
8
|
var RETRYABLE_STATUS = /* @__PURE__ */ new Set([401, 402, 403, 408, 409, 413, 429, 498, 500]);
|
|
3
9
|
var RETRYABLE_PATTERNS = [
|
|
4
10
|
"overloaded",
|
|
@@ -111,6 +117,7 @@ function isRetryableError(error) {
|
|
|
111
117
|
return RETRYABLE_PATTERNS.some((p) => text.includes(p));
|
|
112
118
|
}
|
|
113
119
|
function classifyError(error) {
|
|
120
|
+
if (error instanceof EmptyCompletionError) return "empty_completion";
|
|
114
121
|
const e = error;
|
|
115
122
|
const status = e?.statusCode ?? e?.status;
|
|
116
123
|
if (typeof status === "number") return String(status);
|
|
@@ -133,6 +140,7 @@ var BILLING_PATTERNS = [
|
|
|
133
140
|
"\u6263\u6B3E"
|
|
134
141
|
];
|
|
135
142
|
function classifyErrorKind(error) {
|
|
143
|
+
if (error instanceof EmptyCompletionError) return "empty";
|
|
136
144
|
const e = error;
|
|
137
145
|
const status = e?.statusCode ?? e?.status;
|
|
138
146
|
const { text } = errorSignals(error);
|
|
@@ -152,10 +160,27 @@ function costForUsage(cost, inputTokens, outputTokens, cacheReadTokens) {
|
|
|
152
160
|
const cachedRate = cost.cacheRead ?? cost.input;
|
|
153
161
|
return fullInput / 1e6 * cost.input + cached / 1e6 * cachedRate + outputTokens / 1e6 * cost.output;
|
|
154
162
|
}
|
|
163
|
+
function cacheSavingForUsage(cost, inputTokens, cacheReadTokens) {
|
|
164
|
+
if (cost.cacheRead === void 0) return 0;
|
|
165
|
+
const cached = Math.min(Math.max(cacheReadTokens, 0), inputTokens);
|
|
166
|
+
return cached / 1e6 * (cost.input - cost.cacheRead);
|
|
167
|
+
}
|
|
155
168
|
function requestIdFrom(options) {
|
|
156
169
|
const raw = options.providerOptions?.lcr?.requestId;
|
|
157
170
|
return typeof raw === "string" && raw.length > 0 ? raw : void 0;
|
|
158
171
|
}
|
|
172
|
+
var CONTENT_PART_TYPES = /* @__PURE__ */ new Set([
|
|
173
|
+
"text-delta",
|
|
174
|
+
"reasoning-delta",
|
|
175
|
+
"tool-call",
|
|
176
|
+
"tool-input-start",
|
|
177
|
+
"tool-input-delta",
|
|
178
|
+
"tool-input-end",
|
|
179
|
+
"file",
|
|
180
|
+
"source",
|
|
181
|
+
"tool-result",
|
|
182
|
+
"raw"
|
|
183
|
+
]);
|
|
159
184
|
var LcrFallbackModel = class {
|
|
160
185
|
constructor(opts) {
|
|
161
186
|
this.opts = opts;
|
|
@@ -256,19 +281,22 @@ var LcrFallbackModel = class {
|
|
|
256
281
|
});
|
|
257
282
|
}
|
|
258
283
|
/**
|
|
259
|
-
* Baseline = what this same usage would have cost on the
|
|
260
|
-
*
|
|
261
|
-
*
|
|
262
|
-
* the
|
|
284
|
+
* Baseline = what this same usage would have cost on the always-on fallback:
|
|
285
|
+
* the LAST priced leg of the chain (by convention the list-price provider you'd
|
|
286
|
+
* use without routing — e.g. OpenRouter, always last). The winner's saving is
|
|
287
|
+
* `baselineUsd - costUsd`. We take the last priced leg, NOT the most expensive
|
|
288
|
+
* one: prompt caching can make a sticker-cheaper provider (no `cacheRead`) cost
|
|
289
|
+
* MORE on a cache-heavy call, and a max-of-chain baseline would then fabricate a
|
|
290
|
+
* "saving" even on calls the fallback itself served. Undefined when no provider
|
|
291
|
+
* in the chain carries a price (nothing to compare against).
|
|
263
292
|
*/
|
|
264
293
|
baselineUsd(inputTokens, outputTokens, cacheReadTokens) {
|
|
265
|
-
let
|
|
294
|
+
let baseline;
|
|
266
295
|
for (const p of this.opts.providers) {
|
|
267
296
|
if (!p.cost) continue;
|
|
268
|
-
|
|
269
|
-
if (max === void 0 || c > max) max = c;
|
|
297
|
+
baseline = costForUsage(p.cost, inputTokens, outputTokens, cacheReadTokens);
|
|
270
298
|
}
|
|
271
|
-
return
|
|
299
|
+
return baseline;
|
|
272
300
|
}
|
|
273
301
|
/** Winner settled: record the attempt, fire `onCost` (compat) + `onCall`. */
|
|
274
302
|
finalizeOk(ctx, provider, attemptStart, usage, ttftMs) {
|
|
@@ -277,7 +305,9 @@ var LcrFallbackModel = class {
|
|
|
277
305
|
const outputTokens = usage?.outputTokens?.total ?? 0;
|
|
278
306
|
const cacheReadTokens = usage?.inputTokens?.cacheRead ?? 0;
|
|
279
307
|
const costUsd = provider.cost ? costForUsage(provider.cost, inputTokens, outputTokens, cacheReadTokens) : 0;
|
|
308
|
+
const cachedSavingUsd = provider.cost ? cacheSavingForUsage(provider.cost, inputTokens, cacheReadTokens) : 0;
|
|
280
309
|
const usageMissing = inputTokens === 0 && outputTokens === 0;
|
|
310
|
+
const emptyCompletion = inputTokens > 0 && outputTokens === 0;
|
|
281
311
|
this.emitCost({
|
|
282
312
|
model: this.opts.modelName,
|
|
283
313
|
provider: provider.label,
|
|
@@ -299,8 +329,10 @@ var LcrFallbackModel = class {
|
|
|
299
329
|
...cacheReadTokens > 0 ? { cachedInputTokens: cacheReadTokens } : {},
|
|
300
330
|
costUsd,
|
|
301
331
|
baselineUsd: this.baselineUsd(inputTokens, outputTokens, cacheReadTokens),
|
|
332
|
+
...cachedSavingUsd > 0 ? { cachedSavingUsd } : {},
|
|
302
333
|
...ctx.requestId ? { requestId: ctx.requestId } : {},
|
|
303
|
-
...usageMissing ? { usageMissing: true } : {}
|
|
334
|
+
...usageMissing ? { usageMissing: true } : {},
|
|
335
|
+
...emptyCompletion ? { emptyCompletion: true } : {}
|
|
304
336
|
});
|
|
305
337
|
}
|
|
306
338
|
/** Every provider failed: fire `onCall` with no winner. */
|
|
@@ -331,6 +363,15 @@ var LcrFallbackModel = class {
|
|
|
331
363
|
const attemptStart = Date.now();
|
|
332
364
|
try {
|
|
333
365
|
const result = await provider.model.doGenerate(options);
|
|
366
|
+
const out = result.usage?.outputTokens?.total ?? 0;
|
|
367
|
+
const inp = result.usage?.inputTokens?.total ?? 0;
|
|
368
|
+
if (inp > 0 && out === 0 && tried < n - 1) {
|
|
369
|
+
const emptyErr = new EmptyCompletionError(provider.label);
|
|
370
|
+
lastError = emptyErr;
|
|
371
|
+
this.emitError(emptyErr, provider.label);
|
|
372
|
+
this.recordFail(ctx, provider, attemptStart, emptyErr);
|
|
373
|
+
continue;
|
|
374
|
+
}
|
|
334
375
|
this.settleSticky(idx);
|
|
335
376
|
this.finalizeOk(ctx, provider, attemptStart, result.usage);
|
|
336
377
|
return result;
|
|
@@ -392,7 +433,7 @@ var LcrFallbackModel = class {
|
|
|
392
433
|
const servingIdx = idx;
|
|
393
434
|
const triedBeforeServing = tried;
|
|
394
435
|
let usage;
|
|
395
|
-
let
|
|
436
|
+
let contentStreamed = false;
|
|
396
437
|
let ttftMs;
|
|
397
438
|
const stream = new ReadableStream({
|
|
398
439
|
async start(controller) {
|
|
@@ -401,17 +442,24 @@ var LcrFallbackModel = class {
|
|
|
401
442
|
reader = result.stream.getReader();
|
|
402
443
|
for (; ; ) {
|
|
403
444
|
const { done, value } = await reader.read();
|
|
404
|
-
if (!
|
|
445
|
+
if (!contentStreamed && value && typeof value === "object" && "error" in value) {
|
|
405
446
|
const err = value.error;
|
|
406
447
|
if (self.shouldRetry(err)) throw err;
|
|
407
448
|
}
|
|
408
449
|
if (done) break;
|
|
409
|
-
if (value.type === "finish")
|
|
450
|
+
if (value.type === "finish") {
|
|
451
|
+
usage = value.usage;
|
|
452
|
+
const out = value.usage?.outputTokens?.total ?? 0;
|
|
453
|
+
const inp = value.usage?.inputTokens?.total ?? 0;
|
|
454
|
+
if (inp > 0 && out === 0 && !contentStreamed && triedBeforeServing + 1 < n) {
|
|
455
|
+
throw new EmptyCompletionError(servingProvider.label);
|
|
456
|
+
}
|
|
457
|
+
}
|
|
410
458
|
if (ttftMs === void 0 && (value.type === "text-delta" || value.type === "reasoning-delta")) {
|
|
411
459
|
ttftMs = Date.now() - servingAttemptStart;
|
|
412
460
|
}
|
|
413
461
|
controller.enqueue(value);
|
|
414
|
-
if (value.type
|
|
462
|
+
if (CONTENT_PART_TYPES.has(value.type)) contentStreamed = true;
|
|
415
463
|
}
|
|
416
464
|
self.settleSticky(servingIdx);
|
|
417
465
|
self.finalizeOk(ctx, servingProvider, servingAttemptStart, usage, ttftMs);
|
|
@@ -419,7 +467,7 @@ var LcrFallbackModel = class {
|
|
|
419
467
|
} catch (error) {
|
|
420
468
|
self.emitError(error, servingProvider.label);
|
|
421
469
|
self.recordFail(ctx, servingProvider, servingAttemptStart, error);
|
|
422
|
-
if (!
|
|
470
|
+
if (!contentStreamed) {
|
|
423
471
|
const nextTried = triedBeforeServing + 1;
|
|
424
472
|
if (nextTried >= n) {
|
|
425
473
|
self.finalizeFail(ctx);
|
|
@@ -482,6 +530,7 @@ function formatCallRecord(record, opts = {}) {
|
|
|
482
530
|
line += ` (saved $${(record.baselineUsd - record.costUsd).toFixed(4)})`;
|
|
483
531
|
}
|
|
484
532
|
if (record.usageMissing) line += ` \u26A0no-usage`;
|
|
533
|
+
if (record.emptyCompletion) line += ` \u26A0empty`;
|
|
485
534
|
const failed = record.attempts.filter((a) => !a.ok);
|
|
486
535
|
if (failed.length > 0) {
|
|
487
536
|
const reasons = failed.map((a) => `${a.provider} ${a.errorClass ?? "error"}`).join(", ");
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "ai-lcr",
|
|
3
|
-
"version": "0.5.
|
|
3
|
+
"version": "0.5.2",
|
|
4
4
|
"description": "Least Cost Routing for LLMs — route every model call to the cheapest available provider, fall back automatically, and track real cost. Built for the Vercel AI SDK.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"ai",
|