@elizaos/plugin-elizacloud 2.0.0-beta.1 → 2.0.11-beta.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -44
- package/auto-enable.ts +10 -5
- package/dist/browser/index.browser.js +2 -2
- package/dist/browser/index.browser.js.map +4 -4
- package/dist/cjs/index.node.cjs +2874 -5915
- package/dist/cjs/index.node.js.map +47 -116
- package/dist/cloud/auth-service-types.d.ts +8 -0
- package/dist/cloud/auth-service-types.d.ts.map +1 -0
- package/dist/cloud/auth-service-types.js +36 -0
- package/dist/cloud/auth-service-types.js.map +10 -0
- package/dist/cloud/auth.js +4 -51
- package/dist/cloud/auth.js.map +4 -4
- package/dist/cloud/base-url.d.ts +6 -2
- package/dist/cloud/base-url.d.ts.map +1 -1
- package/dist/cloud/base-url.js +3 -51
- package/dist/cloud/base-url.js.map +3 -3
- package/dist/cloud/bridge-client.d.ts +3 -3
- package/dist/cloud/bridge-client.d.ts.map +1 -1
- package/dist/cloud/bridge-client.js +3 -51
- package/dist/cloud/bridge-client.js.map +3 -3
- package/dist/cloud/clack-observer.d.ts +35 -0
- package/dist/cloud/clack-observer.d.ts.map +1 -0
- package/dist/cloud/clack-observer.js +143 -0
- package/dist/cloud/clack-observer.js.map +10 -0
- package/dist/cloud/cloud-manager.js +45 -92
- package/dist/cloud/cloud-manager.js.map +6 -6
- package/dist/cloud/cloud-wallet.js +2 -4835
- package/dist/cloud/cloud-wallet.js.map +3 -82
- package/dist/cloud/duffel-client.d.ts +181 -0
- package/dist/cloud/duffel-client.d.ts.map +1 -0
- package/dist/cloud/duffel-client.js +506 -0
- package/dist/cloud/duffel-client.js.map +11 -0
- package/dist/cloud/index.d.ts +6 -0
- package/dist/cloud/index.d.ts.map +1 -1
- package/dist/cloud/index.js +1782 -1
- package/dist/cloud/index.js.map +18 -3
- package/dist/cloud/lifeops-schedule-sync-client.d.ts +43 -0
- package/dist/cloud/lifeops-schedule-sync-client.d.ts.map +1 -0
- package/dist/cloud/lifeops-schedule-sync-client.js +180 -0
- package/dist/cloud/lifeops-schedule-sync-client.js.map +11 -0
- package/dist/cloud/lifeops-schedule-sync-contracts.d.ts +89 -0
- package/dist/cloud/lifeops-schedule-sync-contracts.d.ts.map +1 -0
- package/dist/cloud/lifeops-schedule-sync-contracts.js +39 -0
- package/dist/cloud/lifeops-schedule-sync-contracts.js.map +10 -0
- package/dist/cloud/managed-payment-clients.d.ts +166 -0
- package/dist/cloud/managed-payment-clients.d.ts.map +1 -0
- package/dist/cloud/managed-payment-clients.js +238 -0
- package/dist/cloud/managed-payment-clients.js.map +11 -0
- package/dist/cloud/null-observer.d.ts +35 -0
- package/dist/cloud/null-observer.d.ts.map +1 -0
- package/dist/cloud/null-observer.js +45 -0
- package/dist/cloud/null-observer.js.map +10 -0
- package/dist/cloud/setup-observer.d.ts +98 -0
- package/dist/cloud/setup-observer.d.ts.map +1 -0
- package/dist/cloud/setup-observer.js +2 -0
- package/dist/cloud/setup-observer.js.map +9 -0
- package/dist/cloud/validate-url.d.ts.map +1 -1
- package/dist/cloud/validate-url.js +2 -1
- package/dist/cloud/validate-url.js.map +3 -3
- package/dist/cloud/x402-payment-handler.d.ts +85 -0
- package/dist/cloud/x402-payment-handler.d.ts.map +1 -0
- package/dist/cloud/x402-payment-handler.js +119 -0
- package/dist/cloud/x402-payment-handler.js.map +10 -0
- package/dist/cloud-setup.d.ts +36 -0
- package/dist/cloud-setup.d.ts.map +1 -0
- package/dist/{onboarding.js → cloud-setup.js} +139 -139
- package/dist/cloud-setup.js.map +14 -0
- package/dist/cloud-voice-catalog.d.ts +65 -0
- package/dist/cloud-voice-catalog.d.ts.map +1 -0
- package/dist/cloud-voice-catalog.js +278 -0
- package/dist/cloud-voice-catalog.js.map +12 -0
- package/dist/index.browser.d.ts +11 -0
- package/dist/index.browser.d.ts.map +1 -1
- package/dist/index.d.ts +7 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +5416 -8405
- package/dist/index.js.map +48 -116
- package/dist/index.node.d.ts +8 -1
- package/dist/index.node.d.ts.map +1 -1
- package/dist/init.js +17 -4
- package/dist/init.js.map +4 -4
- package/dist/lib/cloud-connection.d.ts +0 -1
- package/dist/lib/cloud-connection.d.ts.map +1 -1
- package/dist/lib/cloud-connection.js +14 -91
- package/dist/lib/cloud-connection.js.map +7 -7
- package/dist/lib/cloud-secrets.d.ts +5 -18
- package/dist/lib/cloud-secrets.d.ts.map +1 -1
- package/dist/lib/cloud-secrets.js +8 -36
- package/dist/lib/cloud-secrets.js.map +3 -3
- package/dist/lib/config-like.d.ts +1 -1
- package/dist/lib/config-like.d.ts.map +1 -1
- package/dist/lib/config-like.js +3 -3
- package/dist/lib/config-like.js.map +3 -3
- package/dist/lib/credential-type-map.d.ts +1 -1
- package/dist/lib/credential-type-map.js.map +1 -1
- package/dist/lib/http.d.ts +0 -11
- package/dist/lib/http.d.ts.map +1 -1
- package/dist/lib/http.js.map +2 -2
- package/dist/lib/server-cloud-tts.d.ts +12 -25
- package/dist/lib/server-cloud-tts.d.ts.map +1 -1
- package/dist/lib/server-cloud-tts.js +31 -329
- package/dist/lib/server-cloud-tts.js.map +4 -7
- package/dist/lib/tts-debug.d.ts +5 -3
- package/dist/lib/tts-debug.d.ts.map +1 -1
- package/dist/lib/tts-debug.js +1 -34
- package/dist/lib/tts-debug.js.map +3 -4
- package/dist/models/embeddings.d.ts.map +1 -1
- package/dist/models/embeddings.js +79 -69
- package/dist/models/embeddings.js.map +6 -6
- package/dist/models/image.d.ts.map +1 -1
- package/dist/models/image.js +42 -15
- package/dist/models/image.js.map +6 -6
- package/dist/models/index.js +676 -166
- package/dist/models/index.js.map +11 -12
- package/dist/models/research.d.ts.map +1 -1
- package/dist/models/research.js +24 -7
- package/dist/models/research.js.map +6 -6
- package/dist/models/speech.d.ts +61 -3
- package/dist/models/speech.d.ts.map +1 -1
- package/dist/models/speech.js +173 -17
- package/dist/models/speech.js.map +5 -5
- package/dist/models/text.d.ts +106 -1
- package/dist/models/text.d.ts.map +1 -1
- package/dist/models/text.js +452 -82
- package/dist/models/text.js.map +7 -8
- package/dist/models/tokenization.d.ts.map +1 -1
- package/dist/models/tokenization.js.map +2 -2
- package/dist/models/transcription.d.ts.map +1 -1
- package/dist/models/transcription.js +20 -6
- package/dist/models/transcription.js.map +5 -5
- package/dist/node/index.node.js +2828 -5838
- package/dist/node/index.node.js.map +47 -116
- package/dist/plugin.d.ts.map +1 -1
- package/dist/plugin.js +376 -5050
- package/dist/plugin.js.map +16 -92
- package/dist/providers/openai.js +11 -2
- package/dist/providers/openai.js.map +3 -3
- package/dist/register-routes.js +376 -5050
- package/dist/register-routes.js.map +16 -92
- package/dist/routes/cloud-billing-routes.d.ts.map +1 -1
- package/dist/routes/cloud-billing-routes.js +17 -60
- package/dist/routes/cloud-billing-routes.js.map +8 -7
- package/dist/routes/cloud-coding-container-routes.d.ts +8 -0
- package/dist/routes/cloud-coding-container-routes.d.ts.map +1 -0
- package/dist/routes/cloud-coding-container-routes.js +214 -0
- package/dist/routes/cloud-coding-container-routes.js.map +11 -0
- package/dist/routes/cloud-compat-routes.d.ts.map +1 -1
- package/dist/routes/cloud-compat-routes.js +17 -60
- package/dist/routes/cloud-compat-routes.js.map +8 -7
- package/dist/routes/cloud-features-routes.js +2 -2
- package/dist/routes/cloud-features-routes.js.map +4 -4
- package/dist/routes/cloud-relay-routes.d.ts +2 -1
- package/dist/routes/cloud-relay-routes.d.ts.map +1 -1
- package/dist/routes/cloud-relay-routes.js +84 -2
- package/dist/routes/cloud-relay-routes.js.map +5 -4
- package/dist/routes/cloud-routes-autonomous.d.ts +3 -4
- package/dist/routes/cloud-routes-autonomous.d.ts.map +1 -1
- package/dist/routes/cloud-routes-autonomous.js +11 -4893
- package/dist/routes/cloud-routes-autonomous.js.map +8 -87
- package/dist/routes/cloud-routes.d.ts +2 -2
- package/dist/routes/cloud-routes.d.ts.map +1 -1
- package/dist/routes/cloud-routes.js +343 -5058
- package/dist/routes/cloud-routes.js.map +13 -90
- package/dist/routes/cloud-status-routes-autonomous.d.ts +1 -2
- package/dist/routes/cloud-status-routes-autonomous.d.ts.map +1 -1
- package/dist/routes/cloud-status-routes-autonomous.js +4 -51
- package/dist/routes/cloud-status-routes-autonomous.js.map +5 -5
- package/dist/routes/cloud-status-routes.js +14 -90
- package/dist/routes/cloud-status-routes.js.map +7 -7
- package/dist/routes/home-remote-runner-access-url.d.ts +16 -0
- package/dist/routes/home-remote-runner-access-url.d.ts.map +1 -0
- package/dist/routes/home-remote-runner-access-url.js +91 -0
- package/dist/routes/home-remote-runner-access-url.js.map +10 -0
- package/dist/routes/travel-provider-relay-routes.d.ts +9 -0
- package/dist/routes/travel-provider-relay-routes.d.ts.map +1 -0
- package/dist/routes/travel-provider-relay-routes.js +358 -0
- package/dist/routes/travel-provider-relay-routes.js.map +14 -0
- package/dist/services/cloud-auth.d.ts +1 -1
- package/dist/services/cloud-auth.d.ts.map +1 -1
- package/dist/services/cloud-auth.js +7 -2
- package/dist/services/cloud-auth.js.map +4 -4
- package/dist/services/cloud-backup.js.map +2 -2
- package/dist/services/cloud-bootstrap.d.ts.map +1 -1
- package/dist/services/cloud-bootstrap.js.map +2 -2
- package/dist/services/cloud-bridge.js.map +3 -3
- package/dist/services/cloud-container.d.ts +5 -1
- package/dist/services/cloud-container.d.ts.map +1 -1
- package/dist/services/cloud-container.js +52 -1
- package/dist/services/cloud-container.js.map +4 -4
- package/dist/services/cloud-credential-provider.js.map +2 -2
- package/dist/services/cloud-model-registry.js.map +2 -2
- package/dist/types/cloud.d.ts +1 -0
- package/dist/types/cloud.d.ts.map +1 -1
- package/dist/types/cloud.js.map +2 -2
- package/dist/types/index.d.ts +1 -1
- package/dist/types/index.d.ts.map +1 -1
- package/dist/utils/cloud-sdk/client.d.ts.map +1 -1
- package/dist/utils/cloud-sdk/client.js +136 -4
- package/dist/utils/cloud-sdk/client.js.map +5 -5
- package/dist/utils/cloud-sdk/http.js.map +1 -1
- package/dist/utils/cloud-sdk/public-routes.d.ts +186 -0
- package/dist/utils/cloud-sdk/public-routes.d.ts.map +1 -1
- package/dist/utils/cloud-sdk/public-routes.js +99 -1
- package/dist/utils/cloud-sdk/public-routes.js.map +3 -3
- package/dist/utils/cloud-sdk/types.d.ts +0 -2
- package/dist/utils/cloud-sdk/types.d.ts.map +1 -1
- package/dist/utils/cloud-sdk/types.js.map +1 -1
- package/dist/utils/config.d.ts +10 -1
- package/dist/utils/config.d.ts.map +1 -1
- package/dist/utils/config.js +12 -2
- package/dist/utils/config.js.map +3 -3
- package/dist/utils/events.d.ts +23 -2
- package/dist/utils/events.d.ts.map +1 -1
- package/dist/utils/events.js +5 -3
- package/dist/utils/events.js.map +3 -3
- package/dist/utils/sdk-client.d.ts.map +1 -1
- package/dist/utils/sdk-client.js +17 -4
- package/dist/utils/sdk-client.js.map +4 -4
- package/dist/utils/waifu-metering.d.ts +108 -0
- package/dist/utils/waifu-metering.d.ts.map +1 -0
- package/dist/utils/waifu-metering.js +166 -0
- package/dist/utils/waifu-metering.js.map +10 -0
- package/package.json +51 -22
- package/src/cloud/auth-service-types.ts +24 -0
- package/src/cloud/base-url.ts +6 -62
- package/src/cloud/clack-observer.ts +189 -0
- package/src/cloud/duffel-client.ts +847 -0
- package/src/cloud/index.ts +10 -0
- package/src/cloud/lifeops-schedule-sync-client.ts +245 -0
- package/src/cloud/lifeops-schedule-sync-contracts.ts +124 -0
- package/src/cloud/managed-payment-clients.ts +374 -0
- package/src/cloud/null-observer.ts +45 -0
- package/src/cloud/setup-observer.ts +125 -0
- package/src/cloud/validate-url.ts +7 -1
- package/src/cloud/x402-payment-handler.ts +215 -0
- package/src/cloud-setup.ts +531 -0
- package/src/cloud-voice-catalog.test.ts +254 -0
- package/src/cloud-voice-catalog.ts +246 -0
- package/src/index.browser.ts +29 -0
- package/src/index.node.ts +31 -1
- package/src/index.ts +76 -4
- package/src/lib/cloud-connection.ts +2 -4
- package/src/lib/cloud-secrets.ts +10 -54
- package/src/lib/config-like.ts +1 -1
- package/src/lib/credential-type-map.ts +2 -2
- package/src/lib/http.ts +0 -17
- package/src/lib/server-cloud-tts.ts +33 -341
- package/src/lib/tts-debug.ts +5 -34
- package/src/models/embeddings.ts +140 -76
- package/src/models/image.ts +29 -14
- package/src/models/research.ts +11 -1
- package/src/models/speech.ts +269 -23
- package/src/models/text.ts +704 -110
- package/src/models/tokenization.ts +2 -2
- package/src/models/transcription.ts +7 -3
- package/src/plugin.ts +38 -0
- package/src/routes/cloud-billing-routes.ts +4 -14
- package/src/routes/cloud-coding-container-routes.ts +198 -0
- package/src/routes/cloud-compat-routes.ts +4 -14
- package/src/routes/cloud-features-routes.ts +1 -1
- package/src/routes/cloud-relay-routes.ts +47 -1
- package/src/routes/cloud-routes-autonomous.ts +7 -10
- package/src/routes/cloud-routes.ts +68 -7
- package/src/routes/cloud-status-routes-autonomous.ts +6 -2
- package/src/routes/home-remote-runner-access-url.ts +83 -0
- package/src/routes/travel-provider-relay-routes.ts +193 -0
- package/src/services/cloud-auth.ts +9 -2
- package/src/services/cloud-bootstrap.ts +1 -3
- package/src/services/cloud-bridge.ts +1 -1
- package/src/services/cloud-container.ts +93 -0
- package/src/services/cloud-credential-provider.ts +1 -1
- package/src/services/cloud-model-registry.ts +1 -1
- package/src/types/cloud.ts +22 -0
- package/src/types/index.ts +19 -0
- package/src/utils/cloud-sdk/client.ts +42 -3
- package/src/utils/cloud-sdk/public-routes.ts +168 -0
- package/src/utils/cloud-sdk/types.ts +0 -2
- package/src/utils/config.ts +20 -1
- package/src/utils/events.ts +30 -2
- package/src/utils/sdk-client.ts +5 -1
- package/src/utils/waifu-metering.ts +302 -0
- package/dist/onboarding.d.ts +0 -35
- package/dist/onboarding.d.ts.map +0 -1
- package/dist/onboarding.js.map +0 -14
- package/src/onboarding.ts +0 -396
package/src/models/text.ts
CHANGED
|
@@ -3,19 +3,20 @@ import type {
|
|
|
3
3
|
IAgentRuntime,
|
|
4
4
|
ModelTypeName,
|
|
5
5
|
TextStreamResult,
|
|
6
|
+
TokenUsage,
|
|
6
7
|
} from "@elizaos/core";
|
|
7
8
|
import {
|
|
8
9
|
buildCanonicalSystemPrompt,
|
|
9
10
|
logger,
|
|
10
11
|
ModelType,
|
|
12
|
+
recordInferenceSpan,
|
|
11
13
|
renderChatMessagesForPrompt,
|
|
12
14
|
resolveEffectiveSystemPrompt,
|
|
15
|
+
Semaphore,
|
|
16
|
+
timeInferenceSpan,
|
|
13
17
|
} from "@elizaos/core";
|
|
14
|
-
import type { LanguageModel } from "ai";
|
|
15
|
-
import { createOpenAIClient } from "../providers/openai";
|
|
16
18
|
import {
|
|
17
19
|
getActionPlannerModel,
|
|
18
|
-
getExperimentalTelemetry,
|
|
19
20
|
getLargeModel,
|
|
20
21
|
getMediumModel,
|
|
21
22
|
getMegaModel,
|
|
@@ -36,6 +37,150 @@ const RESPONSE_HANDLER_MODEL_TYPE = (ModelType.RESPONSE_HANDLER ??
|
|
|
36
37
|
"RESPONSE_HANDLER") as ModelTypeName;
|
|
37
38
|
const ACTION_PLANNER_MODEL_TYPE = (ModelType.ACTION_PLANNER ?? "ACTION_PLANNER") as ModelTypeName;
|
|
38
39
|
|
|
40
|
+
/**
|
|
41
|
+
* Per-process cap on CONCURRENT native cloud text calls.
|
|
42
|
+
*
|
|
43
|
+
* Covers BOTH native cloud text routes that share the one cerebras key:
|
|
44
|
+
* the `/chat/completions` round-trip (native-transport callers) AND the
|
|
45
|
+
* `/responses` round-trip (bare-`{ prompt }` callers, incl. the primary reply
|
|
46
|
+
* action). Same model name -> same shared key -> same concurrency budget, so
|
|
47
|
+
* both routes must funnel through this one semaphore or a bare-prompt call can
|
|
48
|
+
* still push the key over its limit.
|
|
49
|
+
*
|
|
50
|
+
* The per-turn burst that triggers the 429 comes from the prompt BATCHER
|
|
51
|
+
* (`dynamicPromptExecFromState`, which always sets providerOptions -> native
|
|
52
|
+
* `/chat/completions`) and the merged evaluator call — NOT from composeState
|
|
53
|
+
* providers (no provider calls `useModel` during composeState). Firing those
|
|
54
|
+
* at once overruns the ONE shared cerebras key's concurrent-request limit
|
|
55
|
+
* -> 429 -> 3 retries x backoff -> 30-63s of latency. Capping in-flight calls
|
|
56
|
+
* through a small semaphore keeps each call ~3s with no 429, without needing
|
|
57
|
+
* more keys or backend changes.
|
|
58
|
+
*
|
|
59
|
+
* Default is a SAFETY CEILING, not full serialization: the paid cerebras key
|
|
60
|
+
* (1000 req/min) and leaner per-turn call counts make the 429 risk small, so
|
|
61
|
+
* the default of 8 leaves the typical 1-3 concurrent calls/turn untouched while
|
|
62
|
+
* still bounding a pathological burst. The limiter is process-global and keys
|
|
63
|
+
* on native transport, not the model, so it also bounds non-cerebras native
|
|
64
|
+
* calls (e.g. zai-glm-4.7) — a high default avoids serializing those. Set
|
|
65
|
+
* `ELIZAOS_CLOUD_NATIVE_CONCURRENCY` (positive integer) to tighten it (1 = fully
|
|
66
|
+
* serialize) on a cerebras-bottlenecked single-key deployment, or raise it for
|
|
67
|
+
* more parallelism. Embeddings use a SEPARATE `/embeddings` route
|
|
68
|
+
* (embeddings.ts) and are intentionally NOT gated here.
|
|
69
|
+
*/
|
|
70
|
+
const NATIVE_CONCURRENCY_ENV = "ELIZAOS_CLOUD_NATIVE_CONCURRENCY";
|
|
71
|
+
const DEFAULT_NATIVE_CONCURRENCY = 8;
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Client-side timeout for cloud text round-trips. Without this the handler
|
|
75
|
+
* passes no `timeoutMs`/`signal` to `requestRaw`, so a hung/slow gateway holds
|
|
76
|
+
* the concurrency permit AND stalls the whole turn until fetch's own (very
|
|
77
|
+
* long) default. `ELIZAOS_CLOUD_TEXT_TIMEOUT_MS` overrides; `0`/negative opts
|
|
78
|
+
* out (no client-side timeout).
|
|
79
|
+
*/
|
|
80
|
+
const TEXT_TIMEOUT_ENV = "ELIZAOS_CLOUD_TEXT_TIMEOUT_MS";
|
|
81
|
+
const DEFAULT_TEXT_TIMEOUT_MS = 120_000;
|
|
82
|
+
|
|
83
|
+
export function resolveTextTimeoutMs(): number | undefined {
|
|
84
|
+
const raw =
|
|
85
|
+
typeof process !== "undefined" ? process.env[TEXT_TIMEOUT_ENV] : undefined;
|
|
86
|
+
if (raw === undefined || raw.trim() === "") return DEFAULT_TEXT_TIMEOUT_MS;
|
|
87
|
+
const parsed = Number.parseInt(raw, 10);
|
|
88
|
+
if (!Number.isFinite(parsed)) return DEFAULT_TEXT_TIMEOUT_MS;
|
|
89
|
+
return parsed <= 0 ? undefined : parsed;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Token-by-token streaming of the native `/chat/completions` round-trip. On by
|
|
94
|
+
* default so the user-visible reply renders from the first token instead of
|
|
95
|
+
* waiting for the whole generation. `ELIZAOS_CLOUD_STREAMING=0`/`false`/`off`
|
|
96
|
+
* forces the buffered path (kill-switch). Streaming only engages when the
|
|
97
|
+
* runtime actually requests it (`params.stream`), so non-streaming callers
|
|
98
|
+
* (connectors with no UI stream) are unaffected.
|
|
99
|
+
*/
|
|
100
|
+
const STREAMING_ENV = "ELIZAOS_CLOUD_STREAMING";
|
|
101
|
+
|
|
102
|
+
export function resolveStreamingEnabled(): boolean {
|
|
103
|
+
const raw = typeof process !== "undefined" ? process.env[STREAMING_ENV] : undefined;
|
|
104
|
+
if (raw === undefined) return true;
|
|
105
|
+
const v = raw.trim().toLowerCase();
|
|
106
|
+
return v !== "0" && v !== "false" && v !== "off";
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Combine the runtime's abort signal with the client-side timeout into one
|
|
111
|
+
* signal for `requestRaw`. A stream is long-lived, so it should abort on EITHER
|
|
112
|
+
* a caller cancel OR the timeout — `requestRaw` honors only a single signal, so
|
|
113
|
+
* merge them here.
|
|
114
|
+
*/
|
|
115
|
+
function buildStreamAbortSignal(
|
|
116
|
+
abortSignal: AbortSignal | undefined,
|
|
117
|
+
timeoutMs: number | undefined
|
|
118
|
+
): AbortSignal | undefined {
|
|
119
|
+
const timeoutSig =
|
|
120
|
+
typeof timeoutMs === "number" && timeoutMs > 0
|
|
121
|
+
? AbortSignal.timeout(timeoutMs)
|
|
122
|
+
: undefined;
|
|
123
|
+
if (abortSignal && timeoutSig) return AbortSignal.any([abortSignal, timeoutSig]);
|
|
124
|
+
return abortSignal ?? timeoutSig;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
let nativeChatLimiter: Semaphore | null = null;
|
|
128
|
+
|
|
129
|
+
function resolveNativeConcurrency(): number {
|
|
130
|
+
const raw =
|
|
131
|
+
typeof process !== "undefined" ? process.env[NATIVE_CONCURRENCY_ENV] : undefined;
|
|
132
|
+
const parsed = raw ? Number.parseInt(raw, 10) : Number.NaN;
|
|
133
|
+
return Number.isFinite(parsed) && parsed > 0 ? parsed : DEFAULT_NATIVE_CONCURRENCY;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
function getNativeChatLimiter(): Semaphore {
|
|
137
|
+
if (!nativeChatLimiter) {
|
|
138
|
+
nativeChatLimiter = new Semaphore(resolveNativeConcurrency());
|
|
139
|
+
}
|
|
140
|
+
return nativeChatLimiter;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Run a single cerebras-bound network round-trip under the shared per-process
|
|
145
|
+
* concurrency cap. Hold the permit only across `fn` (the `requestRaw` call);
|
|
146
|
+
* release the instant the server responds so response-body parsing runs
|
|
147
|
+
* unguarded. `finally` frees the permit even on throw so a failed call never
|
|
148
|
+
* starves the queue. Used by BOTH native text routes (`/chat/completions` and
|
|
149
|
+
* `/responses`) so every cerebras text call shares one budget.
|
|
150
|
+
*
|
|
151
|
+
* Exported for unit tests that drive the shared cap directly.
|
|
152
|
+
*
|
|
153
|
+
* `label` (e.g. `responses` / `chat/completions`) tags the latency spans this
|
|
154
|
+
* records on the active per-turn inference timer: `cloud.semaphore-wait` (time
|
|
155
|
+
* spent queued for a permit — non-zero means the cap is serializing) and
|
|
156
|
+
* `cloud.http:<label>` (the network round-trip). Both are no-ops when no turn
|
|
157
|
+
* timer is active.
|
|
158
|
+
*/
|
|
159
|
+
export async function withNativeChatLimit<T>(
|
|
160
|
+
fn: () => Promise<T>,
|
|
161
|
+
label = "native"
|
|
162
|
+
): Promise<T> {
|
|
163
|
+
const limiter = getNativeChatLimiter();
|
|
164
|
+
const waitStartedAt = Date.now();
|
|
165
|
+
await limiter.acquire();
|
|
166
|
+
recordInferenceSpan("cloud.semaphore-wait", Date.now() - waitStartedAt, {
|
|
167
|
+
route: label,
|
|
168
|
+
});
|
|
169
|
+
try {
|
|
170
|
+
return await timeInferenceSpan(`cloud.http:${label}`, fn, { route: label });
|
|
171
|
+
} finally {
|
|
172
|
+
limiter.release();
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Test-only: discard the cached limiter so the next call re-reads the env knob.
|
|
178
|
+
* Production code never needs this — the knob is read once per process.
|
|
179
|
+
*/
|
|
180
|
+
export function __resetNativeChatLimiterForTests(): void {
|
|
181
|
+
nativeChatLimiter = null;
|
|
182
|
+
}
|
|
183
|
+
|
|
39
184
|
type ResponsesApiResponse = Record<string, unknown> & {
|
|
40
185
|
error?: {
|
|
41
186
|
message?: string;
|
|
@@ -61,7 +206,6 @@ const REASONING_MODEL_PATTERNS = [
|
|
|
61
206
|
"claude-opus-4-7",
|
|
62
207
|
"gpt-5",
|
|
63
208
|
] as const;
|
|
64
|
-
const RESPONSES_ROUTED_PREFIXES = ["openai/", "anthropic/"] as const;
|
|
65
209
|
type ChatAttachment = {
|
|
66
210
|
data: string | Uint8Array | URL;
|
|
67
211
|
mediaType: string;
|
|
@@ -121,37 +265,87 @@ type ChatCompletionsResponse = Record<string, unknown> & {
|
|
|
121
265
|
usage?: Record<string, unknown>;
|
|
122
266
|
};
|
|
123
267
|
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
...(attachment.filename ? { filename: attachment.filename } : {}),
|
|
141
|
-
});
|
|
142
|
-
}
|
|
268
|
+
/**
|
|
269
|
+
* Eliza-Cloud-hosted `eliza-1` model ids that run a fork of llama-server (or
|
|
270
|
+
* vLLM with the eliza1 parsers) capable of honoring the `x-eliza-span-samplers`
|
|
271
|
+
* header. Other upstreams (OpenAI / Anthropic / generic OpenRouter) strip
|
|
272
|
+
* unknown headers safely, but to keep the wire surface narrow we only attach
|
|
273
|
+
* the per-span sampler plan when the resolved model is one we know honors it.
|
|
274
|
+
*
|
|
275
|
+
* The "we know" bound is conservative — extend the prefix list when a new
|
|
276
|
+
* fork-built deployment lands. The fallback is "do not send the header" which
|
|
277
|
+
* preserves today's behavior on every other provider.
|
|
278
|
+
*/
|
|
279
|
+
const SPAN_SAMPLER_HONORING_MODEL_PREFIXES = [
|
|
280
|
+
"vast/eliza-1-",
|
|
281
|
+
"elizaos/eliza-1-",
|
|
282
|
+
"eliza-1-",
|
|
283
|
+
] as const;
|
|
143
284
|
|
|
144
|
-
|
|
285
|
+
function isSpanSamplerHonoringModel(modelName: string): boolean {
|
|
286
|
+
const lower = modelName.toLowerCase();
|
|
287
|
+
return SPAN_SAMPLER_HONORING_MODEL_PREFIXES.some((prefix) =>
|
|
288
|
+
lower.startsWith(prefix),
|
|
289
|
+
);
|
|
145
290
|
}
|
|
146
291
|
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
292
|
+
/**
|
|
293
|
+
* Build the `x-eliza-span-samplers` HTTP header value from a {@link SpanSamplerPlan}.
|
|
294
|
+
* Returns `undefined` when there is no plan or no overrides — narrow the wire
|
|
295
|
+
* surface so non-eliza providers never see a stray fork-extension header.
|
|
296
|
+
*
|
|
297
|
+
* Wire schema (snake_case):
|
|
298
|
+
* { overrides: [{ span_index, temperature, top_k?, top_p? }, ...], strict?: boolean }
|
|
299
|
+
*/
|
|
300
|
+
function buildSpanSamplerHeader(
|
|
301
|
+
plan: GenerateTextParams["spanSamplerPlan"],
|
|
302
|
+
): string | undefined {
|
|
303
|
+
if (!plan || plan.overrides.length === 0) return undefined;
|
|
304
|
+
const overrides = plan.overrides.map((o) => {
|
|
305
|
+
const wire: Record<string, unknown> = {
|
|
306
|
+
span_index: o.spanIndex,
|
|
307
|
+
temperature: o.temperature,
|
|
308
|
+
};
|
|
309
|
+
if (typeof o.topK === "number") wire.top_k = o.topK;
|
|
310
|
+
if (typeof o.topP === "number") wire.top_p = o.topP;
|
|
311
|
+
return wire;
|
|
312
|
+
});
|
|
313
|
+
const body: Record<string, unknown> = { overrides };
|
|
314
|
+
if (plan.strict === true) body.strict = true;
|
|
315
|
+
return JSON.stringify(body);
|
|
150
316
|
}
|
|
151
317
|
|
|
152
|
-
|
|
318
|
+
/**
|
|
319
|
+
* Extract the authoritative USD cost the metered cloud gateway charged for a
|
|
320
|
+
* request, when it surfaces one. The gateway is the only honest source of USD
|
|
321
|
+
* (it owns the model-pricing table + platform markup); we prefer it over any
|
|
322
|
+
* client-side token estimate. Checks the response body `usage.cost_usd` first,
|
|
323
|
+
* then the `X-Eliza-Cost-Usd` response header. Returns undefined when neither
|
|
324
|
+
* is present so consumers fall back to a token-based estimate.
|
|
325
|
+
*/
|
|
326
|
+
function extractCostUsd(
|
|
327
|
+
usage: unknown,
|
|
328
|
+
response?: { headers?: { get?: (name: string) => string | null } }
|
|
329
|
+
): number | undefined {
|
|
330
|
+
const fromBody = firstNumber(
|
|
331
|
+
asRecord(usage).cost_usd,
|
|
332
|
+
asRecord(usage).costUsd,
|
|
333
|
+
asRecord(usage).cost
|
|
334
|
+
);
|
|
335
|
+
if (typeof fromBody === "number" && Number.isFinite(fromBody)) {
|
|
336
|
+
return fromBody;
|
|
337
|
+
}
|
|
338
|
+
const header = response?.headers?.get?.("X-Eliza-Cost-Usd");
|
|
339
|
+
if (header) {
|
|
340
|
+
const parsed = Number(header);
|
|
341
|
+
if (Number.isFinite(parsed)) return parsed;
|
|
342
|
+
}
|
|
343
|
+
return undefined;
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
function isReasoningModel(modelName: string): boolean {
|
|
153
347
|
const lower = modelName.toLowerCase();
|
|
154
|
-
return
|
|
348
|
+
return REASONING_MODEL_PATTERNS.some((pattern) => lower.includes(pattern));
|
|
155
349
|
}
|
|
156
350
|
|
|
157
351
|
function isRecord(value: unknown): value is Record<string, unknown> {
|
|
@@ -256,30 +450,60 @@ function unwrapJsonSchema(value: unknown): unknown {
|
|
|
256
450
|
return record.schema ?? record.jsonSchema ?? value;
|
|
257
451
|
}
|
|
258
452
|
|
|
259
|
-
|
|
453
|
+
// Normalize a single tool entry into the OpenAI `{ type, function }` wire
|
|
454
|
+
// shape. Accepts BOTH the already-nested form (`{ type: "function", function:
|
|
455
|
+
// { name, parameters } }`) and core's FLAT `ToolDefinition` envelope
|
|
456
|
+
// (`{ name, type: "function", parameters }`, e.g. createHandleResponseTool /
|
|
457
|
+
// the action planner). Returning the flat form verbatim made the cloud gateway
|
|
458
|
+
// read `tool.function.name` on an undefined `function` → "Cannot read
|
|
459
|
+
// properties of undefined (reading 'name')". Returns undefined for entries with
|
|
460
|
+
// no resolvable name so they are dropped rather than crashing downstream.
|
|
461
|
+
function normalizeNativeToolEntry(
|
|
462
|
+
rawTool: unknown,
|
|
463
|
+
fallbackName?: string
|
|
464
|
+
): Record<string, unknown> | undefined {
|
|
465
|
+
const tool = asRecord(rawTool);
|
|
466
|
+
const nested = asRecord(tool.function);
|
|
467
|
+
const name = firstString(nested.name, tool.name, fallbackName);
|
|
468
|
+
if (!name) {
|
|
469
|
+
return undefined;
|
|
470
|
+
}
|
|
471
|
+
const description = firstString(nested.description, tool.description);
|
|
472
|
+
const inputSchema = unwrapJsonSchema(
|
|
473
|
+
nested.parameters ??
|
|
474
|
+
tool.inputSchema ??
|
|
475
|
+
tool.parameters ??
|
|
476
|
+
tool.schema ?? { type: "object" }
|
|
477
|
+
);
|
|
478
|
+
return {
|
|
479
|
+
type: "function",
|
|
480
|
+
function: {
|
|
481
|
+
name,
|
|
482
|
+
...(description ? { description } : {}),
|
|
483
|
+
parameters: inputSchema,
|
|
484
|
+
},
|
|
485
|
+
};
|
|
486
|
+
}
|
|
487
|
+
|
|
488
|
+
export function normalizeNativeTools(tools: unknown): unknown[] | undefined {
|
|
260
489
|
if (!tools) {
|
|
261
490
|
return undefined;
|
|
262
491
|
}
|
|
263
492
|
|
|
264
493
|
if (Array.isArray(tools)) {
|
|
265
|
-
|
|
494
|
+
const normalized = tools
|
|
495
|
+
.map((tool) => normalizeNativeToolEntry(tool))
|
|
496
|
+
.filter((tool): tool is Record<string, unknown> => tool !== undefined);
|
|
497
|
+
return normalized.length > 0 ? normalized : undefined;
|
|
266
498
|
}
|
|
267
499
|
|
|
268
500
|
const toolSet = asRecord(tools);
|
|
269
501
|
const normalized: unknown[] = [];
|
|
270
502
|
for (const [name, rawTool] of Object.entries(toolSet)) {
|
|
271
|
-
const
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
normalized.push({
|
|
276
|
-
type: "function",
|
|
277
|
-
function: {
|
|
278
|
-
name,
|
|
279
|
-
...(typeof tool.description === "string" ? { description: tool.description } : {}),
|
|
280
|
-
parameters: inputSchema,
|
|
281
|
-
},
|
|
282
|
-
});
|
|
503
|
+
const entry = normalizeNativeToolEntry(rawTool, name);
|
|
504
|
+
if (entry) {
|
|
505
|
+
normalized.push(entry);
|
|
506
|
+
}
|
|
283
507
|
}
|
|
284
508
|
|
|
285
509
|
return normalized.length > 0 ? normalized : undefined;
|
|
@@ -604,40 +828,24 @@ function getModelNameForType(runtime: IAgentRuntime, modelType: TextModelType):
|
|
|
604
828
|
}
|
|
605
829
|
}
|
|
606
830
|
|
|
831
|
+
/**
|
|
832
|
+
* Resolve the model name, rendered prompt, and effective system prompt for a
|
|
833
|
+
* cloud text call.
|
|
834
|
+
*
|
|
835
|
+
* This used to also construct a Vercel AI-SDK `LanguageModel` (`openai.chat()`)
|
|
836
|
+
* plus a full `generateParams` object — but the handlers below call the cloud
|
|
837
|
+
* HTTP API directly (`requestRaw` → `/responses` / `/chat/completions`), so that
|
|
838
|
+
* AI-SDK client + params object was built and immediately discarded on every
|
|
839
|
+
* single text generation. Removed: it was pure per-call overhead and a
|
|
840
|
+
* misleading code path when reasoning about which transport actually runs.
|
|
841
|
+
*/
|
|
607
842
|
function buildGenerateParams(
|
|
608
843
|
runtime: IAgentRuntime,
|
|
609
844
|
modelType: TextModelType,
|
|
610
845
|
params: GenerateTextParams
|
|
611
846
|
) {
|
|
612
|
-
const paramsWithAttachments = params as GenerateTextParamsWithAttachments;
|
|
613
847
|
const prompt = params.prompt ?? "";
|
|
614
|
-
const maxTokens = params.maxTokens ?? 8192;
|
|
615
|
-
|
|
616
|
-
const openai = createOpenAIClient(runtime);
|
|
617
848
|
const modelName = getModelNameForType(runtime, modelType);
|
|
618
|
-
const experimentalTelemetry = getExperimentalTelemetry(runtime);
|
|
619
|
-
const userContent =
|
|
620
|
-
(paramsWithAttachments.attachments?.length ?? 0) > 0
|
|
621
|
-
? buildUserContent(paramsWithAttachments)
|
|
622
|
-
: undefined;
|
|
623
|
-
|
|
624
|
-
// Use openai.chat() (Chat Completions API) instead of openai.languageModel()
|
|
625
|
-
// (Responses API). The Responses API unconditionally rejects presencePenalty,
|
|
626
|
-
// frequencyPenalty, and stopSequences for ALL models, emitting noisy warnings.
|
|
627
|
-
// The Chat Completions API supports these features natively and handles
|
|
628
|
-
// reasoning models gracefully when the params are omitted.
|
|
629
|
-
const model = openai.chat(modelName) as LanguageModel;
|
|
630
|
-
|
|
631
|
-
// Reasoning models don't support temperature, frequency/presence penalties,
|
|
632
|
-
// or stopSequences. Detect via model name patterns.
|
|
633
|
-
const reasoning = isReasoningModel(modelName);
|
|
634
|
-
const stopSequences =
|
|
635
|
-
!reasoning &&
|
|
636
|
-
supportsStopSequences(modelName) &&
|
|
637
|
-
Array.isArray(params.stopSequences) &&
|
|
638
|
-
params.stopSequences.length > 0
|
|
639
|
-
? params.stopSequences
|
|
640
|
-
: undefined;
|
|
641
849
|
const systemPrompt = resolveEffectiveSystemPrompt({
|
|
642
850
|
params,
|
|
643
851
|
fallback: buildCanonicalSystemPrompt({ character: runtime.character }),
|
|
@@ -647,20 +855,7 @@ function buildGenerateParams(
|
|
|
647
855
|
omitDuplicateSystem: systemPrompt,
|
|
648
856
|
}) ?? prompt;
|
|
649
857
|
|
|
650
|
-
|
|
651
|
-
model,
|
|
652
|
-
...(userContent
|
|
653
|
-
? { messages: [{ role: "user" as const, content: userContent }] }
|
|
654
|
-
: { prompt: promptText }),
|
|
655
|
-
system: systemPrompt,
|
|
656
|
-
...(stopSequences ? { stopSequences } : {}),
|
|
657
|
-
maxOutputTokens: maxTokens,
|
|
658
|
-
experimental_telemetry: {
|
|
659
|
-
isEnabled: experimentalTelemetry,
|
|
660
|
-
},
|
|
661
|
-
};
|
|
662
|
-
|
|
663
|
-
return { generateParams, modelName, modelType, prompt: promptText, systemPrompt };
|
|
858
|
+
return { modelName, modelType, prompt: promptText, systemPrompt };
|
|
664
859
|
}
|
|
665
860
|
|
|
666
861
|
async function generateTextWithModel(
|
|
@@ -673,16 +868,33 @@ async function generateTextWithModel(
|
|
|
673
868
|
|
|
674
869
|
logger.debug(`[ELIZAOS_CLOUD] Generating text with ${modelType} model: ${modelName}`);
|
|
675
870
|
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
871
|
+
// Stream the user-visible reply token-by-token. Gated to the structured
|
|
872
|
+
// reply path (`streamStructured`, set only by the RESPONSE_HANDLER stage-1
|
|
873
|
+
// call): that call carries a responseSkeleton, so the runtime's field
|
|
874
|
+
// extractor surfaces `replyText` incrementally to the UI. Planner/other
|
|
875
|
+
// native calls (no responseSkeleton) stay buffered — streaming their raw
|
|
876
|
+
// envelope would leak internals to the UI stream. The bare `/responses`
|
|
877
|
+
// route stays buffered too (different SSE schema, not on the reply path).
|
|
878
|
+
const paramsStreaming = params as {
|
|
879
|
+
stream?: boolean;
|
|
880
|
+
streamStructured?: boolean;
|
|
881
|
+
};
|
|
882
|
+
const wantsStream =
|
|
883
|
+
Boolean(paramsStreaming.stream) &&
|
|
884
|
+
paramsStreaming.streamStructured === true &&
|
|
885
|
+
resolveStreamingEnabled();
|
|
681
886
|
|
|
682
887
|
logger.log(`[ELIZAOS_CLOUD] Using ${modelType} model: ${modelName}`);
|
|
683
888
|
logger.log(prompt);
|
|
684
889
|
|
|
685
890
|
if (hasNativeTransportOptions(paramsWithNative)) {
|
|
891
|
+
if (wantsStream) {
|
|
892
|
+
return streamNativeChatCompletion(runtime, modelType, paramsWithNative, {
|
|
893
|
+
modelName,
|
|
894
|
+
prompt,
|
|
895
|
+
systemPrompt,
|
|
896
|
+
});
|
|
897
|
+
}
|
|
686
898
|
const nativeResult = await generateNativeChatCompletion(runtime, modelType, paramsWithNative, {
|
|
687
899
|
modelName,
|
|
688
900
|
prompt,
|
|
@@ -718,13 +930,27 @@ async function generateTextWithModel(
|
|
|
718
930
|
requestBody.temperature = params.temperature;
|
|
719
931
|
}
|
|
720
932
|
|
|
721
|
-
const
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
933
|
+
const responsesHeaders: Record<string, string> = {
|
|
934
|
+
"X-Eliza-Llm-Purpose": getPurposeForModelType(modelType),
|
|
935
|
+
"X-Eliza-Model-Type": modelType,
|
|
936
|
+
};
|
|
937
|
+
if (isSpanSamplerHonoringModel(modelName)) {
|
|
938
|
+
const samplerHeader = buildSpanSamplerHeader(params.spanSamplerPlan);
|
|
939
|
+
if (samplerHeader) {
|
|
940
|
+
responsesHeaders["x-eliza-span-samplers"] = samplerHeader;
|
|
941
|
+
}
|
|
942
|
+
}
|
|
943
|
+
// Same shared cerebras key as the /chat/completions route, so gate this
|
|
944
|
+
// bare-prompt round-trip through the SAME limiter (parsing stays unguarded).
|
|
945
|
+
const response = await withNativeChatLimit(
|
|
946
|
+
() =>
|
|
947
|
+
createCloudApiClient(runtime).requestRaw("POST", "/responses", {
|
|
948
|
+
headers: responsesHeaders,
|
|
949
|
+
json: requestBody,
|
|
950
|
+
timeoutMs: resolveTextTimeoutMs(),
|
|
951
|
+
}),
|
|
952
|
+
"responses"
|
|
953
|
+
);
|
|
728
954
|
const responseText = await response.text();
|
|
729
955
|
let data: ResponsesApiResponse = {};
|
|
730
956
|
if (responseText) {
|
|
@@ -757,11 +983,23 @@ async function generateTextWithModel(
|
|
|
757
983
|
}
|
|
758
984
|
|
|
759
985
|
if (data.usage) {
|
|
760
|
-
emitModelUsageEvent(
|
|
761
|
-
|
|
762
|
-
|
|
763
|
-
|
|
764
|
-
|
|
986
|
+
emitModelUsageEvent(
|
|
987
|
+
runtime,
|
|
988
|
+
modelType,
|
|
989
|
+
prompt,
|
|
990
|
+
{
|
|
991
|
+
inputTokens: data.usage.input_tokens ?? 0,
|
|
992
|
+
outputTokens: data.usage.output_tokens ?? 0,
|
|
993
|
+
totalTokens: data.usage.total_tokens ?? 0,
|
|
994
|
+
},
|
|
995
|
+
{
|
|
996
|
+
modelName: getModelNameForType(runtime, modelType),
|
|
997
|
+
...(() => {
|
|
998
|
+
const costUsd = extractCostUsd(data.usage, response);
|
|
999
|
+
return typeof costUsd === "number" ? { costUsd } : {};
|
|
1000
|
+
})(),
|
|
1001
|
+
}
|
|
1002
|
+
);
|
|
765
1003
|
}
|
|
766
1004
|
|
|
767
1005
|
const text = extractResponsesOutputText(data);
|
|
@@ -772,7 +1010,9 @@ async function generateTextWithModel(
|
|
|
772
1010
|
return text;
|
|
773
1011
|
}
|
|
774
1012
|
|
|
775
|
-
|
|
1013
|
+
// Exported for unit tests (the concurrency limiter wrapper). Not part of the
|
|
1014
|
+
// plugin's public model-handler surface.
|
|
1015
|
+
export async function generateNativeChatCompletion(
|
|
776
1016
|
runtime: IAgentRuntime,
|
|
777
1017
|
modelType: TextModelType,
|
|
778
1018
|
params: GenerateTextParamsWithNativeOptions,
|
|
@@ -788,13 +1028,35 @@ async function generateNativeChatCompletion(
|
|
|
788
1028
|
context.prompt,
|
|
789
1029
|
context.systemPrompt
|
|
790
1030
|
);
|
|
791
|
-
const
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
1031
|
+
const headers: Record<string, string> = {
|
|
1032
|
+
"X-Eliza-Llm-Purpose": getPurposeForModelType(modelType),
|
|
1033
|
+
"X-Eliza-Model-Type": modelType,
|
|
1034
|
+
};
|
|
1035
|
+
// Per-span sampler overrides only ride along when the resolved model is a
|
|
1036
|
+
// fork-built eliza-1 deployment that knows how to honor the header. Other
|
|
1037
|
+
// upstreams (OpenAI / Anthropic / generic OpenRouter) strip unknown headers
|
|
1038
|
+
// safely, but we keep the wire surface narrow until the cloud honor path
|
|
1039
|
+
// lands in Wave 3.
|
|
1040
|
+
if (isSpanSamplerHonoringModel(context.modelName)) {
|
|
1041
|
+
const samplerHeader = buildSpanSamplerHeader(params.spanSamplerPlan);
|
|
1042
|
+
if (samplerHeader) {
|
|
1043
|
+
headers["x-eliza-span-samplers"] = samplerHeader;
|
|
1044
|
+
}
|
|
1045
|
+
}
|
|
1046
|
+
// Serialize the per-turn batcher/evaluator burst through the SAME shared
|
|
1047
|
+
// semaphore the /responses route uses, so N simultaneous native cloud text
|
|
1048
|
+
// calls don't overrun the one shared cerebras key's concurrent limit (-> 429
|
|
1049
|
+
// -> retries -> 30-63s). The permit is held only across the network
|
|
1050
|
+
// round-trip; the text()/JSON parse below runs unguarded.
|
|
1051
|
+
const response = await withNativeChatLimit(
|
|
1052
|
+
() =>
|
|
1053
|
+
createCloudApiClient(runtime).requestRaw("POST", "/chat/completions", {
|
|
1054
|
+
headers,
|
|
1055
|
+
json: requestBody,
|
|
1056
|
+
timeoutMs: resolveTextTimeoutMs(),
|
|
1057
|
+
}),
|
|
1058
|
+
"chat/completions"
|
|
1059
|
+
);
|
|
798
1060
|
const responseText = await response.text();
|
|
799
1061
|
let data: ChatCompletionsResponse = {};
|
|
800
1062
|
if (responseText) {
|
|
@@ -828,7 +1090,13 @@ async function generateNativeChatCompletion(
|
|
|
828
1090
|
|
|
829
1091
|
const usage = convertNativeUsage(data.usage);
|
|
830
1092
|
if (usage) {
|
|
831
|
-
emitModelUsageEvent(runtime, modelType, context.prompt, usage
|
|
1093
|
+
emitModelUsageEvent(runtime, modelType, context.prompt, usage, {
|
|
1094
|
+
modelName: context.modelName,
|
|
1095
|
+
...(() => {
|
|
1096
|
+
const costUsd = extractCostUsd(data.usage, response);
|
|
1097
|
+
return typeof costUsd === "number" ? { costUsd } : {};
|
|
1098
|
+
})(),
|
|
1099
|
+
});
|
|
832
1100
|
}
|
|
833
1101
|
|
|
834
1102
|
const text = extractChatCompletionText(data);
|
|
@@ -849,6 +1117,332 @@ async function generateNativeChatCompletion(
|
|
|
849
1117
|
};
|
|
850
1118
|
}
|
|
851
1119
|
|
|
1120
|
+
// ---------------------------------------------------------------------------
|
|
1121
|
+
// Streaming native /chat/completions (token-by-token, OpenAI-compatible SSE)
|
|
1122
|
+
// ---------------------------------------------------------------------------
|
|
1123
|
+
|
|
1124
|
+
interface Deferred<T> {
|
|
1125
|
+
promise: Promise<T>;
|
|
1126
|
+
resolve: (value: T) => void;
|
|
1127
|
+
}
|
|
1128
|
+
|
|
1129
|
+
function deferred<T>(): Deferred<T> {
|
|
1130
|
+
let resolve!: (value: T) => void;
|
|
1131
|
+
const promise = new Promise<T>((r) => {
|
|
1132
|
+
resolve = r;
|
|
1133
|
+
});
|
|
1134
|
+
return { promise, resolve };
|
|
1135
|
+
}
|
|
1136
|
+
|
|
1137
|
+
/**
|
|
1138
|
+
* Parse an OpenAI-compatible SSE byte stream into the decoded JSON frame of
|
|
1139
|
+
* each `data:` line. Yields one object per frame; stops at `data: [DONE]`.
|
|
1140
|
+
* Tolerates partial reads (buffers across chunk boundaries) and ignores
|
|
1141
|
+
* non-`data:` lines (comments, blank separators). Exported for unit tests.
|
|
1142
|
+
*/
|
|
1143
|
+
export async function* parseOpenAiSseStream(
|
|
1144
|
+
body: ReadableStream<Uint8Array>
|
|
1145
|
+
): AsyncGenerator<Record<string, unknown>> {
|
|
1146
|
+
const reader = body.getReader();
|
|
1147
|
+
const decoder = new TextDecoder();
|
|
1148
|
+
let buffer = "";
|
|
1149
|
+
const handle = (line: string): Record<string, unknown> | "DONE" | null => {
|
|
1150
|
+
const trimmed = line.trimStart();
|
|
1151
|
+
if (!trimmed.startsWith("data:")) return null;
|
|
1152
|
+
const payload = trimmed.slice(5).trim();
|
|
1153
|
+
if (payload === "") return null;
|
|
1154
|
+
if (payload === "[DONE]") return "DONE";
|
|
1155
|
+
try {
|
|
1156
|
+
return JSON.parse(payload) as Record<string, unknown>;
|
|
1157
|
+
} catch {
|
|
1158
|
+
return null;
|
|
1159
|
+
}
|
|
1160
|
+
};
|
|
1161
|
+
try {
|
|
1162
|
+
for (;;) {
|
|
1163
|
+
const { value, done } = await reader.read();
|
|
1164
|
+
if (done) break;
|
|
1165
|
+
buffer += decoder.decode(value, { stream: true });
|
|
1166
|
+
let nl: number;
|
|
1167
|
+
while ((nl = buffer.indexOf("\n")) >= 0) {
|
|
1168
|
+
const line = buffer.slice(0, nl);
|
|
1169
|
+
buffer = buffer.slice(nl + 1);
|
|
1170
|
+
const frame = handle(line);
|
|
1171
|
+
if (frame === "DONE") return;
|
|
1172
|
+
if (frame) yield frame;
|
|
1173
|
+
}
|
|
1174
|
+
}
|
|
1175
|
+
const tail = handle(buffer);
|
|
1176
|
+
if (tail && tail !== "DONE") yield tail;
|
|
1177
|
+
} finally {
|
|
1178
|
+
// cancel() (not just releaseLock()) tears down the underlying connection,
|
|
1179
|
+
// so an EARLY consumer break (runtime abort / turn-supersede / a downstream
|
|
1180
|
+
// throw closes this generator via .return()) stops the upstream generation
|
|
1181
|
+
// instead of letting it run to its natural end and bill tokens nobody reads.
|
|
1182
|
+
// On natural completion the stream is already done, so this is a no-op; it
|
|
1183
|
+
// also releases the lock. Not threading the abort signal into the fetch on
|
|
1184
|
+
// purpose — cancel() gets the teardown without rejecting an in-flight read
|
|
1185
|
+
// with AbortError and changing the runtime's quiet-stop semantics.
|
|
1186
|
+
try {
|
|
1187
|
+
await reader.cancel();
|
|
1188
|
+
} catch {
|
|
1189
|
+
// Reader already cancelled/released by an upstream abort — nothing to do.
|
|
1190
|
+
}
|
|
1191
|
+
}
|
|
1192
|
+
}
|
|
1193
|
+
|
|
1194
|
+
interface StreamingToolCallAcc {
|
|
1195
|
+
id?: string;
|
|
1196
|
+
name?: string;
|
|
1197
|
+
args: string;
|
|
1198
|
+
}
|
|
1199
|
+
|
|
1200
|
+
/** Fold one SSE `delta.tool_calls[]` array into the per-index accumulator. */
|
|
1201
|
+
export function accumulateToolCallDeltas(
|
|
1202
|
+
acc: Map<number, StreamingToolCallAcc>,
|
|
1203
|
+
deltas: unknown
|
|
1204
|
+
): void {
|
|
1205
|
+
if (!Array.isArray(deltas)) return;
|
|
1206
|
+
for (const raw of deltas) {
|
|
1207
|
+
const d = asRecord(raw);
|
|
1208
|
+
const index = typeof d.index === "number" ? d.index : 0;
|
|
1209
|
+
const cur = acc.get(index) ?? { args: "" };
|
|
1210
|
+
const id = firstString(d.id);
|
|
1211
|
+
if (id) cur.id = id;
|
|
1212
|
+
const fn = recordAt(d, "function");
|
|
1213
|
+
const name = firstString(fn.name);
|
|
1214
|
+
if (name) cur.name = name;
|
|
1215
|
+
if (typeof fn.arguments === "string") cur.args += fn.arguments;
|
|
1216
|
+
acc.set(index, cur);
|
|
1217
|
+
}
|
|
1218
|
+
}
|
|
1219
|
+
|
|
1220
|
+
/** Materialize accumulated tool-call deltas into the buffered-path shape. */
|
|
1221
|
+
export function finalizeStreamedToolCalls(
|
|
1222
|
+
acc: Map<number, StreamingToolCallAcc>
|
|
1223
|
+
): NativeToolCall[] {
|
|
1224
|
+
const out: NativeToolCall[] = [];
|
|
1225
|
+
for (const [index, c] of [...acc.entries()].sort((a, b) => a[0] - b[0])) {
|
|
1226
|
+
if (!c.name) continue;
|
|
1227
|
+
out.push({
|
|
1228
|
+
type: "tool-call",
|
|
1229
|
+
toolCallId: c.id ?? `call_${c.name}_${index}`,
|
|
1230
|
+
toolName: c.name,
|
|
1231
|
+
input: parseJsonIfPossible(c.args.trim() === "" ? "{}" : c.args),
|
|
1232
|
+
});
|
|
1233
|
+
}
|
|
1234
|
+
return out;
|
|
1235
|
+
}
|
|
1236
|
+
|
|
1237
|
+
/**
|
|
1238
|
+
* Streaming variant of {@link generateNativeChatCompletion}: returns a
|
|
1239
|
+
* {@link TextStreamResult} whose `textStream` yields `delta.content` as it
|
|
1240
|
+
* arrives, so `useModel`'s for-await loop streams it to the UI from the first
|
|
1241
|
+
* token. Falls back to a single-chunk buffered result if the gateway answers
|
|
1242
|
+
* non-SSE (self-healing). The shared concurrency permit is held for the whole
|
|
1243
|
+
* stream lifetime (released in the generator's `finally`), not just until
|
|
1244
|
+
* headers arrive — otherwise the cap would under-count in-flight requests.
|
|
1245
|
+
*/
|
|
1246
|
+
export async function streamNativeChatCompletion(
|
|
1247
|
+
runtime: IAgentRuntime,
|
|
1248
|
+
modelType: TextModelType,
|
|
1249
|
+
params: GenerateTextParamsWithNativeOptions,
|
|
1250
|
+
context: { modelName: string; prompt: string; systemPrompt?: string }
|
|
1251
|
+
): Promise<TextStreamResult> {
|
|
1252
|
+
const requestBody = buildNativeRequestBody(
|
|
1253
|
+
params,
|
|
1254
|
+
context.modelName,
|
|
1255
|
+
context.prompt,
|
|
1256
|
+
context.systemPrompt
|
|
1257
|
+
);
|
|
1258
|
+
requestBody.stream = true;
|
|
1259
|
+
// OpenAI-compatible: ask the server to include a final usage-only frame so we
|
|
1260
|
+
// can meter the streamed call accurately.
|
|
1261
|
+
requestBody.stream_options = { include_usage: true };
|
|
1262
|
+
|
|
1263
|
+
const headers: Record<string, string> = {
|
|
1264
|
+
"X-Eliza-Llm-Purpose": getPurposeForModelType(modelType),
|
|
1265
|
+
"X-Eliza-Model-Type": modelType,
|
|
1266
|
+
};
|
|
1267
|
+
if (isSpanSamplerHonoringModel(context.modelName)) {
|
|
1268
|
+
const samplerHeader = buildSpanSamplerHeader(params.spanSamplerPlan);
|
|
1269
|
+
if (samplerHeader) {
|
|
1270
|
+
headers["x-eliza-span-samplers"] = samplerHeader;
|
|
1271
|
+
}
|
|
1272
|
+
}
|
|
1273
|
+
|
|
1274
|
+
const abortSignal = (params as { signal?: AbortSignal }).signal;
|
|
1275
|
+
const signal = buildStreamAbortSignal(abortSignal, resolveTextTimeoutMs());
|
|
1276
|
+
|
|
1277
|
+
const limiter = getNativeChatLimiter();
|
|
1278
|
+
const waitStartedAt = Date.now();
|
|
1279
|
+
await limiter.acquire();
|
|
1280
|
+
recordInferenceSpan("cloud.semaphore-wait", Date.now() - waitStartedAt, {
|
|
1281
|
+
route: "chat/completions:stream",
|
|
1282
|
+
});
|
|
1283
|
+
let permitReleased = false;
|
|
1284
|
+
const releasePermit = (): void => {
|
|
1285
|
+
if (!permitReleased) {
|
|
1286
|
+
permitReleased = true;
|
|
1287
|
+
limiter.release();
|
|
1288
|
+
}
|
|
1289
|
+
};
|
|
1290
|
+
|
|
1291
|
+
let response: Response;
|
|
1292
|
+
try {
|
|
1293
|
+
response = await createCloudApiClient(runtime).requestRaw("POST", "/chat/completions", {
|
|
1294
|
+
headers,
|
|
1295
|
+
json: requestBody,
|
|
1296
|
+
...(signal ? { signal } : {}),
|
|
1297
|
+
});
|
|
1298
|
+
} catch (err) {
|
|
1299
|
+
releasePermit();
|
|
1300
|
+
throw err;
|
|
1301
|
+
}
|
|
1302
|
+
|
|
1303
|
+
if (!response.ok) {
|
|
1304
|
+
let errorBody: { message?: string } | undefined;
|
|
1305
|
+
try {
|
|
1306
|
+
const errText = await response.text();
|
|
1307
|
+
if (errText) {
|
|
1308
|
+
errorBody = (JSON.parse(errText) as ChatCompletionsResponse).error;
|
|
1309
|
+
}
|
|
1310
|
+
} catch {
|
|
1311
|
+
// Non-JSON error body — fall through to the status-coded message.
|
|
1312
|
+
}
|
|
1313
|
+
releasePermit();
|
|
1314
|
+
const message =
|
|
1315
|
+
typeof errorBody?.message === "string" && errorBody.message.trim()
|
|
1316
|
+
? errorBody.message.trim()
|
|
1317
|
+
: `elizaOS Cloud error ${response.status}`;
|
|
1318
|
+
const requestError = new Error(message) as Error & {
|
|
1319
|
+
status?: number;
|
|
1320
|
+
error?: unknown;
|
|
1321
|
+
};
|
|
1322
|
+
requestError.status = response.status;
|
|
1323
|
+
if (errorBody) requestError.error = errorBody;
|
|
1324
|
+
throw requestError;
|
|
1325
|
+
}
|
|
1326
|
+
|
|
1327
|
+
const contentType = response.headers.get("content-type") ?? "";
|
|
1328
|
+
const isSse = contentType.includes("text/event-stream") && response.body !== null;
|
|
1329
|
+
|
|
1330
|
+
// Self-healing fallback: gateway answered with a buffered JSON body despite
|
|
1331
|
+
// the stream request. Yield it as a single chunk so the streaming contract
|
|
1332
|
+
// (and the structured-field extractor downstream) still works.
|
|
1333
|
+
if (!isSse) {
|
|
1334
|
+
const bufferedText = await response.text();
|
|
1335
|
+
releasePermit();
|
|
1336
|
+
let data: ChatCompletionsResponse = {};
|
|
1337
|
+
if (bufferedText) {
|
|
1338
|
+
try {
|
|
1339
|
+
data = JSON.parse(bufferedText) as ChatCompletionsResponse;
|
|
1340
|
+
} catch (parseErr) {
|
|
1341
|
+
logger.error(
|
|
1342
|
+
`[ELIZAOS_CLOUD] Failed to parse buffered chat completions JSON: ${
|
|
1343
|
+
parseErr instanceof Error ? parseErr.message : String(parseErr)
|
|
1344
|
+
}`
|
|
1345
|
+
);
|
|
1346
|
+
}
|
|
1347
|
+
}
|
|
1348
|
+
const text = extractChatCompletionText(data);
|
|
1349
|
+
const toolCalls = extractNativeToolCalls(data);
|
|
1350
|
+
const usage = convertNativeUsage(data.usage);
|
|
1351
|
+
if (usage) {
|
|
1352
|
+
emitModelUsageEvent(runtime, modelType, context.prompt, usage, {
|
|
1353
|
+
modelName: context.modelName,
|
|
1354
|
+
...(() => {
|
|
1355
|
+
const costUsd = extractCostUsd(data.usage, response);
|
|
1356
|
+
return typeof costUsd === "number" ? { costUsd } : {};
|
|
1357
|
+
})(),
|
|
1358
|
+
});
|
|
1359
|
+
}
|
|
1360
|
+
if (!text.trim() && toolCalls.length === 0) {
|
|
1361
|
+
throw new Error("elizaOS Cloud returned no text or tool calls");
|
|
1362
|
+
}
|
|
1363
|
+
async function* single(): AsyncGenerator<string> {
|
|
1364
|
+
if (text) yield text;
|
|
1365
|
+
}
|
|
1366
|
+
return {
|
|
1367
|
+
textStream: single(),
|
|
1368
|
+
text: Promise.resolve(text),
|
|
1369
|
+
usage: Promise.resolve(usage),
|
|
1370
|
+
finishReason: Promise.resolve(data.choices?.[0]?.finish_reason),
|
|
1371
|
+
toolCalls: Promise.resolve(toolCalls),
|
|
1372
|
+
providerMetadata: { modelName: context.modelName, usage: data.usage },
|
|
1373
|
+
};
|
|
1374
|
+
}
|
|
1375
|
+
|
|
1376
|
+
const body = response.body as ReadableStream<Uint8Array>;
|
|
1377
|
+
const toolAcc = new Map<number, StreamingToolCallAcc>();
|
|
1378
|
+
let accumulated = "";
|
|
1379
|
+
let nativeUsage: NativeTokenUsage | undefined;
|
|
1380
|
+
let rawUsage: unknown;
|
|
1381
|
+
let finishReason: string | undefined;
|
|
1382
|
+
|
|
1383
|
+
const textD = deferred<string>();
|
|
1384
|
+
const usageD = deferred<TokenUsage | undefined>();
|
|
1385
|
+
const finishD = deferred<string | undefined>();
|
|
1386
|
+
const toolCallsD = deferred<NativeToolCall[]>();
|
|
1387
|
+
|
|
1388
|
+
async function* generate(): AsyncGenerator<string> {
|
|
1389
|
+
try {
|
|
1390
|
+
for await (const frame of parseOpenAiSseStream(body)) {
|
|
1391
|
+
if (frame.error) {
|
|
1392
|
+
const message = asRecord(frame.error).message;
|
|
1393
|
+
throw new Error(
|
|
1394
|
+
typeof message === "string" && message.trim()
|
|
1395
|
+
? message.trim()
|
|
1396
|
+
: "elizaOS Cloud stream error"
|
|
1397
|
+
);
|
|
1398
|
+
}
|
|
1399
|
+
const choices = Array.isArray(frame.choices) ? frame.choices : [];
|
|
1400
|
+
const choice = asRecord(choices[0]);
|
|
1401
|
+
const delta = recordAt(choice, "delta");
|
|
1402
|
+
// Raw (un-trimmed) content — inter-token whitespace is significant.
|
|
1403
|
+
if (typeof delta.content === "string" && delta.content.length > 0) {
|
|
1404
|
+
accumulated += delta.content;
|
|
1405
|
+
yield delta.content;
|
|
1406
|
+
}
|
|
1407
|
+
if (delta.tool_calls) {
|
|
1408
|
+
accumulateToolCallDeltas(toolAcc, delta.tool_calls);
|
|
1409
|
+
}
|
|
1410
|
+
const fr = firstString(choice.finish_reason);
|
|
1411
|
+
if (fr) finishReason = fr;
|
|
1412
|
+
if (frame.usage) {
|
|
1413
|
+
rawUsage = frame.usage;
|
|
1414
|
+
nativeUsage = convertNativeUsage(frame.usage);
|
|
1415
|
+
}
|
|
1416
|
+
}
|
|
1417
|
+
} finally {
|
|
1418
|
+
releasePermit();
|
|
1419
|
+
const toolCalls = finalizeStreamedToolCalls(toolAcc);
|
|
1420
|
+
textD.resolve(accumulated);
|
|
1421
|
+
usageD.resolve(nativeUsage);
|
|
1422
|
+
finishD.resolve(finishReason);
|
|
1423
|
+
toolCallsD.resolve(toolCalls);
|
|
1424
|
+
if (nativeUsage) {
|
|
1425
|
+
emitModelUsageEvent(runtime, modelType, context.prompt, nativeUsage, {
|
|
1426
|
+
modelName: context.modelName,
|
|
1427
|
+
...(() => {
|
|
1428
|
+
const costUsd = extractCostUsd(rawUsage, response);
|
|
1429
|
+
return typeof costUsd === "number" ? { costUsd } : {};
|
|
1430
|
+
})(),
|
|
1431
|
+
});
|
|
1432
|
+
}
|
|
1433
|
+
}
|
|
1434
|
+
}
|
|
1435
|
+
|
|
1436
|
+
return {
|
|
1437
|
+
textStream: generate(),
|
|
1438
|
+
text: textD.promise,
|
|
1439
|
+
usage: usageD.promise,
|
|
1440
|
+
finishReason: finishD.promise,
|
|
1441
|
+
toolCalls: toolCallsD.promise,
|
|
1442
|
+
providerMetadata: { modelName: context.modelName },
|
|
1443
|
+
};
|
|
1444
|
+
}
|
|
1445
|
+
|
|
852
1446
|
export async function handleTextSmall(
|
|
853
1447
|
runtime: IAgentRuntime,
|
|
854
1448
|
params: GenerateTextParams
|