pi-sap-aicore 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/stream.ts ADDED
@@ -0,0 +1,1051 @@
1
+ import { randomUUID } from "node:crypto";
2
+ import { appendFileSync } from "node:fs";
3
+ import { tmpdir } from "node:os";
4
+ import { join } from "node:path";
5
+
6
+ import {
7
+ type Api,
8
+ type AssistantMessage,
9
+ type AssistantMessageEventStream,
10
+ calculateCost,
11
+ type Context,
12
+ createAssistantMessageEventStream,
13
+ type Model,
14
+ type SimpleStreamOptions,
15
+ type Usage,
16
+ } from "@earendil-works/pi-ai";
17
+ import { AuthStorage } from "@earendil-works/pi-coding-agent";
18
+ import type {
19
+ ChatModel,
20
+ LlmModelParams,
21
+ } from "@sap-ai-sdk/orchestration";
22
+ import type { TokenUsage } from "@sap-ai-sdk/orchestration/internal.js";
23
+
24
+ import { parseAndValidateServiceKey, type ValidatedKey } from "./auth.ts";
25
+ import { mapFinishReason, piContextToOrchestration } from "./translate.ts";
26
+
27
+ // `@sap-ai-sdk/orchestration` is loaded dynamically (not at module load) so a
28
+ // missing dependency surfaces as an actionable, in-stream error instead of a
29
+ // raw `ERR_MODULE_NOT_FOUND` crash at pi startup. Only the `OrchestrationClient`
30
+ // value needs a runtime import — every other SAP symbol used here is `import
31
+ // type` and erased at compile time, so importing this module is side-effect
32
+ // free until the first actual stream. Keeping the import here (rather than an
33
+ // `async` wrapper in index.ts) lets `streamSimple` stay synchronous, which is
34
+ // the shape pi's provider contract requires.
35
+ async function importOrchestration(): Promise<
36
+ typeof import("@sap-ai-sdk/orchestration")
37
+ > {
38
+ try {
39
+ return await import("@sap-ai-sdk/orchestration");
40
+ } catch (err) {
41
+ const code = (err as NodeJS.ErrnoException)?.code;
42
+ const msg = (err as Error)?.message ?? "";
43
+ const isMissingSapSdk =
44
+ code === "ERR_MODULE_NOT_FOUND" &&
45
+ msg.includes("@sap-ai-sdk/orchestration");
46
+ if (!isMissingSapSdk) throw err;
47
+
48
+ throw new Error(
49
+ "The SAP AI Core SDK (@sap-ai-sdk/orchestration) isn't installed, so " +
50
+ "this provider can't make requests. pi loaded the extension but its " +
51
+ "dependencies didn't finish installing. Fix: run `npm install` in the " +
52
+ "pi-sap-aicore directory (where pi installed it, e.g. under " +
53
+ "~/.pi/agent/), then restart pi. See the pi-sap-aicore README " +
54
+ "(Installation) for details.",
55
+ );
56
+ }
57
+ }
58
+
59
+ // Opt-in request logging for diagnosing server-side failures whose error body
60
+ // doesn't echo back what we sent (e.g. SAP's "Internal server error" 500s,
61
+ // which only return the templating result, not the params/messages). Set
62
+ // PI_SAP_AICORE_DEBUG_PAYLOAD to a file path to append one JSON line per
63
+ // request and one per error — both keyed by the same `requestId`, so you can
64
+ // grep `"kind":"error"` and look up the request that triggered it. Set it to
65
+ // "1"/"true" to use <tmpdir>/pi-sap-aicore-payloads.jsonl. WARNING: logs full
66
+ // message bodies, so leave it off unless actively debugging — the file will
67
+ // contain prompt content.
68
+ function debugPayloadPath(): string | undefined {
69
+ const v = process.env.PI_SAP_AICORE_DEBUG_PAYLOAD?.trim();
70
+ if (!v) return undefined;
71
+ if (v === "1" || v.toLowerCase() === "true") {
72
+ return join(tmpdir(), "pi-sap-aicore-payloads.jsonl");
73
+ }
74
+ return v;
75
+ }
76
+
77
+ export function debugLog(entry: Record<string, unknown>): void {
78
+ const path = debugPayloadPath();
79
+ if (!path) return;
80
+ try {
81
+ const line = JSON.stringify({ ts: new Date().toISOString(), ...entry });
82
+ appendFileSync(path, `${line}\n`);
83
+ } catch {
84
+ // Never let diagnostic logging break a real request.
85
+ }
86
+ }
87
+
88
+ // SAP SDK wraps server-side errors as `Error while iterating over SSE stream`
89
+ // with the real error attached via `.cause`. Walk the chain so the user sees
90
+ // what SAP/Anthropic actually complained about.
91
+ //
92
+ // SAP's http-client.js wraps axios errors as
93
+ // `ErrorWithCause("Request failed with status code N.", axiosError)`. The
94
+ // wrapper .message and the axios .message are IDENTICAL, and the real
95
+ // server explanation lives on `axiosError.response.data` (already parsed
96
+ // from the SSE error frame by handleStreamError). Without extracting it,
97
+ // the surface is just "400 → 400" with no actionable info.
98
+ const MAX_DETAIL_CHARS = 2000;
99
+
100
+ function truncate(s: string, max = MAX_DETAIL_CHARS): string {
101
+ return s.length > max ? `${s.slice(0, max)}…[+${s.length - max} chars]` : s;
102
+ }
103
+
104
+ // Try known server-error shapes first (SAP, Anthropic-via-orchestration),
105
+ // fall back to JSON. Returns undefined when there's nothing meaningful to
106
+ // say beyond what .message already conveyed.
107
+ function extractServerDetail(data: unknown): string | undefined {
108
+ if (data == null) return undefined;
109
+ if (typeof data === "string") {
110
+ const trimmed = data.trim();
111
+ return trimmed.length > 0 ? truncate(trimmed) : undefined;
112
+ }
113
+ if (typeof data !== "object") return truncate(String(data));
114
+
115
+ const d = data as Record<string, unknown>;
116
+
117
+ // Anthropic-bubbled: { error: { type, message } } or { type, message }
118
+ const nested = (d.error ?? d) as Record<string, unknown>;
119
+ const nestedMsg =
120
+ typeof nested.message === "string" ? nested.message : undefined;
121
+ const nestedType = typeof nested.type === "string" ? nested.type : undefined;
122
+ if (nestedMsg) {
123
+ const loc =
124
+ typeof nested.location === "string" ? nested.location : undefined;
125
+ const prefix = nestedType ? `${nestedType}: ` : "";
126
+ const suffix = loc ? ` (at ${loc})` : "";
127
+ return truncate(`${prefix}${nestedMsg}${suffix}`);
128
+ }
129
+
130
+ // Fallback: stringify and let the user read it.
131
+ try {
132
+ return truncate(JSON.stringify(d));
133
+ } catch {
134
+ return truncate(String(d));
135
+ }
136
+ }
137
+
138
+ // SAP SDK upstream bug: @sap-ai-sdk/core's `handleStreamError` (in
139
+ // http-client.js) does an unconditional `JSON.parse` on the response
140
+ // body via `node:stream/consumers`'s `json()`. When SAP AI Core's
141
+ // gateway (Envoy/Istio) returns a plain-text error like
142
+ // `upstream connect error and disconnect/reset before headers...`
143
+ // — typical for transient backend unreachability, gateway timeouts,
144
+ // or some rate-limit responses — `JSON.parse` throws a raw V8
145
+ // SyntaxError that escapes BEFORE the SDK's `throw new ErrorWithCause`
146
+ // wrapper, so we lose the status code and any structured context.
147
+ //
148
+ // Detect that exact shape so the user sees something actionable
149
+ // instead of `Unexpected token 'u', "upstream c"... is not valid JSON`.
150
+ // We also try to recover the original gateway body text from the
151
+ // SyntaxError message itself (V8 includes the first ~chars of the
152
+ // offending input as `"upstream c"...`), so the user can tell
153
+ // envoy-from-anything-else apart at a glance.
154
+ function looksLikeSapGatewayJsonParseFailure(error: unknown): boolean {
155
+ if (!(error instanceof SyntaxError)) return false;
156
+ const msg = error.message ?? "";
157
+ // V8 shape: `Unexpected token 'X', "<snippet>"... is not valid JSON`
158
+ // or `Unexpected non-whitespace character...` for some payloads.
159
+ return /is not valid JSON/.test(msg) || /Unexpected token/.test(msg);
160
+ }
161
+
162
+ function sapGatewayHint(error: SyntaxError): string {
163
+ const snippetMatch = error.message.match(/"([^"]+)"\.\.\./);
164
+ const snippet = snippetMatch?.[1];
165
+ const body = snippet ? ` Body started with: "${snippet}...".` : "";
166
+ const looksLikeEnvoy = snippet !== undefined && /^upstream\b/i.test(snippet);
167
+ const diagnosis = looksLikeEnvoy
168
+ ? "SAP AI Core's gateway (Envoy) returned a plain-text error instead of JSON. " +
169
+ "This is almost always transient — upstream connect failure, gateway " +
170
+ "timeout on a long reasoning turn, or a non-JSON 429/503 from the proxy."
171
+ : "SAP AI Core returned a non-JSON response body. Likely a transient " +
172
+ "gateway/proxy error (timeout, upstream unreachable, or non-JSON 5xx).";
173
+ return (
174
+ `${diagnosis}${body} Retry usually works; if it persists, check the SAP AI ` +
175
+ `Core service status and that your deployment + resource group are healthy. ` +
176
+ `(Underlying SDK bug: @sap-ai-sdk/core's handleStreamError JSON.parses the ` +
177
+ `error body unconditionally; see axios#6468.)`
178
+ );
179
+ }
180
+
181
+ // SAP's SSE iterator (@sap-ai-sdk/core/dist/stream/sse-stream.js) throws
182
+ // `new Error("Error received from the server.\n" + JSON.stringify(data.error))`
183
+ // when the orchestration server emits a mid-stream error frame (e.g. a
184
+ // 500 from the LLM Module after templating succeeded). The JSON body
185
+ // includes `intermediate_results.templating`, which echoes our entire
186
+ // system prompt back at the user — useless noise that drowns the
187
+ // actionable bits ({code, message, location, request_id}). Detect this
188
+ // shape, extract the signal, and drop the echo.
189
+ function looksLikeSapServerSseError(error: unknown): boolean {
190
+ return (
191
+ error instanceof Error &&
192
+ error.message.startsWith("Error received from the server.")
193
+ );
194
+ }
195
+
196
+ type SapSseErrorBody = {
197
+ code?: number;
198
+ message?: string;
199
+ location?: string;
200
+ request_id?: string;
201
+ };
202
+
203
+ function extractSapSseErrorDetail(error: Error): string | undefined {
204
+ const newline = error.message.indexOf("\n");
205
+ if (newline < 0) return undefined;
206
+ const body = error.message.slice(newline + 1).trim();
207
+ try {
208
+ const d = JSON.parse(body) as SapSseErrorBody;
209
+ const parts: string[] = [];
210
+ if (typeof d.code === "number") parts.push(`SAP ${d.code}`);
211
+ if (typeof d.location === "string" && d.location.length > 0)
212
+ parts.push(`at ${d.location}`);
213
+ const head = parts.length > 0 ? `${parts.join(" ")}: ` : "";
214
+ const tail =
215
+ typeof d.request_id === "string" && d.request_id.length > 0
216
+ ? ` (request_id: ${d.request_id})`
217
+ : "";
218
+ const msg = typeof d.message === "string" ? d.message : "(no message)";
219
+ return `${head}${msg}${tail}`;
220
+ } catch {
221
+ return undefined;
222
+ }
223
+ }
224
+
225
+ export function formatError(error: unknown): string {
226
+ const parts: string[] = [];
227
+ const seen = new Set<string>();
228
+ const push = (s: string | undefined) => {
229
+ if (!s) return;
230
+ if (seen.has(s)) return;
231
+ seen.add(s);
232
+ parts.push(s);
233
+ };
234
+
235
+ let current: unknown = error;
236
+ while (current instanceof Error) {
237
+ if (looksLikeSapServerSseError(current)) {
238
+ const detail = extractSapSseErrorDetail(current);
239
+ if (detail) {
240
+ push(detail);
241
+ } else {
242
+ // Fallback: keep first line only so we don't dump the
243
+ // echoed system prompt on a parse failure.
244
+ push(current.message.split("\n", 1)[0]);
245
+ }
246
+ } else if (looksLikeSapGatewayJsonParseFailure(current)) {
247
+ push(sapGatewayHint(current as SyntaxError));
248
+ push(current.message);
249
+ } else {
250
+ push(current.message);
251
+ }
252
+ const response = (current as Error & { response?: { data?: unknown } })
253
+ .response;
254
+ push(extractServerDetail(response?.data));
255
+ current = (current as Error & { cause?: unknown }).cause;
256
+ }
257
+ if (current !== undefined && current !== null) push(String(current));
258
+ return parts.length > 0 ? parts.join(" → ") : String(error);
259
+ }
260
+
261
+ // SAP orchestration keeps its OWN per-model streaming allow-list, and it lags
262
+ // behind direct LLM-access support: a freshly-added model (e.g. gpt-5.5) can
263
+ // advertise "Streaming Support: Yes" on its Model Library card — that flag
264
+ // describes direct /chat/completions — while the orchestration service still
265
+ // rejects `client.stream()` with a 400 "Streaming is not supported for this
266
+ // model". We can't flip that server-side, but the SDK's non-streaming
267
+ // `chatCompletion()` DOES work for these models, so we fall back to it and
268
+ // replay the single response through pi's streaming events. This set records
269
+ // which model.ids hit the wall so later turns skip the wasted streaming probe
270
+ // for the rest of the process; restart pi once SAP enables orchestration
271
+ // streaming and the model returns to the streaming path.
272
+ const STREAMING_UNSUPPORTED = new Set<string>();
273
+
274
+ function isStreamingUnsupportedError(error: unknown): boolean {
275
+ // formatError already walks `.cause` and extracts SAP's nested
276
+ // response.data message, so match the fully-resolved string instead of
277
+ // guessing where in the error chain the phrase lives.
278
+ return /streaming is not supported/i.test(formatError(error));
279
+ }
280
+
281
+ // EMPIRICAL FINDING (2026-05-16, verified across gpt-5-mini and
282
+ // claude-4.5-sonnet): SAP orchestration strips all detail fields from
283
+ // the TokenUsage response. We receive ONLY {prompt_tokens,
284
+ // completion_tokens, total_tokens} — no `prompt_tokens_details`, no
285
+ // `completion_tokens_details`, no Anthropic-style top-level
286
+ // `cache_read_input_tokens`/`cache_creation_input_tokens`. So pi's
287
+ // `cacheRead`/`cacheWrite` will always be 0 on SAP-routed turns,
288
+ // regardless of whether the backend (OpenAI/Anthropic) actually cached.
289
+ // SAP's own contract billing may give you a cache discount that isn't
290
+ // visible to this client.
291
+ //
292
+ // We KEEP the detail-field probes below for defense: if SAP ever flips
293
+ // a switch to expose detail fields, the math is already correct. Pi's
294
+ // convention is also the OPPOSITE of OpenAI's: `usage.input` is
295
+ // non-cached prompt tokens only, with cached tokens accounted for
296
+ // separately on `cacheRead`/`cacheWrite`. Don't "simplify" by setting
297
+ // `input = prompt_tokens` — that would double-count cache hits if/when
298
+ // SAP starts exposing them and inflate cost reporting by ~10× (cacheRead
299
+ // is priced at 10% of input on Anthropic).
300
+ // Accepts any OpenAI-shaped usage object — orchestration's `TokenUsage` OR the
301
+ // foundation provider's `AzureOpenAiCompletionUsage`; both carry the same
302
+ // prompt/completion fields. Kept structural so stream-foundation.ts can reuse
303
+ // this without importing orchestration's `TokenUsage` type.
304
+ export type RawTokenUsage = {
305
+ prompt_tokens?: number;
306
+ completion_tokens?: number;
307
+ prompt_tokens_details?: { cached_tokens?: number } | null;
308
+ cache_read_input_tokens?: number;
309
+ cache_creation_input_tokens?: number;
310
+ };
311
+
312
+ export function mapUsage(usage: RawTokenUsage): Usage {
313
+ const openAiCached = usage.prompt_tokens_details?.cached_tokens ?? 0;
314
+ const anthropicCached = usage.cache_read_input_tokens ?? 0;
315
+ // In practice these are mutually exclusive per route; max() is defensive.
316
+ const cacheRead = Math.max(openAiCached, anthropicCached);
317
+ const cacheWrite = usage.cache_creation_input_tokens ?? 0;
318
+ const prompt = usage.prompt_tokens ?? 0;
319
+ const input = Math.max(0, prompt - cacheRead - cacheWrite);
320
+ const output = usage.completion_tokens ?? 0;
321
+ return {
322
+ input,
323
+ output,
324
+ cacheRead,
325
+ cacheWrite,
326
+ totalTokens: input + output + cacheRead + cacheWrite,
327
+ cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
328
+ };
329
+ }
330
+
331
+ // Anthropic adaptive thinking is only supported on Opus 4.6+, Sonnet 4.6+,
332
+ // and Opus 4.7+. Older models (4.0–4.5) reject `thinking: {type: "adaptive"}`
333
+ // with "adaptive thinking is not supported on this model" and need the
334
+ // classic budget-tokens shape instead. Versioned check rather than a
335
+ // hard-coded id list so future tenant extras (5.x etc.) just work.
336
+ function anthropicSupportsAdaptive(modelId: string): boolean {
337
+ if (!modelId.startsWith("anthropic--claude-")) return false;
338
+ const m = modelId.match(/^anthropic--claude-(\d+)(?:\.(\d+))?/);
339
+ if (!m) return false;
340
+ const major = Number.parseInt(m[1], 10);
341
+ const minor = m[2] ? Number.parseInt(m[2], 10) : 0;
342
+ return major > 4 || (major === 4 && minor >= 6);
343
+ }
344
+
345
+ // Pi cycles five reasoning levels; Anthropic's budget-tokens API takes a
346
+ // raw token count. These defaults mirror what README:108 documents (1k /
347
+ // 4k / 8k / 16k / 32k) — and they're what pi-ai's own anthropic provider
348
+ // uses for older models. budget_tokens MUST be ≥1024; max_tokens MUST be
349
+ // strictly greater than budget_tokens.
350
+ const ANTHROPIC_BUDGET_TOKENS: Record<string, number> = {
351
+ minimal: 1024,
352
+ low: 4096,
353
+ medium: 8192,
354
+ high: 16384,
355
+ xhigh: 32768,
356
+ };
357
+
358
+ function clampBudget(intended: number, maxTokens: number): number {
359
+ // Leave at least 1024 tokens of room for the actual response; if even
360
+ // that's not possible, give up on thinking for this turn.
361
+ const ceiling = Math.max(0, maxTokens - 1024);
362
+ return Math.min(intended, ceiling);
363
+ }
364
+
365
+ function reasoningParams(
366
+ model: Model<Api>,
367
+ reasoning: string | undefined,
368
+ effectiveMaxTokens: number,
369
+ ): Partial<LlmModelParams> {
370
+ if (!reasoning || reasoning === "off") return {};
371
+
372
+ // SAP orchestration does NOT have a single unified reasoning shape.
373
+ // The right model.params keys are provider-native:
374
+ // - Anthropic adaptive (4.6+, 4.7+): `thinking: { type: "adaptive" }`
375
+ // enables reasoning, `output_config: { effort }` controls depth.
376
+ // - Anthropic budget (4.0–4.5): `thinking: { type: "enabled",
377
+ // budget_tokens: N }`. output_config is not used; the depth is the
378
+ // budget itself. SAP rejects adaptive on these models with
379
+ // "adaptive thinking is not supported on this model".
380
+ // - OpenAI: `reasoning_effort: "minimal"|"low"|"medium"|"high"`.
381
+ // SAP rejects `thinking` and `output_config` for openai routes.
382
+ // - Gemini: unverified at SAP — pi-side we ship gemini-2.5* with
383
+ // `reasoning: false` so this never fires for them.
384
+ if (model.id.startsWith("anthropic--")) {
385
+ if (anthropicSupportsAdaptive(model.id)) {
386
+ const effort =
387
+ model.thinkingLevelMap?.[
388
+ reasoning as keyof NonNullable<typeof model.thinkingLevelMap>
389
+ ];
390
+ if (!effort) return {};
391
+ return {
392
+ thinking: { type: "adaptive" },
393
+ output_config: { effort },
394
+ };
395
+ }
396
+ const intended = ANTHROPIC_BUDGET_TOKENS[reasoning];
397
+ if (!intended) return {};
398
+ // Anthropic requires max_tokens > budget_tokens, so clamp against
399
+ // the EFFECTIVE max_tokens we're actually sending — not the model's
400
+ // hard cap — otherwise pi's tighter budget will 400.
401
+ const budget = clampBudget(intended, effectiveMaxTokens);
402
+ if (budget < 1024) return {}; // not enough headroom to think
403
+ return {
404
+ thinking: { type: "enabled", budget_tokens: budget },
405
+ };
406
+ }
407
+ if (model.id.startsWith("gpt-")) {
408
+ const effort =
409
+ model.thinkingLevelMap?.[
410
+ reasoning as keyof NonNullable<typeof model.thinkingLevelMap>
411
+ ];
412
+ if (!effort) return {};
413
+ return { reasoning_effort: effort };
414
+ }
415
+ return {};
416
+ }
417
+
418
+ // gpt-5* on SAP orchestration rejects `temperature` ("Unsupported parameter").
419
+ // Mirrors the `temperature: false` flag in models-snapshot.json without forcing
420
+ // stream.ts to import the snapshot just for capability lookup.
421
+ function modelSupportsTemperature(modelId: string): boolean {
422
+ return !modelId.startsWith("gpt-");
423
+ }
424
+
425
+ function buildLlmParams(
426
+ model: Model<Api>,
427
+ options: SimpleStreamOptions | undefined,
428
+ ): LlmModelParams {
429
+ // Pi may pass a maxTokens budget smaller than the model's hard cap (e.g.
430
+ // to reserve room for thinking). Respect it; otherwise fall back to the
431
+ // model's documented max output.
432
+ const effectiveMaxTokens = options?.maxTokens ?? model.maxTokens;
433
+ const reasoning = reasoningParams(model, options?.reasoning, effectiveMaxTokens);
434
+ const params: LlmModelParams = {
435
+ max_tokens: effectiveMaxTokens,
436
+ };
437
+ // Anthropic rejects a custom temperature when extended thinking is enabled
438
+ // ("`temperature` may only be set to 1 when thinking is enabled"). Whenever
439
+ // we're sending a `thinking` block, drop temperature so the two
440
+ // incompatible params never go out together. (gpt-* is already excluded by
441
+ // modelSupportsTemperature; it carries reasoning_effort, not `thinking`.)
442
+ const sendingThinking = "thinking" in reasoning;
443
+ if (
444
+ options?.temperature !== undefined &&
445
+ modelSupportsTemperature(model.id) &&
446
+ !sendingThinking
447
+ ) {
448
+ params.temperature = options.temperature;
449
+ }
450
+ return {
451
+ ...params,
452
+ ...reasoning,
453
+ };
454
+ }
455
+
456
+ export type ToolCallSlot = {
457
+ contentIndex: number;
458
+ partialJson: string;
459
+ };
460
+
461
+ // What both the streaming and non-streaming paths hand to the shared
462
+ // finalizer: the resolved finish reason, any accumulated refusal, and raw
463
+ // SAP token usage (mapped + costed once, in `finishTurn`).
464
+ type TurnResult = {
465
+ finishReason: string | undefined;
466
+ refusalText: string;
467
+ usage: TokenUsage | undefined;
468
+ };
469
+
470
+ // SAP's `ChatDelta` schema is `{role?, content, refusal?, tool_calls?} & Record<string, any>`.
471
+ // The Record<string,any> is a deliberate passthrough for vendor-native
472
+ // streaming fields. The SDK only exposes `getDeltaContent()` and
473
+ // `getDeltaToolCalls()`; everything else we have to dig out of
474
+ // `findChoiceByIndex(0)?.delta` ourselves.
475
+ //
476
+ // EMPIRICAL FINDING (2026-05-16, opus 4.6 + gpt-5-mini): SAP orchestration
477
+ // does NOT pass reasoning/thinking content through. Deltas contain only
478
+ // `role` and `content`. The model genuinely reasons (token usage reflects
479
+ // it, and step-by-step structure leaks into the visible text), but the
480
+ // structured thinking block pi expects to render in its UI panel never
481
+ // arrives. Refusals also weren't observed; OpenAI moderation may inline
482
+ // them into `content` rather than `refusal`.
483
+ //
484
+ // We keep the `pickReasoning` / refusal machinery below in place anyway:
485
+ // (a) it's a few function calls per chunk, (b) if SAP ever flips a switch
486
+ // to expose reasoning text, our extension picks it up with no further
487
+ // changes. Don't be tempted to delete it as "dead code".
488
+ export type ExtendedDelta = {
489
+ content?: string | null;
490
+ refusal?: string | null;
491
+ // OpenAI-compat reasoning passthrough (DeepSeek, gpt-5 via SAP, etc.)
492
+ reasoning_content?: string | null;
493
+ reasoning?: string | null;
494
+ reasoning_text?: string | null;
495
+ // Anthropic-via-SAP may pass through native thinking as a string or
496
+ // as a content-block array. Mirror both shapes defensively.
497
+ thinking?:
498
+ | string
499
+ | Array<{ type?: string; thinking?: string; text?: string }>
500
+ | null;
501
+ [key: string]: unknown;
502
+ };
503
+
504
+ const REASONING_FIELDS = [
505
+ "reasoning_content",
506
+ "reasoning",
507
+ "reasoning_text",
508
+ ] as const;
509
+
510
+ // Returns the first non-empty reasoning chunk on the delta, plus the
511
+ // field name it came from. Latching the field name across chunks avoids
512
+ // double-counting providers that emit both `reasoning` and
513
+ // `reasoning_content` with identical content (chutes.ai etc. do this —
514
+ // pi-ai's openai-completions provider applies the same defense).
515
+ export function pickReasoning(
516
+ delta: ExtendedDelta,
517
+ preferredField: string | undefined,
518
+ ): { text: string; field: string } | undefined {
519
+ if (preferredField) {
520
+ const v = delta[preferredField];
521
+ if (typeof v === "string" && v.length > 0)
522
+ return { text: v, field: preferredField };
523
+ }
524
+ for (const field of REASONING_FIELDS) {
525
+ if (field === preferredField) continue;
526
+ const v = delta[field];
527
+ if (typeof v === "string" && v.length > 0) return { text: v, field };
528
+ }
529
+ const native = delta.thinking;
530
+ if (typeof native === "string" && native.length > 0) {
531
+ return { text: native, field: "thinking" };
532
+ }
533
+ if (Array.isArray(native)) {
534
+ const joined = native
535
+ .map((b) => (b?.type === "thinking" ? b.thinking : b?.text) ?? "")
536
+ .join("");
537
+ if (joined.length > 0) return { text: joined, field: "thinking" };
538
+ }
539
+ return undefined;
540
+ }
541
+
542
+ // Latch finish reasons across chunks. SAP can emit a real reason (e.g.
543
+ // "tool_calls") on chunk N and then a later "stop" on chunk N+1 — taking
544
+ // the last value loses the meaningful one. Latch the first non-empty;
545
+ // also bias toward "tool_calls" so toolUse always wins over a trailing
546
+ // "stop" (which happens after the tool args complete).
547
+ export function latchFinishReason(
548
+ current: string | undefined,
549
+ next: string | undefined,
550
+ ): string | undefined {
551
+ if (!next) return current;
552
+ if (next === "tool_calls" || next === "function_call") return next;
553
+ if (current === "tool_calls" || current === "function_call") return current;
554
+ return current ?? next;
555
+ }
556
+
557
+ let lastValidatedKey: ValidatedKey | undefined;
558
+
559
+ // pi stores oauth credentials keyed by PROVIDER name, not by the oauth `name`.
560
+ // So a `/login` under `sap-aicore` is NOT automatically handed to a second
561
+ // provider (`sap-aicore-foundation`) that shares the same oauth object — pi
562
+ // passes that provider the registration placeholder instead, and we'd wrongly
563
+ // report "no key configured". Recover the shared login by reading pi's own auth
564
+ // store directly and returning the first sibling oauth credential that carries
565
+ // a service-key JSON. This is what makes one `/login` serve both providers.
566
+ function readSharedServiceKeyFromStore(): string | undefined {
567
+ try {
568
+ const store = AuthStorage.create();
569
+ for (const provider of store.list()) {
570
+ const cred = store.get(provider);
571
+ if (cred?.type !== "oauth") continue;
572
+ const sk = (cred as { serviceKey?: unknown }).serviceKey;
573
+ if (typeof sk === "string" && sk.trimStart().startsWith("{")) return sk;
574
+ }
575
+ } catch {
576
+ // Auth store unreadable (missing/locked/format change) — fall through to
577
+ // the actionable "no key configured" error below.
578
+ }
579
+ return undefined;
580
+ }
581
+
582
+ export function ensureServiceKey(apiKey: string | undefined): ValidatedKey {
583
+ // Resolution order:
584
+ // 1. `apiKey` from pi — the oauth-stored service-key JSON, passed to the
585
+ // provider pi associates the credential with (the one you `/login`ed).
586
+ // When a provider is unconfigured, pi passes our registration
587
+ // placeholder (a non-JSON literal), so we treat anything not starting
588
+ // with `{` as "no key from pi".
589
+ // 2. AICORE_SERVICE_KEY env override (per-shell).
590
+ // 3. The shared login from pi's auth store — covers a second provider
591
+ // (foundation) that shares the oauth but has no credential of its own.
592
+ const fromPi = apiKey?.trimStart().startsWith("{") ? apiKey : undefined;
593
+ const raw =
594
+ fromPi ??
595
+ process.env.AICORE_SERVICE_KEY ??
596
+ readSharedServiceKeyFromStore();
597
+ if (!raw) {
598
+ throw new Error(
599
+ "No SAP AI Core service key configured. Run `/login` in pi, " +
600
+ "pick 'Use a subscription' → 'SAP AI Core', and paste your BTP " +
601
+ "service-key JSON. Or set AICORE_SERVICE_KEY in your shell.",
602
+ );
603
+ }
604
+
605
+ if (lastValidatedKey?.raw === raw) return lastValidatedKey;
606
+
607
+ const validated = parseAndValidateServiceKey(raw);
608
+ lastValidatedKey = validated;
609
+ return validated;
610
+ }
611
+
612
+ // Resolve the SAP AI Core resource group with this precedence:
613
+ // 1. AICORE_RESOURCE_GROUP env var (per-shell override).
614
+ // 2. `resourceGroup` field on the service-key JSON (per-tenant default).
615
+ // 3. undefined — SAP server-side defaults to "default".
616
+ export function resolveResourceGroup(key: ValidatedKey): string | undefined {
617
+ const fromEnv = process.env.AICORE_RESOURCE_GROUP?.trim();
618
+ if (fromEnv) return fromEnv;
619
+ return key.resourceGroup;
620
+ }
621
+
622
+ export function streamSapAiCore(
623
+ model: Model<Api>,
624
+ context: Context,
625
+ options?: SimpleStreamOptions,
626
+ ): AssistantMessageEventStream {
627
+ const stream = createAssistantMessageEventStream();
628
+
629
+ const output: AssistantMessage = {
630
+ role: "assistant",
631
+ content: [],
632
+ api: model.api,
633
+ provider: model.provider,
634
+ model: model.id,
635
+ usage: {
636
+ input: 0,
637
+ output: 0,
638
+ cacheRead: 0,
639
+ cacheWrite: 0,
640
+ totalTokens: 0,
641
+ cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
642
+ },
643
+ stopReason: "stop",
644
+ timestamp: Date.now(),
645
+ };
646
+
647
+ (async () => {
648
+ const requestId = randomUUID();
649
+ try {
650
+ stream.push({ type: "start", partial: output });
651
+
652
+ const serviceKey = ensureServiceKey(options?.apiKey);
653
+ process.env.AICORE_SERVICE_KEY = serviceKey.raw;
654
+ const resourceGroup = resolveResourceGroup(serviceKey);
655
+
656
+ const { messages, tools } = piContextToOrchestration(context);
657
+
658
+ const { OrchestrationClient } = await importOrchestration();
659
+ const llmParams = buildLlmParams(model, options);
660
+
661
+ debugLog({
662
+ requestId,
663
+ kind: "request",
664
+ model: model.id,
665
+ resourceGroup,
666
+ params: llmParams,
667
+ messageRoles: messages.map((m) => m.role),
668
+ messages,
669
+ });
670
+
671
+ const client = new OrchestrationClient(
672
+ {
673
+ promptTemplating: {
674
+ model: {
675
+ name: model.id as ChatModel,
676
+ params: llmParams,
677
+ },
678
+ prompt: {
679
+ template: [],
680
+ ...(tools.length > 0 ? { tools } : {}),
681
+ },
682
+ },
683
+ },
684
+ // SAP's typings reject AI-Resource-Group as a header
685
+ // (`'AI-Resource-Group'?: never`); the only supported path is
686
+ // the deploymentConfig constructor arg. Omit when undefined
687
+ // so SAP falls back to its server-side default ("default").
688
+ resourceGroup ? { resourceGroup } : undefined,
689
+ );
690
+
691
+ // Shared finalizer for both paths: map+cost usage once, promote a
692
+ // refusal to a visible error, otherwise emit the `done` event.
693
+ const finishTurn = (result: TurnResult) => {
694
+ if (result.usage) {
695
+ output.usage = mapUsage(result.usage);
696
+ calculateCost(model, output.usage);
697
+ }
698
+
699
+ // A refusal terminates the turn with no real content. Promote
700
+ // it to errorMessage and emit an error event so pi surfaces
701
+ // it visibly instead of showing an empty assistant turn.
702
+ if (result.refusalText) {
703
+ output.stopReason = "error";
704
+ output.errorMessage = `Model refused: ${result.refusalText}`;
705
+ stream.push({ type: "error", reason: "error", error: output });
706
+ stream.end();
707
+ return;
708
+ }
709
+
710
+ output.stopReason = mapFinishReason(result.finishReason);
711
+ stream.push({
712
+ type: "done",
713
+ reason: output.stopReason as "stop" | "length" | "toolUse",
714
+ message: output,
715
+ });
716
+ stream.end();
717
+ };
718
+
719
+ // Non-streaming fallback for models SAP orchestration refuses to
720
+ // stream (see STREAMING_UNSUPPORTED). One blocking chatCompletion,
721
+ // replayed through pi's streaming events as a single text/tool block.
722
+ const runBlocking = async (): Promise<TurnResult> => {
723
+ const blocking = await client.chatCompletion(
724
+ { messages },
725
+ options?.signal ? { signal: options.signal } : undefined,
726
+ );
727
+
728
+ // getRefusal() first: getContent() throws on a filtered turn,
729
+ // and a refusal is exactly that case.
730
+ const refusal = blocking.getRefusal();
731
+ if (refusal) {
732
+ return {
733
+ finishReason: blocking.getFinishReason(),
734
+ refusalText: refusal,
735
+ usage: blocking.getTokenUsage(),
736
+ };
737
+ }
738
+
739
+ const content = blocking.getContent();
740
+ if (content) {
741
+ output.content.push({ type: "text", text: content });
742
+ const idx = output.content.length - 1;
743
+ stream.push({ type: "text_start", contentIndex: idx, partial: output });
744
+ stream.push({
745
+ type: "text_delta",
746
+ contentIndex: idx,
747
+ delta: content,
748
+ partial: output,
749
+ });
750
+ stream.push({
751
+ type: "text_end",
752
+ contentIndex: idx,
753
+ content,
754
+ partial: output,
755
+ });
756
+ }
757
+
758
+ for (const tc of blocking.getToolCalls() ?? []) {
759
+ let parsedArgs: Record<string, unknown> = {};
760
+ if (tc.function.arguments) {
761
+ try {
762
+ parsedArgs = JSON.parse(tc.function.arguments);
763
+ } catch {
764
+ // Model emitted invalid JSON — leave args empty rather
765
+ // than crash; mirrors the streaming path's tolerance.
766
+ }
767
+ }
768
+ output.content.push({
769
+ type: "toolCall",
770
+ id: tc.id,
771
+ name: tc.function.name,
772
+ arguments: parsedArgs,
773
+ });
774
+ const idx = output.content.length - 1;
775
+ stream.push({
776
+ type: "toolcall_start",
777
+ contentIndex: idx,
778
+ partial: output,
779
+ });
780
+ if (tc.function.arguments) {
781
+ stream.push({
782
+ type: "toolcall_delta",
783
+ contentIndex: idx,
784
+ delta: tc.function.arguments,
785
+ partial: output,
786
+ });
787
+ }
788
+ stream.push({
789
+ type: "toolcall_end",
790
+ contentIndex: idx,
791
+ toolCall: {
792
+ type: "toolCall",
793
+ id: tc.id,
794
+ name: tc.function.name,
795
+ arguments: parsedArgs,
796
+ },
797
+ partial: output,
798
+ });
799
+ }
800
+
801
+ return {
802
+ finishReason: blocking.getFinishReason(),
803
+ refusalText: "",
804
+ usage: blocking.getTokenUsage(),
805
+ };
806
+ };
807
+
808
+ // Stream by default; on SAP's "Streaming is not supported" 400 —
809
+ // and only before any chunk has been emitted — remember the model
810
+ // and fall back to the blocking path so the turn still completes.
811
+ let response: Awaited<ReturnType<typeof client.stream>> | undefined =
812
+ undefined;
813
+ if (!STREAMING_UNSUPPORTED.has(model.id)) {
814
+ try {
815
+ response = await client.stream({ messages }, options?.signal, {
816
+ promptTemplating: { include_usage: true },
817
+ });
818
+ } catch (error) {
819
+ if (
820
+ !isStreamingUnsupportedError(error) ||
821
+ output.content.length > 0
822
+ ) {
823
+ throw error;
824
+ }
825
+ STREAMING_UNSUPPORTED.add(model.id);
826
+ debugLog({
827
+ requestId,
828
+ kind: "stream-fallback",
829
+ model: model.id,
830
+ reason: "orchestration-streaming-unsupported",
831
+ });
832
+ }
833
+ }
834
+
835
+ if (!response) {
836
+ finishTurn(await runBlocking());
837
+ return;
838
+ }
839
+
840
+ let textIndex = -1;
841
+ let thinkingIndex = -1;
842
+ let reasoningField: string | undefined;
843
+ let refusalText = "";
844
+ const toolSlots = new Map<number, ToolCallSlot>();
845
+ let finishReason: string | undefined;
846
+
847
+ const closeText = () => {
848
+ if (textIndex < 0) return;
849
+ const block = output.content[textIndex];
850
+ if (block?.type === "text") {
851
+ stream.push({
852
+ type: "text_end",
853
+ contentIndex: textIndex,
854
+ content: block.text,
855
+ partial: output,
856
+ });
857
+ }
858
+ textIndex = -1;
859
+ };
860
+
861
+ const closeThinking = () => {
862
+ if (thinkingIndex < 0) return;
863
+ const block = output.content[thinkingIndex];
864
+ if (block?.type === "thinking") {
865
+ stream.push({
866
+ type: "thinking_end",
867
+ contentIndex: thinkingIndex,
868
+ content: block.thinking,
869
+ partial: output,
870
+ });
871
+ }
872
+ thinkingIndex = -1;
873
+ };
874
+
875
+ for await (const chunk of response.stream) {
876
+ if (options?.signal?.aborted) break;
877
+
878
+ const choice = chunk.findChoiceByIndex(0);
879
+ const rawDelta = (choice?.delta ?? {}) as ExtendedDelta;
880
+
881
+ // Reasoning first — most providers emit reasoning chunks
882
+ // before the visible text, and pi's UI expects a
883
+ // thinking block to precede the text block in
884
+ // output.content ordering.
885
+ const reasoning = pickReasoning(rawDelta, reasoningField);
886
+ if (reasoning) {
887
+ reasoningField = reasoning.field;
888
+ if (thinkingIndex < 0) {
889
+ closeText();
890
+ output.content.push({ type: "thinking", thinking: "" });
891
+ thinkingIndex = output.content.length - 1;
892
+ stream.push({
893
+ type: "thinking_start",
894
+ contentIndex: thinkingIndex,
895
+ partial: output,
896
+ });
897
+ }
898
+ const block = output.content[thinkingIndex];
899
+ if (block?.type === "thinking") {
900
+ block.thinking += reasoning.text;
901
+ stream.push({
902
+ type: "thinking_delta",
903
+ contentIndex: thinkingIndex,
904
+ delta: reasoning.text,
905
+ partial: output,
906
+ });
907
+ }
908
+ }
909
+
910
+ const delta = chunk.getDeltaContent();
911
+ if (delta) {
912
+ if (textIndex < 0) {
913
+ closeThinking();
914
+ output.content.push({ type: "text", text: "" });
915
+ textIndex = output.content.length - 1;
916
+ stream.push({
917
+ type: "text_start",
918
+ contentIndex: textIndex,
919
+ partial: output,
920
+ });
921
+ }
922
+ const block = output.content[textIndex];
923
+ if (block?.type === "text") {
924
+ block.text += delta;
925
+ stream.push({
926
+ type: "text_delta",
927
+ contentIndex: textIndex,
928
+ delta,
929
+ partial: output,
930
+ });
931
+ }
932
+ }
933
+
934
+ // Refusals from SAP's content filter or the underlying
935
+ // provider (OpenAI moderation, etc.). Accumulate
936
+ // across chunks; surface as the final error message so
937
+ // the user sees something instead of an empty turn.
938
+ if (
939
+ typeof rawDelta.refusal === "string" &&
940
+ rawDelta.refusal.length > 0
941
+ ) {
942
+ refusalText += rawDelta.refusal;
943
+ }
944
+
945
+ const toolDeltas = chunk.getDeltaToolCalls();
946
+ if (toolDeltas && toolDeltas.length > 0) {
947
+ closeText();
948
+ closeThinking();
949
+
950
+ for (const td of toolDeltas) {
951
+ let slot = toolSlots.get(td.index);
952
+ if (!slot) {
953
+ output.content.push({
954
+ type: "toolCall",
955
+ id: td.id ?? "",
956
+ name: td.function?.name ?? "",
957
+ arguments: {},
958
+ });
959
+ slot = {
960
+ contentIndex: output.content.length - 1,
961
+ partialJson: "",
962
+ };
963
+ toolSlots.set(td.index, slot);
964
+ stream.push({
965
+ type: "toolcall_start",
966
+ contentIndex: slot.contentIndex,
967
+ partial: output,
968
+ });
969
+ }
970
+
971
+ const block = output.content[slot.contentIndex];
972
+ if (block?.type === "toolCall") {
973
+ if (td.id && !block.id) block.id = td.id;
974
+ if (td.function?.name && !block.name)
975
+ block.name = td.function.name;
976
+
977
+ const fragment = td.function?.arguments ?? "";
978
+ if (fragment) {
979
+ slot.partialJson += fragment;
980
+ try {
981
+ block.arguments = JSON.parse(slot.partialJson);
982
+ } catch {
983
+ // Partial JSON — keep accumulating until valid
984
+ }
985
+ stream.push({
986
+ type: "toolcall_delta",
987
+ contentIndex: slot.contentIndex,
988
+ delta: fragment,
989
+ partial: output,
990
+ });
991
+ }
992
+ }
993
+ }
994
+ }
995
+
996
+ finishReason = latchFinishReason(finishReason, chunk.getFinishReason());
997
+ }
998
+
999
+ closeText();
1000
+ closeThinking();
1001
+
1002
+ for (const slot of toolSlots.values()) {
1003
+ const block = output.content[slot.contentIndex];
1004
+ if (block?.type === "toolCall") {
1005
+ if (slot.partialJson) {
1006
+ try {
1007
+ block.arguments = JSON.parse(slot.partialJson);
1008
+ } catch {
1009
+ // Leave arguments as last successfully-parsed value
1010
+ }
1011
+ }
1012
+ stream.push({
1013
+ type: "toolcall_end",
1014
+ contentIndex: slot.contentIndex,
1015
+ toolCall: {
1016
+ type: "toolCall",
1017
+ id: block.id,
1018
+ name: block.name,
1019
+ arguments: block.arguments,
1020
+ },
1021
+ partial: output,
1022
+ });
1023
+ }
1024
+ }
1025
+
1026
+ finishTurn({
1027
+ finishReason: finishReason ?? response.getFinishReason(),
1028
+ refusalText,
1029
+ usage: response.getTokenUsage(),
1030
+ });
1031
+ } catch (error) {
1032
+ output.stopReason = options?.signal?.aborted ? "aborted" : "error";
1033
+ output.errorMessage = formatError(error);
1034
+ debugLog({
1035
+ requestId,
1036
+ kind: "error",
1037
+ model: model.id,
1038
+ stopReason: output.stopReason,
1039
+ error: output.errorMessage,
1040
+ });
1041
+ stream.push({
1042
+ type: "error",
1043
+ reason: output.stopReason as "error" | "aborted",
1044
+ error: output,
1045
+ });
1046
+ stream.end();
1047
+ }
1048
+ })();
1049
+
1050
+ return stream;
1051
+ }