@ssweens/pi-vertex 1.1.4 → 1.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,47 @@
2
2
 
3
3
  All notable changes to this project will be documented in this file.
4
4
 
5
+ ## [1.1.7] - 2026-05-16
6
+ ### Added
7
+ - **Regional pricing for Claude models** — non-global Vertex endpoints (us-east5, europe-west1, asia-southeast1, us/eu multi-region) carry a 10% price premium per GCP's published rates. The streaming layer now automatically selects the correct cost tier based on the resolved endpoint at call time. No config change required — if your `GOOGLE_CLOUD_LOCATION` or config resolves to any non-`global` location, cost tracking reflects the regional rate.
8
+ - Claude Opus 4.7/4.6/4.5: global $5.00/$25.00 → regional $5.50/$27.50
9
+ - Claude Sonnet 4.6/4.5: global $3.00/$15.00 → regional $3.30/$16.50
10
+ - Claude Haiku 4.5: global $1.00/$5.00 → regional $1.10/$5.50
11
+ - Claude Opus 4.1, Opus 4, Sonnet 4: uniform pricing (no regional variant on GCP)
12
+ - **`costRegional?: ModelCost` field on `VertexModelConfig`** — optional cost tier used when the resolved GCP location is non-global. Models without this field use `cost` for all regions.
13
+
14
+ ### Fixed
15
+ - **Grok cache read pricing** — previously 0 for both xAI models; corrected to GCP official rates:
16
+ - `grok-4.20-reasoning`: cacheRead $0.20/1M
17
+ - `grok-4.1-fast-reasoning`: cacheRead $0.05/1M
18
+
19
+ ## [1.1.6] - 2026-05-16
20
+ ### Fixed
21
+ - **`maxTokens / 2` halving removed** — both the Anthropic and OpenAI-compat MaaS streaming paths were silently capping requests at half the model's stated `maxTokens`. Requests now use the full `maxTokens` value unless the caller explicitly overrides it.
22
+ - **Gemini cached token double-counting** — `promptTokenCount` includes cached tokens, so input cost was inflated. Input usage is now `promptTokenCount − cachedTokenCount`, matching the actual billable amount.
23
+ - **`sanitizeText` corrupted emoji** — the previous regex replaced all surrogate code units including valid pairs (emoji are encoded as two surrogates). Now only unpaired/lone surrogates are stripped.
24
+ - **Gemini Pro can't use `MINIMAL` thinking level** — `ThinkingLevel.MINIMAL` is only valid for Flash models. Pro requests with `minimal`/`low` effort now floor to `ThinkingLevel.LOW`.
25
+ - **Reasoning models always get a minimum thinking config** — previously thinking was only configured when an explicit `reasoning` effort was passed. For reasoning-capable Gemini models, a minimum config (lowest budget/level) is now always set, matching pi-mono behavior and preventing silent thought suppression.
26
+ - **`convertToGeminiMessages`: missing tool results injected** — if an assistant turn with tool calls has no matching `toolResult` message, a synthetic error result (`"No result provided"`) is flushed before the next turn. Prevents Gemini 400 errors from dangling tool calls.
27
+ - **`convertToGeminiMessages`: image tool results supported** — `toolResult` messages containing image content are now forwarded correctly. Gemini 3+ models receive them as `functionResponse.parts`; older models get a separate user image turn.
28
+ - **`convertToGeminiMessages`: tighter same-model guard** — thought signature replay now also requires `api === "google-generative-ai"` so signatures from non-Gemini providers (e.g. Claude) are never incorrectly forwarded.
29
+ - **`convertToGeminiMessages`: removed `id` from `functionCall` parts** — the `requiresToolCallId` heuristic was wrong; Gemini does not use tool call IDs in `functionCall` parts.
30
+
31
+ ### Updated
32
+ - `claude-opus-4-6`: `maxTokens` corrected to `128000` (was `32000`)
33
+ - `claude-sonnet-4-6`: `maxTokens` corrected to `128000` (was `64000`)
34
+ - `convertToolsForGemini` / `convertTools`: signatures tightened from `any[]` to typed `Tool[]`
35
+
36
+ *Bug fixes co-discovered with [lhl/pi-vertex](https://github.com/lhl/pi-vertex), a respected community fork. Credit: @lhl.*
37
+
38
+ ## [1.1.5] - 2026-05-16
39
+ ### Added
40
+ - **xAI Grok models** (new publisher on Vertex MaaS OpenAI-compat endpoint):
41
+ - `grok-4.20-reasoning` — flagship model, 200K context, text+image input, reasoning+tools, $1.25/$2.50 per 1M tokens
42
+ - `grok-4.1-fast-reasoning` — cost-effective model, 128K context, text+image input, reasoning+tools, $0.20/$0.50 per 1M tokens
43
+ - **Claude Opus 4.7** (`claude-opus-4-7`) — 1M context, 128K max output tokens (up from 4.6's 32K), $5.00/$25.00 per 1M, same cache pricing as Opus 4.6
44
+ - **Gemma 4 26B A4B IT** (`gemma-4-26b-a4b-it`) — Google's MoE instruction-tuned model via MaaS, 262K context, 128K max output, text+image input, $0.15/$0.60 per 1M tokens
45
+
5
46
  ## [1.1.4] - 2026-03-30
6
47
  ### Fixed
7
48
  - Removed error message override for `400 (no body)` responses from Vertex MaaS models. The original message now passes through to `isContextOverflow()` which already handles this pattern, enabling proper auto-compact instead of showing a raw error to the user.
package/README.md CHANGED
@@ -15,18 +15,18 @@ Set your GCP project and credentials. Vertex AI models (Gemini, Claude, Llama, D
15
15
 
16
16
  ## Features
17
17
 
18
- - **43 models** across 4 categories:
19
- - **Gemini** (8): 3.1 Pro, 3 Pro, 3 Flash, 2.5 Pro, 2.5 Flash, 2.0 Flash, and more
20
- - **Claude** (12): Opus 4.6, Sonnet 4.6, 4.5, 4.1, 4, 3.7 Sonnet, 3.5 Sonnet v2, 3.5 Sonnet, 3 Haiku
18
+ - **48 models** across 4 categories:
19
+ - **Gemini** (9): 3.1 Pro, 3.1 Flash-Lite, 3 Flash, 2.5 Pro, 2.5 Flash, 2.5 Flash-Lite, 2.0 Flash, 2.0 Flash-Lite
20
+ - **Claude** (10): Opus 4.7, Opus 4.6, Sonnet 4.6, Opus/Sonnet/Haiku 4.5, Opus 4.1, Opus 4, Sonnet 4, 3.5 Sonnet v2
21
21
  - **Llama** (3): 4 Maverick, 4 Scout, 3.3 70B
22
- - **Other MaaS** (20): AI21 Jamba, Mistral, DeepSeek, Qwen, OpenAI GPT-OSS, Kimi, MiniMax, GLM
22
+ - **Other MaaS** (26): Grok, Gemma, Mistral, DeepSeek, Qwen, OpenAI GPT-OSS, Kimi, MiniMax, GLM
23
23
 
24
24
  - **Unified streaming**: Single provider, multiple model families
25
25
  - **Full tool calling support**: All models with multi-turn tool use and proper tool result handling
26
26
  - **Thinking/reasoning**: Gemini 3 thinking levels, Gemini 2.5 thinking budgets, thought signature preservation
27
27
  - **Automatic auth**: Uses Google Application Default Credentials
28
28
  - **Region awareness**: Global endpoints where supported, regional where required
29
- - **Pricing tracking**: Built-in cost per token for all models (including thinking tokens)
29
+ - **Pricing tracking**: Built-in cost per token for all models (including thinking tokens and regional endpoint premiums)
30
30
 
31
31
  ## Installation
32
32
 
@@ -128,17 +128,20 @@ alias pil="GOOGLE_CLOUD_PROJECT=your-project pi --provider vertex --model llama-
128
128
 
129
129
  ### Claude Models
130
130
 
131
- | Model | Context | Max Tokens | Input | Reasoning | Price (in/out) | Region |
132
- |-------|---------|------------|-------|-----------|----------------|--------|
133
- | claude-opus-4-6 | 1M | 32,000 | text, image | | $5.00/$25.00 | global |
134
- | claude-sonnet-4-6 | 1M | 64,000 | text, image | ✅ | $3.00/$15.00 | global |
135
- | claude-opus-4-5 | 200K | 32,000 | text, image | ✅ | $5.00/$25.00 | global |
136
- | claude-sonnet-4-5 | 200K | 64,000 | text, image | ✅ | $3.00/$15.00 | global |
137
- | claude-haiku-4-5 | 200K | 64,000 | text, image | ✅ | $1.00/$5.00 | global |
138
- | claude-opus-4-1 | 200K | 32,000 | text, image | ✅ | $15.00/$75.00 | global |
139
- | claude-opus-4 | 200K | 32,000 | text, image | ✅ | $15.00/$75.00 | global |
140
- | claude-sonnet-4 | 200K | 64,000 | text, image | ✅ | $3.00/$15.00 | global |
141
- | claude-3-5-sonnet-v2 | 200K | 8,192 | text, image | | $3.00/$15.00 | global |
131
+ Prices shown are for the **global** endpoint. Non-global regions (us-east5, europe-west1, asia-southeast1, us/eu multi-region) carry a 10% premium — cost tracking adjusts automatically based on your configured `GOOGLE_CLOUD_LOCATION`.
132
+
133
+ | Model | Context | Max Tokens | Input | Reasoning | Price global (in/out) | Price regional (in/out) |
134
+ |-------|---------|------------|-------|-----------|----------------------|------------------------|
135
+ | claude-opus-4-7 | 1M | 128,000 | text, image | ✅ | $5.00/$25.00 | $5.50/$27.50 |
136
+ | claude-opus-4-6 | 1M | 128,000 | text, image | ✅ | $5.00/$25.00 | $5.50/$27.50 |
137
+ | claude-sonnet-4-6 | 1M | 128,000 | text, image | ✅ | $3.00/$15.00 | $3.30/$16.50 |
138
+ | claude-opus-4-5 | 200K | 32,000 | text, image | ✅ | $5.00/$25.00 | $5.50/$27.50 |
139
+ | claude-sonnet-4-5 | 200K | 64,000 | text, image | ✅ | $3.00/$15.00 | $3.30/$16.50 |
140
+ | claude-haiku-4-5 | 200K | 64,000 | text, image | ✅ | $1.00/$5.00 | $1.10/$5.50 |
141
+ | claude-opus-4-1 | 200K | 32,000 | text, image | | $15.00/$75.00 | (uniform) |
142
+ | claude-opus-4 | 200K | 32,000 | text, image | ✅ | $15.00/$75.00 | (uniform) |
143
+ | claude-sonnet-4 | 200K | 64,000 | text, image | ✅ | $3.00/$15.00 | (uniform) |
144
+ | claude-3-5-sonnet-v2 | 200K | 8,192 | text, image | ❌ | $3.00/$15.00 | (uniform) |
142
145
 
143
146
  ### Llama Models
144
147
 
@@ -170,6 +173,9 @@ alias pil="GOOGLE_CLOUD_PROJECT=your-project pi --provider vertex --model llama-
170
173
  | minimax-m2 | 196K | minimaxai | $0.30/$1.20 | global |
171
174
  | glm-5 | 200K | zai-org | $1.00/$3.20 | global |
172
175
  | glm-4.7 | 200K | zai-org | $0.60/$2.20 | global |
176
+ | grok-4.20-reasoning | 200K | xai | $1.25/$2.50 | global |
177
+ | grok-4.1-fast-reasoning | 128K | xai | $0.20/$0.50 | global |
178
+ | gemma-4-26b-a4b-it | 262K | google | $0.15/$0.60 | global |
173
179
 
174
180
  ## Regional Endpoints
175
181
 
@@ -218,6 +224,10 @@ export GOOGLE_CLOUD_LOCATION=us-central1
218
224
  - `@mariozechner/pi-ai`: Peer dependency
219
225
  - `@mariozechner/pi-coding-agent`: Peer dependency
220
226
 
227
+ ## Acknowledgments
228
+
229
+ [lhl](https://github.com/lhl) maintains [lhl/pi-vertex](https://github.com/lhl/pi-vertex), an independent fork that added comprehensive unit tests and CI, and identified several important bugs. Several fixes in v1.1.6 were co-discovered through review of that work, including the `maxTokens/2` halving bug, Gemini cached-token double-counting, `sanitizeText` emoji corruption, missing tool result flushing, and image tool result forwarding. Kudos.
230
+
221
231
  ## License
222
232
 
223
233
  MIT
package/models/claude.ts CHANGED
@@ -2,13 +2,42 @@
2
2
  * Claude model definitions for Vertex AI
3
3
  * Source: https://cloud.google.com/vertex-ai/generative-ai/docs/partner-models/use-partner-models
4
4
  * Pricing: https://cloud.google.com/vertex-ai/generative-ai/pricing#partner-models
5
- * All prices per 1M tokens (global endpoint, <= 200K input tokens)
5
+ * All prices per 1M tokens (<=200K input tokens)
6
+ * `cost` = global endpoint; `costRegional` = non-global (us-east5, europe-west1,
7
+ * asia-southeast1, us/eu multi-region) — uniformly 10% above global.
6
8
  * Cache write prices are for 5-minute TTL
7
9
  */
8
10
 
9
11
  import type { VertexModelConfig } from "../types.js";
10
12
 
11
13
  export const CLAUDE_MODELS: VertexModelConfig[] = [
14
+ // Claude 4.7 series
15
+ {
16
+ id: "claude-opus-4-7",
17
+ name: "Claude Opus 4.7",
18
+ apiId: "claude-opus-4-7",
19
+ publisher: "anthropic",
20
+ endpointType: "maas",
21
+ contextWindow: 1000000,
22
+ maxTokens: 128000,
23
+ input: ["text", "image"],
24
+ reasoning: true,
25
+ tools: true,
26
+ cost: {
27
+ input: 5.00,
28
+ output: 25.00,
29
+ cacheRead: 0.50,
30
+ cacheWrite: 6.25,
31
+ },
32
+ costRegional: {
33
+ input: 5.50,
34
+ output: 27.50,
35
+ cacheRead: 0.55,
36
+ cacheWrite: 6.875,
37
+ },
38
+ region: "global",
39
+ },
40
+
12
41
  // Claude 4.6 series
13
42
  {
14
43
  id: "claude-opus-4-6",
@@ -17,7 +46,7 @@ export const CLAUDE_MODELS: VertexModelConfig[] = [
17
46
  publisher: "anthropic",
18
47
  endpointType: "maas",
19
48
  contextWindow: 1000000,
20
- maxTokens: 32000,
49
+ maxTokens: 128000,
21
50
  input: ["text", "image"],
22
51
  reasoning: true,
23
52
  tools: true,
@@ -27,6 +56,12 @@ export const CLAUDE_MODELS: VertexModelConfig[] = [
27
56
  cacheRead: 0.50,
28
57
  cacheWrite: 6.25,
29
58
  },
59
+ costRegional: {
60
+ input: 5.50,
61
+ output: 27.50,
62
+ cacheRead: 0.55,
63
+ cacheWrite: 6.875,
64
+ },
30
65
  region: "global",
31
66
  },
32
67
  {
@@ -36,7 +71,7 @@ export const CLAUDE_MODELS: VertexModelConfig[] = [
36
71
  publisher: "anthropic",
37
72
  endpointType: "maas",
38
73
  contextWindow: 1000000,
39
- maxTokens: 64000,
74
+ maxTokens: 128000,
40
75
  input: ["text", "image"],
41
76
  reasoning: true,
42
77
  tools: true,
@@ -46,6 +81,12 @@ export const CLAUDE_MODELS: VertexModelConfig[] = [
46
81
  cacheRead: 0.30,
47
82
  cacheWrite: 3.75,
48
83
  },
84
+ costRegional: {
85
+ input: 3.30,
86
+ output: 16.50,
87
+ cacheRead: 0.33,
88
+ cacheWrite: 4.125,
89
+ },
49
90
  region: "global",
50
91
  },
51
92
 
@@ -67,6 +108,12 @@ export const CLAUDE_MODELS: VertexModelConfig[] = [
67
108
  cacheRead: 0.50,
68
109
  cacheWrite: 6.25,
69
110
  },
111
+ costRegional: {
112
+ input: 5.50,
113
+ output: 27.50,
114
+ cacheRead: 0.55,
115
+ cacheWrite: 6.875,
116
+ },
70
117
  region: "global",
71
118
  },
72
119
  {
@@ -86,6 +133,12 @@ export const CLAUDE_MODELS: VertexModelConfig[] = [
86
133
  cacheRead: 0.30,
87
134
  cacheWrite: 3.75,
88
135
  },
136
+ costRegional: {
137
+ input: 3.30,
138
+ output: 16.50,
139
+ cacheRead: 0.33,
140
+ cacheWrite: 4.125,
141
+ },
89
142
  region: "global",
90
143
  },
91
144
  {
@@ -105,6 +158,12 @@ export const CLAUDE_MODELS: VertexModelConfig[] = [
105
158
  cacheRead: 0.10,
106
159
  cacheWrite: 1.25,
107
160
  },
161
+ costRegional: {
162
+ input: 1.10,
163
+ output: 5.50,
164
+ cacheRead: 0.11,
165
+ cacheWrite: 1.375,
166
+ },
108
167
  region: "global",
109
168
  },
110
169
 
package/models/maas.ts CHANGED
@@ -8,6 +8,46 @@
8
8
  import type { VertexModelConfig } from "../types.js";
9
9
 
10
10
  export const MAAS_MODELS: VertexModelConfig[] = [
11
+ // --- xAI Grok ---
12
+ {
13
+ id: "grok-4.20-reasoning",
14
+ name: "Grok 4.20 Reasoning",
15
+ apiId: "grok-4.20-reasoning",
16
+ publisher: "xai",
17
+ endpointType: "maas",
18
+ contextWindow: 200000,
19
+ maxTokens: 32000,
20
+ input: ["text", "image"],
21
+ reasoning: true,
22
+ tools: true,
23
+ cost: {
24
+ input: 1.25,
25
+ output: 2.50,
26
+ cacheRead: 0.20,
27
+ cacheWrite: 0,
28
+ },
29
+ region: "global",
30
+ },
31
+ {
32
+ id: "grok-4.1-fast-reasoning",
33
+ name: "Grok 4.1 Fast Reasoning",
34
+ apiId: "grok-4.1-fast-reasoning",
35
+ publisher: "xai",
36
+ endpointType: "maas",
37
+ contextWindow: 128000,
38
+ maxTokens: 32000,
39
+ input: ["text", "image"],
40
+ reasoning: true,
41
+ tools: true,
42
+ cost: {
43
+ input: 0.20,
44
+ output: 0.50,
45
+ cacheRead: 0.05,
46
+ cacheWrite: 0,
47
+ },
48
+ region: "global",
49
+ },
50
+
11
51
  // --- Meta Llama ---
12
52
  {
13
53
  id: "llama-4-maverick",
@@ -383,6 +423,27 @@ export const MAAS_MODELS: VertexModelConfig[] = [
383
423
  region: "global",
384
424
  },
385
425
 
426
+ // --- Google Gemma (MaaS) ---
427
+ {
428
+ id: "gemma-4-26b-a4b-it",
429
+ name: "Gemma 4 26B A4B IT",
430
+ apiId: "gemma-4-26b-a4b-it-maas",
431
+ publisher: "google",
432
+ endpointType: "maas",
433
+ contextWindow: 262144,
434
+ maxTokens: 128000,
435
+ input: ["text", "image"],
436
+ reasoning: false,
437
+ tools: false,
438
+ cost: {
439
+ input: 0.15,
440
+ output: 0.60,
441
+ cacheRead: 0,
442
+ cacheWrite: 0,
443
+ },
444
+ region: "global",
445
+ },
446
+
386
447
  // --- GLM (Zhipu AI) ---
387
448
  {
388
449
  id: "glm-5",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ssweens/pi-vertex",
3
- "version": "1.1.4",
3
+ "version": "1.1.7",
4
4
  "description": "Google Vertex AI provider for Pi coding agent - supports Gemini, Claude, and all MaaS models",
5
5
  "type": "module",
6
6
  "main": "index.ts",
@@ -24,14 +24,53 @@ const THINKING_LEVEL_MAP: Record<string, ThinkingLevel> = {
24
24
  high: ThinkingLevel.HIGH,
25
25
  };
26
26
 
27
+ interface GeminiThinkingConfig {
28
+ includeThoughts?: boolean;
29
+ thinkingBudget?: number;
30
+ thinkingLevel?: ThinkingLevel;
31
+ }
32
+
33
+ function isGemini3ProModel(modelId: string): boolean {
34
+ return /gemini-3(?:\.\d+)?-pro/.test(modelId.toLowerCase());
35
+ }
36
+
37
+ function isGemini3FlashModel(modelId: string): boolean {
38
+ return /gemini-3(?:\.\d+)?-flash/.test(modelId.toLowerCase());
39
+ }
40
+
41
+ function isGemini25ProModel(modelId: string): boolean {
42
+ return /gemini-2\.5-pro/.test(modelId.toLowerCase());
43
+ }
44
+
45
+ function getGemini3ThinkingLevel(effort: string, modelId: string): ThinkingLevel {
46
+ if (isGemini3ProModel(modelId)) {
47
+ // Pro only supports LOW/MEDIUM/HIGH — floor minimal/low to LOW
48
+ if (effort === "minimal" || effort === "low") return ThinkingLevel.LOW;
49
+ if (effort === "medium") return ThinkingLevel.MEDIUM;
50
+ return ThinkingLevel.HIGH;
51
+ }
52
+ return THINKING_LEVEL_MAP[effort];
53
+ }
54
+
55
+ function getLowestThinkingConfig(modelId: string): GeminiThinkingConfig {
56
+ if (isGemini3ProModel(modelId)) {
57
+ return { thinkingLevel: ThinkingLevel.LOW };
58
+ }
59
+ if (isGemini3FlashModel(modelId)) {
60
+ return { thinkingLevel: ThinkingLevel.MINIMAL };
61
+ }
62
+ if (isGemini25ProModel(modelId)) {
63
+ return { thinkingBudget: 128 };
64
+ }
65
+ return { thinkingBudget: 0 };
66
+ }
67
+
27
68
  function mapGeminiStopReason(reason: string): "stop" | "length" | "toolUse" | "error" {
28
69
  switch (reason) {
29
70
  case FinishReason.STOP:
30
71
  return "stop";
31
72
  case FinishReason.MAX_TOKENS:
32
73
  return "length";
33
- case FinishReason.SAFETY:
34
- case FinishReason.RECITATION:
35
74
  default:
36
75
  return "error";
37
76
  }
@@ -79,9 +118,11 @@ export function streamGemini(
79
118
  // Convert messages with model ID for proper thinking/tool handling
80
119
  const contents = convertToGeminiMessages(context.messages, model.apiId);
81
120
 
82
- // Build config — only set temperature when explicitly provided
83
- const config: any = {
84
- maxOutputTokens: options?.maxTokens || Math.floor(model.maxTokens / 2),
121
+ // Build config — only set temperature when explicitly provided.
122
+ // The Vertex Gemini config shape is sprawling; use Record to avoid
123
+ // fighting the SDK's incomplete typings.
124
+ const config: Record<string, unknown> = {
125
+ maxOutputTokens: options?.maxTokens || model.maxTokens,
85
126
  ...(options?.temperature !== undefined && { temperature: options.temperature }),
86
127
  };
87
128
 
@@ -95,28 +136,33 @@ export function streamGemini(
95
136
  config.tools = convertToolsForGemini(context.tools);
96
137
  }
97
138
 
98
- // Add thinking configuration (matches pi-mono's buildParams logic)
99
- if (model.reasoning && options?.reasoning) {
100
- const effort = options.reasoning === "xhigh" ? "high" : options.reasoning;
101
- const isGemini3 = model.apiId.startsWith("gemini-3");
102
-
103
- const thinkingConfig: any = { includeThoughts: true };
139
+ // Add thinking configuration (matches pi-mono's buildParams logic).
140
+ // For reasoning models: always set a minimum thinking config so the model
141
+ // doesn't silently suppress thoughts when no effort level is specified.
142
+ if (model.reasoning) {
143
+ if (options?.reasoning) {
144
+ const effort = options.reasoning === "xhigh" ? "high" : options.reasoning;
145
+ const isGemini3 = model.apiId.startsWith("gemini-3");
146
+ const thinkingConfig: GeminiThinkingConfig = { includeThoughts: true };
147
+
148
+ if (isGemini3) {
149
+ // Gemini 3 Pro doesn't support MINIMAL; Flash models do.
150
+ thinkingConfig.thinkingLevel = getGemini3ThinkingLevel(effort, model.apiId);
151
+ } else {
152
+ // Gemini 2.5 models use thinking budgets (token counts)
153
+ const budgets: Record<string, number> = {
154
+ minimal: 128,
155
+ low: 2048,
156
+ medium: 8192,
157
+ high: model.apiId.includes("2.5-pro") ? 32768 : 24576,
158
+ };
159
+ thinkingConfig.thinkingBudget = budgets[effort] ?? 8192;
160
+ }
104
161
 
105
- if (isGemini3) {
106
- // Gemini 3 models use thinking levels (MINIMAL/LOW/MEDIUM/HIGH)
107
- thinkingConfig.thinkingLevel = THINKING_LEVEL_MAP[effort];
162
+ config.thinkingConfig = thinkingConfig;
108
163
  } else {
109
- // Gemini 2.5 models use thinking budgets (token counts)
110
- const budgets: Record<string, number> = {
111
- minimal: 128,
112
- low: 2048,
113
- medium: 8192,
114
- high: model.apiId.includes("2.5-pro") ? 32768 : 24576,
115
- };
116
- thinkingConfig.thinkingBudget = budgets[effort] ?? 8192;
164
+ config.thinkingConfig = getLowestThinkingConfig(model.apiId);
117
165
  }
118
-
119
- config.thinkingConfig = thinkingConfig;
120
166
  }
121
167
 
122
168
  // Pass abort signal to SDK for in-flight cancellation
@@ -136,8 +182,10 @@ export function streamGemini(
136
182
  config,
137
183
  });
138
184
 
139
- // Track current content block for thinking/text transitions
140
- let currentBlock: any = null;
185
+ // Track current content block for thinking/text transitions.
186
+ type StreamingTextBlock = { type: "text"; text: string; textSignature?: string };
187
+ type StreamingThinkingBlock = { type: "thinking"; thinking: string; thinkingSignature?: string };
188
+ let currentBlock: StreamingTextBlock | StreamingThinkingBlock | null = null;
141
189
  let currentBlockType: "text" | "thinking" | null = null;
142
190
 
143
191
  for await (const chunk of response) {
@@ -152,13 +200,11 @@ export function streamGemini(
152
200
 
153
201
  // Check if we need to transition to a new block
154
202
  if (currentBlockType !== targetType) {
155
- // End previous block
156
- if (currentBlock && currentBlockType) {
157
- if (currentBlockType === "text") {
158
- stream.push({ type: "text_end", contentIndex: output.content.length - 1, content: currentBlock.text, partial: output });
159
- } else {
160
- stream.push({ type: "thinking_end", contentIndex: output.content.length - 1, content: currentBlock.thinking, partial: output });
161
- }
203
+ // End previous block (narrow on type for correct field access)
204
+ if (currentBlock?.type === "text") {
205
+ stream.push({ type: "text_end", contentIndex: output.content.length - 1, content: currentBlock.text, partial: output });
206
+ } else if (currentBlock?.type === "thinking") {
207
+ stream.push({ type: "thinking_end", contentIndex: output.content.length - 1, content: currentBlock.thinking, partial: output });
162
208
  }
163
209
 
164
210
  // Start new block
@@ -174,12 +220,12 @@ export function streamGemini(
174
220
  currentBlockType = targetType;
175
221
  }
176
222
 
177
- // Accumulate content
178
- if (currentBlockType === "thinking") {
223
+ // Accumulate content (narrow on discriminant for type safety)
224
+ if (currentBlock?.type === "thinking") {
179
225
  currentBlock.thinking += part.text;
180
226
  currentBlock.thinkingSignature = retainThoughtSignature(currentBlock.thinkingSignature, part.thoughtSignature);
181
227
  stream.push({ type: "thinking_delta", contentIndex: output.content.length - 1, delta: part.text, partial: output });
182
- } else {
228
+ } else if (currentBlock?.type === "text") {
183
229
  currentBlock.text += part.text;
184
230
  currentBlock.textSignature = retainThoughtSignature(currentBlock.textSignature, part.thoughtSignature);
185
231
  stream.push({ type: "text_delta", contentIndex: output.content.length - 1, delta: part.text, partial: output });
@@ -188,12 +234,12 @@ export function streamGemini(
188
234
 
189
235
  if (part.functionCall) {
190
236
  // End current text/thinking block before tool call
191
- if (currentBlock && currentBlockType) {
192
- if (currentBlockType === "text") {
193
- stream.push({ type: "text_end", contentIndex: output.content.length - 1, content: currentBlock.text, partial: output });
194
- } else {
195
- stream.push({ type: "thinking_end", contentIndex: output.content.length - 1, content: currentBlock.thinking, partial: output });
196
- }
237
+ if (currentBlock?.type === "text") {
238
+ stream.push({ type: "text_end", contentIndex: output.content.length - 1, content: currentBlock.text, partial: output });
239
+ } else if (currentBlock?.type === "thinking") {
240
+ stream.push({ type: "thinking_end", contentIndex: output.content.length - 1, content: currentBlock.thinking, partial: output });
241
+ }
242
+ if (currentBlock) {
197
243
  currentBlock = null;
198
244
  currentBlockType = null;
199
245
  }
@@ -210,7 +256,7 @@ export function streamGemini(
210
256
  type: "toolCall" as const,
211
257
  id: toolCallId,
212
258
  name: part.functionCall.name || "",
213
- arguments: (part.functionCall.args as Record<string, any>) ?? {},
259
+ arguments: (part.functionCall.args as Record<string, unknown>) ?? {},
214
260
  ...(part.thoughtSignature && { thoughtSignature: part.thoughtSignature }),
215
261
  };
216
262
 
@@ -230,18 +276,26 @@ export function streamGemini(
230
276
  output.errorMessage = "Content blocked by safety filters";
231
277
  }
232
278
  // Override to toolUse if any tool calls are present (matches pi-mono)
233
- if (output.content.some((b: any) => b.type === "toolCall")) {
279
+ if (output.content.some((b) => b.type === "toolCall")) {
234
280
  output.stopReason = "toolUse";
235
281
  }
236
282
  }
237
283
 
238
- // Update usage — include thoughtsTokenCount in output (matches pi-mono)
284
+ // Update usage — include thoughtsTokenCount in output (matches pi-mono).
285
+ // Subtract cached tokens from prompt to avoid double-counting in input cost.
239
286
  if (chunk.usageMetadata) {
240
- const meta = chunk.usageMetadata as any;
287
+ const meta = chunk.usageMetadata as {
288
+ cachedContentTokenCount?: number;
289
+ promptTokenCount?: number;
290
+ candidatesTokenCount?: number;
291
+ thoughtsTokenCount?: number;
292
+ totalTokenCount?: number;
293
+ };
294
+ const cachedTokens = meta.cachedContentTokenCount || 0;
241
295
  output.usage = {
242
- input: meta.promptTokenCount || 0,
296
+ input: Math.max(0, (meta.promptTokenCount || 0) - cachedTokens),
243
297
  output: (meta.candidatesTokenCount || 0) + (meta.thoughtsTokenCount || 0),
244
- cacheRead: meta.cachedContentTokenCount || 0,
298
+ cacheRead: cachedTokens,
245
299
  cacheWrite: 0,
246
300
  totalTokens: meta.totalTokenCount || 0,
247
301
  cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 },
@@ -251,15 +305,17 @@ export function streamGemini(
251
305
  }
252
306
 
253
307
  // End final block
254
- if (currentBlock && currentBlockType) {
255
- if (currentBlockType === "text") {
256
- stream.push({ type: "text_end", contentIndex: output.content.length - 1, content: currentBlock.text, partial: output });
257
- } else {
258
- stream.push({ type: "thinking_end", contentIndex: output.content.length - 1, content: currentBlock.thinking, partial: output });
259
- }
308
+ if (currentBlock?.type === "text") {
309
+ stream.push({ type: "text_end", contentIndex: output.content.length - 1, content: currentBlock.text, partial: output });
310
+ } else if (currentBlock?.type === "thinking") {
311
+ stream.push({ type: "thinking_end", contentIndex: output.content.length - 1, content: currentBlock.thinking, partial: output });
312
+ }
313
+
314
+ if (options?.signal?.aborted) {
315
+ throw new Error("Request was aborted");
260
316
  }
261
317
 
262
- stream.push({ type: "done", reason: output.stopReason as any, message: output });
318
+ stream.push({ type: "done", reason: output.stopReason, message: output });
263
319
  stream.end();
264
320
  } catch (error) {
265
321
  output.stopReason = options?.signal?.aborted ? "aborted" : "error";
package/streaming/maas.ts CHANGED
@@ -53,6 +53,11 @@ async function streamAnthropic(
53
53
  const location = resolveLocation(model.region);
54
54
  const auth = getAuthConfig(location);
55
55
 
56
+ // Use regional pricing when the resolved endpoint is not the global one.
57
+ // Models without costRegional (e.g. Opus 4.1, Sonnet 4) have uniform pricing.
58
+ const effectiveCost =
59
+ auth.location !== "global" && model.costRegional ? model.costRegional : model.cost;
60
+
56
61
  const client = new AnthropicVertex({
57
62
  projectId: auth.projectId,
58
63
  region: auth.location,
@@ -218,7 +223,7 @@ async function streamAnthropic(
218
223
 
219
224
  const params: any = {
220
225
  model: model.apiId,
221
- max_tokens: options?.maxTokens || Math.floor(model.maxTokens / 2),
226
+ max_tokens: options?.maxTokens || model.maxTokens,
222
227
  messages,
223
228
  ...(context.systemPrompt ? { system: context.systemPrompt } : {}),
224
229
  ...(tools && tools.length > 0 ? { tools } : {}),
@@ -314,7 +319,7 @@ async function streamAnthropic(
314
319
  }
315
320
 
316
321
  output.usage.totalTokens = output.usage.input + output.usage.output + output.usage.cacheRead + output.usage.cacheWrite;
317
- calculateCost(model as any, output.usage);
322
+ calculateCost({ ...model, cost: effectiveCost } as any, output.usage);
318
323
 
319
324
  if (output.content.some((b: any) => b.type === "toolCall")) {
320
325
  output.stopReason = "toolUse";
@@ -371,7 +376,7 @@ export function streamMaaS(
371
376
  const innerStream = streamSimpleOpenAICompletions(modelForPi, context as any, {
372
377
  ...options,
373
378
  apiKey: accessToken,
374
- maxTokens: options?.maxTokens || Math.floor(model.maxTokens / 2),
379
+ maxTokens: options?.maxTokens || model.maxTokens,
375
380
  temperature: options?.temperature,
376
381
  });
377
382
 
package/types.ts CHANGED
@@ -47,7 +47,17 @@ export interface VertexModelConfig {
47
47
  input: ModelInputType[];
48
48
  reasoning: boolean;
49
49
  tools: boolean;
50
+ /** Pricing for the global endpoint (default). */
50
51
  cost: ModelCost;
52
+ /**
53
+ * Pricing for non-global regional endpoints (us-east5, europe-west1,
54
+ * asia-southeast1, us/eu multi-region, etc.).
55
+ *
56
+ * When the resolved GCP location is not "global" and this field is set,
57
+ * the streaming layer uses these costs instead of `cost`.
58
+ * Omit for models whose pricing is uniform across all regions.
59
+ */
60
+ costRegional?: ModelCost;
51
61
  region: string;
52
62
  }
53
63
 
package/utils.ts CHANGED
@@ -7,18 +7,24 @@
7
7
 
8
8
  import type {
9
9
  AssistantMessage,
10
+ ImageContent,
10
11
  Message,
11
12
  TextContent,
12
13
  ThinkingContent,
14
+ Tool,
13
15
  ToolCall,
14
16
  ToolResultMessage,
15
17
  } from "./types.js";
16
18
 
17
19
  /**
18
- * Sanitize text by removing invalid surrogate pairs
20
+ * Sanitize text by removing unpaired surrogate code units.
21
+ * Valid surrogate pairs (emoji) are preserved.
19
22
  */
20
23
  export function sanitizeText(text: string): string {
21
- return text.replace(/[\uD800-\uDFFF]/g, "\uFFFD");
24
+ return text.replace(
25
+ /[\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF]/g,
26
+ "",
27
+ );
22
28
  }
23
29
 
24
30
  // --- Thought signature helpers (matching pi-mono google-shared.ts) ---
@@ -50,12 +56,20 @@ export function retainThoughtSignature(
50
56
  return existing;
51
57
  }
52
58
 
53
- /**
54
- * Whether a model requires explicit tool call IDs in functionCall parts.
55
- * Claude and GPT-OSS models on Vertex require them; native Gemini models don't.
56
- */
57
- function requiresToolCallId(modelId: string): boolean {
58
- return modelId.startsWith("claude-") || modelId.startsWith("gpt-oss-");
59
+ type GeminiContent = {
60
+ role: "user" | "model";
61
+ parts: Array<Record<string, unknown>>;
62
+ };
63
+
64
+ function getGeminiMajorVersion(modelId: string): number | undefined {
65
+ const match = modelId.toLowerCase().match(/^gemini(?:-live)?-(\d+)/);
66
+ return match ? Number.parseInt(match[1], 10) : undefined;
67
+ }
68
+
69
+ function supportsMultimodalFunctionResponse(modelId: string): boolean {
70
+ const majorVersion = getGeminiMajorVersion(modelId);
71
+ if (majorVersion !== undefined) return majorVersion >= 3;
72
+ return true;
59
73
  }
60
74
 
61
75
  /**
@@ -64,12 +78,77 @@ function requiresToolCallId(modelId: string): boolean {
64
78
  * Handles the full pi-ai Message union: UserMessage, AssistantMessage (with
65
79
  * TextContent, ThinkingContent, ToolCall blocks), and ToolResultMessage.
66
80
  */
67
- export function convertToGeminiMessages(messages: Message[], modelId: string): any[] {
68
- const result: any[] = [];
81
+ export function convertToGeminiMessages(messages: Message[], modelId: string): GeminiContent[] {
82
+ const result: GeminiContent[] = [];
69
83
  const isGemini3 = modelId.startsWith("gemini-3");
84
+ let pendingToolCalls: ToolCall[] = [];
85
+ let existingToolResultIds = new Set<string>();
86
+
87
+ const pushToolResult = (
88
+ toolCallId: string,
89
+ toolName: string,
90
+ content: ToolResultMessage["content"],
91
+ isError: boolean,
92
+ ) => {
93
+ const textContent = content.filter((c): c is TextContent => c.type === "text");
94
+ const textResult = textContent.map((c) => c.text).join("\n");
95
+ const imageContent = content.filter((c): c is ImageContent => c.type === "image");
96
+ const hasText = textResult.length > 0;
97
+ const hasImages = imageContent.length > 0;
98
+ const responseValue = hasText
99
+ ? sanitizeText(textResult)
100
+ : hasImages
101
+ ? "(see attached image)"
102
+ : "";
103
+
104
+ const imageParts = imageContent.map((img) => ({
105
+ inlineData: { mimeType: img.mimeType, data: img.data },
106
+ }));
107
+
108
+ const functionResponsePart: Record<string, unknown> = {
109
+ functionResponse: {
110
+ name: toolName,
111
+ response: isError ? { error: responseValue } : { output: responseValue },
112
+ ...(hasImages && supportsMultimodalFunctionResponse(modelId) ? { parts: imageParts } : {}),
113
+ },
114
+ };
115
+
116
+ // Merge consecutive tool results into a single user turn (required by Gemini API)
117
+ const lastContent = result[result.length - 1];
118
+ if (lastContent?.role === "user" && lastContent.parts?.some((p) => "functionResponse" in p)) {
119
+ lastContent.parts.push(functionResponsePart);
120
+ } else {
121
+ result.push({ role: "user", parts: [functionResponsePart] });
122
+ }
123
+
124
+ // Gemini < 3: carry image tool results as a separate user image turn
125
+ if (hasImages && !supportsMultimodalFunctionResponse(modelId)) {
126
+ result.push({
127
+ role: "user",
128
+ parts: [{ text: "Tool result image:" }, ...imageParts],
129
+ });
130
+ }
131
+ };
132
+
133
+ const flushMissingToolResults = () => {
134
+ if (pendingToolCalls.length === 0) return;
135
+ for (const toolCall of pendingToolCalls) {
136
+ if (!existingToolResultIds.has(toolCall.id)) {
137
+ pushToolResult(
138
+ toolCall.id,
139
+ toolCall.name,
140
+ [{ type: "text", text: "No result provided" }],
141
+ true,
142
+ );
143
+ }
144
+ }
145
+ pendingToolCalls = [];
146
+ existingToolResultIds = new Set<string>();
147
+ };
70
148
 
71
149
  for (const msg of messages) {
72
150
  if (msg.role === "user") {
151
+ flushMissingToolResults();
73
152
  if (typeof msg.content === "string") {
74
153
  if (msg.content.trim()) {
75
154
  result.push({
@@ -78,33 +157,34 @@ export function convertToGeminiMessages(messages: Message[], modelId: string): a
78
157
  });
79
158
  }
80
159
  } else {
81
- const parts = msg.content.map((item) => {
82
- if (item.type === "text") {
83
- return { text: sanitizeText(item.text) };
84
- } else {
85
- return {
86
- inlineData: {
87
- mimeType: item.mimeType,
88
- data: item.data,
89
- },
90
- };
91
- }
92
- });
160
+ const parts: Array<Record<string, unknown>> = msg.content.map(
161
+ (item: TextContent | ImageContent) => {
162
+ if (item.type === "text") {
163
+ return { text: sanitizeText(item.text) };
164
+ }
165
+ return { inlineData: { mimeType: item.mimeType, data: item.data } };
166
+ },
167
+ );
93
168
  if (parts.length > 0) {
94
169
  result.push({ role: "user", parts });
95
170
  }
96
171
  }
97
172
  } else if (msg.role === "assistant") {
98
173
  const assistantMsg = msg as AssistantMessage;
174
+ flushMissingToolResults();
99
175
 
100
176
  // Skip errored/aborted messages — they're incomplete turns
101
177
  if (assistantMsg.stopReason === "error" || assistantMsg.stopReason === "aborted") {
102
178
  continue;
103
179
  }
104
180
 
181
+ // Also require api match so cross-provider thought signatures aren't replayed
105
182
  const isSameProviderAndModel =
106
- assistantMsg.provider === "vertex" && assistantMsg.model === modelId;
107
- const parts: any[] = [];
183
+ assistantMsg.provider === "vertex" &&
184
+ assistantMsg.api === "google-generative-ai" &&
185
+ assistantMsg.model === modelId;
186
+ const parts: Array<Record<string, unknown>> = [];
187
+ const toolCalls: ToolCall[] = [];
108
188
 
109
189
  for (const block of assistantMsg.content) {
110
190
  if (block.type === "text") {
@@ -134,13 +214,13 @@ export function convertToGeminiMessages(messages: Message[], modelId: string): a
134
214
  }
135
215
  } else if (block.type === "toolCall") {
136
216
  const toolCallBlock = block as ToolCall;
217
+ toolCalls.push(toolCallBlock);
137
218
  const thoughtSig = resolveThoughtSignature(isSameProviderAndModel, toolCallBlock.thoughtSignature);
138
219
 
139
- const part: any = {
220
+ const part: Record<string, unknown> = {
140
221
  functionCall: {
141
222
  name: toolCallBlock.name,
142
223
  args: toolCallBlock.arguments ?? {},
143
- ...(requiresToolCallId(modelId) ? { id: toolCallBlock.id } : {}),
144
224
  },
145
225
  };
146
226
  if (thoughtSig) {
@@ -159,31 +239,24 @@ export function convertToGeminiMessages(messages: Message[], modelId: string): a
159
239
  if (parts.length > 0) {
160
240
  result.push({ role: "model", parts });
161
241
  }
242
+ if (toolCalls.length > 0) {
243
+ pendingToolCalls = toolCalls;
244
+ existingToolResultIds = new Set<string>();
245
+ }
162
246
  } else if (msg.role === "toolResult") {
163
247
  const toolResultMsg = msg as ToolResultMessage;
164
- const textContent = toolResultMsg.content.filter((c) => c.type === "text") as TextContent[];
165
- const textResult = textContent.map((c) => c.text).join("\n");
166
- const responseValue = textResult || "";
167
-
168
- const includeId = requiresToolCallId(modelId);
169
- const functionResponsePart: any = {
170
- functionResponse: {
171
- name: toolResultMsg.toolName,
172
- response: toolResultMsg.isError ? { error: responseValue } : { output: responseValue },
173
- ...(includeId ? { id: toolResultMsg.toolCallId } : {}),
174
- },
175
- };
176
-
177
- // Merge consecutive tool results into a single user turn (required by Gemini API)
178
- const lastContent = result[result.length - 1];
179
- if (lastContent?.role === "user" && lastContent.parts?.some((p: any) => p.functionResponse)) {
180
- lastContent.parts.push(functionResponsePart);
181
- } else {
182
- result.push({ role: "user", parts: [functionResponsePart] });
183
- }
248
+ existingToolResultIds.add(toolResultMsg.toolCallId);
249
+ pushToolResult(
250
+ toolResultMsg.toolCallId,
251
+ toolResultMsg.toolName,
252
+ toolResultMsg.content,
253
+ toolResultMsg.isError,
254
+ );
184
255
  }
185
256
  }
186
257
 
258
+ flushMissingToolResults();
259
+
187
260
  return result;
188
261
  }
189
262
 
@@ -191,7 +264,9 @@ export function convertToGeminiMessages(messages: Message[], modelId: string): a
191
264
  * Convert tools to Gemini format using parametersJsonSchema (full JSON Schema support).
192
265
  * This differs from OpenAI format — Gemini uses functionDeclarations wrapped in an array.
193
266
  */
194
- export function convertToolsForGemini(tools: any[]): any[] | undefined {
267
+ export function convertToolsForGemini(
268
+ tools: Tool[],
269
+ ): Array<{ functionDeclarations: Array<Record<string, unknown>> }> | undefined {
195
270
  if (!tools || tools.length === 0) return undefined;
196
271
  return [
197
272
  {
@@ -207,7 +282,7 @@ export function convertToolsForGemini(tools: any[]): any[] | undefined {
207
282
  /**
208
283
  * Convert tools to OpenAI format (for Claude and MaaS models)
209
284
  */
210
- export function convertTools(tools: any[]): any[] {
285
+ export function convertTools(tools: Tool[]): Array<Record<string, unknown>> {
211
286
  return tools.map((tool) => ({
212
287
  type: "function",
213
288
  function: {