copilot-custom-endpoint 1.3.13 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -24,9 +24,9 @@ That's it. No code, no servers to manage (unless the model specifically needs th
24
24
  | **MiMo V2 Flash** | Xiaomi | No | ❌ | [Setup](docs/models/mimo.md) |
25
25
  | **MiMo V2.5** | Xiaomi | No | ✅ | [Setup](docs/models/mimo.md) |
26
26
  | **MiMo V2.5 Pro** | Xiaomi | No | ❌ | [Setup](docs/models/mimo.md) |
27
- | **Kimi K2.6** | Moonshot | **Yes** | ✅ | [Setup](docs/models/kimi.md) |
28
- | **Qwen 3.7 Plus** | DashScope | Optional | ✅ | [Setup](docs/models/qwen.md) |
29
- | **Qwen 3.7 Max** | DashScope | Optional | ❌ | [Setup](docs/models/qwen.md) |
27
+ | **Kimi K2.7 Code / K2.6** | Moonshot | **Yes** | ✅ | [Setup](docs/models/kimi.md) |
28
+ | **Qwen 3.7 Plus** | DashScope | Optional (recommended) | ✅ | [Setup](docs/models/qwen.md) |
29
+ | **Qwen 3.7 Max** | DashScope | Optional (recommended) | ❌ | [Setup](docs/models/qwen.md) |
30
30
  | **MiniMax M3** | MiniMax | No | ✅ | [Setup](docs/models/minimax.md) |
31
31
  | **GLM 5.1** | Z.ai | No | ❌ | [Setup](docs/models/glm.md) |
32
32
  | **GLM 5V Turbo** | Z.ai | No | ✅ | [Setup](docs/models/glm.md) |
@@ -88,16 +88,17 @@ npx copilot-custom-endpoint clean # Remove debug_log/
88
88
 
89
89
  ## Pricing snapshot
90
90
 
91
- All prices are **USD per 1M tokens** (cache miss). 1 AI credit = $0.01.
91
+ All prices are **USD per 1M tokens** (cache miss). 1 AI credit = $0.01. **MiniMax M3** figures reflect a permanent 50% off list price — see the model doc for the full rate card.
92
92
 
93
93
  | Model | Input | Output | Context |
94
94
  | ---------------------------- | ----- | ------ | ------- |
95
95
  | **MiMo V2 Flash** 🏆 | $0.10 | $0.30 | 256K |
96
96
  | **DeepSeek V4 Flash** 🏆 | $0.14 | $0.28 | 1M |
97
- | **Kimi K2.6** (non-thinking) | $0.16 | $0.95 | 256K |
97
+ | **Kimi K2.6** (non-thinking) | $0.16 | $0.95 | 262K |
98
+ | **Kimi K2.7 Code** | $0.19 | $4.00 | 262K |
99
+ | **MiniMax M3** | $0.30 | $1.20 | 1M |
98
100
  | **MiMo V2.5** | $0.40 | $2.00 | 1M |
99
101
  | **Qwen 3.7 Plus** | $0.40 | $1.60 | 1M |
100
- | **MiniMax M3** | $0.60 | $2.40 | 1M |
101
102
  | **MiMo V2.5 Pro** | $1.00 | $3.00 | 1M |
102
103
  | **GLM 5V Turbo** | $1.20 | $4.00 | 200K |
103
104
  | **GLM 5.1** | $1.40 | $4.40 | 200K |
@@ -3,6 +3,8 @@
3
3
  Here's a complete, real-world `chatLanguageModels.json` that combines **all the providers documented in this repo**. Copy what you need, leave the rest out.
4
4
 
5
5
  > **Note:** The `apiKey` fields are left as empty strings — set them via the **Chat: Manage Language Models** UI (Command Palette → right-click provider group → **Update API Key**). After you set a key via the UI, VS Code replaces the empty string with a `${input:chat.lm.secret.<id>}` secret reference.
6
+ >
7
+ > This combined config reflects the same provider blocks as the live `chatLanguageModels.json`. Qwen is pointed at the local proxy; remove `requestBody.enable_thinking` when using the proxy.
6
8
 
7
9
  ```json
8
10
  [
@@ -15,24 +17,18 @@ Here's a complete, real-world `chatLanguageModels.json` that combines **all the
15
17
  {
16
18
  "id": "qwen3.7-max",
17
19
  "name": "Qwen 3.7 Max (text)",
18
- "url": "https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions",
20
+ "url": "http://127.0.0.1:3458/v1/chat/completions",
19
21
  "toolCalling": true,
20
22
  "vision": false,
21
- "streaming": true,
22
- "requestBody": {
23
- "enable_thinking": false
24
- }
23
+ "streaming": true
25
24
  },
26
25
  {
27
26
  "id": "qwen3.7-plus",
28
27
  "name": "Qwen 3.7 Plus (vision)",
29
- "url": "https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions",
28
+ "url": "http://127.0.0.1:3458/v1/chat/completions",
30
29
  "toolCalling": true,
31
30
  "vision": true,
32
- "streaming": true,
33
- "requestBody": {
34
- "enable_thinking": false
35
- }
31
+ "streaming": true
36
32
  }
37
33
  ]
38
34
  },
@@ -54,6 +50,20 @@ Here's a complete, real-world `chatLanguageModels.json` that combines **all the
54
50
  "streaming": true,
55
51
  "maxInputTokens": 262144,
56
52
  "maxOutputTokens": 32768
53
+ },
54
+ {
55
+ "id": "kimi-k2.7-code",
56
+ "name": "Kimi K2.7 Code (vision)",
57
+ "url": "http://127.0.0.1:3457/v1/chat/completions",
58
+ "requestBody": {
59
+ "temperature": 1,
60
+ "max_tokens": 4096
61
+ },
62
+ "toolCalling": true,
63
+ "vision": true,
64
+ "streaming": true,
65
+ "maxInputTokens": 262144,
66
+ "maxOutputTokens": 4096
57
67
  }
58
68
  ]
59
69
  },
@@ -155,7 +165,6 @@ Here's a complete, real-world `chatLanguageModels.json` that combines **all the
155
165
  "top_p": 0.95
156
166
  }
157
167
  },
158
-
159
168
  {
160
169
  "id": "glm-5v-turbo",
161
170
  "name": "GLM 5V Turbo (vision)",
@@ -180,7 +189,7 @@ Here's a complete, real-world `chatLanguageModels.json` that combines **all the
180
189
 
181
190
  If you only need one provider, jump straight to its setup guide:
182
191
 
183
- - [Kimi K2.6](kimi.md)
192
+ - [Kimi K2.6 / K2.7 Code](kimi.md)
184
193
  - [Qwen 3.7 Plus / 3.7 Max](qwen.md)
185
194
  - [Xiaomi MiMo (V2.5 / V2.5 Pro / V2 Flash)](mimo.md)
186
195
  - [MiniMax M3](minimax.md)
@@ -10,7 +10,7 @@
10
10
  | Vision | ✅ Yes (`glm-5v-turbo` only) |
11
11
  | Tool calling | ✅ Yes (native multimodal tool use on `glm-5v-turbo`) |
12
12
  | Context (flagship) | 200K (`glm-5.1` / `glm-5v-turbo`) |
13
- | Max output (flagship) | 128K |
13
+ | Max output (flagship) | 131072 |
14
14
  | Required `requestBody` | `thinking: { type: "enabled" }` (recommended) |
15
15
  | Endpoint (intl) | `https://api.z.ai/api/paas/v4/chat/completions` |
16
16
  | Endpoint (China) | `https://open.bigmodel.cn/api/paas/v4/chat/completions` |
@@ -20,8 +20,8 @@
20
20
 
21
21
  | Model | Vision | Context | Max output | Thinking | Cost (in / out per 1M) | Role |
22
22
  | -------------- | ------ | ------- | ---------- | --------- | ---------------------- | --------------------------------------------------------- |
23
- | `glm-5.1` | ❌ | 200K | 128K | `enabled` | $1.40 / $4.40 | Current flagship — long-horizon / 8h autonomous work |
24
- | `glm-5v-turbo` | ✅ | 200K | 128K | `enabled` | $1.20 / $4.00 | Multimodal **coding** model — vision-based agentic coding |
23
+ | `glm-5.1` | ❌ | 200K | 131072 | `enabled` | $1.40 / $4.40 | Current flagship — long-horizon / 8h autonomous work |
24
+ | `glm-5v-turbo` | ✅ | 200K | 131072 | `enabled` | $1.20 / $4.00 | Multimodal **coding** model — vision-based agentic coding |
25
25
 
26
26
  > Other GLM models — `glm-5`, `glm-5-turbo`, `glm-4.6v-flashx`, `glm-4.5`, `glm-4.5-air`, `glm-4.5-flash`, `glm-4.5-x`, `glm-4.5-airx`, `glm-4-32b-0414-128k` — are callable on the same endpoint but are intentionally **not** added to the default `chatLanguageModels.json` block below. Add them in the same shape if you need them. Note: `glm-4.6v-flashx` was previously in the default block but has been **removed** because live testing showed it is not reliable for tool calling.
27
27
 
@@ -54,7 +54,7 @@ Config file location:
54
54
  "models": [
55
55
  {
56
56
  "id": "glm-5.1",
57
- "name": "GLM 5.1 (flagship)",
57
+ "name": "GLM 5.1 (text)",
58
58
  "url": "https://api.z.ai/api/paas/v4/chat/completions",
59
59
  "toolCalling": true,
60
60
  "vision": false,
@@ -69,7 +69,7 @@ Config file location:
69
69
  },
70
70
  {
71
71
  "id": "glm-5v-turbo",
72
- "name": "GLM 5V Turbo (vision flagship)",
72
+ "name": "GLM 5V Turbo (vision)",
73
73
  "url": "https://api.z.ai/api/paas/v4/chat/completions",
74
74
  "toolCalling": true,
75
75
  "vision": true,
@@ -1,19 +1,36 @@
1
1
  # Kimi — VS Code Custom Endpoint Setup Guide
2
2
 
3
- > **TL;DR:** Kimi K2.6 requires the local proxy. The K2 family locks `temperature: 1` and `top_p: 0.95`, and requires `thinking: { type: "disabled" }` on tool turns. The proxy rewrites sampling values, suppresses thinking on tool turns, and preserves streaming. Direct VS Code → Moonshot integration is not viable in this environment.
3
+ > **TL;DR:** Kimi models require the local proxy. The K2 family locks `temperature: 1` and `top_p: 0.95`. K2.6 requires `thinking: { type: "disabled" }` on tool turns; **K2.7 Code is always-thinking and rejects `thinking: disabled`**, so the proxy detects `kimi-k2.7*` and skips that rewrite while keeping sampling enforcement. Direct VS Code → Moonshot integration is not viable in this environment.
4
4
 
5
5
  ## At a Glance
6
6
 
7
- | Field | Value |
8
- | ---------------------- | --------------------------------------------- |
9
- | Mode | **Proxy required** (local on `:3457`) |
10
- | Vision | ✅ Yes |
11
- | Tool calling | ✅ Yes (proxy forces `thinking: disabled`) |
12
- | Context | 256K |
13
- | Max output | 32K |
14
- | Required `requestBody` | `temperature: 1` |
15
- | Upstream endpoint | `https://api.moonshot.ai/v1/chat/completions` |
16
- | Proxy endpoint | `http://127.0.0.1:3457/v1/chat/completions` |
7
+ | Field | Value |
8
+ | ----------------- | --------------------------------------------- |
9
+ | Mode | **Proxy required** (local on `:3457`) |
10
+ | Vision | ✅ Yes |
11
+ | Tool calling | ✅ Yes |
12
+ | Upstream endpoint | `https://api.moonshot.ai/v1/chat/completions` |
13
+ | Proxy endpoint | `http://127.0.0.1:3457/v1/chat/completions` |
14
+
15
+ ### K2.6
16
+
17
+ | Field | Value |
18
+ | ---------------------- | ------------------------------------ |
19
+ | Model id | `kimi-k2.6` |
20
+ | Context | 262K |
21
+ | Max output | 32768 |
22
+ | Required `requestBody` | `temperature: 1` |
23
+ | Tool calling | ✅ Proxy forces `thinking: disabled` |
24
+
25
+ ### K2.7 Code
26
+
27
+ | Field | Value |
28
+ | ---------------------- | ---------------------------------------------------------- |
29
+ | Model id | `kimi-k2.7-code` |
30
+ | Context | 262K |
31
+ | Max output | 4096 |
32
+ | Required `requestBody` | `temperature: 1`, `max_tokens: 4096` |
33
+ | Tool calling | ✅ Proxy lets K2.7 think (it rejects `thinking: disabled`) |
17
34
 
18
35
  ## Quick Start
19
36
 
@@ -23,7 +40,7 @@
23
40
  - `npx copilot-custom-endpoint` (also starts the Qwen proxy concurrently)
24
41
  2. **Edit `chatLanguageModels.json`** — add the Kimi block from [Setup](#setup) below.
25
42
  3. **Set your Moonshot API key** via the Command Palette → **Chat: Manage Language Models**.
26
- 4. **Restart VS Code** and pick "Kimi K2.6" in the chat picker.
43
+ 4. **Restart VS Code** and pick "Kimi K2.6" or "Kimi K2.7 Code" in the chat picker.
27
44
 
28
45
  ## Setup
29
46
 
@@ -46,7 +63,7 @@ Config file location:
46
63
  "models": [
47
64
  {
48
65
  "id": "kimi-k2.6",
49
- "name": "Kimi K2.6",
66
+ "name": "Kimi K2.6 (vision)",
50
67
  "url": "http://127.0.0.1:3457/v1/chat/completions",
51
68
  "requestBody": {
52
69
  "temperature": 1
@@ -56,11 +73,27 @@ Config file location:
56
73
  "streaming": true,
57
74
  "maxInputTokens": 262144,
58
75
  "maxOutputTokens": 32768
76
+ },
77
+ {
78
+ "id": "kimi-k2.7-code",
79
+ "name": "Kimi K2.7 Code",
80
+ "url": "http://127.0.0.1:3457/v1/chat/completions",
81
+ "requestBody": {
82
+ "temperature": 1,
83
+ "max_tokens": 4096
84
+ },
85
+ "toolCalling": true,
86
+ "vision": true,
87
+ "streaming": true,
88
+ "maxInputTokens": 262144,
89
+ "maxOutputTokens": 4096
59
90
  }
60
91
  ]
61
92
  }
62
93
  ```
63
94
 
95
+ > **K2.7 note:** `max_tokens` and `maxOutputTokens` are intentionally conservative at **4096**. K2.7 is always-thinking, so reasoning tokens inflate response size. Values above 24K triggered VS Code's "Response too long" error in agent mode during validation. Raise this only if you have tested your specific workload.
96
+
64
97
  ### 2. API key
65
98
 
66
99
  1. Open the Command Palette (`Ctrl+Shift+P`).
@@ -84,15 +117,15 @@ Config file location:
84
117
 
85
118
  All can be set in a `.env` file at the repo root (both proxies `import 'dotenv/config'` automatically).
86
119
 
87
- | Variable | Default | Purpose |
88
- | ------------------------------------------- | ----------------------------------------------------- | ------------------------------------------------------- |
89
- | `KIMI_PROXY_PORT` | `3457` (falls back to `PORT`) | Local listen port |
90
- | `KIMI_UPSTREAM_URL` | `https://api.moonshot.ai/v1/chat/completions` | Upstream Moonshot endpoint |
91
- | `KIMI_PROXY_FORCE_TEMPERATURE` | `1` | Temperature for thinking-mode requests |
92
- | `KIMI_PROXY_FORCE_NON_THINKING_TEMPERATURE` | `0.6` | Temperature when thinking is disabled (tool requests) |
93
- | `KIMI_PROXY_FORCE_TOP_P` | `0.95` | `top_p` forced into request body |
94
- | `KIMI_PROXY_DISABLE_THINKING_WITH_TOOLS` | `1` | Force `thinking={"type":"disabled"}` when tools present |
95
- | `KIMI_PROXY_LOG` | `debug_log/kimi-proxy.ndjson` (relative to repo root) | Redacted NDJSON log path |
120
+ | Variable | Default | Purpose |
121
+ | ------------------------------------------- | -------------------------------------------------------- | ------------------------------------------------------- |
122
+ | `KIMI_PROXY_PORT` | `3457` (falls back to `PORT`) | Local listen port |
123
+ | `KIMI_UPSTREAM_URL` | `https://api.moonshot.ai/v1/chat/completions` | Upstream Moonshot endpoint |
124
+ | `KIMI_PROXY_FORCE_TEMPERATURE` | `1` | Temperature for thinking-mode requests |
125
+ | `KIMI_PROXY_FORCE_NON_THINKING_TEMPERATURE` | `0.6` | Temperature when thinking is disabled (tool requests) |
126
+ | `KIMI_PROXY_FORCE_TOP_P` | `0.95` | `top_p` forced into request body |
127
+ | `KIMI_PROXY_DISABLE_THINKING_WITH_TOOLS` | `1` | Force `thinking={"type":"disabled"}` when tools present |
128
+ | `KIMI_PROXY_LOG` | `debug_log/kimi-proxy.ndjson` (relative to proxy script) | Redacted NDJSON log path |
96
129
 
97
130
  #### Health check response
98
131
 
@@ -110,7 +143,8 @@ All can be set in a `.env` file at the repo root (both proxies `import 'dotenv/c
110
143
 
111
144
  - Forwards the existing `Authorization` header upstream.
112
145
  - Rewrites plain-chat requests to `temperature: 1` and `top_p: 0.95`.
113
- - Rewrites tool-enabled requests to `thinking: {"type": "disabled"}`, `temperature: 0.6`, and `top_p: 0.95`.
146
+ - For **K2.5/K2.6**: rewrites tool-enabled requests to `thinking: {"type": "disabled"}`, `temperature: 0.6`, and `top_p: 0.95`.
147
+ - For **K2.7 Code**: keeps thinking enabled (K2.7 rejects `thinking: disabled` with HTTP 400) and rewrites to `temperature: 1`, `top_p: 0.95`.
114
148
  - Preserves streaming responses.
115
149
  - Writes redacted request summaries to `debug_log/kimi-proxy.ndjson`.
116
150
 
@@ -125,10 +159,11 @@ All can be set in a `.env` file at the repo root (both proxies `import 'dotenv/c
125
159
 
126
160
  ### Thinking mode
127
161
 
128
- | Turn type | Behavior |
129
- | ------------ | ----------------------------------------------------------- |
130
- | Plain chat | Thinking enabled, `temperature: 1` |
131
- | Tool-enabled | `thinking: { type: "disabled" }` forced, `temperature: 0.6` |
162
+ | Model | Turn type | Behavior |
163
+ | ----------- | ------------ | ----------------------------------------------------------- |
164
+ | K2.5 / K2.6 | Plain chat | Thinking enabled, `temperature: 1`, `top_p: 0.95` |
165
+ | K2.5 / K2.6 | Tool-enabled | `thinking: { type: "disabled" }` forced, `temperature: 0.6`, `top_p: 0.95` |
166
+ | K2.7 Code | All turns | Always-thinking, `temperature: 1`, `top_p: 0.95` |
132
167
 
133
168
  ### Capabilities
134
169
 
@@ -151,12 +186,14 @@ All can be set in a `.env` file at the repo root (both proxies `import 'dotenv/c
151
186
 
152
187
  ## Pricing
153
188
 
154
- For the cross-provider comparison, see [docs/pricing.md](../pricing.md). Kimi K2.6 on the **Moonshot direct platform**:
189
+ For the cross-provider comparison, see [docs/pricing.md](../pricing.md). Kimi models on the **Moonshot direct platform**:
155
190
 
156
- | Model | Input | Output (non-thinking) | Output (thinking) |
157
- | ----------- | ---------- | --------------------- | ----------------- |
158
- | `kimi-k2.6` | $0.16 / 1M | $0.95 / 1M | $4.00 / 1M |
191
+ | Model | Input | Cached input | Output (non-thinking) | Output (thinking) |
192
+ | ---------------- | ---------- | ------------ | --------------------- | ----------------- |
193
+ | `kimi-k2.6` | $0.16 / 1M | — | $0.95 / 1M | $4.00 / 1M |
194
+ | `kimi-k2.7-code` | $0.19 / 1M | $0.95 / 1M | — | $4.00 / 1M |
159
195
 
196
+ > **K2.7:** No non-thinking mode — always-thinking. Cached input pricing applies.
160
197
  > Via DashScope, K2.6 is also available at $0.89 / 1M input and $3.71 / 1M output (same model, regional pricing).
161
198
 
162
199
  ---
@@ -213,11 +250,25 @@ The model-level `requestBody.temperature = 1` override validated locally but was
213
250
  - Redacted proxy logs confirmed `temperature 0.1 -> 1` and `top_p 1 -> 0.95` for plain-chat requests.
214
251
  - Redacted proxy logs later confirmed `thinking undefined -> disabled` and `temperature 0.1 -> 0.6` for tool-enabled requests.
215
252
 
253
+ ### K2.7 Code validation results (June 14, 2026)
254
+
255
+ | Check | Result |
256
+ | ----------------------------------------------------- | ------------------------------------------ |
257
+ | `GET /v1/models` — slug confirmed | ✅ `kimi-k2.7-code` |
258
+ | Plain chat via proxy | ✅ |
259
+ | Tool turn with `thinking: disabled` | ❌ HTTP 400 — rejected by model |
260
+ | Tool turn letting K2.7 think | ✅ |
261
+ | Two-turn tool loop via proxy | ✅ No `reasoning_content is missing` error |
262
+ | VS Code Agent mode — integrated browser opened Google | ✅ |
263
+ | `maxOutputTokens` 24K–32K in agent mode | ❌ VS Code "Response too long" |
264
+ | `maxOutputTokens` 4096 in agent mode | ✅ |
265
+
216
266
  ### Final verdict
217
267
 
218
268
  - Acceptable for plain chat: **yes** (proxy)
219
269
  - Acceptable for streaming chat: **yes** (proxy)
220
270
  - Acceptable for tool-enabled agent use: **yes**, with the local proxy workaround
271
+ - K2.7 specifically: **yes**, but keep `maxOutputTokens` low (4096 validated) to avoid VS Code's response-size limit
221
272
  - Acceptable without a proxy: **no**
222
273
 
223
274
  ## References
@@ -233,3 +284,4 @@ The model-level `requestBody.temperature = 1` override validated locally but was
233
284
  - Kimi web search guide: `https://platform.kimi.ai/docs/guide/use-web-search.md`
234
285
  - Kimi coding tools / agent guide: `https://platform.kimi.ai/docs/guide/agent-support.md`
235
286
  - Kimi K2.6 pricing: `https://platform.kimi.ai/docs/pricing/chat-k26`
287
+ - Kimi K2.7 Code pricing: `https://platform.kimi.ai/docs/pricing/chat-k27-code`
@@ -10,7 +10,7 @@
10
10
  | Vision | ✅ Yes (`mimo-v2.5` only) |
11
11
  | Tool calling | ✅ Yes (with `thinking: disabled`) |
12
12
  | Context | 1M (V2.5 Pro / V2.5) / 256K (V2 Flash) |
13
- | Max output | 128K (V2.5 Pro) / 32K (V2.5) / 64K (V2 Flash) |
13
+ | Max output | 131072 (V2.5 Pro) / 32768 (V2.5) / 65536 (V2 Flash) |
14
14
  | Required `requestBody` | `thinking: { type: "disabled" }` |
15
15
  | Endpoint | `https://api.xiaomimimo.com/v1/chat/completions` |
16
16
 
@@ -51,7 +51,7 @@ Config file location:
51
51
  "models": [
52
52
  {
53
53
  "id": "mimo-v2.5-pro",
54
- "name": "MiMo V2.5 Pro",
54
+ "name": "MiMo V2.5 Pro (text)",
55
55
  "url": "https://api.xiaomimimo.com/v1/chat/completions",
56
56
  "toolCalling": true,
57
57
  "vision": false,
@@ -66,7 +66,7 @@ Config file location:
66
66
  },
67
67
  {
68
68
  "id": "mimo-v2.5",
69
- "name": "MiMo V2.5",
69
+ "name": "MiMo V2.5 (vision)",
70
70
  "url": "https://api.xiaomimimo.com/v1/chat/completions",
71
71
  "toolCalling": true,
72
72
  "vision": true,
@@ -81,7 +81,7 @@ Config file location:
81
81
  },
82
82
  {
83
83
  "id": "mimo-v2-flash",
84
- "name": "MiMo V2 Flash",
84
+ "name": "MiMo V2 Flash (text)",
85
85
  "url": "https://api.xiaomimimo.com/v1/chat/completions",
86
86
  "toolCalling": true,
87
87
  "vision": false,
@@ -10,7 +10,7 @@
10
10
  | Vision | ✅ Yes (image + video) |
11
11
  | Tool calling | ✅ Yes |
12
12
  | Context | 1M (guaranteed 512K) |
13
- | Max output | 512K (recommended 128K) |
13
+ | Max output | 131072 |
14
14
  | Required `requestBody` | `thinking: { type: "adaptive" }, reasoning_split: true` |
15
15
  | Endpoint (international) | `https://api.minimax.io/v1/chat/completions` |
16
16
  | Endpoint (China) | `https://api.minimaxi.com/v1/chat/completions` |
@@ -42,7 +42,7 @@ Config file location:
42
42
  "models": [
43
43
  {
44
44
  "id": "MiniMax-M3",
45
- "name": "MiniMax M3",
45
+ "name": "MiniMax M3 (vision)",
46
46
  "url": "https://api.minimax.io/v1/chat/completions",
47
47
  "toolCalling": true,
48
48
  "vision": true,
@@ -149,7 +149,7 @@ For the cross-provider comparison, see [docs/pricing.md](../pricing.md). MiniMax
149
149
 
150
150
  \* Input tokens above 512K are available in limited quantity for a limited time.
151
151
 
152
- > **Promo:** A 7-day 50% off promotion is available for new accounts, making the 512K tier effectively $0.30 / 1M input and $1.20 / 1M output for the first week.
152
+ > **Permanent 50% off:** A standing 50% discount applies to all MiniMax-M3 pay-as-you-go usage on both the Standard and Priority tiers (verified June 9, 2026). The effective rates are $0.30 / 1M input, $1.20 / 1M output, and $0.06 / 1M cached input (≤ 512K tier).
153
153
 
154
154
  ### Token Plan (subscription)
155
155
 
@@ -1,19 +1,19 @@
1
1
  # Qwen (DashScope) — VS Code Custom Endpoint Setup Guide
2
2
 
3
- > **TL;DR:** Direct path works for `qwen3.7-plus` (vision) and `qwen3.7-max` (text-only) without a proxy. The optional `proxy/qwen-proxy.mjs` adds dynamic thinking suppression: reasoning stays ON in plain chat but turns OFF automatically when tools are invoked. Pick the mode that matches your tradeoff.
3
+ > **TL;DR:** The live config points `qwen3.7-plus` (vision) and `qwen3.7-max` (text-only) at `proxy/qwen-proxy.mjs` for dynamic thinking suppression: reasoning stays ON in plain chat but turns OFF automatically when tools are invoked. A direct DashScope path with static `enable_thinking: false` is also supported if you prefer not to run the proxy.
4
4
 
5
5
  ## At a Glance
6
6
 
7
7
  | Field | Value |
8
8
  | ------------------------------- | ------------------------------------------------------------------------- |
9
- | Mode | **Direct** (no proxy) **or** **Proxy** (optional, for dynamic thinking) |
10
- | Vision | ✅ Yes (`qwen3.7-plus`) |
11
- | Tool calling | ✅ Yes |
12
- | Context | 1M |
13
- | Required `requestBody` (direct) | `enable_thinking: false` |
14
- | Required `requestBody` (proxy) | none — proxy injects based on tool activity in the conversation |
15
- | Endpoint | `https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions` |
16
- | Proxy endpoint | `http://127.0.0.1:3458/v1/chat/completions` |
9
+ | Mode | **Proxy** (local on `:3458`) **or** **Direct** (static `enable_thinking: false`) |
10
+ | Vision | ✅ Yes (`qwen3.7-plus`) |
11
+ | Tool calling | ✅ Yes |
12
+ | Context | 1M |
13
+ | Required `requestBody` (direct) | `enable_thinking: false` |
14
+ | Required `requestBody` (proxy) | none — proxy injects based on tool activity in the conversation |
15
+ | Endpoint | `https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions` |
16
+ | Proxy endpoint | `http://127.0.0.1:3458/v1/chat/completions` |
17
17
 
18
18
  ### Models at a glance
19
19
 
@@ -22,15 +22,9 @@
22
22
  | `qwen3.7-plus` | ✅ Yes | Primary model with image understanding |
23
23
  | `qwen3.7-max` | ❌ No | Larger text-only model |
24
24
 
25
- > The snapshot `qwen3.7-plus-2026-05-26` is also available; the floating `qwen3.7-plus` alias is preferred.
25
+ > The live `chatLanguageModels.json` points Qwen models at the local proxy by default; the direct DashScope URL is shown for users who prefer a static `enable_thinking: false` setup.
26
26
 
27
- ## Quick Start — Direct Path (Recommended for Simplicity)
28
-
29
- 1. **Edit `chatLanguageModels.json`** — add the Qwen block from [Setup § Direct](#direct-path) below.
30
- 2. **Set your `DASHSCOPE_API_KEY`** via Command Palette → **Chat: Manage Language Models**.
31
- 3. **Restart VS Code** and pick "Qwen 3.7 Plus" or "Qwen 3.7 Max".
32
-
33
- ## Quick Start — With Proxy (Dynamic Thinking)
27
+ ## Quick Start — With Proxy (Recommended)
34
28
 
35
29
  1. **Start the proxy** — choose one:
36
30
  - `npm run proxy:qwen` (from the repo root)
@@ -40,6 +34,12 @@
40
34
  3. **Set your DashScope API key** via the Language Models UI.
41
35
  4. **Restart VS Code.** Reasoning will be visible in plain chat and suppressed on tool turns.
42
36
 
37
+ ## Quick Start — Direct Path (No Proxy)
38
+
39
+ 1. **Edit `chatLanguageModels.json`** — add the Qwen block from [Setup § Direct](#direct-path) below.
40
+ 2. **Set your `DASHSCOPE_API_KEY`** via Command Palette → **Chat: Manage Language Models**.
41
+ 3. **Restart VS Code** and pick "Qwen 3.7 Plus" or "Qwen 3.7 Max".
42
+
43
43
  ## Setup
44
44
 
45
45
  ### Regional endpoints
@@ -63,7 +63,7 @@ DashScope is region-specific — your API key only works on the endpoint it was
63
63
  "models": [
64
64
  {
65
65
  "id": "qwen3.7-max",
66
- "name": "Qwen 3.7 Max",
66
+ "name": "Qwen 3.7 Max (text)",
67
67
  "url": "https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions",
68
68
  "toolCalling": true,
69
69
  "vision": false,
@@ -74,7 +74,7 @@ DashScope is region-specific — your API key only works on the endpoint it was
74
74
  },
75
75
  {
76
76
  "id": "qwen3.7-plus",
77
- "name": "Qwen 3.7 Plus",
77
+ "name": "Qwen 3.7 Plus (vision)",
78
78
  "url": "https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions",
79
79
  "toolCalling": true,
80
80
  "vision": true,
@@ -89,6 +89,8 @@ DashScope is region-specific — your API key only works on the endpoint it was
89
89
 
90
90
  > **`enable_thinking: false`** suppresses the Qwen3 family's default thinking mode, which prevents `reasoning_content` issues during tool loops.
91
91
 
92
+ > **Live config note:** The checked-in `chatLanguageModels.json` points Qwen at the local proxy (`http://127.0.0.1:3458`) with no `requestBody` override, so the proxy manages `enable_thinking` dynamically. Use the snippet above only if you are not running the proxy.
93
+
92
94
  ### Proxy path
93
95
 
94
96
  #### 1. Start the proxy
@@ -132,7 +134,7 @@ Expected response:
132
134
  "models": [
133
135
  {
134
136
  "id": "qwen3.7-max",
135
- "name": "Qwen 3.7 Max",
137
+ "name": "Qwen 3.7 Max (text)",
136
138
  "url": "http://127.0.0.1:3458/v1/chat/completions",
137
139
  "toolCalling": true,
138
140
  "vision": false,
@@ -140,7 +142,7 @@ Expected response:
140
142
  },
141
143
  {
142
144
  "id": "qwen3.7-plus",
143
- "name": "Qwen 3.7 Plus",
145
+ "name": "Qwen 3.7 Plus (vision)",
144
146
  "url": "http://127.0.0.1:3458/v1/chat/completions",
145
147
  "toolCalling": true,
146
148
  "vision": true,
@@ -160,7 +162,7 @@ All can be set in a `.env` file at the repo root (both proxies `import 'dotenv/c
160
162
  | ---------------------------------------- | ------------------------------------------------------------------------- | -------------------------------------------------- |
161
163
  | `QWEN_PROXY_PORT` | `3458` (falls back to `PORT`) | Local listen port |
162
164
  | `QWEN_UPSTREAM_URL` | `https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions` | Upstream DashScope endpoint |
163
- | `QWEN_PROXY_LOG` | `debug_log/qwen-proxy.ndjson` (relative to repo root) | Redacted NDJSON log path |
165
+ | `QWEN_PROXY_LOG` | `debug_log/qwen-proxy.ndjson` (relative to proxy script) | Redacted NDJSON log path |
164
166
  | `QWEN_PROXY_DISABLE_THINKING_WITH_TOOLS` | `1` | Set to `0` to skip tool-aware thinking suppression |
165
167
 
166
168
  #### Proxy request rewriting rules
@@ -175,6 +177,8 @@ The proxy detects active tool use by examining the conversation state, not just
175
177
  > **Why delete rather than set `true`?** Omitting the key lets Qwen use its built-in default (`true`). Deletion is closer to "don't interfere."
176
178
  >
177
179
  > **Why not check `body.tools`?** The proxy checks for tool _activity_ — tool results in the message history or an explicit `tool_choice` directive — rather than the mere presence of a tools array. This correctly handles tool-enabled conversations even when the client sends `tools` in an earlier request but omits it from subsequent turns.
180
+ >
181
+ > **Proxy vs. direct:** The live config uses the proxy URL with no `requestBody` override so this dynamic behavior is applied to every request. The direct-path snippet above keeps `enable_thinking: false` static in `requestBody` as a no-proxy alternative.
178
182
 
179
183
  ### API key
180
184
 
@@ -197,6 +201,8 @@ The Qwen3 hybrid-thinking models default to `enable_thinking: true`, producing `
197
201
  | Proxy path | Thinking ON (default preserved) | Thinking OFF (auto-injected) |
198
202
  | No config (default) | Thinking ON | Risk: history may be rejected |
199
203
 
204
+ > The live `chatLanguageModels.json` uses the proxy path by default, so plain-chat reasoning is visible and tool turns are stable.
205
+
200
206
  ### Vision (`qwen3.7-plus`)
201
207
 
202
208
  - Image input via OpenAI-compatible `content` array format (base64 data URIs).
package/docs/pricing.md CHANGED
@@ -47,6 +47,7 @@ These are the models available through GitHub Copilot's model roster as of June
47
47
  | **MiMo V2 Flash** | Xiaomi | $0.10 | $0.01 | $0.30 | 256K |
48
48
  | **DeepSeek V4 Flash** | DeepSeek | $0.14 | $0.0028 | $0.28 | 1M |
49
49
  | **Kimi K2.6** | Moonshot | $0.16 | — | $0.95 (non-thinking) / $4.00 (thinking) | 256K |
50
+ | **Kimi K2.7 Code** | Moonshot | $0.19 | $0.95 | $4.00 | 262K |
50
51
  | **Qwen 3.7 Plus** | DashScope | $0.40 (≤256K) / $1.20 (>256K) | — | $1.60 (≤256K) / $4.80 (>256K) | 1M |
51
52
  | **MiMo V2.5** | Xiaomi | $0.40 | $0.08 | $2.00 | 1M |
52
53
  | **DeepSeek V4 Pro** | DeepSeek | $0.435 | $0.003625 | $0.87 | 1M |
@@ -66,7 +67,7 @@ These are the models available through GitHub Copilot's model roster as of June
66
67
  > - **Qwen** models use **tiered pricing** — determined by total input tokens per request. Prices above are for non-thinking mode.
67
68
  > - **Kimi K2.6** pricing is from the **Moonshot platform** (direct). Via DashScope: $0.89 input / $3.71 output.
68
69
  > - **DashScope** offers a **free quota** of 1M input + 1M output tokens per model, valid for 90 days.
69
- > - **MiniMax M3** uses **tiered pricing** — input price doubles above 512K input tokens. Cache hits are priced at 20% of the input rate ($0.12/M ≤512K, $0.24/M >512K). A 7-day 50% off promotion is available for new accounts.
70
+ > - **MiniMax M3** uses **tiered pricing** — input price doubles above 512K input tokens. Cache hits are priced at 20% of the input rate ($0.12/M ≤512K, $0.24/M >512K). A **permanent 50% off** discount applies to all MiniMax-M3 pay-as-you-go usage (Standard and Priority tiers), making the effective rates half the list prices above.
70
71
  > - **GLM** models support prompt caching — cache hits are priced at $0.24/M for 5V Turbo and $0.26/M for 5.1.
71
72
  > - **MiMo** offers a **Token Plan** subscription model with discounted rates and a free cache-writing promotion.
72
73
  > - For typical Copilot chat usage (short-to-medium prompts), you'll almost always fall in the lowest pricing tier.
@@ -80,12 +81,13 @@ For a typical coding session (~10K input + ~2K output tokens per turn, 50 turns)
80
81
  | MiMo V2 Flash | ~$0.08 |
81
82
  | DeepSeek V4 Flash | ~$0.10 |
82
83
  | Kimi K2.6 (non-thinking) | ~$0.18 |
84
+ | MiniMax M3 (50% off) | ~$0.27 |
83
85
  | DeepSeek V4 Pro | ~$0.30 |
84
86
  | Raptor mini | ~$0.33 |
85
87
  | Qwen 3.7 Plus | ~$0.36 |
86
88
  | MiMo V2.5 | ~$0.40 |
87
89
  | Kimi K2.6 (thinking) | ~$0.48 |
88
- | MiniMax M3 | ~$0.54 |
90
+ | Kimi K2.7 Code | ~$0.50 |
89
91
  | Gemini 3 Flash | ~$0.55 |
90
92
  | MiMo V2.5 Pro | ~$0.80 |
91
93
  | GPT-5.4 mini | ~$0.83 |
@@ -102,7 +104,7 @@ For a typical coding session (~10K input + ~2K output tokens per turn, 50 turns)
102
104
 
103
105
  > **How long does 7,000 credits last?** A Pro+ subscriber running 50-turn sessions could afford roughly **13 GPT-5.5 sessions**, **23 Opus sessions**, or **212 Raptor mini sessions** per month — or mix and match. (Multiply session cost by 100 to convert to AI credits.)
104
106
 
105
- > Prices last verified: June 1, 2026. Always check the official pages for the latest rates:
107
+ > Prices last verified: June 9, 2026. Always check the official pages for the latest rates:
106
108
  >
107
109
  > - [GitHub Copilot models & pricing](https://docs.github.com/en/copilot/reference/copilot-billing/models-and-pricing)
108
110
  > - [OpenAI pricing](https://openai.com/api/pricing/)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "copilot-custom-endpoint",
3
- "version": "1.3.13",
3
+ "version": "1.4.0",
4
4
  "description": "Local proxies for VS Code Copilot custom endpoints — Kimi K2 & Qwen 3.x",
5
5
  "license": "MIT",
6
6
  "type": "module",
@@ -8,12 +8,16 @@ import { createProxy } from '../lib/create-proxy.mjs'
8
8
  * - Validated in this repo with `kimi-k2.6`.
9
9
  * - Expected to work for `kimi-k2.5`, because Kimi documents the same fixed
10
10
  * sampling and thinking behavior for `kimi-k2.6` / `kimi-k2.5`.
11
+ * - Validated in this repo with `kimi-k2.7-code` (June 14, 2026). K2.7 is
12
+ * always-thinking and rejects `thinking: { type: 'disabled' }`. The proxy
13
+ * detects K2.7 and skips the thinking-disable rewrite while keeping
14
+ * temperature/top_p enforcement.
11
15
  * - Not intended for `moonshot-v1` models or non-Kimi providers, because this
12
16
  * proxy rewrites requests to K2-family-specific values:
13
17
  * - thinking mode temperature = 1.0
14
18
  * - non-thinking mode temperature = 0.6
15
19
  * - top_p = 0.95
16
- * - tool-enabled requests force `thinking: { type: 'disabled' }`
20
+ * - tool-enabled requests force `thinking: { type: 'disabled' }` (K2.5/K2.6 only)
17
21
  */
18
22
  const upstreamUrl =
19
23
  process.env.KIMI_UPSTREAM_URL ?? 'https://api.moonshot.ai/v1/chat/completions'
@@ -104,6 +108,12 @@ function rewriteKimi(payload) {
104
108
  const incomingTemperature = payload.temperature
105
109
  const incomingTopP = payload.top_p
106
110
  const incomingThinkingType = payload?.thinking?.type
111
+ const model = payload.model ?? ''
112
+
113
+ // K2.7 is always-thinking and rejects thinking: disabled.
114
+ // Detect K2.7 variants (e.g. kimi-k2.7-code) and skip the thinking-disable
115
+ // rewrite while keeping temperature/top_p enforcement.
116
+ const isK27 = model.startsWith('kimi-k2.7')
107
117
 
108
118
  // Determine if a tool is actually being invoked:
109
119
  // - tool_choice is set and not "none"
@@ -116,7 +126,7 @@ function rewriteKimi(payload) {
116
126
  (toolChoice !== undefined && toolChoice !== 'none' && toolChoice !== null)
117
127
  const hasTools = hasActiveToolCall
118
128
 
119
- const useNonThinkingMode = disableThinkingWithTools && hasTools
129
+ const useNonThinkingMode = !isK27 && disableThinkingWithTools && hasTools
120
130
  const rewrittenTemperature = useNonThinkingMode
121
131
  ? forcedNonThinkingTemperature
122
132
  : forcedTemperature
@@ -134,6 +144,8 @@ function rewriteKimi(payload) {
134
144
 
135
145
  const rewrittenThinkingType = payload.thinking?.type
136
146
  const rewriteInfo = {
147
+ model,
148
+ isK27,
137
149
  incomingTemperature,
138
150
  rewrittenTemperature,
139
151
  incomingTopP,
@@ -145,7 +157,8 @@ function rewriteKimi(payload) {
145
157
  const summary = summarizePayload(payload, hasTools, rewriteInfo)
146
158
 
147
159
  const modeTag = hasTools ? '[tools]' : '[chat]'
148
- const consoleMsg = `${modeTag} temperature ${String(incomingTemperature)} -> ${String(rewrittenTemperature)}, top_p ${String(incomingTopP)} -> ${String(forcedTopP)}, thinking ${String(incomingThinkingType)} -> ${String(rewrittenThinkingType)}`
160
+ const k27Tag = isK27 ? '[k2.7]' : ''
161
+ const consoleMsg = `${k27Tag}${modeTag} temperature ${String(incomingTemperature)} -> ${String(rewrittenTemperature)}, top_p ${String(incomingTopP)} -> ${String(forcedTopP)}, thinking ${String(incomingThinkingType)} -> ${String(rewrittenThinkingType)}`
149
162
 
150
163
  // Clean up internal key before forwarding
151
164
  delete payload.__incomingThinkingType