copilot-custom-endpoint 1.3.14 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -4
- package/docs/example-config.md +21 -12
- package/docs/models/glm.md +5 -5
- package/docs/models/kimi.md +83 -31
- package/docs/models/mimo.md +4 -4
- package/docs/models/minimax.md +2 -2
- package/docs/models/qwen.md +28 -22
- package/docs/pricing.md +2 -0
- package/package.json +1 -1
- package/proxy/kimi-proxy.mjs +16 -3
package/README.md
CHANGED
|
@@ -24,9 +24,9 @@ That's it. No code, no servers to manage (unless the model specifically needs th
|
|
|
24
24
|
| **MiMo V2 Flash** | Xiaomi | No | ❌ | [Setup](docs/models/mimo.md) |
|
|
25
25
|
| **MiMo V2.5** | Xiaomi | No | ✅ | [Setup](docs/models/mimo.md) |
|
|
26
26
|
| **MiMo V2.5 Pro** | Xiaomi | No | ❌ | [Setup](docs/models/mimo.md) |
|
|
27
|
-
| **Kimi K2.6**
|
|
28
|
-
| **Qwen 3.7 Plus** | DashScope | Optional
|
|
29
|
-
| **Qwen 3.7 Max** | DashScope | Optional
|
|
27
|
+
| **Kimi K2.7 Code / K2.6** | Moonshot | **Yes** | ✅ | [Setup](docs/models/kimi.md) |
|
|
28
|
+
| **Qwen 3.7 Plus** | DashScope | Optional (recommended) | ✅ | [Setup](docs/models/qwen.md) |
|
|
29
|
+
| **Qwen 3.7 Max** | DashScope | Optional (recommended) | ❌ | [Setup](docs/models/qwen.md) |
|
|
30
30
|
| **MiniMax M3** | MiniMax | No | ✅ | [Setup](docs/models/minimax.md) |
|
|
31
31
|
| **GLM 5.1** | Z.ai | No | ❌ | [Setup](docs/models/glm.md) |
|
|
32
32
|
| **GLM 5V Turbo** | Z.ai | No | ✅ | [Setup](docs/models/glm.md) |
|
|
@@ -94,7 +94,8 @@ All prices are **USD per 1M tokens** (cache miss). 1 AI credit = $0.01. **MiniMa
|
|
|
94
94
|
| ---------------------------- | ----- | ------ | ------- |
|
|
95
95
|
| **MiMo V2 Flash** 🏆 | $0.10 | $0.30 | 256K |
|
|
96
96
|
| **DeepSeek V4 Flash** 🏆 | $0.14 | $0.28 | 1M |
|
|
97
|
-
| **Kimi K2.6** (non-thinking) | $0.16 | $0.95 |
|
|
97
|
+
| **Kimi K2.6** (non-thinking) | $0.16 | $0.95 | 262K |
|
|
98
|
+
| **Kimi K2.7 Code** | $0.19 | $4.00 | 262K |
|
|
98
99
|
| **MiniMax M3** | $0.30 | $1.20 | 1M |
|
|
99
100
|
| **MiMo V2.5** | $0.40 | $2.00 | 1M |
|
|
100
101
|
| **Qwen 3.7 Plus** | $0.40 | $1.60 | 1M |
|
package/docs/example-config.md
CHANGED
|
@@ -3,6 +3,8 @@
|
|
|
3
3
|
Here's a complete, real-world `chatLanguageModels.json` that combines **all the providers documented in this repo**. Copy what you need, leave the rest out.
|
|
4
4
|
|
|
5
5
|
> **Note:** The `apiKey` fields are left as empty strings — set them via the **Chat: Manage Language Models** UI (Command Palette → right-click provider group → **Update API Key**). After you set a key via the UI, VS Code replaces the empty string with a `${input:chat.lm.secret.<id>}` secret reference.
|
|
6
|
+
>
|
|
7
|
+
> This combined config reflects the same provider blocks as the live `chatLanguageModels.json`. Qwen is pointed at the local proxy; remove `requestBody.enable_thinking` when using the proxy.
|
|
6
8
|
|
|
7
9
|
```json
|
|
8
10
|
[
|
|
@@ -15,24 +17,18 @@ Here's a complete, real-world `chatLanguageModels.json` that combines **all the
|
|
|
15
17
|
{
|
|
16
18
|
"id": "qwen3.7-max",
|
|
17
19
|
"name": "Qwen 3.7 Max (text)",
|
|
18
|
-
"url": "
|
|
20
|
+
"url": "http://127.0.0.1:3458/v1/chat/completions",
|
|
19
21
|
"toolCalling": true,
|
|
20
22
|
"vision": false,
|
|
21
|
-
"streaming": true
|
|
22
|
-
"requestBody": {
|
|
23
|
-
"enable_thinking": false
|
|
24
|
-
}
|
|
23
|
+
"streaming": true
|
|
25
24
|
},
|
|
26
25
|
{
|
|
27
26
|
"id": "qwen3.7-plus",
|
|
28
27
|
"name": "Qwen 3.7 Plus (vision)",
|
|
29
|
-
"url": "
|
|
28
|
+
"url": "http://127.0.0.1:3458/v1/chat/completions",
|
|
30
29
|
"toolCalling": true,
|
|
31
30
|
"vision": true,
|
|
32
|
-
"streaming": true
|
|
33
|
-
"requestBody": {
|
|
34
|
-
"enable_thinking": false
|
|
35
|
-
}
|
|
31
|
+
"streaming": true
|
|
36
32
|
}
|
|
37
33
|
]
|
|
38
34
|
},
|
|
@@ -54,6 +50,20 @@ Here's a complete, real-world `chatLanguageModels.json` that combines **all the
|
|
|
54
50
|
"streaming": true,
|
|
55
51
|
"maxInputTokens": 262144,
|
|
56
52
|
"maxOutputTokens": 32768
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
"id": "kimi-k2.7-code",
|
|
56
|
+
"name": "Kimi K2.7 Code (vision)",
|
|
57
|
+
"url": "http://127.0.0.1:3457/v1/chat/completions",
|
|
58
|
+
"requestBody": {
|
|
59
|
+
"temperature": 1,
|
|
60
|
+
"max_tokens": 4096
|
|
61
|
+
},
|
|
62
|
+
"toolCalling": true,
|
|
63
|
+
"vision": true,
|
|
64
|
+
"streaming": true,
|
|
65
|
+
"maxInputTokens": 262144,
|
|
66
|
+
"maxOutputTokens": 4096
|
|
57
67
|
}
|
|
58
68
|
]
|
|
59
69
|
},
|
|
@@ -155,7 +165,6 @@ Here's a complete, real-world `chatLanguageModels.json` that combines **all the
|
|
|
155
165
|
"top_p": 0.95
|
|
156
166
|
}
|
|
157
167
|
},
|
|
158
|
-
|
|
159
168
|
{
|
|
160
169
|
"id": "glm-5v-turbo",
|
|
161
170
|
"name": "GLM 5V Turbo (vision)",
|
|
@@ -180,7 +189,7 @@ Here's a complete, real-world `chatLanguageModels.json` that combines **all the
|
|
|
180
189
|
|
|
181
190
|
If you only need one provider, jump straight to its setup guide:
|
|
182
191
|
|
|
183
|
-
- [Kimi K2.6](kimi.md)
|
|
192
|
+
- [Kimi K2.6 / K2.7 Code](kimi.md)
|
|
184
193
|
- [Qwen 3.7 Plus / 3.7 Max](qwen.md)
|
|
185
194
|
- [Xiaomi MiMo (V2.5 / V2.5 Pro / V2 Flash)](mimo.md)
|
|
186
195
|
- [MiniMax M3](minimax.md)
|
package/docs/models/glm.md
CHANGED
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
| Vision | ✅ Yes (`glm-5v-turbo` only) |
|
|
11
11
|
| Tool calling | ✅ Yes (native multimodal tool use on `glm-5v-turbo`) |
|
|
12
12
|
| Context (flagship) | 200K (`glm-5.1` / `glm-5v-turbo`) |
|
|
13
|
-
| Max output (flagship) |
|
|
13
|
+
| Max output (flagship) | 131072 |
|
|
14
14
|
| Required `requestBody` | `thinking: { type: "enabled" }` (recommended) |
|
|
15
15
|
| Endpoint (intl) | `https://api.z.ai/api/paas/v4/chat/completions` |
|
|
16
16
|
| Endpoint (China) | `https://open.bigmodel.cn/api/paas/v4/chat/completions` |
|
|
@@ -20,8 +20,8 @@
|
|
|
20
20
|
|
|
21
21
|
| Model | Vision | Context | Max output | Thinking | Cost (in / out per 1M) | Role |
|
|
22
22
|
| -------------- | ------ | ------- | ---------- | --------- | ---------------------- | --------------------------------------------------------- |
|
|
23
|
-
| `glm-5.1` | ❌ | 200K |
|
|
24
|
-
| `glm-5v-turbo` | ✅ | 200K |
|
|
23
|
+
| `glm-5.1` | ❌ | 200K | 131072 | `enabled` | $1.40 / $4.40 | Current flagship — long-horizon / 8h autonomous work |
|
|
24
|
+
| `glm-5v-turbo` | ✅ | 200K | 131072 | `enabled` | $1.20 / $4.00 | Multimodal **coding** model — vision-based agentic coding |
|
|
25
25
|
|
|
26
26
|
> Other GLM models — `glm-5`, `glm-5-turbo`, `glm-4.6v-flashx`, `glm-4.5`, `glm-4.5-air`, `glm-4.5-flash`, `glm-4.5-x`, `glm-4.5-airx`, `glm-4-32b-0414-128k` — are callable on the same endpoint but are intentionally **not** added to the default `chatLanguageModels.json` block below. Add them in the same shape if you need them. Note: `glm-4.6v-flashx` was previously in the default block but has been **removed** because live testing showed it is not reliable for tool calling.
|
|
27
27
|
|
|
@@ -54,7 +54,7 @@ Config file location:
|
|
|
54
54
|
"models": [
|
|
55
55
|
{
|
|
56
56
|
"id": "glm-5.1",
|
|
57
|
-
"name": "GLM 5.1 (
|
|
57
|
+
"name": "GLM 5.1 (text)",
|
|
58
58
|
"url": "https://api.z.ai/api/paas/v4/chat/completions",
|
|
59
59
|
"toolCalling": true,
|
|
60
60
|
"vision": false,
|
|
@@ -69,7 +69,7 @@ Config file location:
|
|
|
69
69
|
},
|
|
70
70
|
{
|
|
71
71
|
"id": "glm-5v-turbo",
|
|
72
|
-
"name": "GLM 5V Turbo (vision
|
|
72
|
+
"name": "GLM 5V Turbo (vision)",
|
|
73
73
|
"url": "https://api.z.ai/api/paas/v4/chat/completions",
|
|
74
74
|
"toolCalling": true,
|
|
75
75
|
"vision": true,
|
package/docs/models/kimi.md
CHANGED
|
@@ -1,19 +1,36 @@
|
|
|
1
1
|
# Kimi — VS Code Custom Endpoint Setup Guide
|
|
2
2
|
|
|
3
|
-
> **TL;DR:** Kimi
|
|
3
|
+
> **TL;DR:** Kimi models require the local proxy. The K2 family locks `temperature: 1` and `top_p: 0.95`. K2.6 requires `thinking: { type: "disabled" }` on tool turns; **K2.7 Code is always-thinking and rejects `thinking: disabled`**, so the proxy detects `kimi-k2.7*` and skips that rewrite while keeping sampling enforcement. Direct VS Code → Moonshot integration is not viable in this environment.
|
|
4
4
|
|
|
5
5
|
## At a Glance
|
|
6
6
|
|
|
7
|
-
| Field
|
|
8
|
-
|
|
|
9
|
-
| Mode
|
|
10
|
-
| Vision
|
|
11
|
-
| Tool calling
|
|
12
|
-
|
|
|
13
|
-
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
7
|
+
| Field | Value |
|
|
8
|
+
| ----------------- | --------------------------------------------- |
|
|
9
|
+
| Mode | **Proxy required** (local on `:3457`) |
|
|
10
|
+
| Vision | ✅ Yes |
|
|
11
|
+
| Tool calling | ✅ Yes |
|
|
12
|
+
| Upstream endpoint | `https://api.moonshot.ai/v1/chat/completions` |
|
|
13
|
+
| Proxy endpoint | `http://127.0.0.1:3457/v1/chat/completions` |
|
|
14
|
+
|
|
15
|
+
### K2.6
|
|
16
|
+
|
|
17
|
+
| Field | Value |
|
|
18
|
+
| ---------------------- | ------------------------------------ |
|
|
19
|
+
| Model id | `kimi-k2.6` |
|
|
20
|
+
| Context | 262K |
|
|
21
|
+
| Max output | 32768 |
|
|
22
|
+
| Required `requestBody` | `temperature: 1` |
|
|
23
|
+
| Tool calling | ✅ Proxy forces `thinking: disabled` |
|
|
24
|
+
|
|
25
|
+
### K2.7 Code
|
|
26
|
+
|
|
27
|
+
| Field | Value |
|
|
28
|
+
| ---------------------- | ---------------------------------------------------------- |
|
|
29
|
+
| Model id | `kimi-k2.7-code` |
|
|
30
|
+
| Context | 262K |
|
|
31
|
+
| Max output | 4096 |
|
|
32
|
+
| Required `requestBody` | `temperature: 1`, `max_tokens: 4096` |
|
|
33
|
+
| Tool calling | ✅ Proxy lets K2.7 think (it rejects `thinking: disabled`) |
|
|
17
34
|
|
|
18
35
|
## Quick Start
|
|
19
36
|
|
|
@@ -23,7 +40,7 @@
|
|
|
23
40
|
- `npx copilot-custom-endpoint` (also starts the Qwen proxy concurrently)
|
|
24
41
|
2. **Edit `chatLanguageModels.json`** — add the Kimi block from [Setup](#setup) below.
|
|
25
42
|
3. **Set your Moonshot API key** via the Command Palette → **Chat: Manage Language Models**.
|
|
26
|
-
4. **Restart VS Code** and pick "Kimi K2.6" in the chat picker.
|
|
43
|
+
4. **Restart VS Code** and pick "Kimi K2.6" or "Kimi K2.7 Code" in the chat picker.
|
|
27
44
|
|
|
28
45
|
## Setup
|
|
29
46
|
|
|
@@ -46,7 +63,7 @@ Config file location:
|
|
|
46
63
|
"models": [
|
|
47
64
|
{
|
|
48
65
|
"id": "kimi-k2.6",
|
|
49
|
-
"name": "Kimi K2.6",
|
|
66
|
+
"name": "Kimi K2.6 (vision)",
|
|
50
67
|
"url": "http://127.0.0.1:3457/v1/chat/completions",
|
|
51
68
|
"requestBody": {
|
|
52
69
|
"temperature": 1
|
|
@@ -56,11 +73,27 @@ Config file location:
|
|
|
56
73
|
"streaming": true,
|
|
57
74
|
"maxInputTokens": 262144,
|
|
58
75
|
"maxOutputTokens": 32768
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
"id": "kimi-k2.7-code",
|
|
79
|
+
"name": "Kimi K2.7 Code",
|
|
80
|
+
"url": "http://127.0.0.1:3457/v1/chat/completions",
|
|
81
|
+
"requestBody": {
|
|
82
|
+
"temperature": 1,
|
|
83
|
+
"max_tokens": 4096
|
|
84
|
+
},
|
|
85
|
+
"toolCalling": true,
|
|
86
|
+
"vision": true,
|
|
87
|
+
"streaming": true,
|
|
88
|
+
"maxInputTokens": 262144,
|
|
89
|
+
"maxOutputTokens": 4096
|
|
59
90
|
}
|
|
60
91
|
]
|
|
61
92
|
}
|
|
62
93
|
```
|
|
63
94
|
|
|
95
|
+
> **K2.7 note:** `max_tokens` and `maxOutputTokens` are intentionally conservative at **4096**. K2.7 is always-thinking, so reasoning tokens inflate response size. Values above 24K triggered VS Code's "Response too long" error in agent mode during validation. Raise this only if you have tested your specific workload.
|
|
96
|
+
|
|
64
97
|
### 2. API key
|
|
65
98
|
|
|
66
99
|
1. Open the Command Palette (`Ctrl+Shift+P`).
|
|
@@ -84,15 +117,15 @@ Config file location:
|
|
|
84
117
|
|
|
85
118
|
All can be set in a `.env` file at the repo root (both proxies `import 'dotenv/config'` automatically).
|
|
86
119
|
|
|
87
|
-
| Variable | Default
|
|
88
|
-
| ------------------------------------------- |
|
|
89
|
-
| `KIMI_PROXY_PORT` | `3457` (falls back to `PORT`)
|
|
90
|
-
| `KIMI_UPSTREAM_URL` | `https://api.moonshot.ai/v1/chat/completions`
|
|
91
|
-
| `KIMI_PROXY_FORCE_TEMPERATURE` | `1`
|
|
92
|
-
| `KIMI_PROXY_FORCE_NON_THINKING_TEMPERATURE` | `0.6`
|
|
93
|
-
| `KIMI_PROXY_FORCE_TOP_P` | `0.95`
|
|
94
|
-
| `KIMI_PROXY_DISABLE_THINKING_WITH_TOOLS` | `1`
|
|
95
|
-
| `KIMI_PROXY_LOG` | `debug_log/kimi-proxy.ndjson` (relative to
|
|
120
|
+
| Variable | Default | Purpose |
|
|
121
|
+
| ------------------------------------------- | -------------------------------------------------------- | ------------------------------------------------------- |
|
|
122
|
+
| `KIMI_PROXY_PORT` | `3457` (falls back to `PORT`) | Local listen port |
|
|
123
|
+
| `KIMI_UPSTREAM_URL` | `https://api.moonshot.ai/v1/chat/completions` | Upstream Moonshot endpoint |
|
|
124
|
+
| `KIMI_PROXY_FORCE_TEMPERATURE` | `1` | Temperature for thinking-mode requests |
|
|
125
|
+
| `KIMI_PROXY_FORCE_NON_THINKING_TEMPERATURE` | `0.6` | Temperature when thinking is disabled (tool requests) |
|
|
126
|
+
| `KIMI_PROXY_FORCE_TOP_P` | `0.95` | `top_p` forced into request body |
|
|
127
|
+
| `KIMI_PROXY_DISABLE_THINKING_WITH_TOOLS` | `1` | Force `thinking={"type":"disabled"}` when tools present |
|
|
128
|
+
| `KIMI_PROXY_LOG` | `debug_log/kimi-proxy.ndjson` (relative to proxy script) | Redacted NDJSON log path |
|
|
96
129
|
|
|
97
130
|
#### Health check response
|
|
98
131
|
|
|
@@ -110,7 +143,8 @@ All can be set in a `.env` file at the repo root (both proxies `import 'dotenv/c
|
|
|
110
143
|
|
|
111
144
|
- Forwards the existing `Authorization` header upstream.
|
|
112
145
|
- Rewrites plain-chat requests to `temperature: 1` and `top_p: 0.95`.
|
|
113
|
-
-
|
|
146
|
+
- For **K2.5/K2.6**: rewrites tool-enabled requests to `thinking: {"type": "disabled"}`, `temperature: 0.6`, and `top_p: 0.95`.
|
|
147
|
+
- For **K2.7 Code**: keeps thinking enabled (K2.7 rejects `thinking: disabled` with HTTP 400) and rewrites to `temperature: 1`, `top_p: 0.95`.
|
|
114
148
|
- Preserves streaming responses.
|
|
115
149
|
- Writes redacted request summaries to `debug_log/kimi-proxy.ndjson`.
|
|
116
150
|
|
|
@@ -125,10 +159,11 @@ All can be set in a `.env` file at the repo root (both proxies `import 'dotenv/c
|
|
|
125
159
|
|
|
126
160
|
### Thinking mode
|
|
127
161
|
|
|
128
|
-
| Turn type | Behavior |
|
|
129
|
-
| ------------ | ----------------------------------------------------------- |
|
|
130
|
-
| Plain chat | Thinking enabled, `temperature: 1`
|
|
131
|
-
| Tool-enabled | `thinking: { type: "disabled" }` forced, `temperature: 0.6` |
|
|
162
|
+
| Model | Turn type | Behavior |
|
|
163
|
+
| ----------- | ------------ | ----------------------------------------------------------- |
|
|
164
|
+
| K2.5 / K2.6 | Plain chat | Thinking enabled, `temperature: 1`, `top_p: 0.95` |
|
|
165
|
+
| K2.5 / K2.6 | Tool-enabled | `thinking: { type: "disabled" }` forced, `temperature: 0.6`, `top_p: 0.95` |
|
|
166
|
+
| K2.7 Code | All turns | Always-thinking, `temperature: 1`, `top_p: 0.95` |
|
|
132
167
|
|
|
133
168
|
### Capabilities
|
|
134
169
|
|
|
@@ -151,12 +186,14 @@ All can be set in a `.env` file at the repo root (both proxies `import 'dotenv/c
|
|
|
151
186
|
|
|
152
187
|
## Pricing
|
|
153
188
|
|
|
154
|
-
For the cross-provider comparison, see [docs/pricing.md](../pricing.md). Kimi
|
|
189
|
+
For the cross-provider comparison, see [docs/pricing.md](../pricing.md). Kimi models on the **Moonshot direct platform**:
|
|
155
190
|
|
|
156
|
-
| Model
|
|
157
|
-
|
|
|
158
|
-
| `kimi-k2.6`
|
|
191
|
+
| Model | Input | Cached input | Output (non-thinking) | Output (thinking) |
|
|
192
|
+
| ---------------- | ---------- | ------------ | --------------------- | ----------------- |
|
|
193
|
+
| `kimi-k2.6` | $0.16 / 1M | — | $0.95 / 1M | $4.00 / 1M |
|
|
194
|
+
| `kimi-k2.7-code` | $0.19 / 1M | $0.95 / 1M | — | $4.00 / 1M |
|
|
159
195
|
|
|
196
|
+
> **K2.7:** No non-thinking mode — always-thinking. Cached input pricing applies.
|
|
160
197
|
> Via DashScope, K2.6 is also available at $0.89 / 1M input and $3.71 / 1M output (same model, regional pricing).
|
|
161
198
|
|
|
162
199
|
---
|
|
@@ -213,11 +250,25 @@ The model-level `requestBody.temperature = 1` override validated locally but was
|
|
|
213
250
|
- Redacted proxy logs confirmed `temperature 0.1 -> 1` and `top_p 1 -> 0.95` for plain-chat requests.
|
|
214
251
|
- Redacted proxy logs later confirmed `thinking undefined -> disabled` and `temperature 0.1 -> 0.6` for tool-enabled requests.
|
|
215
252
|
|
|
253
|
+
### K2.7 Code validation results (June 14, 2026)
|
|
254
|
+
|
|
255
|
+
| Check | Result |
|
|
256
|
+
| ----------------------------------------------------- | ------------------------------------------ |
|
|
257
|
+
| `GET /v1/models` — slug confirmed | ✅ `kimi-k2.7-code` |
|
|
258
|
+
| Plain chat via proxy | ✅ |
|
|
259
|
+
| Tool turn with `thinking: disabled` | ❌ HTTP 400 — rejected by model |
|
|
260
|
+
| Tool turn letting K2.7 think | ✅ |
|
|
261
|
+
| Two-turn tool loop via proxy | ✅ No `reasoning_content is missing` error |
|
|
262
|
+
| VS Code Agent mode — integrated browser opened Google | ✅ |
|
|
263
|
+
| `maxOutputTokens` 24K–32K in agent mode | ❌ VS Code "Response too long" |
|
|
264
|
+
| `maxOutputTokens` 4096 in agent mode | ✅ |
|
|
265
|
+
|
|
216
266
|
### Final verdict
|
|
217
267
|
|
|
218
268
|
- Acceptable for plain chat: **yes** (proxy)
|
|
219
269
|
- Acceptable for streaming chat: **yes** (proxy)
|
|
220
270
|
- Acceptable for tool-enabled agent use: **yes**, with the local proxy workaround
|
|
271
|
+
- K2.7 specifically: **yes**, but keep `maxOutputTokens` low (4096 validated) to avoid VS Code's response-size limit
|
|
221
272
|
- Acceptable without a proxy: **no**
|
|
222
273
|
|
|
223
274
|
## References
|
|
@@ -233,3 +284,4 @@ The model-level `requestBody.temperature = 1` override validated locally but was
|
|
|
233
284
|
- Kimi web search guide: `https://platform.kimi.ai/docs/guide/use-web-search.md`
|
|
234
285
|
- Kimi coding tools / agent guide: `https://platform.kimi.ai/docs/guide/agent-support.md`
|
|
235
286
|
- Kimi K2.6 pricing: `https://platform.kimi.ai/docs/pricing/chat-k26`
|
|
287
|
+
- Kimi K2.7 Code pricing: `https://platform.kimi.ai/docs/pricing/chat-k27-code`
|
package/docs/models/mimo.md
CHANGED
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
| Vision | ✅ Yes (`mimo-v2.5` only) |
|
|
11
11
|
| Tool calling | ✅ Yes (with `thinking: disabled`) |
|
|
12
12
|
| Context | 1M (V2.5 Pro / V2.5) / 256K (V2 Flash) |
|
|
13
|
-
| Max output |
|
|
13
|
+
| Max output | 131072 (V2.5 Pro) / 32768 (V2.5) / 65536 (V2 Flash) |
|
|
14
14
|
| Required `requestBody` | `thinking: { type: "disabled" }` |
|
|
15
15
|
| Endpoint | `https://api.xiaomimimo.com/v1/chat/completions` |
|
|
16
16
|
|
|
@@ -51,7 +51,7 @@ Config file location:
|
|
|
51
51
|
"models": [
|
|
52
52
|
{
|
|
53
53
|
"id": "mimo-v2.5-pro",
|
|
54
|
-
"name": "MiMo V2.5 Pro",
|
|
54
|
+
"name": "MiMo V2.5 Pro (text)",
|
|
55
55
|
"url": "https://api.xiaomimimo.com/v1/chat/completions",
|
|
56
56
|
"toolCalling": true,
|
|
57
57
|
"vision": false,
|
|
@@ -66,7 +66,7 @@ Config file location:
|
|
|
66
66
|
},
|
|
67
67
|
{
|
|
68
68
|
"id": "mimo-v2.5",
|
|
69
|
-
"name": "MiMo V2.5",
|
|
69
|
+
"name": "MiMo V2.5 (vision)",
|
|
70
70
|
"url": "https://api.xiaomimimo.com/v1/chat/completions",
|
|
71
71
|
"toolCalling": true,
|
|
72
72
|
"vision": true,
|
|
@@ -81,7 +81,7 @@ Config file location:
|
|
|
81
81
|
},
|
|
82
82
|
{
|
|
83
83
|
"id": "mimo-v2-flash",
|
|
84
|
-
"name": "MiMo V2 Flash",
|
|
84
|
+
"name": "MiMo V2 Flash (text)",
|
|
85
85
|
"url": "https://api.xiaomimimo.com/v1/chat/completions",
|
|
86
86
|
"toolCalling": true,
|
|
87
87
|
"vision": false,
|
package/docs/models/minimax.md
CHANGED
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
| Vision | ✅ Yes (image + video) |
|
|
11
11
|
| Tool calling | ✅ Yes |
|
|
12
12
|
| Context | 1M (guaranteed 512K) |
|
|
13
|
-
| Max output |
|
|
13
|
+
| Max output | 131072 |
|
|
14
14
|
| Required `requestBody` | `thinking: { type: "adaptive" }, reasoning_split: true` |
|
|
15
15
|
| Endpoint (international) | `https://api.minimax.io/v1/chat/completions` |
|
|
16
16
|
| Endpoint (China) | `https://api.minimaxi.com/v1/chat/completions` |
|
|
@@ -42,7 +42,7 @@ Config file location:
|
|
|
42
42
|
"models": [
|
|
43
43
|
{
|
|
44
44
|
"id": "MiniMax-M3",
|
|
45
|
-
"name": "MiniMax M3",
|
|
45
|
+
"name": "MiniMax M3 (vision)",
|
|
46
46
|
"url": "https://api.minimax.io/v1/chat/completions",
|
|
47
47
|
"toolCalling": true,
|
|
48
48
|
"vision": true,
|
package/docs/models/qwen.md
CHANGED
|
@@ -1,19 +1,19 @@
|
|
|
1
1
|
# Qwen (DashScope) — VS Code Custom Endpoint Setup Guide
|
|
2
2
|
|
|
3
|
-
> **TL;DR:**
|
|
3
|
+
> **TL;DR:** The live config points `qwen3.7-plus` (vision) and `qwen3.7-max` (text-only) at `proxy/qwen-proxy.mjs` for dynamic thinking suppression: reasoning stays ON in plain chat but turns OFF automatically when tools are invoked. A direct DashScope path with static `enable_thinking: false` is also supported if you prefer not to run the proxy.
|
|
4
4
|
|
|
5
5
|
## At a Glance
|
|
6
6
|
|
|
7
7
|
| Field | Value |
|
|
8
8
|
| ------------------------------- | ------------------------------------------------------------------------- |
|
|
9
|
-
| Mode | **
|
|
10
|
-
| Vision | ✅ Yes (`qwen3.7-plus`)
|
|
11
|
-
| Tool calling | ✅ Yes
|
|
12
|
-
| Context | 1M
|
|
13
|
-
| Required `requestBody` (direct) | `enable_thinking: false`
|
|
14
|
-
| Required `requestBody` (proxy) | none — proxy injects based on tool activity in the conversation
|
|
15
|
-
| Endpoint | `https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions`
|
|
16
|
-
| Proxy endpoint | `http://127.0.0.1:3458/v1/chat/completions`
|
|
9
|
+
| Mode | **Proxy** (local on `:3458`) **or** **Direct** (static `enable_thinking: false`) |
|
|
10
|
+
| Vision | ✅ Yes (`qwen3.7-plus`) |
|
|
11
|
+
| Tool calling | ✅ Yes |
|
|
12
|
+
| Context | 1M |
|
|
13
|
+
| Required `requestBody` (direct) | `enable_thinking: false` |
|
|
14
|
+
| Required `requestBody` (proxy) | none — proxy injects based on tool activity in the conversation |
|
|
15
|
+
| Endpoint | `https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions` |
|
|
16
|
+
| Proxy endpoint | `http://127.0.0.1:3458/v1/chat/completions` |
|
|
17
17
|
|
|
18
18
|
### Models at a glance
|
|
19
19
|
|
|
@@ -22,15 +22,9 @@
|
|
|
22
22
|
| `qwen3.7-plus` | ✅ Yes | Primary model with image understanding |
|
|
23
23
|
| `qwen3.7-max` | ❌ No | Larger text-only model |
|
|
24
24
|
|
|
25
|
-
> The
|
|
25
|
+
> The live `chatLanguageModels.json` points Qwen models at the local proxy by default; the direct DashScope URL is shown for users who prefer a static `enable_thinking: false` setup.
|
|
26
26
|
|
|
27
|
-
## Quick Start —
|
|
28
|
-
|
|
29
|
-
1. **Edit `chatLanguageModels.json`** — add the Qwen block from [Setup § Direct](#direct-path) below.
|
|
30
|
-
2. **Set your `DASHSCOPE_API_KEY`** via Command Palette → **Chat: Manage Language Models**.
|
|
31
|
-
3. **Restart VS Code** and pick "Qwen 3.7 Plus" or "Qwen 3.7 Max".
|
|
32
|
-
|
|
33
|
-
## Quick Start — With Proxy (Dynamic Thinking)
|
|
27
|
+
## Quick Start — With Proxy (Recommended)
|
|
34
28
|
|
|
35
29
|
1. **Start the proxy** — choose one:
|
|
36
30
|
- `npm run proxy:qwen` (from the repo root)
|
|
@@ -40,6 +34,12 @@
|
|
|
40
34
|
3. **Set your DashScope API key** via the Language Models UI.
|
|
41
35
|
4. **Restart VS Code.** Reasoning will be visible in plain chat and suppressed on tool turns.
|
|
42
36
|
|
|
37
|
+
## Quick Start — Direct Path (No Proxy)
|
|
38
|
+
|
|
39
|
+
1. **Edit `chatLanguageModels.json`** — add the Qwen block from [Setup § Direct](#direct-path) below.
|
|
40
|
+
2. **Set your `DASHSCOPE_API_KEY`** via Command Palette → **Chat: Manage Language Models**.
|
|
41
|
+
3. **Restart VS Code** and pick "Qwen 3.7 Plus" or "Qwen 3.7 Max".
|
|
42
|
+
|
|
43
43
|
## Setup
|
|
44
44
|
|
|
45
45
|
### Regional endpoints
|
|
@@ -63,7 +63,7 @@ DashScope is region-specific — your API key only works on the endpoint it was
|
|
|
63
63
|
"models": [
|
|
64
64
|
{
|
|
65
65
|
"id": "qwen3.7-max",
|
|
66
|
-
"name": "Qwen 3.7 Max",
|
|
66
|
+
"name": "Qwen 3.7 Max (text)",
|
|
67
67
|
"url": "https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions",
|
|
68
68
|
"toolCalling": true,
|
|
69
69
|
"vision": false,
|
|
@@ -74,7 +74,7 @@ DashScope is region-specific — your API key only works on the endpoint it was
|
|
|
74
74
|
},
|
|
75
75
|
{
|
|
76
76
|
"id": "qwen3.7-plus",
|
|
77
|
-
"name": "Qwen 3.7 Plus",
|
|
77
|
+
"name": "Qwen 3.7 Plus (vision)",
|
|
78
78
|
"url": "https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions",
|
|
79
79
|
"toolCalling": true,
|
|
80
80
|
"vision": true,
|
|
@@ -89,6 +89,8 @@ DashScope is region-specific — your API key only works on the endpoint it was
|
|
|
89
89
|
|
|
90
90
|
> **`enable_thinking: false`** suppresses the Qwen3 family's default thinking mode, which prevents `reasoning_content` issues during tool loops.
|
|
91
91
|
|
|
92
|
+
> **Live config note:** The checked-in `chatLanguageModels.json` points Qwen at the local proxy (`http://127.0.0.1:3458`) with no `requestBody` override, so the proxy manages `enable_thinking` dynamically. Use the snippet above only if you are not running the proxy.
|
|
93
|
+
|
|
92
94
|
### Proxy path
|
|
93
95
|
|
|
94
96
|
#### 1. Start the proxy
|
|
@@ -132,7 +134,7 @@ Expected response:
|
|
|
132
134
|
"models": [
|
|
133
135
|
{
|
|
134
136
|
"id": "qwen3.7-max",
|
|
135
|
-
"name": "Qwen 3.7 Max",
|
|
137
|
+
"name": "Qwen 3.7 Max (text)",
|
|
136
138
|
"url": "http://127.0.0.1:3458/v1/chat/completions",
|
|
137
139
|
"toolCalling": true,
|
|
138
140
|
"vision": false,
|
|
@@ -140,7 +142,7 @@ Expected response:
|
|
|
140
142
|
},
|
|
141
143
|
{
|
|
142
144
|
"id": "qwen3.7-plus",
|
|
143
|
-
"name": "Qwen 3.7 Plus",
|
|
145
|
+
"name": "Qwen 3.7 Plus (vision)",
|
|
144
146
|
"url": "http://127.0.0.1:3458/v1/chat/completions",
|
|
145
147
|
"toolCalling": true,
|
|
146
148
|
"vision": true,
|
|
@@ -160,7 +162,7 @@ All can be set in a `.env` file at the repo root (both proxies `import 'dotenv/c
|
|
|
160
162
|
| ---------------------------------------- | ------------------------------------------------------------------------- | -------------------------------------------------- |
|
|
161
163
|
| `QWEN_PROXY_PORT` | `3458` (falls back to `PORT`) | Local listen port |
|
|
162
164
|
| `QWEN_UPSTREAM_URL` | `https://dashscope-intl.aliyuncs.com/compatible-mode/v1/chat/completions` | Upstream DashScope endpoint |
|
|
163
|
-
| `QWEN_PROXY_LOG` | `debug_log/qwen-proxy.ndjson` (relative to
|
|
165
|
+
| `QWEN_PROXY_LOG` | `debug_log/qwen-proxy.ndjson` (relative to proxy script) | Redacted NDJSON log path |
|
|
164
166
|
| `QWEN_PROXY_DISABLE_THINKING_WITH_TOOLS` | `1` | Set to `0` to skip tool-aware thinking suppression |
|
|
165
167
|
|
|
166
168
|
#### Proxy request rewriting rules
|
|
@@ -175,6 +177,8 @@ The proxy detects active tool use by examining the conversation state, not just
|
|
|
175
177
|
> **Why delete rather than set `true`?** Omitting the key lets Qwen use its built-in default (`true`). Deletion is closer to "don't interfere."
|
|
176
178
|
>
|
|
177
179
|
> **Why not check `body.tools`?** The proxy checks for tool _activity_ — tool results in the message history or an explicit `tool_choice` directive — rather than the mere presence of a tools array. This correctly handles tool-enabled conversations even when the client sends `tools` in an earlier request but omits it from subsequent turns.
|
|
180
|
+
>
|
|
181
|
+
> **Proxy vs. direct:** The live config uses the proxy URL with no `requestBody` override so this dynamic behavior is applied to every request. The direct-path snippet above keeps `enable_thinking: false` static in `requestBody` as a no-proxy alternative.
|
|
178
182
|
|
|
179
183
|
### API key
|
|
180
184
|
|
|
@@ -197,6 +201,8 @@ The Qwen3 hybrid-thinking models default to `enable_thinking: true`, producing `
|
|
|
197
201
|
| Proxy path | Thinking ON (default preserved) | Thinking OFF (auto-injected) |
|
|
198
202
|
| No config (default) | Thinking ON | Risk: history may be rejected |
|
|
199
203
|
|
|
204
|
+
> The live `chatLanguageModels.json` uses the proxy path by default, so plain-chat reasoning is visible and tool turns are stable.
|
|
205
|
+
|
|
200
206
|
### Vision (`qwen3.7-plus`)
|
|
201
207
|
|
|
202
208
|
- Image input via OpenAI-compatible `content` array format (base64 data URIs).
|
package/docs/pricing.md
CHANGED
|
@@ -47,6 +47,7 @@ These are the models available through GitHub Copilot's model roster as of June
|
|
|
47
47
|
| **MiMo V2 Flash** | Xiaomi | $0.10 | $0.01 | $0.30 | 256K |
|
|
48
48
|
| **DeepSeek V4 Flash** | DeepSeek | $0.14 | $0.0028 | $0.28 | 1M |
|
|
49
49
|
| **Kimi K2.6** | Moonshot | $0.16 | — | $0.95 (non-thinking) / $4.00 (thinking) | 256K |
|
|
50
|
+
| **Kimi K2.7 Code** | Moonshot | $0.19 | $0.95 | $4.00 | 262K |
|
|
50
51
|
| **Qwen 3.7 Plus** | DashScope | $0.40 (≤256K) / $1.20 (>256K) | — | $1.60 (≤256K) / $4.80 (>256K) | 1M |
|
|
51
52
|
| **MiMo V2.5** | Xiaomi | $0.40 | $0.08 | $2.00 | 1M |
|
|
52
53
|
| **DeepSeek V4 Pro** | DeepSeek | $0.435 | $0.003625 | $0.87 | 1M |
|
|
@@ -86,6 +87,7 @@ For a typical coding session (~10K input + ~2K output tokens per turn, 50 turns)
|
|
|
86
87
|
| Qwen 3.7 Plus | ~$0.36 |
|
|
87
88
|
| MiMo V2.5 | ~$0.40 |
|
|
88
89
|
| Kimi K2.6 (thinking) | ~$0.48 |
|
|
90
|
+
| Kimi K2.7 Code | ~$0.50 |
|
|
89
91
|
| Gemini 3 Flash | ~$0.55 |
|
|
90
92
|
| MiMo V2.5 Pro | ~$0.80 |
|
|
91
93
|
| GPT-5.4 mini | ~$0.83 |
|
package/package.json
CHANGED
package/proxy/kimi-proxy.mjs
CHANGED
|
@@ -8,12 +8,16 @@ import { createProxy } from '../lib/create-proxy.mjs'
|
|
|
8
8
|
* - Validated in this repo with `kimi-k2.6`.
|
|
9
9
|
* - Expected to work for `kimi-k2.5`, because Kimi documents the same fixed
|
|
10
10
|
* sampling and thinking behavior for `kimi-k2.6` / `kimi-k2.5`.
|
|
11
|
+
* - Validated in this repo with `kimi-k2.7-code` (June 14, 2026). K2.7 is
|
|
12
|
+
* always-thinking and rejects `thinking: { type: 'disabled' }`. The proxy
|
|
13
|
+
* detects K2.7 and skips the thinking-disable rewrite while keeping
|
|
14
|
+
* temperature/top_p enforcement.
|
|
11
15
|
* - Not intended for `moonshot-v1` models or non-Kimi providers, because this
|
|
12
16
|
* proxy rewrites requests to K2-family-specific values:
|
|
13
17
|
* - thinking mode temperature = 1.0
|
|
14
18
|
* - non-thinking mode temperature = 0.6
|
|
15
19
|
* - top_p = 0.95
|
|
16
|
-
* - tool-enabled requests force `thinking: { type: 'disabled' }`
|
|
20
|
+
* - tool-enabled requests force `thinking: { type: 'disabled' }` (K2.5/K2.6 only)
|
|
17
21
|
*/
|
|
18
22
|
const upstreamUrl =
|
|
19
23
|
process.env.KIMI_UPSTREAM_URL ?? 'https://api.moonshot.ai/v1/chat/completions'
|
|
@@ -104,6 +108,12 @@ function rewriteKimi(payload) {
|
|
|
104
108
|
const incomingTemperature = payload.temperature
|
|
105
109
|
const incomingTopP = payload.top_p
|
|
106
110
|
const incomingThinkingType = payload?.thinking?.type
|
|
111
|
+
const model = payload.model ?? ''
|
|
112
|
+
|
|
113
|
+
// K2.7 is always-thinking and rejects thinking: disabled.
|
|
114
|
+
// Detect K2.7 variants (e.g. kimi-k2.7-code) and skip the thinking-disable
|
|
115
|
+
// rewrite while keeping temperature/top_p enforcement.
|
|
116
|
+
const isK27 = model.startsWith('kimi-k2.7')
|
|
107
117
|
|
|
108
118
|
// Determine if a tool is actually being invoked:
|
|
109
119
|
// - tool_choice is set and not "none"
|
|
@@ -116,7 +126,7 @@ function rewriteKimi(payload) {
|
|
|
116
126
|
(toolChoice !== undefined && toolChoice !== 'none' && toolChoice !== null)
|
|
117
127
|
const hasTools = hasActiveToolCall
|
|
118
128
|
|
|
119
|
-
const useNonThinkingMode = disableThinkingWithTools && hasTools
|
|
129
|
+
const useNonThinkingMode = !isK27 && disableThinkingWithTools && hasTools
|
|
120
130
|
const rewrittenTemperature = useNonThinkingMode
|
|
121
131
|
? forcedNonThinkingTemperature
|
|
122
132
|
: forcedTemperature
|
|
@@ -134,6 +144,8 @@ function rewriteKimi(payload) {
|
|
|
134
144
|
|
|
135
145
|
const rewrittenThinkingType = payload.thinking?.type
|
|
136
146
|
const rewriteInfo = {
|
|
147
|
+
model,
|
|
148
|
+
isK27,
|
|
137
149
|
incomingTemperature,
|
|
138
150
|
rewrittenTemperature,
|
|
139
151
|
incomingTopP,
|
|
@@ -145,7 +157,8 @@ function rewriteKimi(payload) {
|
|
|
145
157
|
const summary = summarizePayload(payload, hasTools, rewriteInfo)
|
|
146
158
|
|
|
147
159
|
const modeTag = hasTools ? '[tools]' : '[chat]'
|
|
148
|
-
const
|
|
160
|
+
const k27Tag = isK27 ? '[k2.7]' : ''
|
|
161
|
+
const consoleMsg = `${k27Tag}${modeTag} temperature ${String(incomingTemperature)} -> ${String(rewrittenTemperature)}, top_p ${String(incomingTopP)} -> ${String(forcedTopP)}, thinking ${String(incomingThinkingType)} -> ${String(rewrittenThinkingType)}`
|
|
149
162
|
|
|
150
163
|
// Clean up internal key before forwarding
|
|
151
164
|
delete payload.__incomingThinkingType
|