glm-mcp-copilot 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,312 @@
1
+ #!/usr/bin/env node
2
+ // index.js -- GLM MCP server.
3
+ // Exposes GLM (Zhipu/Z.ai) as a delegation subagent for Claude Code via three tools:
4
+ // glm_delegate -- run a self-contained subtask on GLM and get the result
5
+ // glm_recommend -- advisory: should this task go to GLM or Opus? which model?
6
+ // glm_status -- peak window, model picks, cost multipliers, config sanity
7
+ //
8
+ // Design notes baked in (see docs/research):
9
+ // * "auto" model selection defaults to GLM-5.2 in both windows; since GLM-5.2 carries the
10
+ // ~3x peak surcharge, the router routes less work to GLM during peak.
11
+ // * Calls are serialized through a concurrency gate to respect GLM's ~1-in-flight cap.
12
+ // * Output stays high-signal: a short metadata header + GLM's answer, capped.
13
+
14
+ import "./loadEnv.js";
15
+ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
16
+ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
17
+ import { z } from "zod";
18
+ import { glmMessage, config, usageSummary } from "./glmClient.js";
19
+ import {
20
+ resolveModel,
21
+ recommend,
22
+ isPeak,
23
+ chinaHour,
24
+ peakMultiplier,
25
+ estimateCost,
26
+ MODELS,
27
+ resolveMaxTokens,
28
+ MAXTOK,
29
+ USE_HAIKU,
30
+ } from "./router.js";
31
+ import { runGlmAgent } from "./glmAgent.js";
32
+
33
+ const CHARACTER_LIMIT = 50000;
34
+
35
+ const server = new McpServer({ name: "glm-mcp", version: "1.0.0" });
36
+
37
+ const TASK_TYPES = [
38
+ // strong GLM
39
+ "frontend", "ui", "boilerplate", "scaffolding", "config", "crud", "regex",
40
+ "docs", "i18n", "type_lint", "unit_test", "refactor_local", "prototype", "toolcall_single",
41
+ // mild GLM
42
+ "sql", "etl", "cicd", "cli", "notebook", "integration_test",
43
+ "algorithm", "research", "summarization", "toolcall_fanout",
44
+ // neutral
45
+ "general", "ml_training",
46
+ // lean / strong Opus
47
+ "iac", "dependency_upgrade", "debugging", "code_review", "perf", "api_integration",
48
+ "migration", "systems", "refactor_large", "architecture", "security",
49
+ "agentic_loop", "toolcall_heavy",
50
+ ];
51
+
52
+ // ----------------------------- glm_delegate -----------------------------
53
+ server.registerTool(
54
+ "glm_delegate",
55
+ {
56
+ title: "Delegate a subtask to GLM",
57
+ description:
58
+ "Text-in/text-out subtask on GLM (~10x cheaper than Opus); returns text only. GLM has no " +
59
+ "file/tool access -- put everything in task+context. For file edits use glm_agent instead. " +
60
+ "Not for sensitive, long-debugging, large-refactor, or parallel work.",
61
+ inputSchema: {
62
+ task: z
63
+ .string()
64
+ .min(1)
65
+ .describe("The instruction for GLM. Be explicit and self-contained, e.g. 'Write a React component that...'."),
66
+ context: z
67
+ .string()
68
+ .optional()
69
+ .describe("Supporting material GLM needs: code to modify, file contents, specs, examples. GLM has no file access."),
70
+ model: z
71
+ .string()
72
+ .optional()
73
+ .describe("Model id or 'auto' (default). e.g. glm-5.2, glm-4.7, glm-4.5-air. 'auto' picks peak-aware."),
74
+ system: z.string().optional().describe("Optional system prompt to steer GLM's role/format."),
75
+ thinking: z.boolean().optional().describe("Enable GLM reasoning mode for harder tasks (slower). Default false."),
76
+ max_tokens: z.number().int().min(256).max(131072).optional().describe("Max output tokens (ceiling; billed for actual). Default generous."),
77
+ format: z.enum(["concise", "detailed"]).optional().describe("concise (default) or detailed metadata."),
78
+ },
79
+ annotations: {
80
+ readOnlyHint: false,
81
+ destructiveHint: false,
82
+ idempotentHint: false,
83
+ openWorldHint: true,
84
+ },
85
+ },
86
+ async ({ task, context, model = "auto", system, thinking = false, max_tokens, format = "concise" }) => {
87
+ const now = new Date();
88
+ const chosen = resolveModel(model, now);
89
+
90
+ const userContent = context ? `${task}\n\n--- CONTEXT ---\n${context}` : task;
91
+ try {
92
+ const { text, usage } = await glmMessage({
93
+ model: chosen,
94
+ system,
95
+ messages: [{ role: "user", content: userContent }],
96
+ maxTokens: resolveMaxTokens(max_tokens),
97
+ thinking,
98
+ });
99
+
100
+ const inTok = usage.input_tokens ?? 0;
101
+ const outTok = usage.output_tokens ?? 0;
102
+ const totalTok = inTok + outTok;
103
+ const cost = estimateCost(chosen, inTok, outTok, now);
104
+ const opusCost = estimateCost("claude-opus", inTok, outTok, now);
105
+
106
+ // Every output reports how many tokens were delegated to GLM.
107
+ const tokLine = `[GLM delegated ${totalTok} tokens (${inTok} in / ${outTok} out) to ${chosen} — est $${cost}]`;
108
+
109
+ let out;
110
+ if (format === "detailed") {
111
+ out =
112
+ `${tokLine}\n[peak=${isPeak(now)} (CN ${chinaHour(now)}:00) | Opus would be ~$${opusCost}, ` +
113
+ `~${opusCost && cost ? Math.round(opusCost / cost) : "?"}x more]\n\n${text}`;
114
+ } else {
115
+ out = `${tokLine}\n\n${text}`;
116
+ }
117
+ return { content: [{ type: "text", text: clip(out) }] };
118
+ } catch (e) {
119
+ return {
120
+ isError: true,
121
+ content: [
122
+ {
123
+ type: "text",
124
+ text:
125
+ `GLM delegation failed: ${e.message}\n\n` +
126
+ `Suggested next steps:\n` +
127
+ `- If 'concurrency'/'Too much concurrency': retry shortly; GLM caps in-flight requests (~1). Avoid parallel glm_delegate calls.\n` +
128
+ `- If auth error: check GLM_API_KEY in glm-mcp/.env or .mcp.json env.\n` +
129
+ `- If the task is hard/critical, do it on Opus directly instead of GLM.`,
130
+ },
131
+ ],
132
+ };
133
+ }
134
+ }
135
+ );
136
+
137
+ // ----------------------------- glm_agent -----------------------------
138
+ server.registerTool(
139
+ "glm_agent",
140
+ {
141
+ title: "Run GLM as a file-accessing agent",
142
+ description:
143
+ "Run GLM as a real coding agent with its OWN file tools (read/write/edit/list/bash); it works " +
144
+ "your repo end-to-end on GLM tokens (~10x cheaper than Opus). Prefer this over doing repo work " +
145
+ "yourself. Pass task + absolute workdir. Returns a concise summary+stats (use dry_run to preview " +
146
+ "a diff first). Not for sensitive, huge-context, or heavy dependent-tool-loop work.",
147
+ inputSchema: {
148
+ task: z.string().min(1).describe("The coding task for GLM to carry out end-to-end in the repo."),
149
+ workdir: z
150
+ .string()
151
+ .optional()
152
+ .describe("Absolute path to the project root GLM should operate in. Defaults to the server's cwd; always pass it explicitly."),
153
+ context: z.string().optional().describe("Optional extra context/constraints (GLM can also read files itself)."),
154
+ model: z.string().optional().describe("Model id or 'auto' (default, peak-aware)."),
155
+ thinking: z.boolean().optional().describe("Enable GLM reasoning mode for harder tasks. Default false."),
156
+ max_tokens: z.number().int().min(256).max(131072).optional().describe("Max output tokens per turn (ceiling; billed for actual). Default generous."),
157
+ dry_run: z.boolean().optional().describe("If true, GLM proposes a diff and writes nothing (preview before applying). Default false."),
158
+ format: z.enum(["concise", "detailed"]).optional().describe("concise (default: summary+stats+changed files) or detailed (adds full diff). dry_run always shows the diff."),
159
+ },
160
+ annotations: { readOnlyHint: false, destructiveHint: true, idempotentHint: false, openWorldHint: true },
161
+ },
162
+ async ({ task, workdir, context, model = "auto", thinking = false, max_tokens, dry_run = false, format = "concise" }) => {
163
+ const now = new Date();
164
+ const chosen = resolveModel(model, now);
165
+ try {
166
+ const r = await runGlmAgent({ model: chosen, task, context, workdir, maxTokens: resolveMaxTokens(max_tokens), thinking, dryRun: dry_run });
167
+ const inTok = r.usage.input_tokens || 0;
168
+ const outTok = r.usage.output_tokens || 0;
169
+ const totalTok = inTok + outTok;
170
+ const cost = estimateCost(chosen, inTok, outTok, now);
171
+ const opusCost = estimateCost("claude-opus", inTok, outTok, now);
172
+ const xCheaper = cost > 0 ? Math.round(opusCost / cost) : "?";
173
+ const banner = r.dryRun ? "*** DRY RUN — nothing was written; this is GLM's PROPOSED change for you to approve ***\n" : "";
174
+ const header =
175
+ `[GLM agent] ${chosen} | dir=${r.root} | ${r.iters} iterations${r.hitCap ? " (HIT CAP -- may be incomplete)" : ""} | ${r.actions.length} actions | ${r.changedFiles.length} files`;
176
+ // Concise by default: keep what the caller (Claude) must read minimal -> more burden stays on
177
+ // GLM. The full diff + per-action list appear only for dry_run or format:"detailed".
178
+ const showDiff = r.dryRun || format === "detailed";
179
+ const actions = (showDiff && r.actions.length) ? `\nActions:\n- ${r.actions.join("\n- ")}` : "";
180
+ const diff = showDiff
181
+ ? (r.diff ? `\n\n=== DIFF (review this) ===\n${r.diff}` : "\n\n(no file changes)")
182
+ : (r.changedFiles.length
183
+ ? `\n\nChanged files: ${r.changedFiles.join(", ")} (call with format:"detailed" or dry_run:true to see the full diff, or use \`git diff\`)`
184
+ : "\n\n(no file changes)");
185
+ const revert = !r.dryRun && r.git && r.git.revertHint ? `\n\nRevert: ${r.git.revertHint}` : "";
186
+ // Prominent stats footer, shown after every glm_agent run finishes.
187
+ const stats =
188
+ `\n\n=== GLM STATS (this subagent) ===\n` +
189
+ `model: ${chosen}\n` +
190
+ `tokens: ${totalTok} delegated to GLM (${inTok} in / ${outTok} out)\n` +
191
+ `iterations: ${r.iters}${r.hitCap ? " (hit cap)" : ""} files changed: ${r.changedFiles.length}\n` +
192
+ `est. cost: $${cost} (~${xCheaper}x cheaper than Opus)`;
193
+ return { content: [{ type: "text", text: clip(`${banner}${header}${actions}${diff}${revert}\n\n=== GLM SUMMARY ===\n${r.text}${stats}`) }] };
194
+ } catch (e) {
195
+ return {
196
+ isError: true,
197
+ content: [
198
+ {
199
+ type: "text",
200
+ text:
201
+ `GLM agent failed: ${e.message}\n\n` +
202
+ `- 'concurrency'/'Too much concurrency': retry shortly (GLM caps in-flight ~1).\n` +
203
+ `- auth error: check GLM_API_KEY.\n` +
204
+ `- If GLM is looping or the task is hard/critical, run it on Opus instead.`,
205
+ },
206
+ ],
207
+ };
208
+ }
209
+ }
210
+ );
211
+
212
+ // ----------------------------- glm_recommend -----------------------------
213
+ server.registerTool(
214
+ "glm_recommend",
215
+ {
216
+ title: "Recommend GLM vs Opus for a task",
217
+ description:
218
+ "Free local advisory (no GLM call): given a task profile, returns GLM-vs-Opus, model, " +
219
+ "confidence, and reasons. Call when unsure which engine to use.",
220
+ inputSchema: {
221
+ task_type: z.enum(TASK_TYPES).optional().describe("Closest task category. Default 'general'."),
222
+ complexity: z.enum(["low", "medium", "high"]).optional().describe("Default 'medium'."),
223
+ sensitive: z.boolean().optional().describe("True if proprietary/security-critical (forces Opus)."),
224
+ needs_parallel: z.boolean().optional().describe("True if it needs several concurrent agents (forces Opus)."),
225
+ long_horizon: z.boolean().optional().describe("True if many sequential steps / multi-hour autonomy."),
226
+ latency_sensitive: z.boolean().optional().describe("True if a tight interactive loop (forces Opus)."),
227
+ vision: z.boolean().optional().describe("True if input includes images/screenshots/GUI/computer-use (forces Opus)."),
228
+ input_tokens: z.number().int().optional().describe("Approx context size needed. >128K forces Opus (GLM degrades past ~100K)."),
229
+ steps: z.number().int().optional().describe("Approx number of dependent sequential steps. >20 forces Opus (goal drift)."),
230
+ tool_pattern: z.enum(["none", "single", "fanout", "heavy"]).optional().describe("Tool-use shape: single one-shot call / short independent fanout (GLM-ok) vs heavy dependent agentic loop (forces Opus)."),
231
+ unfamiliar_api: z.boolean().optional().describe("True if it uses a niche/post-cutoff/internal API GLM can't know (-2; paste docs or use Opus)."),
232
+ chinese: z.boolean().optional().describe("True if Chinese or Chinese-English bilingual (GLM strength, +1)."),
233
+ },
234
+ annotations: { readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
235
+ },
236
+ async ({ task_type, complexity, sensitive, needs_parallel, long_horizon, latency_sensitive, vision, input_tokens, steps, tool_pattern, unfamiliar_api, chinese }) => {
237
+ const now = new Date();
238
+ const rec = recommend(
239
+ {
240
+ taskType: task_type,
241
+ complexity,
242
+ sensitive,
243
+ needsParallel: needs_parallel,
244
+ longHorizon: long_horizon,
245
+ latencySensitive: latency_sensitive,
246
+ vision,
247
+ inputTokens: input_tokens,
248
+ steps,
249
+ toolPattern: tool_pattern,
250
+ unfamiliarApi: unfamiliar_api,
251
+ chinese,
252
+ },
253
+ now
254
+ );
255
+ const body = {
256
+ decision: rec.engine.toUpperCase(),
257
+ glm_model: rec.engine === "glm" ? rec.model : null,
258
+ confidence: rec.confidence,
259
+ peak_now: isPeak(now),
260
+ china_hour: chinaHour(now),
261
+ reasons: rec.reasons,
262
+ };
263
+ return { content: [{ type: "text", text: JSON.stringify(body, null, 2) }] };
264
+ }
265
+ );
266
+
267
+ // ----------------------------- glm_status -----------------------------
268
+ server.registerTool(
269
+ "glm_status",
270
+ {
271
+ title: "GLM status & config",
272
+ description:
273
+ "Free local status: peak window, active model, GLM usage ledger (proof of GLM tokens spent), and config health. No GLM call.",
274
+ inputSchema: {},
275
+ annotations: { readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
276
+ },
277
+ async () => {
278
+ const now = new Date();
279
+ const status = {
280
+ china_hour: chinaHour(now),
281
+ peak_now: isPeak(now),
282
+ flagship_multiplier: `${peakMultiplier(now)}x`,
283
+ auto_model_now: resolveModel("auto", now),
284
+ models: { offpeak: MODELS.OFFPEAK_MODELS.join(", "), peak: MODELS.PEAK_MODELS.join(", "), cheap: MODELS.CHEAP_MODEL },
285
+ base_url: config.BASE_URL,
286
+ api_key_loaded: config.hasKey,
287
+ max_concurrent: config.MAX_CONCURRENT,
288
+ glm_usage_ledger: usageSummary(),
289
+ use_haiku_subagent: USE_HAIKU,
290
+ orchestration: USE_HAIKU
291
+ ? "Haiku `glm` subagent allowed (spends some Claude tokens to orchestrate)."
292
+ : "Direct GLM only (GLM_USE_HAIKU=off) -> call glm_agent directly; keeps all tokens on GLM.",
293
+ max_tokens: {
294
+ cap_enabled: MAXTOK.capEnabled,
295
+ default_per_call: resolveMaxTokens(undefined),
296
+ cap_value_when_on: MAXTOK.capValue,
297
+ hard_ceiling: MAXTOK.uncappedMax,
298
+ },
299
+ note: config.hasKey
300
+ ? "Ready."
301
+ : "No API key loaded -- set GLM_API_KEY in glm-mcp/.env or .mcp.json env before delegating.",
302
+ };
303
+ return { content: [{ type: "text", text: JSON.stringify(status, null, 2) }] };
304
+ }
305
+ );
306
+
307
+ function clip(s) {
308
+ return s.length > CHARACTER_LIMIT ? s.slice(0, CHARACTER_LIMIT) + "\n…[truncated to fit context]" : s;
309
+ }
310
+
311
+ const transport = new StdioServerTransport();
312
+ await server.connect(transport);
@@ -0,0 +1,24 @@
1
+ // loadEnv.js -- zero-dependency .env loader (so we don't pull in dotenv).
2
+ // Loaded first by index.js / smoke.js. Silently no-ops if .env is absent.
3
+ import { readFileSync } from "node:fs";
4
+ import { fileURLToPath } from "node:url";
5
+ import { dirname, join } from "node:path";
6
+
7
+ const __dirname = dirname(fileURLToPath(import.meta.url));
8
+ const envPath = join(__dirname, "..", ".env");
9
+
10
+ try {
11
+ const raw = readFileSync(envPath, "utf8");
12
+ for (const line of raw.split(/\r?\n/)) {
13
+ const m = line.match(/^\s*([A-Z0-9_]+)\s*=\s*(.*)\s*$/);
14
+ if (!m) continue;
15
+ const key = m[1];
16
+ let val = m[2];
17
+ if ((val.startsWith('"') && val.endsWith('"')) || (val.startsWith("'") && val.endsWith("'"))) {
18
+ val = val.slice(1, -1);
19
+ }
20
+ if (process.env[key] === undefined) process.env[key] = val;
21
+ }
22
+ } catch {
23
+ // no .env present -- rely on real environment variables
24
+ }
@@ -0,0 +1,305 @@
1
+ // router.js
2
+ // Peak-awareness + GLM-vs-Opus decision logic.
3
+ // Pure functions, no I/O, no API calls -> cheap to call, easy to unit test.
4
+ //
5
+ // Facts encoded here come from docs/research:
6
+ // - GLM-5.2 quota multiplier: ~3x peak / ~2x off-peak (1x off-peak under promo).
7
+ // - Peak window: ~14:00-18:00 China time (UTC+8).
8
+ // - GLM-4.7 carries NO multiplier (a cheaper option, not the default).
9
+ // - Concurrency cap is ~1 in-flight even on paid tiers.
10
+
11
+ const PEAK_START_HOUR_CN = intEnv("GLM_PEAK_START_CN", 14); // 14:00 UTC+8
12
+ const PEAK_END_HOUR_CN = intEnv("GLM_PEAK_END_CN", 18); // 18:00 UTC+8 (exclusive)
13
+
14
+ // Default model picks for "auto". Each may be a COMMA-SEPARATED LIST of candidate models;
15
+ // the router auto-picks one per task (most capable for hard / off-peak work, cheapest for
16
+ // easy / peak work) unless a specific model is requested. A single value works too.
17
+ // Example: GLM_OFFPEAK_MODEL="glm-5.2,glm-5-turbo"
18
+ const OFFPEAK_MODELS = splitModels(process.env.GLM_OFFPEAK_MODEL, "glm-5.2");
19
+ const PEAK_MODELS = splitModels(process.env.GLM_PEAK_MODEL, "glm-5.2");
20
+ const CHEAP_MODEL = process.env.GLM_CHEAP_MODEL || "glm-4.5-air";
21
+
22
+ function splitModels(val, fallback) {
23
+ const list = (val || fallback).split(",").map((s) => s.trim()).filter(Boolean);
24
+ return list.length ? list : [fallback];
25
+ }
26
+
27
+ // Rough public-ish per-1M-token rates (USD) for cost estimation only.
28
+ // These are approximations from research and WILL drift -- treat as indicative.
29
+ const RATES = {
30
+ "glm-5.2": { in: 0.6, out: 2.2 },
31
+ "glm-5.2[1m]": { in: 1.2, out: 4.4 },
32
+ "glm-5-turbo": { in: 0.3, out: 1.1 },
33
+ "glm-4.7": { in: 0.4, out: 1.75 },
34
+ "glm-4.6": { in: 0.4, out: 1.75 },
35
+ "glm-4.5": { in: 0.4, out: 1.6 },
36
+ "glm-4.5-air": { in: 0.1, out: 0.6 },
37
+ // Opus reference, for comparison output only:
38
+ "claude-opus": { in: 5.0, out: 25.0 },
39
+ };
40
+
41
+ function intEnv(name, fallback) {
42
+ const v = parseInt(process.env[name] || "", 10);
43
+ return Number.isFinite(v) ? v : fallback;
44
+ }
45
+
46
+ function numEnv(name, fallback) {
47
+ const v = parseFloat(process.env[name] || "");
48
+ return Number.isFinite(v) ? v : fallback;
49
+ }
50
+
51
+ function boolEnv(name, fallback) {
52
+ const v = (process.env[name] || "").trim().toLowerCase();
53
+ if (/^(1|on|true|yes)$/.test(v)) return true;
54
+ if (/^(0|off|false|no)$/.test(v)) return false;
55
+ return fallback;
56
+ }
57
+
58
+ // Use the Haiku-orchestrated `glm` subagent? DEFAULT false -> skip Haiku and call GLM directly
59
+ // (mcp__glm__glm_agent), so the burden and the tokens stay on GLM (the Haiku subagent's own
60
+ // writing would spend Claude tokens). Set GLM_USE_HAIKU=on in .env to allow the subagent path.
61
+ export const USE_HAIKU = boolEnv("GLM_USE_HAIKU", false);
62
+
63
+ // GLM is ~10x cheaper than Opus, so by default GLM carries the overwhelming majority of the
64
+ // burden: with GLM_COST_BIAS=7, ~98-100% of tasks route to GLM (measured across all task types,
65
+ // peak and off-peak). Opus is used only for what GLM genuinely can't/shouldn't do -- vision,
66
+ // parallel fan-out, >128K context, sensitive code, and heavy dependent tool-loops (the hard
67
+ // overrides). LOWER GLM_COST_BIAS (e.g. 1.5) if you want Opus to handle more of the hard tasks
68
+ // (debugging, architecture, security, big refactors); set 0 to decide on capability alone.
69
+ const COST_BIAS = numEnv("GLM_COST_BIAS", 7);
70
+
71
+ // --- Output token policy ---------------------------------------------------
72
+ // max_tokens is a CEILING, not a target: GLM stops when done and you're billed for
73
+ // ACTUAL output, so being generous just prevents truncation at no extra cost.
74
+ //
75
+ // DEFAULT: the cap is OFF -> every call may use up to GLM_MAX_TOKENS_CEILING (131072, generous).
76
+ // Turn it ON with GLM_CAP=on to enforce GLM_MAX_TOKENS as a hard limit on every call and clamp
77
+ // any larger per-call request down to it -- handy when you want to control spend.
78
+ const CAP_ENABLED = /^(1|on|true|yes)$/i.test(process.env.GLM_CAP || "off");
79
+ const CAP_VALUE = intEnv("GLM_MAX_TOKENS", 32768);
80
+ const UNCAPPED_MAX = intEnv("GLM_MAX_TOKENS_CEILING", 131072);
81
+
82
+ /** Resolve max_tokens to send, honoring the on/off cap and any per-call request. */
83
+ export function resolveMaxTokens(requested) {
84
+ const r = Number.isFinite(requested) ? requested : null;
85
+ if (r != null) return CAP_ENABLED ? Math.min(r, CAP_VALUE) : r;
86
+ return CAP_ENABLED ? CAP_VALUE : UNCAPPED_MAX;
87
+ }
88
+ export const MAXTOK = { capEnabled: CAP_ENABLED, capValue: CAP_VALUE, uncappedMax: UNCAPPED_MAX };
89
+
90
+ /** Current hour (0-23) in China time (UTC+8), independent of host TZ. */
91
+ export function chinaHour(date = new Date()) {
92
+ const utcMs = date.getTime() + date.getTimezoneOffset() * 60000;
93
+ const cn = new Date(utcMs + 8 * 3600000);
94
+ return cn.getHours();
95
+ }
96
+
97
+ /** Is `date` inside the GLM peak-billing window? */
98
+ export function isPeak(date = new Date()) {
99
+ const h = chinaHour(date);
100
+ if (PEAK_START_HOUR_CN <= PEAK_END_HOUR_CN) {
101
+ return h >= PEAK_START_HOUR_CN && h < PEAK_END_HOUR_CN;
102
+ }
103
+ // window wraps midnight
104
+ return h >= PEAK_START_HOUR_CN || h < PEAK_END_HOUR_CN;
105
+ }
106
+
107
+ /** Quota/cost multiplier currently in effect for the flagship model. */
108
+ export function peakMultiplier(date = new Date()) {
109
+ return isPeak(date) ? 3 : 2;
110
+ }
111
+
112
+ /** Rough capability score from the model id (higher = more capable). */
113
+ function modelCapability(m) {
114
+ const ver = parseFloat((m.match(/(\d+(?:\.\d+)?)/) || [])[1] || "0");
115
+ let s = ver;
116
+ if (/turbo/i.test(m)) s -= 0.3;
117
+ if (/air/i.test(m)) s -= 0.6;
118
+ if (/flash/i.test(m)) s -= 0.5;
119
+ return s;
120
+ }
121
+
122
+ /** Effective per-call cost proxy for a model (in+out rate, peak multiplier for glm-5.x). */
123
+ function modelEffCost(m, date) {
124
+ const r = RATES[m] || RATES[m.replace(/\[.*?\]/, "")] || { in: 0.4, out: 1.75 };
125
+ const mult = /^glm-5/.test(m) ? peakMultiplier(date) : 1;
126
+ return (r.in + r.out) * mult;
127
+ }
128
+
129
+ /** Auto-pick one model from a candidate list per the rules. */
130
+ function pickFromList(list, complexity, date) {
131
+ if (!list || list.length <= 1) return list && list[0];
132
+ const capDesc = [...list].sort((a, b) => modelCapability(b) - modelCapability(a));
133
+ const costAsc = [...list].sort((a, b) => modelEffCost(a, date) - modelEffCost(b, date));
134
+ if (complexity === "high") return capDesc[0]; // hardest task -> most capable
135
+ if (complexity === "low") return costAsc[0]; // easy task -> cheapest
136
+ return isPeak(date) ? costAsc[0] : capDesc[0]; // medium -> cheapest at peak, most capable off-peak
137
+ }
138
+
139
+ /**
140
+ * Resolve a model id. "auto" -> auto-pick from the off-peak/peak candidate LIST per the rules
141
+ * (capability for hard/off-peak work, cheapest for easy/peak work). A specific id is returned as-is.
142
+ */
143
+ export function resolveModel(requested, date = new Date(), complexity = "medium") {
144
+ if (requested && requested !== "auto") return requested;
145
+ const list = isPeak(date) ? PEAK_MODELS : OFFPEAK_MODELS;
146
+ return pickFromList(list, complexity, date);
147
+ }
148
+
149
+ /** Estimate USD cost for a call given a model and token counts. */
150
+ export function estimateCost(model, inputTokens, outputTokens, date = new Date()) {
151
+ const base = RATES[model] || RATES["glm-4.7"];
152
+ const mult = model.startsWith("glm-5") ? peakMultiplier(date) : 1; // 4.x = no multiplier
153
+ const usd =
154
+ ((inputTokens / 1e6) * base.in + (outputTokens / 1e6) * base.out) * mult;
155
+ return Math.round(usd * 1e6) / 1e6;
156
+ }
157
+
158
+ /**
159
+ * Decide GLM vs Opus for a task. Pure advisory -- the caller (Claude) acts on it.
160
+ * Encodes the rules synthesized in docs/RULES.md.
161
+ *
162
+ * @returns {{engine:"glm"|"opus", model:string|null, confidence:number, reasons:string[]}}
163
+ */
164
+ export function recommend(opts = {}, date = new Date()) {
165
+ const {
166
+ taskType = "general", // see TASK_FIT below
167
+ complexity = "medium", // "low" | "medium" | "high"
168
+ sensitive = false, // proprietary/security-critical code or data
169
+ needsParallel = false, // requires several concurrent agents
170
+ longHorizon = false, // many sequential steps / multi-hour autonomy
171
+ latencySensitive = false, // tight interactive loop
172
+ // --- conditions surfaced by the scenario research ---
173
+ vision = false, // input includes images/screenshots/GUI/computer-use
174
+ inputTokens = 0, // approx size of context the task needs
175
+ steps = 0, // approx number of dependent sequential steps
176
+ toolPattern = "none", // "none" | "single" | "fanout" | "heavy" (dependent agentic loop)
177
+ unfamiliarApi = false, // niche/post-cutoff/internal API the model can't know
178
+ chinese = false, // Chinese or Chinese-English bilingual task
179
+ } = opts;
180
+
181
+ const reasons = [];
182
+
183
+ // ---- Hard overrides -> Opus, regardless of cost ----
184
+ if (sensitive) {
185
+ return done("opus", null, 0.95, [
186
+ "Sensitive/proprietary: GLM routes through servers in China and Zhipu is on the US Entity List; keep secrets/security-critical work on Opus.",
187
+ ]);
188
+ }
189
+ if (vision) {
190
+ return done("opus", null, 0.9, [
191
+ "Vision input (images/screenshots/GUI): GLM's text endpoints have no native vision in this setup; Opus handles it.",
192
+ ]);
193
+ }
194
+ if (needsParallel) {
195
+ return done("opus", null, 0.85, [
196
+ "Needs parallel/concurrent agents: GLM has a ~1 in-flight concurrency cap that breaks fan-out; Opus handles parallel subagents.",
197
+ ]);
198
+ }
199
+ if (latencySensitive) {
200
+ return done("opus", null, 0.7, [
201
+ "Latency-sensitive loop: GLM is among the slowest frontier coders (~50-100 tok/s).",
202
+ ]);
203
+ }
204
+ if (toolPattern === "heavy") {
205
+ return done("opus", null, 0.88, [
206
+ "Tool-heavy dependent agentic loop: GLM plans-then-acts and depends on reasoning-state passthrough, so it drifts/loops across many dependent tool calls; Opus interleaves thinking with tool use. (One-shot/short independent tool calls are fine on GLM.)",
207
+ ]);
208
+ }
209
+ if (inputTokens > 128000) {
210
+ return done("opus", null, 0.8, [
211
+ `Large context (~${Math.round(inputTokens / 1000)}K tokens): GLM degrades well before its advertised limit (~100K usable); use Opus, or glm-5.2[1m] for pure retrieval/extraction only.`,
212
+ ]);
213
+ }
214
+ if (steps > 20 || (longHorizon && complexity === "high")) {
215
+ return done("opus", null, 0.85, [
216
+ "Long-horizon (>20 dependent steps / sustained single goal): GLM exhibits goal drift; Opus holds the plan.",
217
+ ]);
218
+ }
219
+
220
+ // ---- Capability-fit scoring. >0 favors GLM, <0 favors Opus ----
221
+ const TASK_FIT = {
222
+ // Strong GLM: well-specified, single-purpose, cheap-to-verify
223
+ frontend: 2, ui: 2, boilerplate: 2, scaffolding: 2, config: 2, crud: 2,
224
+ regex: 2, docs: 2, i18n: 2, type_lint: 2, unit_test: 2,
225
+ refactor_local: 2, prototype: 2, toolcall_single: 2,
226
+ // Mild GLM
227
+ sql: 1, etl: 1, cicd: 1, cli: 1, notebook: 1, integration_test: 1,
228
+ algorithm: 1, research: 1, summarization: 1, toolcall_fanout: 1,
229
+ // Neutral / toss-up
230
+ general: 0, ml_training: 0,
231
+ // Lean Opus (errors costly or subtle)
232
+ iac: -1, dependency_upgrade: -1,
233
+ debugging: -2, code_review: -2, perf: -2, api_integration: -2,
234
+ migration: -2, systems: -2,
235
+ // Strong Opus
236
+ refactor_large: -3, architecture: -3, security: -3,
237
+ agentic_loop: -3, toolcall_heavy: -3,
238
+ };
239
+ let score = TASK_FIT[taskType] ?? 0;
240
+ reasons.push(`Task type "${taskType}" capability fit ${score >= 0 ? "+" : ""}${score}.`);
241
+
242
+ // Cost is a first-class factor: GLM is ~10x cheaper, so bias toward it. The
243
+ // capability penalties above are what claw hard/risky work back to Opus.
244
+ score += COST_BIAS;
245
+ reasons.push(
246
+ `GLM ~10x cheaper than Opus (still cheaper even at peak) -> cost bias +${COST_BIAS} toward GLM; Opus is the "pay up for quality" exception.`
247
+ );
248
+
249
+ // Soft signals from the research.
250
+ if (unfamiliarApi) {
251
+ score -= 2;
252
+ reasons.push("Unfamiliar/niche/post-cutoff API: GLM hallucinates obscure APIs (-2). Paste authoritative docs into the prompt, or use Opus.");
253
+ }
254
+ if (chinese) {
255
+ score += 1;
256
+ reasons.push("Chinese / bilingual task: GLM is a strength here (+1).");
257
+ }
258
+ if (toolPattern === "single") {
259
+ score += 1;
260
+ reasons.push("Single one-shot tool call: GLM's schema adherence is best-in-class (+1).");
261
+ }
262
+
263
+ if (complexity === "high") {
264
+ score -= 2;
265
+ reasons.push("High complexity -2 (GLM self-correction is weaker than Opus).");
266
+ } else if (complexity === "low") {
267
+ score += 1;
268
+ reasons.push("Low complexity +1 (well-specified work is GLM's sweet spot).");
269
+ }
270
+
271
+ // Cost-timing modifier. Off-peak nudges toward GLM. At peak, if the "auto" model carries the
272
+ // multiplier (glm-5.x), penalize harder so GLM is called LESS during the surcharge window --
273
+ // scaled by the multiplier. A non-multiplier peak model (e.g. glm-4.7) gets only a small nudge.
274
+ if (!isPeak(date)) {
275
+ score += 0.5;
276
+ reasons.push("Off-peak in China (UTC+8): GLM cheapest now (+0.5).");
277
+ } else {
278
+ const m = resolveModel("auto", date, complexity);
279
+ const mult = peakMultiplier(date);
280
+ // Only penalize GLM at peak if the CHOSEN model actually carries the surcharge (glm-5.x).
281
+ // If "auto" lands on a no-surcharge model (e.g. glm-4.7), peak is fine -> no penalty.
282
+ const penalty = /^glm-5/.test(m) ? Math.min((mult - 1) * 0.5, 2) : 0;
283
+ if (penalty > 0) {
284
+ score -= penalty;
285
+ reasons.push(`Peak window (UTC+8): "auto" model ${m} costs ~${mult}x now -> -${penalty} toward GLM (route less to GLM at peak).`);
286
+ } else {
287
+ reasons.push(`Peak window (UTC+8): "auto" model ${m} has no peak surcharge -> no GLM penalty (fine to use).`);
288
+ }
289
+ }
290
+
291
+ if (score >= 1) {
292
+ return done("glm", resolveModel("auto", date, complexity), clamp(0.5 + score * 0.1), reasons);
293
+ }
294
+ return done("opus", null, clamp(0.5 + Math.abs(score) * 0.1), reasons);
295
+
296
+ function done(engine, model, confidence, rs) {
297
+ return { engine, model, confidence: Math.round(confidence * 100) / 100, reasons: rs };
298
+ }
299
+ }
300
+
301
+ function clamp(n) {
302
+ return Math.max(0.5, Math.min(0.97, n));
303
+ }
304
+
305
+ export const MODELS = { OFFPEAK_MODELS, PEAK_MODELS, CHEAP_MODEL, RATES };