glm-mcp-claude 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,306 @@
1
+ #!/usr/bin/env node
2
+ // index.js -- GLM MCP server.
3
+ // Exposes GLM (Zhipu/Z.ai) as a delegation subagent for Claude Code via three tools:
4
+ // glm_delegate -- run a self-contained subtask on GLM and get the result
5
+ // glm_recommend -- advisory: should this task go to GLM or Opus? which model?
6
+ // glm_status -- peak window, model picks, cost multipliers, config sanity
7
+ //
8
+ // Design notes baked in (see docs/research):
9
+ // * "auto" model selection defaults to GLM-5.2 in both windows; since GLM-5.2 carries the
10
+ // ~3x peak surcharge, the router routes less work to GLM during peak.
11
+ // * Calls are serialized through a concurrency gate to respect GLM's ~1-in-flight cap.
12
+ // * Output stays high-signal: a short metadata header + GLM's answer, capped.
13
+
14
+ import "./loadEnv.js";
15
+ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
16
+ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
17
+ import { z } from "zod";
18
+ import { glmMessage, config } from "./glmClient.js";
19
+ import {
20
+ resolveModel,
21
+ recommend,
22
+ isPeak,
23
+ chinaHour,
24
+ peakMultiplier,
25
+ estimateCost,
26
+ MODELS,
27
+ resolveMaxTokens,
28
+ MAXTOK,
29
+ } from "./router.js";
30
+ import { runGlmAgent } from "./glmAgent.js";
31
+
32
+ const CHARACTER_LIMIT = 50000;
33
+
34
+ const server = new McpServer({ name: "glm-mcp", version: "1.0.0" });
35
+
36
+ const TASK_TYPES = [
37
+ // strong GLM
38
+ "frontend", "ui", "boilerplate", "scaffolding", "config", "crud", "regex",
39
+ "docs", "i18n", "type_lint", "unit_test", "refactor_local", "prototype", "toolcall_single",
40
+ // mild GLM
41
+ "sql", "etl", "cicd", "cli", "notebook", "integration_test",
42
+ "algorithm", "research", "summarization", "toolcall_fanout",
43
+ // neutral
44
+ "general", "ml_training",
45
+ // lean / strong Opus
46
+ "iac", "dependency_upgrade", "debugging", "code_review", "perf", "api_integration",
47
+ "migration", "systems", "refactor_large", "architecture", "security",
48
+ "agentic_loop", "toolcall_heavy",
49
+ ];
50
+
51
+ // ----------------------------- glm_delegate -----------------------------
52
+ server.registerTool(
53
+ "glm_delegate",
54
+ {
55
+ title: "Delegate a subtask to GLM",
56
+ description:
57
+ "Run a SELF-CONTAINED subtask on the GLM model (Zhipu/Z.ai) and return its result. " +
58
+ "Use this to offload cheap, well-specified work (frontend/UI, boilerplate, scaffolding, " +
59
+ "CRUD, local refactors, docs, summarization, algorithmic codegen) from Opus and save cost. " +
60
+ "GLM cannot call Claude's tools, so pass everything it needs in `task` + `context` " +
61
+ "(paste the relevant code/specs). It returns text only. " +
62
+ "Model defaults to 'auto' (GLM-5.2; since GLM-5.2 costs ~3x at China peak, the router routes less to GLM during peak). " +
63
+ "Do NOT use for security-sensitive/proprietary code, subtle long debugging, large multi-step " +
64
+ "refactors, or anything needing parallel agents -- keep those on Opus (call glm_recommend if unsure).",
65
+ inputSchema: {
66
+ task: z
67
+ .string()
68
+ .min(1)
69
+ .describe("The instruction for GLM. Be explicit and self-contained, e.g. 'Write a React component that...'."),
70
+ context: z
71
+ .string()
72
+ .optional()
73
+ .describe("Supporting material GLM needs: code to modify, file contents, specs, examples. GLM has no file access."),
74
+ model: z
75
+ .string()
76
+ .optional()
77
+ .describe("Model id or 'auto' (default). e.g. glm-5.2, glm-4.7, glm-4.5-air. 'auto' picks peak-aware."),
78
+ system: z.string().optional().describe("Optional system prompt to steer GLM's role/format."),
79
+ thinking: z.boolean().optional().describe("Enable GLM reasoning mode for harder tasks (slower). Default false."),
80
+ max_tokens: z.number().int().min(256).max(131072).optional().describe("Max output tokens for this call (a ceiling, not a target — you pay for actual output). By default the cap is OFF (up to 131072, generous). Set GLM_CAP=on in .env to enforce GLM_MAX_TOKENS instead."),
81
+ format: z
82
+ .enum(["concise", "detailed"])
83
+ .optional()
84
+ .describe("'concise' (default) = answer + 1-line meta. 'detailed' = full cost/usage/peak metadata."),
85
+ },
86
+ annotations: {
87
+ readOnlyHint: false,
88
+ destructiveHint: false,
89
+ idempotentHint: false,
90
+ openWorldHint: true,
91
+ },
92
+ },
93
+ async ({ task, context, model = "auto", system, thinking = false, max_tokens, format = "concise" }) => {
94
+ const now = new Date();
95
+ const chosen = resolveModel(model, now);
96
+
97
+ const userContent = context ? `${task}\n\n--- CONTEXT ---\n${context}` : task;
98
+ try {
99
+ const { text, usage } = await glmMessage({
100
+ model: chosen,
101
+ system,
102
+ messages: [{ role: "user", content: userContent }],
103
+ maxTokens: resolveMaxTokens(max_tokens),
104
+ thinking,
105
+ });
106
+
107
+ const inTok = usage.input_tokens ?? 0;
108
+ const outTok = usage.output_tokens ?? 0;
109
+ const totalTok = inTok + outTok;
110
+ const cost = estimateCost(chosen, inTok, outTok, now);
111
+ const opusCost = estimateCost("claude-opus", inTok, outTok, now);
112
+
113
+ // Every output reports how many tokens were delegated to GLM.
114
+ const tokLine = `[GLM delegated ${totalTok} tokens (${inTok} in / ${outTok} out) to ${chosen} — est $${cost}]`;
115
+
116
+ let out;
117
+ if (format === "detailed") {
118
+ out =
119
+ `${tokLine}\n[peak=${isPeak(now)} (CN ${chinaHour(now)}:00) | Opus would be ~$${opusCost}, ` +
120
+ `~${opusCost && cost ? Math.round(opusCost / cost) : "?"}x more]\n\n${text}`;
121
+ } else {
122
+ out = `${tokLine}\n\n${text}`;
123
+ }
124
+ return { content: [{ type: "text", text: clip(out) }] };
125
+ } catch (e) {
126
+ return {
127
+ isError: true,
128
+ content: [
129
+ {
130
+ type: "text",
131
+ text:
132
+ `GLM delegation failed: ${e.message}\n\n` +
133
+ `Suggested next steps:\n` +
134
+ `- If 'concurrency'/'Too much concurrency': retry shortly; GLM caps in-flight requests (~1). Avoid parallel glm_delegate calls.\n` +
135
+ `- If auth error: check GLM_API_KEY in glm-mcp/.env or .mcp.json env.\n` +
136
+ `- If the task is hard/critical, do it on Opus directly instead of GLM.`,
137
+ },
138
+ ],
139
+ };
140
+ }
141
+ }
142
+ );
143
+
144
+ // ----------------------------- glm_agent -----------------------------
145
+ server.registerTool(
146
+ "glm_agent",
147
+ {
148
+ title: "Run GLM as a file-accessing agent",
149
+ description:
150
+ "Run GLM as a REAL coding agent with direct filesystem access. Unlike glm_delegate " +
151
+ "(text-in/text-out), this gives GLM its own tools -- read_file, write_file, edit_file, " +
152
+ "list_dir, run_bash -- and loops, executing GLM's tool calls against your repo until it " +
153
+ "finishes. Use this to hand GLM a self-contained coding task it should carry out end-to-end " +
154
+ "(inspect files, make edits, run tests) at ~10x lower cost than Opus. " +
155
+ "PASS `workdir` = the absolute path of the project/repo to work in. " +
156
+ "Best for bounded, well-specified work; for long dependent agentic loops, large refactors, " +
157
+ "or sensitive correctness-critical changes, prefer Opus (see glm_recommend).",
158
+ inputSchema: {
159
+ task: z.string().min(1).describe("The coding task for GLM to carry out end-to-end in the repo."),
160
+ workdir: z
161
+ .string()
162
+ .optional()
163
+ .describe("Absolute path to the project root GLM should operate in. Defaults to the server's cwd; always pass it explicitly."),
164
+ context: z.string().optional().describe("Optional extra context/constraints (GLM can also read files itself)."),
165
+ model: z.string().optional().describe("Model id or 'auto' (default, peak-aware)."),
166
+ thinking: z.boolean().optional().describe("Enable GLM reasoning mode for harder tasks. Default false."),
167
+ max_tokens: z.number().int().min(256).max(131072).optional().describe("Max output tokens per turn (a ceiling, not a target — you pay for actual output). By default the cap is OFF (up to 131072, generous). Set GLM_CAP=on in .env to enforce GLM_MAX_TOKENS instead."),
168
+ dry_run: z
169
+ .boolean()
170
+ .optional()
171
+ .describe("If true, GLM PROPOSES changes (returns a diff) and writes NOTHING to disk -- for Opus to review/approve before a real apply pass. bash is disabled. Default false."),
172
+ },
173
+ annotations: { readOnlyHint: false, destructiveHint: true, idempotentHint: false, openWorldHint: true },
174
+ },
175
+ async ({ task, workdir, context, model = "auto", thinking = false, max_tokens, dry_run = false }) => {
176
+ const now = new Date();
177
+ const chosen = resolveModel(model, now);
178
+ try {
179
+ const r = await runGlmAgent({ model: chosen, task, context, workdir, maxTokens: resolveMaxTokens(max_tokens), thinking, dryRun: dry_run });
180
+ const cost = estimateCost(chosen, r.usage.input_tokens, r.usage.output_tokens, now);
181
+ const banner = r.dryRun ? "*** DRY RUN — nothing was written; this is GLM's PROPOSED change for you to approve ***\n" : "";
182
+ const totalTok = (r.usage.input_tokens || 0) + (r.usage.output_tokens || 0);
183
+ const header =
184
+ `[GLM agent] delegated ${totalTok} tokens (${r.usage.input_tokens || 0} in / ${r.usage.output_tokens || 0} out) to ${chosen} — est $${cost} | ` +
185
+ `dir=${r.root} | iterations=${r.iters}${r.hitCap ? " (HIT CAP -- may be incomplete)" : ""} | actions=${r.actions.length} | files=${r.changedFiles.length}`;
186
+ const actions = r.actions.length ? `\nActions:\n- ${r.actions.join("\n- ")}` : "";
187
+ const diff = r.diff ? `\n\n=== DIFF (review this) ===\n${r.diff}` : "\n\n(no file changes)";
188
+ const revert = !r.dryRun && r.git && r.git.revertHint ? `\n\nRevert: ${r.git.revertHint}` : "";
189
+ return { content: [{ type: "text", text: clip(`${banner}${header}${actions}${diff}${revert}\n\n=== GLM SUMMARY ===\n${r.text}`) }] };
190
+ } catch (e) {
191
+ return {
192
+ isError: true,
193
+ content: [
194
+ {
195
+ type: "text",
196
+ text:
197
+ `GLM agent failed: ${e.message}\n\n` +
198
+ `- 'concurrency'/'Too much concurrency': retry shortly (GLM caps in-flight ~1).\n` +
199
+ `- auth error: check GLM_API_KEY.\n` +
200
+ `- If GLM is looping or the task is hard/critical, run it on Opus instead.`,
201
+ },
202
+ ],
203
+ };
204
+ }
205
+ }
206
+ );
207
+
208
+ // ----------------------------- glm_recommend -----------------------------
209
+ server.registerTool(
210
+ "glm_recommend",
211
+ {
212
+ title: "Recommend GLM vs Opus for a task",
213
+ description:
214
+ "Cheap, no-API advisory. Given a task profile, returns whether to route to GLM or Opus, " +
215
+ "which GLM model to use, a confidence score, and reasons. Call this before delegating when " +
216
+ "unsure. It factors in task type, complexity, sensitivity, parallelism needs, and the current " +
217
+ "China peak-billing window. Runs locally -- no tokens spent on GLM.",
218
+ inputSchema: {
219
+ task_type: z.enum(TASK_TYPES).optional().describe("Closest task category. Default 'general'. Note tool-calling splits into toolcall_single/toolcall_fanout (GLM-ok) vs toolcall_heavy/agentic_loop (Opus)."),
220
+ complexity: z.enum(["low", "medium", "high"]).optional().describe("Default 'medium'."),
221
+ sensitive: z.boolean().optional().describe("True if proprietary/security-critical (forces Opus)."),
222
+ needs_parallel: z.boolean().optional().describe("True if it needs several concurrent agents (forces Opus)."),
223
+ long_horizon: z.boolean().optional().describe("True if many sequential steps / multi-hour autonomy."),
224
+ latency_sensitive: z.boolean().optional().describe("True if a tight interactive loop (forces Opus)."),
225
+ vision: z.boolean().optional().describe("True if input includes images/screenshots/GUI/computer-use (forces Opus)."),
226
+ input_tokens: z.number().int().optional().describe("Approx context size needed. >128K forces Opus (GLM degrades past ~100K)."),
227
+ steps: z.number().int().optional().describe("Approx number of dependent sequential steps. >20 forces Opus (goal drift)."),
228
+ tool_pattern: z.enum(["none", "single", "fanout", "heavy"]).optional().describe("Tool-use shape: single one-shot call / short independent fanout (GLM-ok) vs heavy dependent agentic loop (forces Opus)."),
229
+ unfamiliar_api: z.boolean().optional().describe("True if it uses a niche/post-cutoff/internal API GLM can't know (-2; paste docs or use Opus)."),
230
+ chinese: z.boolean().optional().describe("True if Chinese or Chinese-English bilingual (GLM strength, +1)."),
231
+ },
232
+ annotations: { readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
233
+ },
234
+ async ({ task_type, complexity, sensitive, needs_parallel, long_horizon, latency_sensitive, vision, input_tokens, steps, tool_pattern, unfamiliar_api, chinese }) => {
235
+ const now = new Date();
236
+ const rec = recommend(
237
+ {
238
+ taskType: task_type,
239
+ complexity,
240
+ sensitive,
241
+ needsParallel: needs_parallel,
242
+ longHorizon: long_horizon,
243
+ latencySensitive: latency_sensitive,
244
+ vision,
245
+ inputTokens: input_tokens,
246
+ steps,
247
+ toolPattern: tool_pattern,
248
+ unfamiliarApi: unfamiliar_api,
249
+ chinese,
250
+ },
251
+ now
252
+ );
253
+ const body = {
254
+ decision: rec.engine.toUpperCase(),
255
+ glm_model: rec.engine === "glm" ? rec.model : null,
256
+ confidence: rec.confidence,
257
+ peak_now: isPeak(now),
258
+ china_hour: chinaHour(now),
259
+ reasons: rec.reasons,
260
+ };
261
+ return { content: [{ type: "text", text: JSON.stringify(body, null, 2) }] };
262
+ }
263
+ );
264
+
265
+ // ----------------------------- glm_status -----------------------------
266
+ server.registerTool(
267
+ "glm_status",
268
+ {
269
+ title: "GLM status & config",
270
+ description:
271
+ "Report current peak window, peak-aware model picks, cost multiplier, and config sanity " +
272
+ "(base URL, whether an API key is loaded, concurrency cap). No GLM tokens spent.",
273
+ inputSchema: {},
274
+ annotations: { readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
275
+ },
276
+ async () => {
277
+ const now = new Date();
278
+ const status = {
279
+ china_hour: chinaHour(now),
280
+ peak_now: isPeak(now),
281
+ flagship_multiplier: `${peakMultiplier(now)}x`,
282
+ auto_model_now: resolveModel("auto", now),
283
+ models: { offpeak: MODELS.OFFPEAK_MODELS.join(", "), peak: MODELS.PEAK_MODELS.join(", "), cheap: MODELS.CHEAP_MODEL },
284
+ base_url: config.BASE_URL,
285
+ api_key_loaded: config.hasKey,
286
+ max_concurrent: config.MAX_CONCURRENT,
287
+ max_tokens: {
288
+ cap_enabled: MAXTOK.capEnabled,
289
+ default_per_call: resolveMaxTokens(undefined),
290
+ cap_value_when_on: MAXTOK.capValue,
291
+ hard_ceiling: MAXTOK.uncappedMax,
292
+ },
293
+ note: config.hasKey
294
+ ? "Ready."
295
+ : "No API key loaded -- set GLM_API_KEY in glm-mcp/.env or .mcp.json env before delegating.",
296
+ };
297
+ return { content: [{ type: "text", text: JSON.stringify(status, null, 2) }] };
298
+ }
299
+ );
300
+
301
+ function clip(s) {
302
+ return s.length > CHARACTER_LIMIT ? s.slice(0, CHARACTER_LIMIT) + "\n…[truncated to fit context]" : s;
303
+ }
304
+
305
+ const transport = new StdioServerTransport();
306
+ await server.connect(transport);
@@ -0,0 +1,24 @@
1
+ // loadEnv.js -- zero-dependency .env loader (so we don't pull in dotenv).
2
+ // Loaded first by index.js / smoke.js. Silently no-ops if .env is absent.
3
+ import { readFileSync } from "node:fs";
4
+ import { fileURLToPath } from "node:url";
5
+ import { dirname, join } from "node:path";
6
+
7
+ const __dirname = dirname(fileURLToPath(import.meta.url));
8
+ const envPath = join(__dirname, "..", ".env");
9
+
10
+ try {
11
+ const raw = readFileSync(envPath, "utf8");
12
+ for (const line of raw.split(/\r?\n/)) {
13
+ const m = line.match(/^\s*([A-Z0-9_]+)\s*=\s*(.*)\s*$/);
14
+ if (!m) continue;
15
+ const key = m[1];
16
+ let val = m[2];
17
+ if ((val.startsWith('"') && val.endsWith('"')) || (val.startsWith("'") && val.endsWith("'"))) {
18
+ val = val.slice(1, -1);
19
+ }
20
+ if (process.env[key] === undefined) process.env[key] = val;
21
+ }
22
+ } catch {
23
+ // no .env present -- rely on real environment variables
24
+ }
@@ -0,0 +1,291 @@
1
+ // router.js
2
+ // Peak-awareness + GLM-vs-Opus decision logic.
3
+ // Pure functions, no I/O, no API calls -> cheap to call, easy to unit test.
4
+ //
5
+ // Facts encoded here come from docs/research:
6
+ // - GLM-5.2 quota multiplier: ~3x peak / ~2x off-peak (1x off-peak under promo).
7
+ // - Peak window: ~14:00-18:00 China time (UTC+8).
8
+ // - GLM-4.7 carries NO multiplier (a cheaper option, not the default).
9
+ // - Concurrency cap is ~1 in-flight even on paid tiers.
10
+
11
+ const PEAK_START_HOUR_CN = intEnv("GLM_PEAK_START_CN", 14); // 14:00 UTC+8
12
+ const PEAK_END_HOUR_CN = intEnv("GLM_PEAK_END_CN", 18); // 18:00 UTC+8 (exclusive)
13
+
14
+ // Default model picks for "auto". Each may be a COMMA-SEPARATED LIST of candidate models;
15
+ // the router auto-picks one per task (most capable for hard / off-peak work, cheapest for
16
+ // easy / peak work) unless a specific model is requested. A single value works too.
17
+ // Example: GLM_OFFPEAK_MODEL="glm-5.2,glm-5-turbo"
18
+ const OFFPEAK_MODELS = splitModels(process.env.GLM_OFFPEAK_MODEL, "glm-5.2");
19
+ const PEAK_MODELS = splitModels(process.env.GLM_PEAK_MODEL, "glm-5.2");
20
+ const CHEAP_MODEL = process.env.GLM_CHEAP_MODEL || "glm-4.5-air";
21
+
22
+ function splitModels(val, fallback) {
23
+ const list = (val || fallback).split(",").map((s) => s.trim()).filter(Boolean);
24
+ return list.length ? list : [fallback];
25
+ }
26
+
27
+ // Rough public-ish per-1M-token rates (USD) for cost estimation only.
28
+ // These are approximations from research and WILL drift -- treat as indicative.
29
+ const RATES = {
30
+ "glm-5.2": { in: 0.6, out: 2.2 },
31
+ "glm-5.2[1m]": { in: 1.2, out: 4.4 },
32
+ "glm-5-turbo": { in: 0.3, out: 1.1 },
33
+ "glm-4.7": { in: 0.4, out: 1.75 },
34
+ "glm-4.6": { in: 0.4, out: 1.75 },
35
+ "glm-4.5": { in: 0.4, out: 1.6 },
36
+ "glm-4.5-air": { in: 0.1, out: 0.6 },
37
+ // Opus reference, for comparison output only:
38
+ "claude-opus": { in: 5.0, out: 25.0 },
39
+ };
40
+
41
+ function intEnv(name, fallback) {
42
+ const v = parseInt(process.env[name] || "", 10);
43
+ return Number.isFinite(v) ? v : fallback;
44
+ }
45
+
46
+ function numEnv(name, fallback) {
47
+ const v = parseFloat(process.env[name] || "");
48
+ return Number.isFinite(v) ? v : fallback;
49
+ }
50
+
51
+ // GLM is ~10x cheaper than Opus (and still ~3-4x cheaper even at peak), so the
52
+ // correct default is GLM unless quality/risk justifies paying up for Opus.
53
+ // This is a standing thumb on the scale toward GLM. Raise GLM_COST_BIAS to be more
54
+ // aggressive about cost; set 0 to ignore price and decide on capability alone.
55
+ const COST_BIAS = numEnv("GLM_COST_BIAS", 1.5);
56
+
57
+ // --- Output token policy ---------------------------------------------------
58
+ // max_tokens is a CEILING, not a target: GLM stops when done and you're billed for
59
+ // ACTUAL output, so being generous just prevents truncation at no extra cost.
60
+ //
61
+ // DEFAULT: the cap is OFF -> every call may use up to GLM_MAX_TOKENS_CEILING (131072, generous).
62
+ // Turn it ON with GLM_CAP=on to enforce GLM_MAX_TOKENS as a hard limit on every call and clamp
63
+ // any larger per-call request down to it -- handy when you want to control spend.
64
+ const CAP_ENABLED = /^(1|on|true|yes)$/i.test(process.env.GLM_CAP || "off");
65
+ const CAP_VALUE = intEnv("GLM_MAX_TOKENS", 32768);
66
+ const UNCAPPED_MAX = intEnv("GLM_MAX_TOKENS_CEILING", 131072);
67
+
68
+ /** Resolve max_tokens to send, honoring the on/off cap and any per-call request. */
69
+ export function resolveMaxTokens(requested) {
70
+ const r = Number.isFinite(requested) ? requested : null;
71
+ if (r != null) return CAP_ENABLED ? Math.min(r, CAP_VALUE) : r;
72
+ return CAP_ENABLED ? CAP_VALUE : UNCAPPED_MAX;
73
+ }
74
+ export const MAXTOK = { capEnabled: CAP_ENABLED, capValue: CAP_VALUE, uncappedMax: UNCAPPED_MAX };
75
+
76
+ /** Current hour (0-23) in China time (UTC+8), independent of host TZ. */
77
+ export function chinaHour(date = new Date()) {
78
+ const utcMs = date.getTime() + date.getTimezoneOffset() * 60000;
79
+ const cn = new Date(utcMs + 8 * 3600000);
80
+ return cn.getHours();
81
+ }
82
+
83
+ /** Is `date` inside the GLM peak-billing window? */
84
+ export function isPeak(date = new Date()) {
85
+ const h = chinaHour(date);
86
+ if (PEAK_START_HOUR_CN <= PEAK_END_HOUR_CN) {
87
+ return h >= PEAK_START_HOUR_CN && h < PEAK_END_HOUR_CN;
88
+ }
89
+ // window wraps midnight
90
+ return h >= PEAK_START_HOUR_CN || h < PEAK_END_HOUR_CN;
91
+ }
92
+
93
+ /** Quota/cost multiplier currently in effect for the flagship model. */
94
+ export function peakMultiplier(date = new Date()) {
95
+ return isPeak(date) ? 3 : 2;
96
+ }
97
+
98
+ /** Rough capability score from the model id (higher = more capable). */
99
+ function modelCapability(m) {
100
+ const ver = parseFloat((m.match(/(\d+(?:\.\d+)?)/) || [])[1] || "0");
101
+ let s = ver;
102
+ if (/turbo/i.test(m)) s -= 0.3;
103
+ if (/air/i.test(m)) s -= 0.6;
104
+ if (/flash/i.test(m)) s -= 0.5;
105
+ return s;
106
+ }
107
+
108
+ /** Effective per-call cost proxy for a model (in+out rate, peak multiplier for glm-5.x). */
109
+ function modelEffCost(m, date) {
110
+ const r = RATES[m] || RATES[m.replace(/\[.*?\]/, "")] || { in: 0.4, out: 1.75 };
111
+ const mult = /^glm-5/.test(m) ? peakMultiplier(date) : 1;
112
+ return (r.in + r.out) * mult;
113
+ }
114
+
115
+ /** Auto-pick one model from a candidate list per the rules. */
116
+ function pickFromList(list, complexity, date) {
117
+ if (!list || list.length <= 1) return list && list[0];
118
+ const capDesc = [...list].sort((a, b) => modelCapability(b) - modelCapability(a));
119
+ const costAsc = [...list].sort((a, b) => modelEffCost(a, date) - modelEffCost(b, date));
120
+ if (complexity === "high") return capDesc[0]; // hardest task -> most capable
121
+ if (complexity === "low") return costAsc[0]; // easy task -> cheapest
122
+ return isPeak(date) ? costAsc[0] : capDesc[0]; // medium -> cheapest at peak, most capable off-peak
123
+ }
124
+
125
+ /**
126
+ * Resolve a model id. "auto" -> auto-pick from the off-peak/peak candidate LIST per the rules
127
+ * (capability for hard/off-peak work, cheapest for easy/peak work). A specific id is returned as-is.
128
+ */
129
+ export function resolveModel(requested, date = new Date(), complexity = "medium") {
130
+ if (requested && requested !== "auto") return requested;
131
+ const list = isPeak(date) ? PEAK_MODELS : OFFPEAK_MODELS;
132
+ return pickFromList(list, complexity, date);
133
+ }
134
+
135
+ /** Estimate USD cost for a call given a model and token counts. */
136
+ export function estimateCost(model, inputTokens, outputTokens, date = new Date()) {
137
+ const base = RATES[model] || RATES["glm-4.7"];
138
+ const mult = model.startsWith("glm-5") ? peakMultiplier(date) : 1; // 4.x = no multiplier
139
+ const usd =
140
+ ((inputTokens / 1e6) * base.in + (outputTokens / 1e6) * base.out) * mult;
141
+ return Math.round(usd * 1e6) / 1e6;
142
+ }
143
+
144
+ /**
145
+ * Decide GLM vs Opus for a task. Pure advisory -- the caller (Claude) acts on it.
146
+ * Encodes the rules synthesized in docs/RULES.md.
147
+ *
148
+ * @returns {{engine:"glm"|"opus", model:string|null, confidence:number, reasons:string[]}}
149
+ */
150
+ export function recommend(opts = {}, date = new Date()) {
151
+ const {
152
+ taskType = "general", // see TASK_FIT below
153
+ complexity = "medium", // "low" | "medium" | "high"
154
+ sensitive = false, // proprietary/security-critical code or data
155
+ needsParallel = false, // requires several concurrent agents
156
+ longHorizon = false, // many sequential steps / multi-hour autonomy
157
+ latencySensitive = false, // tight interactive loop
158
+ // --- conditions surfaced by the scenario research ---
159
+ vision = false, // input includes images/screenshots/GUI/computer-use
160
+ inputTokens = 0, // approx size of context the task needs
161
+ steps = 0, // approx number of dependent sequential steps
162
+ toolPattern = "none", // "none" | "single" | "fanout" | "heavy" (dependent agentic loop)
163
+ unfamiliarApi = false, // niche/post-cutoff/internal API the model can't know
164
+ chinese = false, // Chinese or Chinese-English bilingual task
165
+ } = opts;
166
+
167
+ const reasons = [];
168
+
169
+ // ---- Hard overrides -> Opus, regardless of cost ----
170
+ if (sensitive) {
171
+ return done("opus", null, 0.95, [
172
+ "Sensitive/proprietary: GLM routes through servers in China and Zhipu is on the US Entity List; keep secrets/security-critical work on Opus.",
173
+ ]);
174
+ }
175
+ if (vision) {
176
+ return done("opus", null, 0.9, [
177
+ "Vision input (images/screenshots/GUI): GLM's text endpoints have no native vision in this setup; Opus handles it.",
178
+ ]);
179
+ }
180
+ if (needsParallel) {
181
+ return done("opus", null, 0.85, [
182
+ "Needs parallel/concurrent agents: GLM has a ~1 in-flight concurrency cap that breaks fan-out; Opus handles parallel subagents.",
183
+ ]);
184
+ }
185
+ if (latencySensitive) {
186
+ return done("opus", null, 0.7, [
187
+ "Latency-sensitive loop: GLM is among the slowest frontier coders (~50-100 tok/s).",
188
+ ]);
189
+ }
190
+ if (toolPattern === "heavy") {
191
+ return done("opus", null, 0.88, [
192
+ "Tool-heavy dependent agentic loop: GLM plans-then-acts and depends on reasoning-state passthrough, so it drifts/loops across many dependent tool calls; Opus interleaves thinking with tool use. (One-shot/short independent tool calls are fine on GLM.)",
193
+ ]);
194
+ }
195
+ if (inputTokens > 128000) {
196
+ return done("opus", null, 0.8, [
197
+ `Large context (~${Math.round(inputTokens / 1000)}K tokens): GLM degrades well before its advertised limit (~100K usable); use Opus, or glm-5.2[1m] for pure retrieval/extraction only.`,
198
+ ]);
199
+ }
200
+ if (steps > 20 || (longHorizon && complexity === "high")) {
201
+ return done("opus", null, 0.85, [
202
+ "Long-horizon (>20 dependent steps / sustained single goal): GLM exhibits goal drift; Opus holds the plan.",
203
+ ]);
204
+ }
205
+
206
+ // ---- Capability-fit scoring. >0 favors GLM, <0 favors Opus ----
207
+ const TASK_FIT = {
208
+ // Strong GLM: well-specified, single-purpose, cheap-to-verify
209
+ frontend: 2, ui: 2, boilerplate: 2, scaffolding: 2, config: 2, crud: 2,
210
+ regex: 2, docs: 2, i18n: 2, type_lint: 2, unit_test: 2,
211
+ refactor_local: 2, prototype: 2, toolcall_single: 2,
212
+ // Mild GLM
213
+ sql: 1, etl: 1, cicd: 1, cli: 1, notebook: 1, integration_test: 1,
214
+ algorithm: 1, research: 1, summarization: 1, toolcall_fanout: 1,
215
+ // Neutral / toss-up
216
+ general: 0, ml_training: 0,
217
+ // Lean Opus (errors costly or subtle)
218
+ iac: -1, dependency_upgrade: -1,
219
+ debugging: -2, code_review: -2, perf: -2, api_integration: -2,
220
+ migration: -2, systems: -2,
221
+ // Strong Opus
222
+ refactor_large: -3, architecture: -3, security: -3,
223
+ agentic_loop: -3, toolcall_heavy: -3,
224
+ };
225
+ let score = TASK_FIT[taskType] ?? 0;
226
+ reasons.push(`Task type "${taskType}" capability fit ${score >= 0 ? "+" : ""}${score}.`);
227
+
228
+ // Cost is a first-class factor: GLM is ~10x cheaper, so bias toward it. The
229
+ // capability penalties above are what claw hard/risky work back to Opus.
230
+ score += COST_BIAS;
231
+ reasons.push(
232
+ `GLM ~10x cheaper than Opus (still cheaper even at peak) -> cost bias +${COST_BIAS} toward GLM; Opus is the "pay up for quality" exception.`
233
+ );
234
+
235
+ // Soft signals from the research.
236
+ if (unfamiliarApi) {
237
+ score -= 2;
238
+ reasons.push("Unfamiliar/niche/post-cutoff API: GLM hallucinates obscure APIs (-2). Paste authoritative docs into the prompt, or use Opus.");
239
+ }
240
+ if (chinese) {
241
+ score += 1;
242
+ reasons.push("Chinese / bilingual task: GLM is a strength here (+1).");
243
+ }
244
+ if (toolPattern === "single") {
245
+ score += 1;
246
+ reasons.push("Single one-shot tool call: GLM's schema adherence is best-in-class (+1).");
247
+ }
248
+
249
+ if (complexity === "high") {
250
+ score -= 2;
251
+ reasons.push("High complexity -2 (GLM self-correction is weaker than Opus).");
252
+ } else if (complexity === "low") {
253
+ score += 1;
254
+ reasons.push("Low complexity +1 (well-specified work is GLM's sweet spot).");
255
+ }
256
+
257
+ // Cost-timing modifier. Off-peak nudges toward GLM. At peak, if the "auto" model carries the
258
+ // multiplier (glm-5.x), penalize harder so GLM is called LESS during the surcharge window --
259
+ // scaled by the multiplier. A non-multiplier peak model (e.g. glm-4.7) gets only a small nudge.
260
+ if (!isPeak(date)) {
261
+ score += 0.5;
262
+ reasons.push("Off-peak in China (UTC+8): GLM cheapest now (+0.5).");
263
+ } else {
264
+ const m = resolveModel("auto", date, complexity);
265
+ const mult = peakMultiplier(date);
266
+ // Only penalize GLM at peak if the CHOSEN model actually carries the surcharge (glm-5.x).
267
+ // If "auto" lands on a no-surcharge model (e.g. glm-4.7), peak is fine -> no penalty.
268
+ const penalty = /^glm-5/.test(m) ? Math.min((mult - 1) * 0.5, 2) : 0;
269
+ if (penalty > 0) {
270
+ score -= penalty;
271
+ reasons.push(`Peak window (UTC+8): "auto" model ${m} costs ~${mult}x now -> -${penalty} toward GLM (route less to GLM at peak).`);
272
+ } else {
273
+ reasons.push(`Peak window (UTC+8): "auto" model ${m} has no peak surcharge -> no GLM penalty (fine to use).`);
274
+ }
275
+ }
276
+
277
+ if (score >= 1) {
278
+ return done("glm", resolveModel("auto", date, complexity), clamp(0.5 + score * 0.1), reasons);
279
+ }
280
+ return done("opus", null, clamp(0.5 + Math.abs(score) * 0.1), reasons);
281
+
282
+ function done(engine, model, confidence, rs) {
283
+ return { engine, model, confidence: Math.round(confidence * 100) / 100, reasons: rs };
284
+ }
285
+ }
286
+
287
+ function clamp(n) {
288
+ return Math.max(0.5, Math.min(0.97, n));
289
+ }
290
+
291
+ export const MODELS = { OFFPEAK_MODELS, PEAK_MODELS, CHEAP_MODEL, RATES };
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env node
2
+ // smoke.js -- quick offline + online sanity check. Run: npm run smoke
3
+ // Offline checks always run. The live API call runs only if a key is present.
4
+ import "./loadEnv.js";
5
+ import { isPeak, chinaHour, resolveModel, recommend, estimateCost } from "./router.js";
6
+ import { glmMessage, config } from "./glmClient.js";
7
+
8
+ console.log("=== GLM MCP smoke test ===");
9
+ console.log("base_url:", config.BASE_URL);
10
+ console.log("api_key_loaded:", config.hasKey);
11
+ console.log("china_hour:", chinaHour(), "peak_now:", isPeak());
12
+ console.log("auto model now:", resolveModel("auto"));
13
+ console.log(
14
+ "recommend(frontend/low):",
15
+ JSON.stringify(recommend({ taskType: "frontend", complexity: "low" }))
16
+ );
17
+ console.log(
18
+ "recommend(architecture/high):",
19
+ JSON.stringify(recommend({ taskType: "architecture", complexity: "high" }))
20
+ );
21
+ console.log("recommend(sensitive):", JSON.stringify(recommend({ sensitive: true })));
22
+ console.log("est cost glm-5.2 (1k in / 2k out):", "$" + estimateCost("glm-5.2", 1000, 2000));
23
+
24
+ if (!config.hasKey) {
25
+ console.log("\nNo API key -> skipping live call. Offline checks passed.");
26
+ process.exit(0);
27
+ }
28
+
29
+ console.log("\nLive call to GLM…");
30
+ try {
31
+ const { text, usage } = await glmMessage({
32
+ model: resolveModel("auto"),
33
+ messages: [{ role: "user", content: "Reply with exactly: GLM_OK" }],
34
+ maxTokens: 32,
35
+ });
36
+ console.log("response:", text);
37
+ console.log("usage:", JSON.stringify(usage));
38
+ console.log(text.includes("GLM_OK") ? "\n✅ Live call OK." : "\n⚠️ Live call returned unexpected text.");
39
+ } catch (e) {
40
+ console.error("\n❌ Live call failed:", e.message);
41
+ process.exit(1);
42
+ }