@zhihand/mcp 0.17.1 → 0.18.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,6 +15,10 @@ export interface ControlParams {
15
15
  startYRatio?: number;
16
16
  endXRatio?: number;
17
17
  endYRatio?: number;
18
+ appPackage?: string;
19
+ bundleId?: string;
20
+ urlScheme?: string;
21
+ appName?: string;
18
22
  }
19
23
  export interface QueuedControlCommand {
20
24
  type: string;
@@ -9,20 +9,23 @@ export function createControlCommand(params) {
9
9
  return { type: "receive_click", payload: { x: params.xRatio, y: params.yRatio } };
10
10
  case "doubleclick":
11
11
  return { type: "receive_doubleclick", payload: { x: params.xRatio, y: params.yRatio } };
12
+ case "longclick":
13
+ return { type: "receive_longclick", payload: { x: params.xRatio, y: params.yRatio, time: params.durationMs ?? 800 } };
12
14
  case "rightclick":
13
15
  return { type: "receive_rightclick", payload: { x: params.xRatio, y: params.yRatio } };
14
16
  case "middleclick":
15
17
  return { type: "receive_middleclick", payload: { x: params.xRatio, y: params.yRatio } };
16
18
  case "type":
17
- return { type: "receive_type", payload: { text: params.text } };
19
+ return { type: "receive_input", payload: { input: params.text, mode: "auto", submit: false } };
18
20
  case "swipe":
19
21
  return {
20
- type: "receive_swipe",
22
+ type: "receive_slide",
21
23
  payload: {
22
- startX: params.startXRatio,
23
- startY: params.startYRatio,
24
- endX: params.endXRatio,
25
- endY: params.endYRatio,
24
+ x1: params.startXRatio,
25
+ y1: params.startYRatio,
26
+ x2: params.endXRatio,
27
+ y2: params.endYRatio,
28
+ time: params.durationMs ?? 300,
26
29
  },
27
30
  };
28
31
  case "scroll":
@@ -36,12 +39,30 @@ export function createControlCommand(params) {
36
39
  },
37
40
  };
38
41
  case "keycombo":
39
- return { type: "receive_keycombo", payload: { keys: params.keys } };
42
+ return { type: "receive_key_combo", payload: { keys: params.keys } };
43
+ case "back":
44
+ return { type: "receive_back", payload: {} };
45
+ case "home":
46
+ return { type: "receive_home", payload: {} };
47
+ case "enter":
48
+ return { type: "receive_enter", payload: {} };
40
49
  case "clipboard":
41
50
  return {
42
51
  type: "receive_clipboard",
43
52
  payload: { action: params.clipboardAction, text: params.text },
44
53
  };
54
+ case "open_app": {
55
+ const appPayload = {};
56
+ if (params.appPackage)
57
+ appPayload.app_package = params.appPackage;
58
+ if (params.bundleId)
59
+ appPayload.bundle_id = params.bundleId;
60
+ if (params.urlScheme)
61
+ appPayload.url_scheme = params.urlScheme;
62
+ if (params.appName)
63
+ appPayload.app_name = params.appName;
64
+ return { type: "receive_app", payload: appPayload };
65
+ }
45
66
  case "screenshot":
46
67
  return { type: "receive_screenshot", payload: {} };
47
68
  default:
@@ -329,13 +329,35 @@ export function killActiveChild() {
329
329
  function wrapPrompt(userPrompt) {
330
330
  return `You are ZhiHand, an AI assistant connected to the user's mobile phone via MCP tools.
331
331
 
332
- You have the following MCP tools to interact with the phone:
333
- - zhihand_screenshot: Take a screenshot of the phone screen. Use this when the user asks to see, check, or look at their screen.
334
- - zhihand_control: Control the phone — click, type, swipe, scroll, key combos, clipboard, wait. Requires "action" parameter. For clicks, provide xRatio/yRatio (0-1 normalized coordinates).
335
- - zhihand_pair: Pair a new device (rarely needed).
332
+ ## Available MCP Tools
336
333
 
337
- When the user asks you to see their screen, look at something, or check what's on the phone, ALWAYS call zhihand_screenshot first.
338
- When the user asks you to tap, click, type, swipe, or interact with the phone, use zhihand_control.
334
+ ### zhihand_screenshot
335
+ Take a screenshot of the phone screen. Use this when the user asks to see, check, or look at their screen.
336
+
337
+ ### zhihand_control
338
+ Control the phone. Requires "action" parameter. All coordinates use normalized ratios [0,1].
339
+
340
+ **Supported actions:**
341
+ - click: Tap at position. Params: xRatio, yRatio
342
+ - doubleclick: Double tap. Params: xRatio, yRatio
343
+ - longclick: Long press. Params: xRatio, yRatio, durationMs (default 800)
344
+ - type: Type text into focused field. Params: text
345
+ - swipe: Swipe gesture. Params: startXRatio, startYRatio, endXRatio, endYRatio, durationMs (default 300)
346
+ - scroll: Scroll at position. Params: xRatio, yRatio, direction (up/down/left/right), amount (default 3)
347
+ - keycombo: Keyboard shortcut. Params: keys (e.g. "ctrl+c", "alt+tab")
348
+ - back: Press Back button (no params)
349
+ - home: Press Home button (no params)
350
+ - enter: Press Enter key (no params)
351
+ - open_app: Open an app. Params: appPackage (Android, e.g. "com.tencent.mm"), bundleId (iOS), urlScheme (e.g. "weixin://")
352
+ - clipboard: Read/write clipboard. Params: clipboardAction ("get"/"set"), text
353
+ - screenshot: Capture screen via control (same as zhihand_screenshot)
354
+ - wait: Wait before next action. Params: durationMs (default 1000)
355
+
356
+ ## Rules
357
+ - When the user asks to see their screen, ALWAYS call zhihand_screenshot first.
358
+ - When the user asks to open an app (e.g. WeChat, Settings), use open_app action.
359
+ - When the user asks to go back/home, use back/home actions.
360
+ - For all tap/click operations, use xRatio and yRatio (0-1 normalized coordinates based on the screenshot).
339
361
 
340
362
  User message:
341
363
  ${userPrompt}`;
@@ -413,48 +435,16 @@ function dispatchClaude(prompt, startTime, model) {
413
435
  activeChild = child;
414
436
  return collectChildOutput(child, startTime);
415
437
  }
416
- // ── Codex JSONL Output Parser ──────────────────────────────
417
- /** Parse codex JSONL output and extract agent message text. */
418
- function parseCodexJsonl(raw) {
419
- const lines = raw.split("\n").filter(Boolean);
420
- const texts = [];
421
- let hasError = false;
422
- for (const line of lines) {
423
- try {
424
- const event = JSON.parse(line);
425
- const type = String(event.type ?? "");
426
- // Extract text from completed agent messages
427
- if (type === "item.completed") {
428
- const item = event.item;
429
- if (item && typeof item.text === "string" && item.text.trim()) {
430
- texts.push(item.text.trim());
431
- }
432
- }
433
- // Capture errors
434
- if (type === "error") {
435
- const msg = String(event.message ?? "");
436
- if (msg)
437
- texts.push(`Error: ${msg}`);
438
- hasError = true;
439
- }
440
- if (type === "turn.failed") {
441
- hasError = true;
442
- }
443
- }
444
- catch {
445
- // Not valid JSON — skip (truncated line or stderr mixed in)
446
- }
447
- }
448
- if (texts.length > 0) {
449
- return { text: texts.join("\n\n"), success: !hasError };
450
- }
451
- return { text: raw.trim(), success: false };
452
- }
438
+ /**
439
+ * Collect codex JSONL output with streaming line parsing.
440
+ * Processes each JSONL line as it arrives so we extract agent text
441
+ * without buffering large binary payloads (e.g. base64 screenshots).
442
+ */
453
443
  function collectCodexOutput(child, startTime) {
454
444
  return new Promise((resolve) => {
455
- const chunks = [];
456
- let totalBytes = 0;
457
- let truncated = false;
445
+ const texts = [];
446
+ let hasError = false;
447
+ let lineBuffer = "";
458
448
  let settled = false;
459
449
  function settle(result) {
460
450
  if (settled)
@@ -462,36 +452,59 @@ function collectCodexOutput(child, startTime) {
462
452
  settled = true;
463
453
  resolve(result);
464
454
  }
465
- const timer = setTimeout(() => { closeChild(child); }, CLI_TIMEOUT);
466
- const collectOutput = (data) => {
467
- if (truncated)
455
+ function processLine(line) {
456
+ if (!line.trim())
468
457
  return;
469
- totalBytes += data.length;
470
- if (totalBytes > MAX_OUTPUT_BYTES) {
471
- truncated = true;
472
- chunks.push(data.subarray(0, MAX_OUTPUT_BYTES - (totalBytes - data.length)));
458
+ try {
459
+ const event = JSON.parse(line);
460
+ const type = String(event.type ?? "");
461
+ if (type === "item.completed") {
462
+ const item = event.item;
463
+ if (item && typeof item.text === "string" && item.text.trim()) {
464
+ texts.push(item.text.trim());
465
+ }
466
+ }
467
+ if (type === "error") {
468
+ const msg = String(event.message ?? "");
469
+ if (msg)
470
+ texts.push(`Error: ${msg}`);
471
+ hasError = true;
472
+ }
473
+ if (type === "turn.failed") {
474
+ hasError = true;
475
+ }
473
476
  }
474
- else {
475
- chunks.push(data);
477
+ catch {
478
+ // Not valid JSON — skip
479
+ }
480
+ }
481
+ const timer = setTimeout(() => { closeChild(child); }, CLI_TIMEOUT);
482
+ const onData = (data) => {
483
+ lineBuffer += data.toString("utf8");
484
+ const lines = lineBuffer.split("\n");
485
+ // Keep the last (possibly incomplete) line in the buffer
486
+ lineBuffer = lines.pop() ?? "";
487
+ for (const line of lines) {
488
+ processLine(line);
476
489
  }
477
490
  };
478
- child.stdout?.on("data", collectOutput);
479
- child.stderr?.on("data", collectOutput);
491
+ child.stdout?.on("data", onData);
492
+ // stderr is not JSONL, just discard
493
+ child.stderr?.resume();
480
494
  child.on("close", (code) => {
481
495
  clearTimeout(timer);
482
496
  activeChild = null;
497
+ // Process any remaining data in the buffer
498
+ if (lineBuffer.trim())
499
+ processLine(lineBuffer);
483
500
  const durationMs = Date.now() - startTime;
484
- const raw = Buffer.concat(chunks).toString("utf8").trim();
485
- const parsed = parseCodexJsonl(raw);
486
- let text = parsed.text;
487
- if (truncated)
488
- text += "\n\n[Output truncated at 100KB]";
501
+ let text = texts.join("\n\n");
489
502
  if (!text) {
490
503
  text = code === 0
491
504
  ? "Task completed (no output)."
492
505
  : `CLI process exited with code ${code}.`;
493
506
  }
494
- settle({ text, success: parsed.success && code === 0, durationMs });
507
+ settle({ text, success: !hasError && code === 0, durationMs });
495
508
  });
496
509
  child.on("error", (err) => {
497
510
  clearTimeout(timer);
package/dist/index.js CHANGED
@@ -5,7 +5,7 @@ import { controlSchema, screenshotSchema, pairSchema } from "./tools/schemas.js"
5
5
  import { executeControl } from "./tools/control.js";
6
6
  import { handleScreenshot } from "./tools/screenshot.js";
7
7
  import { handlePair } from "./tools/pair.js";
8
- const PACKAGE_VERSION = "0.17.1";
8
+ const PACKAGE_VERSION = "0.18.0";
9
9
  export function createServer(deviceName) {
10
10
  const server = new McpServer({
11
11
  name: "zhihand",
@@ -1,6 +1,6 @@
1
1
  import { z } from "zod";
2
2
  export declare const controlSchema: {
3
- action: z.ZodEnum<["click", "doubleclick", "rightclick", "middleclick", "type", "swipe", "scroll", "keycombo", "clipboard", "wait", "screenshot"]>;
3
+ action: z.ZodEnum<["click", "doubleclick", "longclick", "rightclick", "middleclick", "type", "swipe", "scroll", "keycombo", "back", "home", "enter", "clipboard", "open_app", "wait", "screenshot"]>;
4
4
  xRatio: z.ZodOptional<z.ZodNumber>;
5
5
  yRatio: z.ZodOptional<z.ZodNumber>;
6
6
  text: z.ZodOptional<z.ZodString>;
@@ -13,6 +13,10 @@ export declare const controlSchema: {
13
13
  startYRatio: z.ZodOptional<z.ZodNumber>;
14
14
  endXRatio: z.ZodOptional<z.ZodNumber>;
15
15
  endYRatio: z.ZodOptional<z.ZodNumber>;
16
+ appPackage: z.ZodOptional<z.ZodString>;
17
+ bundleId: z.ZodOptional<z.ZodString>;
18
+ urlScheme: z.ZodOptional<z.ZodString>;
19
+ appName: z.ZodOptional<z.ZodString>;
16
20
  };
17
21
  export declare const screenshotSchema: {};
18
22
  export declare const pairSchema: {
@@ -1,9 +1,10 @@
1
1
  import { z } from "zod";
2
2
  export const controlSchema = {
3
3
  action: z.enum([
4
- "click", "doubleclick", "rightclick", "middleclick",
4
+ "click", "doubleclick", "longclick", "rightclick", "middleclick",
5
5
  "type", "swipe", "scroll", "keycombo",
6
- "clipboard",
6
+ "back", "home", "enter",
7
+ "clipboard", "open_app",
7
8
  "wait", "screenshot",
8
9
  ]),
9
10
  xRatio: z.number().min(0).max(1).optional().describe("Normalized horizontal position [0,1]"),
@@ -13,11 +14,15 @@ export const controlSchema = {
13
14
  amount: z.number().int().positive().default(3).optional().describe("Scroll steps (default 3)"),
14
15
  keys: z.string().optional().describe("Key combo string, e.g. 'ctrl+c', 'alt+tab'"),
15
16
  clipboardAction: z.enum(["get", "set"]).optional().describe("Clipboard action"),
16
- durationMs: z.number().int().positive().max(10000).default(1000).optional().describe("Duration in ms for wait (default 1000, max 10000)"),
17
+ durationMs: z.number().int().positive().max(10000).default(1000).optional().describe("Duration in ms for wait, longclick, or swipe (default 1000, max 10000)"),
17
18
  startXRatio: z.number().min(0).max(1).optional().describe("Swipe start X [0,1]"),
18
19
  startYRatio: z.number().min(0).max(1).optional().describe("Swipe start Y [0,1]"),
19
20
  endXRatio: z.number().min(0).max(1).optional().describe("Swipe end X [0,1]"),
20
21
  endYRatio: z.number().min(0).max(1).optional().describe("Swipe end Y [0,1]"),
22
+ appPackage: z.string().optional().describe("Android package name, e.g. 'com.tencent.mm'"),
23
+ bundleId: z.string().optional().describe("iOS bundle ID, e.g. 'com.tencent.xin'"),
24
+ urlScheme: z.string().optional().describe("URL scheme, e.g. 'weixin://'"),
25
+ appName: z.string().optional().describe("Human-readable app name (for logging)"),
21
26
  };
22
27
  export const screenshotSchema = {};
23
28
  export const pairSchema = {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@zhihand/mcp",
3
- "version": "0.17.1",
3
+ "version": "0.18.0",
4
4
  "private": false,
5
5
  "type": "module",
6
6
  "description": "ZhiHand MCP Server — phone control tools for Claude Code, Codex, Gemini CLI, and OpenClaw",