@midscene/shared 1.9.1 → 1.9.2-beta-20260605084246.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,6 +7,7 @@ import {
7
7
  unwrapZodField,
8
8
  } from '../zod-schema-utils';
9
9
  import { getErrorMessage } from './error-formatter';
10
+ import type { ToolDefaults } from './tool-defaults';
10
11
  import type {
11
12
  ActionSpaceItem,
12
13
  BaseAgent,
@@ -290,6 +291,62 @@ function normalizeActionArgs(
290
291
  );
291
292
  }
292
293
 
294
+ /**
295
+ * Merge `defaults` into a single locate object without overwriting values the
296
+ * caller set explicitly. `deepThink` is a deprecated alias for `deepLocate`,
297
+ * so an explicit `deepThink` counts as `deepLocate` already being set.
298
+ */
299
+ function mergeLocateDefaults(
300
+ locate: Record<string, unknown>,
301
+ defaults: Record<string, unknown>,
302
+ ): Record<string, unknown> {
303
+ let merged: Record<string, unknown> | undefined;
304
+ for (const [key, value] of Object.entries(defaults)) {
305
+ if (locate[key] !== undefined) {
306
+ continue;
307
+ }
308
+ if (key === 'deepLocate' && locate.deepThink !== undefined) {
309
+ continue;
310
+ }
311
+ merged = merged ?? { ...locate };
312
+ merged[key] = value;
313
+ }
314
+ return merged ?? locate;
315
+ }
316
+
317
+ /**
318
+ * Apply `locateDefaults` to every locate-like field of an action's args.
319
+ * Generic over the default keys, so new behaviors need no changes here.
320
+ */
321
+ function applyLocateDefaults(
322
+ args: Record<string, unknown>,
323
+ paramSchema: z.ZodTypeAny | undefined,
324
+ locateDefaults: Record<string, unknown>,
325
+ ): Record<string, unknown> {
326
+ if (!paramSchema || Object.keys(locateDefaults).length === 0) {
327
+ return args;
328
+ }
329
+
330
+ const shape = getZodObjectShape(paramSchema);
331
+ if (!shape) {
332
+ return args;
333
+ }
334
+
335
+ return Object.fromEntries(
336
+ Object.entries(args).map(([key, value]) => {
337
+ const fieldSchema = shape[key] as z.ZodTypeAny | undefined;
338
+ if (
339
+ fieldSchema &&
340
+ isMidsceneLocatorField(fieldSchema) &&
341
+ isRecord(value)
342
+ ) {
343
+ return [key, mergeLocateDefaults(value, locateDefaults)];
344
+ }
345
+ return [key, value];
346
+ }),
347
+ );
348
+ }
349
+
293
350
  /**
294
351
  * Serialize args to human-readable description for AI action
295
352
  */
@@ -490,6 +547,7 @@ export function generateToolsFromActionSpace(
490
547
  ) => args,
491
548
  initArgSchema: ToolSchema = {},
492
549
  initArgCliMetadata?: ToolCliMetadata,
550
+ toolDefaults: ToolDefaults = {},
493
551
  ): ToolDefinition[] {
494
552
  return actionSpace.map((action) => {
495
553
  const schema = {
@@ -505,10 +563,17 @@ export function generateToolsFromActionSpace(
505
563
  handler: async (args: Record<string, unknown>) => {
506
564
  try {
507
565
  const agent = await getAgent(args);
508
- const normalizedArgs = normalizeActionArgs(
566
+ let normalizedArgs = normalizeActionArgs(
509
567
  sanitizeArgs(args),
510
568
  action.paramSchema,
511
569
  );
570
+ if (toolDefaults.locate) {
571
+ normalizedArgs = applyLocateDefaults(
572
+ normalizedArgs,
573
+ action.paramSchema,
574
+ toolDefaults.locate,
575
+ );
576
+ }
512
577
  let actionResult: unknown;
513
578
 
514
579
  try {
@@ -553,6 +618,7 @@ export function generateCommonTools(
553
618
  getAgent: (args?: Record<string, unknown>) => Promise<BaseAgent>,
554
619
  initArgSchema: ToolSchema = {},
555
620
  initArgCliMetadata?: ToolCliMetadata,
621
+ toolDefaults: ToolDefaults = {},
556
622
  ): ToolDefinition[] {
557
623
  return [
558
624
  {
@@ -597,6 +663,18 @@ export function generateCommonTools(
597
663
  .describe(
598
664
  'Natural language description of the action to perform, e.g. "press Command+Space, type Safari, press Enter"',
599
665
  ),
666
+ deepLocate: z
667
+ .boolean()
668
+ .optional()
669
+ .describe(
670
+ 'Use deep locate for every element this action targets. Improves precision for small or ambiguous targets at the cost of speed. Defaults to the server --deep-locate setting.',
671
+ ),
672
+ deepThink: z
673
+ .boolean()
674
+ .optional()
675
+ .describe(
676
+ 'Plan this action with deep thinking (richer context and sub-goal decomposition). Helps with complex multi-step instructions at the cost of speed. Defaults to the server --deep-think setting.',
677
+ ),
600
678
  ...initArgSchema,
601
679
  },
602
680
  cli: mergeToolCliMetadata(undefined, initArgCliMetadata),
@@ -609,7 +687,19 @@ export function generateCommonTools(
609
687
  if (!agent.aiAction) {
610
688
  return createErrorResult('act is not supported by this agent');
611
689
  }
612
- const result = await agent.aiAction(prompt, { deepThink: false });
690
+ // Start from the act defaults (deepThink off), overlay the server
691
+ // tool defaults, then let explicit per-call args win.
692
+ const actOptions: Record<string, unknown> = {
693
+ deepThink: false,
694
+ ...toolDefaults.act,
695
+ };
696
+ if (args.deepLocate !== undefined) {
697
+ actOptions.deepLocate = args.deepLocate;
698
+ }
699
+ if (args.deepThink !== undefined) {
700
+ actOptions.deepThink = args.deepThink;
701
+ }
702
+ const result = await agent.aiAction(prompt, actOptions);
613
703
  return await captureScreenshotResult(agent, 'act', result);
614
704
  } catch (error: unknown) {
615
705
  const errorMessage = getErrorMessage(error);
@@ -628,6 +718,12 @@ export function generateCommonTools(
628
718
  .describe(
629
719
  'Natural language assertion to verify, e.g. "there is a login button visible"',
630
720
  ),
721
+ message: z
722
+ .string()
723
+ .optional()
724
+ .describe(
725
+ 'Custom error message to throw when the assertion fails, e.g. "the login button should be visible".',
726
+ ),
631
727
  ...promptInputExtraSchema,
632
728
  ...initArgSchema,
633
729
  },
@@ -636,6 +732,7 @@ export function generateCommonTools(
636
732
  args: Record<string, unknown> = {},
637
733
  ): Promise<ToolResult> => {
638
734
  const prompt = args.prompt as string;
735
+ const message = args.message as string | undefined;
639
736
  try {
640
737
  const agent = await getAgent(args);
641
738
  if (!agent.aiAssert) {
@@ -647,7 +744,7 @@ export function generateCommonTools(
647
744
  imageName: args.imageName,
648
745
  convertHttpImage2Base64: args.convertHttpImage2Base64,
649
746
  });
650
- await agent.aiAssert(userPrompt);
747
+ await agent.aiAssert(userPrompt, message);
651
748
  return {
652
749
  content: [{ type: 'text', text: 'Assertion passed.' }],
653
750
  };
package/src/mcp/types.ts CHANGED
@@ -1,5 +1,6 @@
1
1
  import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
2
2
  import type { z } from 'zod';
3
+ import type { ToolDefaults } from './tool-defaults';
3
4
 
4
5
  // Avoid circular dependency: don't import from @midscene/core
5
6
  // Instead, use generic types that will be provided by implementation
@@ -148,4 +149,5 @@ export interface IMidsceneTools {
148
149
  attachToServer(server: McpServer): void;
149
150
  initTools(): Promise<void>;
150
151
  destroy?(): Promise<void>;
152
+ setToolDefaults?(toolDefaults: ToolDefaults): void;
151
153
  }