@hypothesi/tauri-mcp-server 0.8.2 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,16 +1,43 @@
1
1
  /**
2
2
  * Shared ref resolver - always available via window.__MCP__.resolveRef.
3
- * Accepts a ref ID ("e3", "ref=e3", "[ref=e3]") or CSS selector.
3
+ * Accepts a ref ID ("e3", "ref=e3", "[ref=e3]"), CSS selector, XPath, or text.
4
4
  * Returns the DOM element, or null if not found.
5
5
  *
6
6
  * Reads window.__MCP__.reverseRefs dynamically at call time so it always
7
7
  * uses the latest snapshot's data.
8
+ *
9
+ * Also provides:
10
+ * - resolveAll(selector, strategy) - returns an Array of matching elements
11
+ * - countAll(selector, strategy) - returns the total match count
8
12
  */
9
13
  (function() {
10
14
  window.__MCP__ = window.__MCP__ || {};
11
- window.__MCP__.resolveRef = function(selectorOrRef) {
15
+
16
+ var REF_PATTERN = /^\[?(?:ref=)?(e\d+)\]?$/;
17
+
18
+ function xpathForText(text) {
19
+ // Escape single quotes for XPath by splitting on ' and using concat()
20
+ if (text.indexOf("'") === -1) {
21
+ return "//*[contains(text(), '" + text + "')]";
22
+ }
23
+ var parts = text.split("'");
24
+ var expr = 'concat(' + parts.map(function(p, i) {
25
+ return (i > 0 ? ",\"'\",": '') + "'" + p + "'";
26
+ }).join('') + ')';
27
+ return '//*[contains(text(), ' + expr + ')]';
28
+ }
29
+
30
+ /**
31
+ * Resolve a single element by selector and strategy.
32
+ * @param {string} selectorOrRef - Selector, ref ID, XPath, or text
33
+ * @param {string} [strategy] - 'css' (default), 'xpath', or 'text'
34
+ * @returns {Element|null}
35
+ */
36
+ window.__MCP__.resolveRef = function(selectorOrRef, strategy) {
12
37
  if (!selectorOrRef) return null;
13
- var refMatch = selectorOrRef.match(/^\[?(?:ref=)?(e\d+)\]?$/);
38
+
39
+ // Ref IDs always take priority regardless of strategy
40
+ var refMatch = selectorOrRef.match(REF_PATTERN);
14
41
  if (refMatch) {
15
42
  var reverseRefs = window.__MCP__.reverseRefs;
16
43
  if (!reverseRefs) {
@@ -18,6 +45,68 @@
18
45
  }
19
46
  return reverseRefs.get(refMatch[1]) || null;
20
47
  }
48
+
49
+ if (strategy === 'text') {
50
+ var xpath = xpathForText(selectorOrRef);
51
+ var result = document.evaluate(xpath, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
52
+ return result.singleNodeValue;
53
+ }
54
+
55
+ if (strategy === 'xpath') {
56
+ var result = document.evaluate(selectorOrRef, document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null);
57
+ return result.singleNodeValue;
58
+ }
59
+
60
+ // Default: CSS selector
21
61
  return document.querySelector(selectorOrRef);
22
62
  };
63
+
64
+ /**
65
+ * Resolve all matching elements as an Array.
66
+ * @param {string} selector - Selector, XPath, or text
67
+ * @param {string} [strategy] - 'css' (default), 'xpath', or 'text'
68
+ * @returns {Element[]}
69
+ */
70
+ window.__MCP__.resolveAll = function(selector, strategy) {
71
+ if (!selector) return [];
72
+
73
+ // Ref IDs resolve to a single element
74
+ var refMatch = selector.match(REF_PATTERN);
75
+ if (refMatch) {
76
+ var el = window.__MCP__.resolveRef(selector);
77
+ return el ? [el] : [];
78
+ }
79
+
80
+ if (strategy === 'text') {
81
+ var xpath = xpathForText(selector);
82
+ var snapshot = document.evaluate(xpath, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
83
+ var results = [];
84
+ for (var i = 0; i < snapshot.snapshotLength; i++) {
85
+ results.push(snapshot.snapshotItem(i));
86
+ }
87
+ return results;
88
+ }
89
+
90
+ if (strategy === 'xpath') {
91
+ var snapshot = document.evaluate(selector, document, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
92
+ var results = [];
93
+ for (var i = 0; i < snapshot.snapshotLength; i++) {
94
+ results.push(snapshot.snapshotItem(i));
95
+ }
96
+ return results;
97
+ }
98
+
99
+ // Default: CSS
100
+ return Array.from(document.querySelectorAll(selector));
101
+ };
102
+
103
+ /**
104
+ * Count all matching elements.
105
+ * @param {string} selector - Selector, XPath, or text
106
+ * @param {string} [strategy] - 'css' (default), 'xpath', or 'text'
107
+ * @returns {number}
108
+ */
109
+ window.__MCP__.countAll = function(selector, strategy) {
110
+ return window.__MCP__.resolveAll(selector, strategy).length;
111
+ };
23
112
  })();
@@ -4,34 +4,38 @@
4
4
  * @param {Object} params
5
5
  * @param {string} params.type - What to wait for: 'selector', 'text', 'ipc-event'
6
6
  * @param {string} params.value - Selector/ref ID, text, or event name to wait for
7
+ * @param {string} params.strategy - Selector strategy (applies when type is 'selector'): 'css', 'xpath', or 'text'
7
8
  * @param {number} params.timeout - Timeout in milliseconds
8
9
  */
9
10
  (async function(params) {
10
- const { type, value, timeout } = params;
11
+ const { type, value, strategy, timeout } = params;
11
12
  const startTime = Date.now();
12
13
 
13
14
  function resolveElement(selectorOrRef) {
14
15
  if (!selectorOrRef) return null;
15
- return window.__MCP__.resolveRef(selectorOrRef);
16
+ return window.__MCP__.resolveRef(selectorOrRef, strategy);
16
17
  }
17
18
 
18
- return new Promise((resolve, reject) => {
19
+ return new Promise(function(resolve, reject) {
19
20
  function check() {
20
21
  if (Date.now() - startTime > timeout) {
21
- reject(new Error(`Timeout waiting for ${type}: ${value}`));
22
+ reject(new Error('Timeout waiting for ' + type + ': ' + value));
22
23
  return;
23
24
  }
24
25
 
25
26
  if (type === 'selector') {
26
- const element = resolveElement(value);
27
+ var element = resolveElement(value);
27
28
  if (element) {
28
- resolve(`Element found: ${value}`);
29
+ var msg = 'Element found: ' + value;
30
+ var count = window.__MCP__.countAll(value, strategy);
31
+ if (count > 1) msg += ' (+' + (count - 1) + ' more match' + (count - 1 === 1 ? '' : 'es') + ')';
32
+ resolve(msg);
29
33
  return;
30
34
  }
31
35
  } else if (type === 'text') {
32
- const found = document.body.innerText.includes(value);
36
+ var found = document.body.innerText.includes(value);
33
37
  if (found) {
34
- resolve(`Text found: ${value}`);
38
+ resolve('Text found: ' + value);
35
39
  return;
36
40
  }
37
41
  } else if (type === 'ipc-event') {
@@ -15,12 +15,22 @@ export const WindowTargetSchema = z.object({
15
15
  appIdentifier: z.union([z.string(), z.number()]).optional().describe('App port or bundle ID to target. Defaults to the only connected app or the default app if multiple are connected.'),
16
16
  });
17
17
  // ============================================================================
18
+ // Shared Selector Strategy
19
+ // ============================================================================
20
+ /**
21
+ * Reusable strategy field for tools that accept a selector.
22
+ * Defaults to 'css' for backward compatibility.
23
+ */
24
+ const selectorStrategyField = z.enum(['css', 'xpath', 'text']).default('css').describe('Selector strategy: "css" (default) for CSS selectors, "xpath" for XPath expressions, ' +
25
+ '"text" to find elements containing the given text. Ref IDs (e.g., "ref=e3") work with any strategy.');
26
+ // ============================================================================
18
27
  // Schemas
19
28
  // ============================================================================
20
29
  export const InteractSchema = WindowTargetSchema.extend({
21
30
  action: z.enum(['click', 'double-click', 'long-press', 'scroll', 'swipe', 'focus'])
22
31
  .describe('Type of interaction to perform'),
23
- selector: z.string().optional().describe('CSS selector for the element to interact with'),
32
+ selector: z.string().optional().describe('Element selector: CSS selector (default), XPath expression, text content, or ref ID (e.g., "ref=e3")'),
33
+ strategy: selectorStrategyField,
24
34
  x: z.number().optional().describe('X coordinate for direct coordinate interaction'),
25
35
  y: z.number().optional().describe('Y coordinate for direct coordinate interaction'),
26
36
  duration: z.number().optional()
@@ -42,7 +52,9 @@ export const ScreenshotSchema = WindowTargetSchema.extend({
42
52
  export const KeyboardSchema = WindowTargetSchema.extend({
43
53
  action: z.enum(['type', 'press', 'down', 'up'])
44
54
  .describe('Keyboard action type: "type" for typing text into an element, "press/down/up" for key events'),
45
- selector: z.string().optional().describe('CSS selector for element to type into (required for "type" action)'),
55
+ selector: z.string().optional().describe('Element selector for element to type into (required for "type" action): ' +
56
+ 'CSS selector (default), XPath, text content, or ref ID'),
57
+ strategy: selectorStrategyField,
46
58
  text: z.string().optional().describe('Text to type (required for "type" action)'),
47
59
  key: z.string().optional().describe('Key to press (required for "press/down/up" actions, e.g., "Enter", "a", "Escape")'),
48
60
  modifiers: z.array(z.enum(['Control', 'Alt', 'Shift', 'Meta'])).optional().describe('Modifier keys to hold'),
@@ -50,10 +62,12 @@ export const KeyboardSchema = WindowTargetSchema.extend({
50
62
  export const WaitForSchema = WindowTargetSchema.extend({
51
63
  type: z.enum(['selector', 'text', 'ipc-event']).describe('What to wait for'),
52
64
  value: z.string().describe('Selector, text content, or IPC event name to wait for'),
65
+ strategy: selectorStrategyField.describe('Selector strategy (applies when type is "selector"): "css" (default), "xpath", or "text".'),
53
66
  timeout: z.number().optional().default(5000).describe('Timeout in milliseconds (default: 5000ms)'),
54
67
  });
55
68
  export const GetStylesSchema = WindowTargetSchema.extend({
56
- selector: z.string().describe('CSS selector for element(s) to get styles from'),
69
+ selector: z.string().describe('Element selector: CSS selector (default), XPath expression, text content, or ref ID'),
70
+ strategy: selectorStrategyField,
57
71
  properties: z.array(z.string()).optional().describe('Specific CSS properties to retrieve. If omitted, returns all computed styles'),
58
72
  multiple: z.boolean().optional().default(false)
59
73
  .describe('Whether to get styles for all matching elements (true) or just the first (false)'),
@@ -68,8 +82,9 @@ export const FocusElementSchema = WindowTargetSchema.extend({
68
82
  selector: z.string().describe('CSS selector for element to focus'),
69
83
  });
70
84
  export const FindElementSchema = WindowTargetSchema.extend({
71
- selector: z.string(),
72
- strategy: z.enum(['css', 'xpath', 'text']).default('css'),
85
+ selector: z.string().describe('The selector to find: CSS selector (default), XPath expression, text content, or ref ID (e.g., "ref=e3"). ' +
86
+ 'Interpretation depends on strategy.'),
87
+ strategy: selectorStrategyField,
73
88
  });
74
89
  export const GetConsoleLogsSchema = WindowTargetSchema.extend({
75
90
  filter: z.string().optional().describe('Regex or keyword to filter logs'),
@@ -77,13 +92,14 @@ export const GetConsoleLogsSchema = WindowTargetSchema.extend({
77
92
  });
78
93
  export const DomSnapshotSchema = WindowTargetSchema.extend({
79
94
  type: z.enum(['accessibility', 'structure']).describe('Snapshot type'),
80
- selector: z.string().optional().describe('CSS selector to scope the snapshot. If omitted, snapshots entire document.'),
95
+ selector: z.string().optional().describe('Selector to scope the snapshot: CSS selector (default), XPath, text content, or ref ID. If omitted, snapshots entire document.'),
96
+ strategy: selectorStrategyField,
81
97
  });
82
98
  // ============================================================================
83
99
  // Implementation Functions
84
100
  // ============================================================================
85
101
  export async function interact(options) {
86
- const { action, selector, x, y, duration, scrollX, scrollY, fromX, fromY, toX, toY, windowId, appIdentifier } = options;
102
+ const { action, selector, strategy, x, y, duration, scrollX, scrollY, fromX, fromY, toX, toY, windowId, appIdentifier } = options;
87
103
  // Handle swipe action separately since it has different logic
88
104
  if (action === 'swipe') {
89
105
  return performSwipe({ fromX, fromY, toX, toY, duration, windowId, appIdentifier });
@@ -93,11 +109,12 @@ export async function interact(options) {
93
109
  if (!selector) {
94
110
  throw new Error('Focus action requires a selector');
95
111
  }
96
- return focusElement({ selector, windowId, appIdentifier });
112
+ return focusElement({ selector, strategy, windowId, appIdentifier });
97
113
  }
98
114
  const script = buildScript(SCRIPTS.interact, {
99
115
  action,
100
116
  selector: selector ?? null,
117
+ strategy: strategy ?? 'css',
101
118
  x: x ?? null,
102
119
  y: y ?? null,
103
120
  duration: duration ?? 500,
@@ -146,7 +163,7 @@ export async function screenshot(options = {}) {
146
163
  return result;
147
164
  }
148
165
  export async function keyboard(options) {
149
- const { action, selectorOrKey, textOrModifiers, modifiers, windowId, appIdentifier } = options;
166
+ const { action, selectorOrKey, strategy, textOrModifiers, modifiers, windowId, appIdentifier } = options;
150
167
  // Handle the different parameter combinations based on action
151
168
  if (action === 'type') {
152
169
  const selector = selectorOrKey;
@@ -154,7 +171,7 @@ export async function keyboard(options) {
154
171
  if (!selector || !text) {
155
172
  throw new Error('Type action requires both selector and text parameters');
156
173
  }
157
- const script = buildTypeScript(selector, text);
174
+ const script = buildTypeScript(selector, text, strategy);
158
175
  try {
159
176
  return await executeInWebview(script, windowId, appIdentifier);
160
177
  }
@@ -179,8 +196,8 @@ export async function keyboard(options) {
179
196
  }
180
197
  }
181
198
  export async function waitFor(options) {
182
- const { type, value, timeout = 5000, windowId, appIdentifier } = options;
183
- const script = buildScript(SCRIPTS.waitFor, { type, value, timeout });
199
+ const { type, value, strategy, timeout = 5000, windowId, appIdentifier } = options;
200
+ const script = buildScript(SCRIPTS.waitFor, { type, value, strategy: strategy ?? 'css', timeout });
184
201
  try {
185
202
  return await executeInWebview(script, windowId, appIdentifier);
186
203
  }
@@ -190,9 +207,10 @@ export async function waitFor(options) {
190
207
  }
191
208
  }
192
209
  export async function getStyles(options) {
193
- const { selector, properties, multiple = false, windowId, appIdentifier } = options;
210
+ const { selector, strategy, properties, multiple = false, windowId, appIdentifier } = options;
194
211
  const script = buildScript(SCRIPTS.getStyles, {
195
212
  selector,
213
+ strategy: strategy ?? 'css',
196
214
  properties: properties || [],
197
215
  multiple,
198
216
  });
@@ -232,8 +250,8 @@ export async function executeJavaScript(options) {
232
250
  }
233
251
  }
234
252
  export async function focusElement(options) {
235
- const { selector, windowId, appIdentifier } = options;
236
- const script = buildScript(SCRIPTS.focus, { selector });
253
+ const { selector, strategy, windowId, appIdentifier } = options;
254
+ const script = buildScript(SCRIPTS.focus, { selector, strategy: strategy ?? 'css' });
237
255
  try {
238
256
  return await executeInWebview(script, windowId, appIdentifier);
239
257
  }
@@ -274,13 +292,13 @@ export async function getConsoleLogs(options = {}) {
274
292
  * Uses aria-api for comprehensive, spec-compliant accessibility computation.
275
293
  */
276
294
  export async function domSnapshot(options) {
277
- const { type, selector, windowId, appIdentifier } = options;
295
+ const { type, selector, strategy, windowId, appIdentifier } = options;
278
296
  // Only load aria-api for accessibility snapshots
279
297
  if (type === 'accessibility') {
280
298
  await ensureAriaApiLoaded(windowId);
281
299
  }
282
300
  // Then execute the snapshot script
283
- const script = buildScript(SCRIPTS.domSnapshot, { type, selector: selector ?? null });
301
+ const script = buildScript(SCRIPTS.domSnapshot, { type, selector: selector ?? null, strategy: strategy ?? 'css' });
284
302
  try {
285
303
  return await executeInWebview(script, windowId, appIdentifier);
286
304
  }
@@ -91,6 +91,28 @@ Once changes are approved and made:
91
91
  - The plugin only runs in debug builds so it won't affect production
92
92
  - The WebSocket server binds to \`0.0.0.0:9223\` by default
93
93
  - For localhost-only access, use \`Builder::new().bind_address("127.0.0.1").build()\``;
94
+ const SELECT_ELEMENT_PROMPT = (message) => {
95
+ const lines = [
96
+ 'The user wants to visually select an element in their running Tauri app so they can discuss it with you.',
97
+ '',
98
+ 'Follow these steps:',
99
+ '',
100
+ '1. **Ensure a session is active** - Use `driver_session` with action "start" if not already connected',
101
+ '',
102
+ '2. **Activate the element picker** - Call `webview_select_element` to show the picker overlay in the app.',
103
+ 'The user will see a blue highlight following their cursor and can click to select an element.',
104
+ 'They can press Escape or click X to cancel.',
105
+ '',
106
+ '3. **Review the result** - You will receive the element\'s metadata (tag, id, classes, CSS selector, XPath,',
107
+ 'bounding rect, attributes, computed styles, parent chain) and an annotated screenshot with the element highlighted.',
108
+ '',
109
+ '4. **Respond to the user** - Use the element context and screenshot to address their request.',
110
+ ];
111
+ if (message) {
112
+ lines.push('', '## User\'s Message About the Element', '', message);
113
+ }
114
+ return lines.join('\n');
115
+ };
94
116
  /**
95
117
  * Complete registry of all available prompts
96
118
  */
@@ -114,6 +136,30 @@ export const PROMPTS = [
114
136
  ];
115
137
  },
116
138
  },
139
+ {
140
+ name: 'select',
141
+ description: 'Visually select an element in the running Tauri app. ' +
142
+ 'Activates a picker overlay — click an element to send its metadata and an annotated screenshot to the agent. ' +
143
+ 'Optionally include a message describing what you want to do with the element.',
144
+ arguments: [
145
+ {
146
+ name: 'message',
147
+ description: 'What you want to discuss or do with the selected element (e.g. "this button should be green instead of blue")',
148
+ required: false,
149
+ },
150
+ ],
151
+ handler: (args) => {
152
+ return [
153
+ {
154
+ role: 'user',
155
+ content: {
156
+ type: 'text',
157
+ text: SELECT_ELEMENT_PROMPT(args.message),
158
+ },
159
+ },
160
+ ];
161
+ },
162
+ },
117
163
  {
118
164
  name: 'setup',
119
165
  description: 'Set up or update the MCP Bridge plugin in a Tauri project. ' +
@@ -8,6 +8,7 @@ import { manageDriverSession, ManageDriverSessionSchema, } from './driver/sessio
8
8
  import { readLogs, ReadLogsSchema } from './monitor/logs.js';
9
9
  import { executeIPCCommand, manageIPCMonitoring, getIPCEvents, emitTestEvent, getBackendState, manageWindow, ExecuteIPCCommandSchema, ManageIPCMonitoringSchema, GetIPCEventsSchema, EmitTestEventSchema, GetBackendStateSchema, ManageWindowSchema, } from './driver/plugin-commands.js';
10
10
  import { interact, screenshot, keyboard, waitFor, getStyles, executeJavaScript, findElement, domSnapshot, InteractSchema, ScreenshotSchema, KeyboardSchema, WaitForSchema, GetStylesSchema, ExecuteJavaScriptSchema, FindElementSchema, DomSnapshotSchema, } from './driver/webview-interactions.js';
11
+ import { selectElement, getPointedElement, SelectElementSchema, GetPointedElementSchema, } from './driver/element-picker.js';
11
12
  import { PLUGIN_VERSION_CARGO } from './version.js';
12
13
  /**
13
14
  * Standard multi-app description for webview tools.
@@ -48,123 +49,27 @@ First, verify this is a Tauri v2 project:
48
49
  Examine these files and report what needs to be added or updated:
49
50
 
50
51
  ### 1. Rust Plugin Dependency
51
-
52
- Check \`src-tauri/Cargo.toml\` for \`tauri-plugin-mcp-bridge\`.
53
- It should be an **optional** dependency behind a Cargo feature
54
- so that it is completely excluded from production builds:
55
-
52
+ Check \`src-tauri/Cargo.toml\` for \`tauri-plugin-mcp-bridge\`. If missing or outdated, note that it needs:
56
53
  \`\`\`toml
57
54
  [dependencies]
58
- tauri-plugin-mcp-bridge = { version = "${PLUGIN_VERSION_CARGO}", optional = true }
59
- \`\`\`
60
-
61
- Under \`[features]\`, add a feature that enables it:
62
-
63
- \`\`\`toml
64
- [features]
65
- mcp-bridge = ["dep:tauri-plugin-mcp-bridge"]
55
+ tauri-plugin-mcp-bridge = "${PLUGIN_VERSION_CARGO}"
66
56
  \`\`\`
67
57
 
68
58
  ### 2. Plugin Registration
69
-
70
- Check \`src-tauri/src/lib.rs\` or \`src-tauri/src/main.rs\` for plugin
71
- registration. It should be gated behind the \`mcp-bridge\` feature flag:
72
-
59
+ Check \`src-tauri/src/lib.rs\` or \`src-tauri/src/main.rs\` for plugin registration. It should have:
73
60
  \`\`\`rust
74
- #[cfg(all(feature = "mcp-bridge", debug_assertions))]
61
+ #[cfg(debug_assertions)]
75
62
  {
76
63
  builder = builder.plugin(tauri_plugin_mcp_bridge::init());
77
64
  }
78
65
  \`\`\`
79
66
 
80
67
  ### 3. Global Tauri Setting
81
-
82
68
  Check \`src-tauri/tauri.conf.json\` for \`withGlobalTauri: true\` under the \`app\` section.
83
69
  **This is required** - without it, the MCP bridge cannot communicate with the webview.
84
70
 
85
- This setting should only be enabled for development. If the project
86
- uses a \`tauri.dev.conf.json\` overlay (applied only during
87
- \`cargo tauri dev\`), prefer placing it there:
88
-
89
- \`\`\`json
90
- {
91
- "app": {
92
- "withGlobalTauri": true
93
- }
94
- }
95
- \`\`\`
96
-
97
- ### 4. Plugin Capability (Conditional via build.rs)
98
-
99
- The \`mcp-bridge:default\` permission must **not** be added to
100
- \`src-tauri/capabilities/default.json\`. Instead, it should be
101
- conditionally generated by the build script so that it only exists
102
- when the \`mcp-bridge\` feature is active.
103
-
104
- Check \`src-tauri/build.rs\` and update it to conditionally write
105
- (or remove) a separate capability file before
106
- \`tauri_build::build()\` runs. Tauri auto-discovers all \`.json\`
107
- files in \`capabilities/\`, so this ensures the permission is only
108
- present when the feature is enabled:
109
-
110
- \`\`\`rust
111
- fn main() {
112
- let mcp_cap_path = std::path::Path::new("capabilities/mcp-bridge.json");
113
- #[cfg(all(feature = "mcp-bridge", debug_assertions))]
114
- {
115
- let cap = r#"{
116
- "identifier": "mcp-bridge",
117
- "description": "enables MCP bridge for development",
118
- "windows": [
119
- "main"
120
- ],
121
- "permissions": [
122
- "mcp-bridge:default"
123
- ]
124
- }"#;
125
- std::fs::write(mcp_cap_path, cap)
126
- .expect("failed to write mcp-bridge capability");
127
- }
128
- #[cfg(not(all(feature = "mcp-bridge", debug_assertions)))]
129
- {
130
- let _ = std::fs::remove_file(mcp_cap_path);
131
- }
132
-
133
- tauri_build::build()
134
- }
135
- \`\`\`
136
-
137
- If \`build.rs\` already has other logic, integrate the conditional
138
- block before the \`tauri_build::build()\` call.
139
-
140
- ### 5. Gitignore the Generated Capability File
141
-
142
- Since \`capabilities/mcp-bridge.json\` is generated at build time, add it to \`src-tauri/.gitignore\`:
143
-
144
- \`\`\`gitignore
145
- /capabilities/mcp-bridge.json
146
- \`\`\`
147
-
148
- ### 6. Dev Scripts (package.json)
149
-
150
- If the project uses npm scripts to run \`tauri dev\`, add
151
- \`--features mcp-bridge\` to the dev scripts so the feature is
152
- automatically enabled. For example:
153
-
154
- \`\`\`json
155
- {
156
- "scripts": {
157
- "dev": "tauri dev --features mcp-bridge",
158
- "dev:ios": "tauri ios dev --features mcp-bridge",
159
- "dev:android": "tauri android dev --features mcp-bridge"
160
- }
161
- }
162
- \`\`\`
163
-
164
- Do **not** add \`--features mcp-bridge\` to release-profile dev
165
- scripts (e.g. those using \`--release\`), as \`debug_assertions\`
166
- is false in release builds and the guard will exclude the plugin
167
- anyway.
71
+ ### 4. Plugin Permissions
72
+ Check \`src-tauri/capabilities/default.json\` (or similar) for \`"mcp-bridge:default"\` permission.
168
73
 
169
74
  ## Response Format
170
75
 
@@ -179,19 +84,13 @@ Only after the user says yes should you make any modifications.
179
84
  ## After Setup
180
85
 
181
86
  Once changes are approved and made:
182
- 1. Run the Tauri app in development mode if npm scripts were
183
- updated, use \`npm run dev\`. Otherwise use
184
- \`cargo tauri dev --features mcp-bridge\` directly.
87
+ 1. Run the Tauri app in development mode (\`cargo tauri dev\`)
185
88
  2. Use \`driver_session\` with action "start" to connect
186
89
  3. Use \`driver_session\` with action "status" to verify
187
90
 
188
91
  ## Notes
189
92
 
190
- - The plugin is completely excluded from production builds both
191
- \`cfg(feature = "mcp-bridge")\` and \`cfg(debug_assertions)\` must
192
- be true, so even if the feature flag is accidentally enabled in a
193
- release build, the plugin will not be included
194
- - The \`mcp-bridge\` Cargo feature must be passed explicitly — either via npm dev scripts or \`cargo tauri dev --features mcp-bridge\`
93
+ - The plugin only runs in debug builds so it won't affect production
195
94
  - The WebSocket server binds to \`0.0.0.0:9223\` by default
196
95
  - For localhost-only access, use \`Builder::new().bind_address("127.0.0.1").build()\`
197
96
  `;
@@ -271,6 +170,8 @@ export const TOOLS = [
271
170
  {
272
171
  name: 'webview_find_element',
273
172
  description: '[Tauri Apps Only] Find DOM elements in a running Tauri app\'s webview. ' +
173
+ 'Supports CSS selectors (default), XPath expressions, and text content matching via the strategy parameter. ' +
174
+ 'Returns the element\'s HTML. ' +
274
175
  'Requires active driver_session. ' +
275
176
  MULTI_APP_DESC + ' ' +
276
177
  'For browser pages or documentation sites, use Chrome DevTools MCP instead.',
@@ -314,6 +215,7 @@ export const TOOLS = [
314
215
  name: 'webview_interact',
315
216
  description: '[Tauri Apps Only] Click, scroll, swipe, focus, or perform gestures in a Tauri app webview. ' +
316
217
  'Supported actions: click, double-click, long-press, scroll, swipe, focus. ' +
218
+ 'Supports CSS selectors (default), XPath, and text content matching via the strategy parameter. ' +
317
219
  'Requires active driver_session. ' +
318
220
  'For browser interaction, use Chrome DevTools MCP instead.',
319
221
  category: TOOL_CATEGORIES.UI_AUTOMATION,
@@ -364,6 +266,8 @@ export const TOOLS = [
364
266
  {
365
267
  name: 'webview_keyboard',
366
268
  description: '[Tauri Apps Only] Type text or send keyboard events in a Tauri app. ' +
269
+ 'The selector parameter (for "type" action) supports CSS selectors (default), ' +
270
+ 'XPath, and text content matching via the strategy parameter. ' +
367
271
  'Requires active driver_session. ' +
368
272
  MULTI_APP_DESC + ' ' +
369
273
  'For browser keyboard input, use Chrome DevTools MCP instead.',
@@ -381,6 +285,7 @@ export const TOOLS = [
381
285
  return await keyboard({
382
286
  action: parsed.action,
383
287
  selectorOrKey: parsed.selector,
288
+ strategy: parsed.strategy,
384
289
  textOrModifiers: parsed.text,
385
290
  windowId: parsed.windowId,
386
291
  appIdentifier: parsed.appIdentifier,
@@ -398,6 +303,7 @@ export const TOOLS = [
398
303
  {
399
304
  name: 'webview_wait_for',
400
305
  description: '[Tauri Apps Only] Wait for elements, text, or IPC events in a Tauri app. ' +
306
+ 'When type is "selector", supports CSS (default), XPath, and text strategies via the strategy parameter. ' +
401
307
  'Requires active driver_session. ' +
402
308
  MULTI_APP_DESC + ' ' +
403
309
  'For browser waits, use Chrome DevTools MCP instead.',
@@ -413,6 +319,7 @@ export const TOOLS = [
413
319
  return await waitFor({
414
320
  type: parsed.type,
415
321
  value: parsed.value,
322
+ strategy: parsed.strategy,
416
323
  timeout: parsed.timeout,
417
324
  windowId: parsed.windowId,
418
325
  appIdentifier: parsed.appIdentifier,
@@ -422,6 +329,7 @@ export const TOOLS = [
422
329
  {
423
330
  name: 'webview_get_styles',
424
331
  description: '[Tauri Apps Only] Get computed CSS styles from elements in a Tauri app. ' +
332
+ 'Supports CSS selectors (default), XPath, and text content matching via the strategy parameter. ' +
425
333
  'Requires active driver_session. ' +
426
334
  MULTI_APP_DESC + ' ' +
427
335
  'For browser style inspection, use Chrome DevTools MCP instead.',
@@ -436,6 +344,7 @@ export const TOOLS = [
436
344
  const parsed = GetStylesSchema.parse(args);
437
345
  return await getStyles({
438
346
  selector: parsed.selector,
347
+ strategy: parsed.strategy,
439
348
  properties: parsed.properties,
440
349
  multiple: parsed.multiple,
441
350
  windowId: parsed.windowId,
@@ -480,6 +389,7 @@ export const TOOLS = [
480
389
  'with element tag names, IDs, CSS classes, and data-testid attributes (if present). ' +
481
390
  'Use this for understanding page layout, debugging CSS selectors, or locating elements by class/ID. ' +
482
391
  'Use the optional selector parameter to scope the snapshot to a subtree. ' +
392
+ 'The selector supports CSS (default), XPath, and text content matching via the strategy parameter. ' +
483
393
  'Requires active driver_session. ' +
484
394
  MULTI_APP_DESC,
485
395
  category: TOOL_CATEGORIES.UI_AUTOMATION,
@@ -494,6 +404,57 @@ export const TOOLS = [
494
404
  return await domSnapshot({
495
405
  type: parsed.type,
496
406
  selector: parsed.selector,
407
+ strategy: parsed.strategy,
408
+ windowId: parsed.windowId,
409
+ appIdentifier: parsed.appIdentifier,
410
+ });
411
+ },
412
+ },
413
+ // Element Picker Tools
414
+ {
415
+ name: 'webview_select_element',
416
+ description: '[Tauri Apps Only] Activates an element picker overlay in the Tauri app. ' +
417
+ 'The user visually selects an element by clicking it, and the tool returns ' +
418
+ 'rich element metadata (tag, id, classes, attributes, text, bounding rect, ' +
419
+ 'CSS selector, computed styles, parent chain) plus an annotated screenshot ' +
420
+ 'with the element highlighted. ' +
421
+ 'Requires active driver_session. ' +
422
+ MULTI_APP_DESC,
423
+ category: TOOL_CATEGORIES.UI_AUTOMATION,
424
+ schema: SelectElementSchema,
425
+ annotations: {
426
+ title: 'Select Element (Visual Picker)',
427
+ readOnlyHint: true,
428
+ openWorldHint: false,
429
+ },
430
+ handler: async (args) => {
431
+ const parsed = SelectElementSchema.parse(args);
432
+ return await selectElement({
433
+ timeout: parsed.timeout,
434
+ windowId: parsed.windowId,
435
+ appIdentifier: parsed.appIdentifier,
436
+ });
437
+ },
438
+ },
439
+ {
440
+ name: 'webview_get_pointed_element',
441
+ description: '[Tauri Apps Only] Retrieves element metadata for an element the user previously ' +
442
+ 'pointed at via Alt+Shift+Click in the Tauri app. Returns the same rich metadata ' +
443
+ 'as webview_select_element (tag, id, classes, attributes, text, bounding rect, ' +
444
+ 'CSS selector, computed styles, parent chain) plus an annotated screenshot. ' +
445
+ 'The user must Alt+Shift+Click an element first before calling this tool. ' +
446
+ 'Requires active driver_session. ' +
447
+ MULTI_APP_DESC,
448
+ category: TOOL_CATEGORIES.UI_AUTOMATION,
449
+ schema: GetPointedElementSchema,
450
+ annotations: {
451
+ title: 'Get Pointed Element',
452
+ readOnlyHint: true,
453
+ openWorldHint: false,
454
+ },
455
+ handler: async (args) => {
456
+ const parsed = GetPointedElementSchema.parse(args);
457
+ return await getPointedElement({
497
458
  windowId: parsed.windowId,
498
459
  appIdentifier: parsed.appIdentifier,
499
460
  });