screenpipe-mcp 0.5.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -54,11 +54,6 @@ function getCurrentDateInfo() {
54
54
  }),
55
55
  };
56
56
  }
57
- // Detect OS
58
- const CURRENT_OS = process.platform;
59
- const IS_MACOS = CURRENT_OS === "darwin";
60
- const IS_WINDOWS = CURRENT_OS === "win32";
61
- const IS_LINUX = CURRENT_OS === "linux";
62
57
  // Parse command line arguments
63
58
  const args = process.argv.slice(2);
64
59
  let port = 3030;
@@ -71,7 +66,7 @@ const SCREENPIPE_API = `http://localhost:${port}`;
71
66
  // Initialize server
72
67
  const server = new index_js_1.Server({
73
68
  name: "screenpipe",
74
- version: "0.5.0",
69
+ version: "0.7.0",
75
70
  }, {
76
71
  capabilities: {
77
72
  tools: {},
@@ -101,7 +96,7 @@ const BASE_TOOLS = [
101
96
  content_type: {
102
97
  type: "string",
103
98
  enum: ["all", "ocr", "audio", "ui"],
104
- description: "Content type filter. Default: 'all'",
99
+ description: "Content type filter: 'ocr' (screen text), 'audio' (transcriptions), 'ui' (legacy UI monitoring), 'all'. Default: 'all'. For keyboard/mouse/accessibility events, use search-ui-events tool instead.",
105
100
  default: "all",
106
101
  },
107
102
  limit: {
@@ -145,55 +140,15 @@ const BASE_TOOLS = [
145
140
  description: "Include base64 screenshots (OCR only). Default: false",
146
141
  default: false,
147
142
  },
148
- },
149
- },
150
- },
151
- {
152
- name: "pixel-control",
153
- description: "Control mouse and keyboard at the pixel level. This is a cross-platform tool that works on all operating systems. " +
154
- "Use this to type text, press keys, move the mouse, and click buttons.",
155
- annotations: {
156
- title: "Pixel Control",
157
- destructiveHint: true,
158
- },
159
- inputSchema: {
160
- type: "object",
161
- properties: {
162
- action_type: {
143
+ speaker_ids: {
163
144
  type: "string",
164
- enum: ["WriteText", "KeyPress", "MouseMove", "MouseClick"],
165
- description: "Type of input action to perform",
145
+ description: "Comma-separated speaker IDs to filter audio results (e.g., '1,2,3')",
166
146
  },
167
- data: {
168
- oneOf: [
169
- {
170
- type: "string",
171
- description: "Text to type or key to press (for WriteText and KeyPress)",
172
- },
173
- {
174
- type: "object",
175
- properties: {
176
- x: {
177
- type: "integer",
178
- description: "X coordinate for mouse movement",
179
- },
180
- y: {
181
- type: "integer",
182
- description: "Y coordinate for mouse movement",
183
- },
184
- },
185
- description: "Coordinates for MouseMove",
186
- },
187
- {
188
- type: "string",
189
- enum: ["left", "right", "middle"],
190
- description: "Button to click for MouseClick",
191
- },
192
- ],
193
- description: "Action-specific data",
147
+ speaker_name: {
148
+ type: "string",
149
+ description: "Filter audio by speaker name (case-insensitive partial match)",
194
150
  },
195
151
  },
196
- required: ["action_type", "data"],
197
152
  },
198
153
  },
199
154
  {
@@ -230,230 +185,88 @@ const BASE_TOOLS = [
230
185
  required: ["start_time", "end_time"],
231
186
  },
232
187
  },
233
- ];
234
- const MACOS_TOOLS = [
235
188
  {
236
- name: "find-elements",
237
- description: "Find UI elements with a specific role in an application. " +
238
- "This tool is especially useful for identifying interactive elements. " +
239
- "\n\nMacOS Accessibility Roles Guide:\n" +
240
- "- Basic roles: 'button', 'textfield', 'checkbox', 'menu', 'list'\n" +
241
- "- MacOS specific roles: 'AXButton', 'AXTextField', 'AXCheckBox', 'AXMenu', etc.\n" +
242
- "- Text inputs can be: 'AXTextField', 'AXTextArea', 'AXComboBox', 'AXSearchField'\n" +
243
- "- Clickable items: 'AXButton', 'AXMenuItem', 'AXMenuBarItem', 'AXImage', 'AXStaticText'\n" +
244
- "- Web content may use: 'AXWebArea', 'AXLink', 'AXHeading', 'AXRadioButton'\n\n" +
245
- "Use MacOS Accessibility Inspector app to identify the exact roles in your target application.",
189
+ name: "search-ui-events",
190
+ description: "Search UI input events captured via accessibility APIs (macOS). " +
191
+ "This is the third modality alongside vision (OCR) and audio. " +
192
+ "Captures: mouse clicks, keyboard text input, scroll events, app/window switches, clipboard operations. " +
193
+ "Events include app context, element info (accessibility labels), and precise timestamps. " +
194
+ "Great for understanding user workflow, what was typed, clicked, or copied.",
246
195
  annotations: {
247
- title: "Find Elements",
196
+ title: "Search UI Events (Accessibility)",
248
197
  readOnlyHint: true,
249
198
  },
250
199
  inputSchema: {
251
200
  type: "object",
252
201
  properties: {
253
- app: {
254
- type: "string",
255
- description: "The name of the application (e.g., 'Chrome', 'Finder', 'Terminal')",
256
- },
257
- window: {
258
- type: "string",
259
- description: "The window name or title (optional)",
260
- },
261
- role: {
262
- type: "string",
263
- description: "The role to search for (e.g., 'button', 'textfield', 'AXButton', 'AXTextField'). For best results, use MacOS AX prefixed roles.",
264
- },
265
- max_results: {
266
- type: "integer",
267
- description: "Maximum number of elements to return",
268
- default: 10,
269
- },
270
- max_depth: {
271
- type: "integer",
272
- description: "Maximum depth of element tree to search",
273
- },
274
- use_background_apps: {
275
- type: "boolean",
276
- description: "Whether to look in background apps",
277
- default: true,
278
- },
279
- activate_app: {
280
- type: "boolean",
281
- description: "Whether to activate the app before searching",
282
- default: true,
283
- },
284
- },
285
- required: ["app", "role"],
286
- },
287
- },
288
- {
289
- name: "click-element",
290
- description: "Click an element in an application using its id (MacOS only)",
291
- annotations: {
292
- title: "Click Element",
293
- destructiveHint: true,
294
- },
295
- inputSchema: {
296
- type: "object",
297
- properties: {
298
- app: {
299
- type: "string",
300
- description: "The name of the application",
301
- },
302
- window: {
303
- type: "string",
304
- description: "The window name (optional)",
305
- },
306
- id: {
307
- type: "string",
308
- description: "The id of the element to click",
309
- },
310
- use_background_apps: {
311
- type: "boolean",
312
- description: "Whether to look in background apps",
313
- default: true,
314
- },
315
- activate_app: {
316
- type: "boolean",
317
- description: "Whether to activate the app before clicking",
318
- default: true,
319
- },
320
- },
321
- required: ["app", "id"],
322
- },
323
- },
324
- {
325
- name: "fill-element",
326
- description: "Type text into an element in an application (MacOS only)",
327
- annotations: {
328
- title: "Fill Element",
329
- destructiveHint: true,
330
- },
331
- inputSchema: {
332
- type: "object",
333
- properties: {
334
- app: {
335
- type: "string",
336
- description: "The name of the application",
337
- },
338
- window: {
339
- type: "string",
340
- description: "The window name (optional)",
341
- },
342
- id: {
202
+ q: {
343
203
  type: "string",
344
- description: "The id of the element to fill",
204
+ description: "Search query for text content, app name, window title. Optional - omit to return recent events.",
345
205
  },
346
- text: {
206
+ event_type: {
347
207
  type: "string",
348
- description: "The text to type into the element",
208
+ enum: ["click", "text", "scroll", "key", "app_switch", "window_focus", "clipboard"],
209
+ description: "Filter by event type. 'text' = aggregated keyboard input, 'click' = mouse clicks with element context, 'app_switch'/'window_focus' = app usage tracking, 'clipboard' = copy/paste events.",
349
210
  },
350
- use_background_apps: {
351
- type: "boolean",
352
- description: "Whether to look in background apps",
353
- default: true,
354
- },
355
- activate_app: {
356
- type: "boolean",
357
- description: "Whether to activate the app before typing",
358
- default: true,
359
- },
360
- },
361
- required: ["app", "id", "text"],
362
- },
363
- },
364
- {
365
- name: "scroll-element",
366
- description: "Scroll an element in a specific direction (MacOS only)",
367
- annotations: {
368
- title: "Scroll Element",
369
- destructiveHint: true,
370
- },
371
- inputSchema: {
372
- type: "object",
373
- properties: {
374
- app: {
211
+ app_name: {
375
212
  type: "string",
376
- description: "The name of the application",
213
+ description: "Filter by application name (e.g., 'Google Chrome', 'Slack', 'Code')",
377
214
  },
378
- window: {
215
+ window_name: {
379
216
  type: "string",
380
- description: "The window name (optional)",
217
+ description: "Filter by window title",
381
218
  },
382
- id: {
219
+ start_time: {
383
220
  type: "string",
384
- description: "The id of the element to scroll",
221
+ format: "date-time",
222
+ description: "ISO 8601 UTC start time (e.g., 2024-01-15T10:00:00Z)",
385
223
  },
386
- direction: {
224
+ end_time: {
387
225
  type: "string",
388
- enum: ["up", "down", "left", "right"],
389
- description: "The direction to scroll",
226
+ format: "date-time",
227
+ description: "ISO 8601 UTC end time (e.g., 2024-01-15T18:00:00Z)",
390
228
  },
391
- amount: {
229
+ limit: {
392
230
  type: "integer",
393
- description: "The amount to scroll in pixels",
394
- },
395
- use_background_apps: {
396
- type: "boolean",
397
- description: "Whether to look in background apps",
398
- default: true,
399
- },
400
- activate_app: {
401
- type: "boolean",
402
- description: "Whether to activate the app before scrolling",
403
- default: true,
231
+ description: "Max results. Default: 50",
232
+ default: 50,
404
233
  },
405
- },
406
- required: ["app", "id", "direction", "amount"],
407
- },
408
- },
409
- {
410
- name: "open-application",
411
- description: "Open an application by name",
412
- annotations: {
413
- title: "Open Application",
414
- destructiveHint: true,
415
- },
416
- inputSchema: {
417
- type: "object",
418
- properties: {
419
- app_name: {
420
- type: "string",
421
- description: "The name of the application to open",
234
+ offset: {
235
+ type: "integer",
236
+ description: "Skip N results for pagination. Default: 0",
237
+ default: 0,
422
238
  },
423
239
  },
424
- required: ["app_name"],
425
240
  },
426
241
  },
427
242
  {
428
- name: "open-url",
429
- description: "Open a URL in a browser",
243
+ name: "get-ui-event-stats",
244
+ description: "Get aggregated statistics of UI events by app and event type. " +
245
+ "Useful for understanding app usage patterns, productivity analysis, or finding which apps were used most.",
430
246
  annotations: {
431
- title: "Open URL",
432
- destructiveHint: true,
247
+ title: "UI Event Statistics",
248
+ readOnlyHint: true,
433
249
  },
434
250
  inputSchema: {
435
251
  type: "object",
436
252
  properties: {
437
- url: {
253
+ start_time: {
438
254
  type: "string",
439
- description: "The URL to open",
255
+ format: "date-time",
256
+ description: "ISO 8601 UTC start time for stats period",
440
257
  },
441
- browser: {
258
+ end_time: {
442
259
  type: "string",
443
- description: "The browser to use (optional)",
260
+ format: "date-time",
261
+ description: "ISO 8601 UTC end time for stats period",
444
262
  },
445
263
  },
446
- required: ["url"],
447
264
  },
448
265
  },
449
266
  ];
450
267
  // List tools handler
451
268
  server.setRequestHandler(types_js_1.ListToolsRequestSchema, async () => {
452
- const tools = [...BASE_TOOLS];
453
- if (IS_MACOS) {
454
- tools.push(...MACOS_TOOLS);
455
- }
456
- return { tools };
269
+ return { tools: BASE_TOOLS };
457
270
  });
458
271
  // MCP Resources - provide dynamic context data
459
272
  const RESOURCES = [
@@ -517,18 +330,20 @@ server.setRequestHandler(types_js_1.ReadResourceRequestSchema, async (request) =
517
330
  mimeType: "text/markdown",
518
331
  text: `# Screenpipe Search Guide
519
332
 
333
+ ## Three Data Modalities
334
+
335
+ Screenpipe captures three types of data:
336
+ 1. **Vision (OCR)** - Screen text from screenshots
337
+ 2. **Audio** - Transcribed speech from microphone/system audio
338
+ 3. **UI Events (Accessibility)** - Keyboard input, mouse clicks, app switches, clipboard (macOS)
339
+
520
340
  ## Quick Start
521
341
  - **Get recent activity**: Call search-content with no parameters
522
342
  - **Search text**: \`{"q": "search term", "content_type": "ocr"}\`
523
- - **Time filter**: Use start_time/end_time with ISO 8601 UTC timestamps
343
+ - **Get keyboard input**: Use search-ui-events with \`event_type: "text"\`
344
+ - **Track app usage**: Use get-ui-event-stats for aggregated data
524
345
 
525
- ## Content Types
526
- - \`ocr\`: Screen text (what you see)
527
- - \`audio\`: Transcribed speech
528
- - \`ui\`: UI element interactions
529
- - \`all\`: Everything (default)
530
-
531
- ## Key Parameters
346
+ ## search-content (Vision + Audio)
532
347
  | Parameter | Description | Default |
533
348
  |-----------|-------------|---------|
534
349
  | q | Search query | (none - returns all) |
@@ -539,11 +354,27 @@ server.setRequestHandler(types_js_1.ReadResourceRequestSchema, async (request) =
539
354
  | app_name | Filter by app | (no filter) |
540
355
  | include_frames | Include screenshots | false |
541
356
 
357
+ ## search-ui-events (Accessibility Data)
358
+ | Parameter | Description | Default |
359
+ |-----------|-------------|---------|
360
+ | q | Search text content, app, window | (none) |
361
+ | event_type | click/text/scroll/key/app_switch/window_focus/clipboard | (all types) |
362
+ | app_name | Filter by application | (no filter) |
363
+ | limit | Max results | 50 |
364
+
365
+ ### Event Types
366
+ - \`text\`: Aggregated keyboard input (what was typed)
367
+ - \`click\`: Mouse clicks with element context (accessibility labels)
368
+ - \`app_switch\`: When user switched applications
369
+ - \`window_focus\`: When window focus changed
370
+ - \`clipboard\`: Copy/paste operations
371
+ - \`scroll\`: Scroll events with delta values
372
+
542
373
  ## Tips
543
374
  1. Read screenpipe://context first to get current timestamps
544
- 2. Omit \`q\` to get all content (useful for "what was I doing?")
545
- 3. Use \`limit: 50-100\` for comprehensive searches
546
- 4. Combine app_name + time filters for focused results`,
375
+ 2. Use search-ui-events for "what did I type?" queries
376
+ 3. Use get-ui-event-stats to understand app usage patterns
377
+ 4. Combine search-content (what was on screen) with search-ui-events (what was done)`,
547
378
  },
548
379
  ],
549
380
  };
@@ -731,25 +562,6 @@ server.setRequestHandler(types_js_1.CallToolRequestSchema, async (request) => {
731
562
  if (!args) {
732
563
  throw new Error("Missing arguments");
733
564
  }
734
- // Check if the tool is MacOS-only and we're not on MacOS
735
- const macosOnlyTools = [
736
- "click-element",
737
- "fill-element",
738
- "find-elements",
739
- "scroll-element",
740
- "open-application",
741
- "open-url",
742
- ];
743
- if (macosOnlyTools.includes(name) && !IS_MACOS) {
744
- return {
745
- content: [
746
- {
747
- type: "text",
748
- text: `The '${name}' tool is only available on MacOS. Current platform: ${CURRENT_OS}`,
749
- },
750
- ],
751
- };
752
- }
753
565
  try {
754
566
  switch (name) {
755
567
  case "search-content": {
@@ -821,47 +633,6 @@ server.setRequestHandler(types_js_1.CallToolRequestSchema, async (request) => {
821
633
  }
822
634
  return { content: contentItems };
823
635
  }
824
- case "pixel-control": {
825
- const action = {
826
- type: args.action_type,
827
- data: args.data,
828
- };
829
- const response = await fetchAPI("/experimental/operator/pixel", {
830
- method: "POST",
831
- body: JSON.stringify({ action }),
832
- });
833
- if (!response.ok) {
834
- throw new Error(`HTTP error: ${response.status}`);
835
- }
836
- const data = await response.json();
837
- if (!data.success) {
838
- return {
839
- content: [
840
- {
841
- type: "text",
842
- text: `Failed to perform input control: ${data.error || "unknown error"}`,
843
- },
844
- ],
845
- };
846
- }
847
- let resultText = "Successfully performed input control action";
848
- if (args.action_type === "WriteText") {
849
- resultText = `Successfully typed text: '${args.data}'`;
850
- }
851
- else if (args.action_type === "KeyPress") {
852
- resultText = `Successfully pressed key: '${args.data}'`;
853
- }
854
- else if (args.action_type === "MouseMove") {
855
- const coords = args.data;
856
- resultText = `Successfully moved mouse to coordinates: x=${coords.x}, y=${coords.y}`;
857
- }
858
- else if (args.action_type === "MouseClick") {
859
- resultText = `Successfully clicked ${args.data} mouse button`;
860
- }
861
- return {
862
- content: [{ type: "text", text: resultText }],
863
- };
864
- }
865
636
  case "export-video": {
866
637
  const startTime = args.start_time;
867
638
  const endTime = args.end_time;
@@ -925,7 +696,8 @@ server.setRequestHandler(types_js_1.CallToolRequestSchema, async (request) => {
925
696
  // Sort frame IDs
926
697
  frameIds.sort((a, b) => a - b);
927
698
  // Step 2: Connect to WebSocket and export video
928
- const wsUrl = `ws://localhost:${port}/frames/export?frame_ids=${frameIds.join(",")}&fps=${fps}`;
699
+ // Send frame_ids in message body to avoid URL length limits
700
+ const wsUrl = `ws://localhost:${port}/frames/export?fps=${fps}`;
929
701
  const exportResult = await new Promise((resolve) => {
930
702
  const ws = new ws_1.WebSocket(wsUrl);
931
703
  let resolved = false;
@@ -936,6 +708,10 @@ server.setRequestHandler(types_js_1.CallToolRequestSchema, async (request) => {
936
708
  resolve({ success: false, error: "Export timed out after 5 minutes" });
937
709
  }
938
710
  }, 5 * 60 * 1000); // 5 minute timeout
711
+ ws.on("open", () => {
712
+ // Send frame_ids in message body to avoid URL length limits
713
+ ws.send(JSON.stringify({ frame_ids: frameIds }));
714
+ });
939
715
  ws.on("error", (error) => {
940
716
  if (!resolved) {
941
717
  resolved = true;
@@ -1007,226 +783,119 @@ server.setRequestHandler(types_js_1.CallToolRequestSchema, async (request) => {
1007
783
  };
1008
784
  }
1009
785
  }
1010
- case "click-element": {
1011
- const selector = {
1012
- app_name: args.app,
1013
- window_name: args.window,
1014
- locator: `#${args.id}`,
1015
- use_background_apps: args.use_background_apps ?? true,
1016
- activate_app: args.activate_app ?? true,
1017
- };
1018
- const response = await fetchAPI("/experimental/operator/click", {
1019
- method: "POST",
1020
- body: JSON.stringify({ selector }),
1021
- });
1022
- if (!response.ok) {
1023
- throw new Error(`HTTP error: ${response.status}`);
1024
- }
1025
- const data = await response.json();
1026
- if (!data.success) {
1027
- return {
1028
- content: [
1029
- {
1030
- type: "text",
1031
- text: `Failed to click element: ${data.error || "unknown error"}`,
1032
- },
1033
- ],
1034
- };
1035
- }
1036
- const result = data.result || {};
1037
- const method = result.method || "unknown";
1038
- const details = result.details || "click operation completed";
1039
- return {
1040
- content: [
1041
- {
1042
- type: "text",
1043
- text: `Successfully clicked element using ${method}. ${details}`,
1044
- },
1045
- ],
1046
- };
1047
- }
1048
- case "fill-element": {
1049
- const selector = {
1050
- app_name: args.app,
1051
- window_name: args.window,
1052
- locator: `#${args.id}`,
1053
- use_background_apps: args.use_background_apps ?? true,
1054
- activate_app: args.activate_app ?? true,
1055
- };
1056
- const response = await fetchAPI("/experimental/operator/type", {
1057
- method: "POST",
1058
- body: JSON.stringify({ selector, text: args.text || "" }),
1059
- });
1060
- if (!response.ok) {
1061
- throw new Error(`HTTP error: ${response.status}`);
1062
- }
1063
- const data = await response.json();
1064
- if (!data.success) {
1065
- return {
1066
- content: [
1067
- {
1068
- type: "text",
1069
- text: `Failed to fill element: ${data.error || "unknown error"}`,
1070
- },
1071
- ],
1072
- };
786
+ case "search-ui-events": {
787
+ const params = new URLSearchParams();
788
+ for (const [key, value] of Object.entries(args)) {
789
+ if (value !== null && value !== undefined) {
790
+ // Map event_type to the API parameter
791
+ params.append(key, String(value));
792
+ }
1073
793
  }
1074
- return {
1075
- content: [
1076
- { type: "text", text: "Successfully filled element with text" },
1077
- ],
1078
- };
1079
- }
1080
- case "find-elements": {
1081
- const selector = {
1082
- app_name: args.app,
1083
- window_name: args.window,
1084
- locator: args.role || "",
1085
- use_background_apps: args.use_background_apps ?? true,
1086
- activate_app: args.activate_app ?? true,
1087
- };
1088
- const response = await fetchAPI("/experimental/operator", {
1089
- method: "POST",
1090
- body: JSON.stringify({
1091
- selector,
1092
- max_results: args.max_results || 10,
1093
- max_depth: args.max_depth,
1094
- }),
1095
- });
794
+ const response = await fetchAPI(`/ui-events?${params.toString()}`);
1096
795
  if (!response.ok) {
1097
796
  throw new Error(`HTTP error: ${response.status}`);
1098
797
  }
1099
798
  const data = await response.json();
1100
- if (!data.success) {
1101
- return {
1102
- content: [
1103
- {
1104
- type: "text",
1105
- text: `Failed to find elements: ${data.error || "unknown error"}`,
1106
- },
1107
- ],
1108
- };
1109
- }
1110
- const elements = data.data || [];
1111
- if (elements.length === 0) {
799
+ const events = data.data || [];
800
+ const pagination = data.pagination || {};
801
+ if (events.length === 0) {
1112
802
  return {
1113
803
  content: [
1114
804
  {
1115
805
  type: "text",
1116
- text: `No elements found matching role '${args.role}' in app '${args.app}'`,
806
+ text: "No UI events found. This feature requires:\n" +
807
+ "1. macOS with Accessibility permissions granted\n" +
808
+ "2. UI Events enabled in screenpipe settings\n" +
809
+ "Try: broader time range or different event_type filter.",
1117
810
  },
1118
811
  ],
1119
812
  };
1120
813
  }
1121
- let resultText = `Found ${elements.length} elements matching role '${args.role}' in app '${args.app}':\n\n`;
1122
- elements.forEach((element, i) => {
1123
- resultText +=
1124
- `Element ${i + 1}:\n` +
1125
- `ID: ${element.id || "N/A"}\n` +
1126
- `Role: ${element.role || "N/A"}\n` +
1127
- `Text: ${element.text || "N/A"}\n` +
1128
- `Description: ${element.description || "N/A"}\n` +
1129
- "---\n";
1130
- });
1131
- return {
1132
- content: [{ type: "text", text: resultText }],
1133
- };
1134
- }
1135
- case "scroll-element": {
1136
- const selector = {
1137
- app_name: args.app,
1138
- window_name: args.window,
1139
- locator: `#${args.id}`,
1140
- use_background_apps: args.use_background_apps ?? true,
1141
- activate_app: args.activate_app ?? true,
1142
- };
1143
- const response = await fetchAPI("/experimental/operator/scroll", {
1144
- method: "POST",
1145
- body: JSON.stringify({
1146
- selector,
1147
- direction: args.direction,
1148
- amount: args.amount,
1149
- }),
1150
- });
1151
- if (!response.ok) {
1152
- throw new Error(`HTTP error: ${response.status}`);
1153
- }
1154
- const data = await response.json();
1155
- if (!data.success) {
1156
- return {
1157
- content: [
1158
- {
1159
- type: "text",
1160
- text: `Failed to scroll element: ${data.error || "unknown error"}`,
1161
- },
1162
- ],
1163
- };
814
+ const formattedEvents = [];
815
+ for (const event of events) {
816
+ const parts = [
817
+ `[${event.event_type?.toUpperCase() || "?"}]`,
818
+ event.app_name || "?",
819
+ event.window_title ? `| ${event.window_title}` : "",
820
+ ];
821
+ let details = "";
822
+ if (event.event_type === "text" && event.text_content) {
823
+ details = `Text: "${event.text_content}"`;
824
+ }
825
+ else if (event.event_type === "click") {
826
+ details = `Click at (${event.x || 0}, ${event.y || 0})`;
827
+ if (event.element?.label) {
828
+ details += ` on "${event.element.label}"`;
829
+ }
830
+ }
831
+ else if (event.event_type === "clipboard" && event.text_content) {
832
+ details = `Clipboard: "${event.text_content.substring(0, 100)}${event.text_content.length > 100 ? "..." : ""}"`;
833
+ }
834
+ else if (event.event_type === "app_switch" || event.event_type === "window_focus") {
835
+ details = `Switched to: ${event.app_name}${event.window_title ? ` - ${event.window_title}` : ""}`;
836
+ }
837
+ else if (event.event_type === "scroll") {
838
+ details = `Scroll: dx=${event.delta_x || 0}, dy=${event.delta_y || 0}`;
839
+ }
840
+ formattedEvents.push(`${parts.join(" ")}\n` +
841
+ `${event.timestamp || ""}\n` +
842
+ `${details}`);
1164
843
  }
844
+ const header = `UI Events: ${events.length}/${pagination.total || "?"}` +
845
+ (pagination.total > events.length ? ` (use offset=${(pagination.offset || 0) + events.length} for more)` : "");
1165
846
  return {
1166
847
  content: [
1167
848
  {
1168
849
  type: "text",
1169
- text: `Successfully scrolled element ${args.direction} by ${args.amount} pixels`,
850
+ text: header + "\n\n" + formattedEvents.join("\n---\n"),
1170
851
  },
1171
852
  ],
1172
853
  };
1173
854
  }
1174
- case "open-application": {
1175
- const response = await fetchAPI("/experimental/operator/open-application", {
1176
- method: "POST",
1177
- body: JSON.stringify({ app_name: args.app_name || "" }),
1178
- });
855
+ case "get-ui-event-stats": {
856
+ const params = new URLSearchParams();
857
+ if (args.start_time)
858
+ params.append("start_time", String(args.start_time));
859
+ if (args.end_time)
860
+ params.append("end_time", String(args.end_time));
861
+ const response = await fetchAPI(`/ui-events/stats?${params.toString()}`);
1179
862
  if (!response.ok) {
1180
863
  throw new Error(`HTTP error: ${response.status}`);
1181
864
  }
1182
- const data = await response.json();
1183
- if (!data.success) {
865
+ const stats = await response.json();
866
+ if (!stats || stats.length === 0) {
1184
867
  return {
1185
868
  content: [
1186
869
  {
1187
870
  type: "text",
1188
- text: `Failed to open application: ${data.error || "unknown error"}`,
871
+ text: "No UI event statistics available. UI Events may not be enabled or no events have been captured yet.",
1189
872
  },
1190
873
  ],
1191
874
  };
1192
875
  }
1193
- return {
1194
- content: [
1195
- {
1196
- type: "text",
1197
- text: `Successfully opened application '${args.app_name}'`,
1198
- },
1199
- ],
1200
- };
1201
- }
1202
- case "open-url": {
1203
- const response = await fetchAPI("/experimental/operator/open-url", {
1204
- method: "POST",
1205
- body: JSON.stringify({
1206
- url: args.url || "",
1207
- browser: args.browser,
1208
- }),
1209
- });
1210
- if (!response.ok) {
1211
- throw new Error(`HTTP error: ${response.status}`);
1212
- }
1213
- const data = await response.json();
1214
- if (!data.success) {
1215
- return {
1216
- content: [
1217
- {
1218
- type: "text",
1219
- text: `Failed to open URL: ${data.error || "unknown error"}`,
1220
- },
1221
- ],
1222
- };
876
+ // Group by app
877
+ const byApp = {};
878
+ for (const stat of stats) {
879
+ const app = stat.app_name || "Unknown";
880
+ if (!byApp[app]) {
881
+ byApp[app] = { app, events: {}, total: 0 };
882
+ }
883
+ byApp[app].events[stat.event_type] = stat.count;
884
+ byApp[app].total += stat.count;
1223
885
  }
1224
- const browserInfo = args.browser ? ` using ${args.browser}` : "";
886
+ // Sort by total events
887
+ const sorted = Object.values(byApp).sort((a, b) => b.total - a.total);
888
+ const lines = sorted.map(({ app, events, total }) => {
889
+ const eventDetails = Object.entries(events)
890
+ .map(([type, count]) => `${type}: ${count}`)
891
+ .join(", ");
892
+ return `${app}: ${total} events (${eventDetails})`;
893
+ });
1225
894
  return {
1226
895
  content: [
1227
896
  {
1228
897
  type: "text",
1229
- text: `Successfully opened URL '${args.url}'${browserInfo}`,
898
+ text: `UI Event Statistics:\n\n${lines.join("\n")}`,
1230
899
  },
1231
900
  ],
1232
901
  };