screenpipe-mcp 0.5.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts CHANGED
@@ -30,12 +30,6 @@ function getCurrentDateInfo(): { isoDate: string; localDate: string } {
30
30
  };
31
31
  }
32
32
 
33
- // Detect OS
34
- const CURRENT_OS = process.platform;
35
- const IS_MACOS = CURRENT_OS === "darwin";
36
- const IS_WINDOWS = CURRENT_OS === "win32";
37
- const IS_LINUX = CURRENT_OS === "linux";
38
-
39
33
  // Parse command line arguments
40
34
  const args = process.argv.slice(2);
41
35
  let port = 3030;
@@ -51,7 +45,7 @@ const SCREENPIPE_API = `http://localhost:${port}`;
51
45
  const server = new Server(
52
46
  {
53
47
  name: "screenpipe",
54
- version: "0.5.0",
48
+ version: "0.7.0",
55
49
  },
56
50
  {
57
51
  capabilities: {
@@ -85,7 +79,7 @@ const BASE_TOOLS: Tool[] = [
85
79
  content_type: {
86
80
  type: "string",
87
81
  enum: ["all", "ocr", "audio", "ui"],
88
- description: "Content type filter. Default: 'all'",
82
+ description: "Content type filter: 'ocr' (screen text), 'audio' (transcriptions), 'ui' (legacy UI monitoring), 'all'. Default: 'all'. For keyboard/mouse/accessibility events, use search-ui-events tool instead.",
89
83
  default: "all",
90
84
  },
91
85
  limit: {
@@ -129,57 +123,15 @@ const BASE_TOOLS: Tool[] = [
129
123
  description: "Include base64 screenshots (OCR only). Default: false",
130
124
  default: false,
131
125
  },
132
- },
133
- },
134
- },
135
- {
136
- name: "pixel-control",
137
- description:
138
- "Control mouse and keyboard at the pixel level. This is a cross-platform tool that works on all operating systems. " +
139
- "Use this to type text, press keys, move the mouse, and click buttons.",
140
- annotations: {
141
- title: "Pixel Control",
142
- destructiveHint: true,
143
- },
144
- inputSchema: {
145
- type: "object",
146
- properties: {
147
- action_type: {
126
+ speaker_ids: {
148
127
  type: "string",
149
- enum: ["WriteText", "KeyPress", "MouseMove", "MouseClick"],
150
- description: "Type of input action to perform",
128
+ description: "Comma-separated speaker IDs to filter audio results (e.g., '1,2,3')",
151
129
  },
152
- data: {
153
- oneOf: [
154
- {
155
- type: "string",
156
- description:
157
- "Text to type or key to press (for WriteText and KeyPress)",
158
- },
159
- {
160
- type: "object",
161
- properties: {
162
- x: {
163
- type: "integer",
164
- description: "X coordinate for mouse movement",
165
- },
166
- y: {
167
- type: "integer",
168
- description: "Y coordinate for mouse movement",
169
- },
170
- },
171
- description: "Coordinates for MouseMove",
172
- },
173
- {
174
- type: "string",
175
- enum: ["left", "right", "middle"],
176
- description: "Button to click for MouseClick",
177
- },
178
- ],
179
- description: "Action-specific data",
130
+ speaker_name: {
131
+ type: "string",
132
+ description: "Filter audio by speaker name (case-insensitive partial match)",
180
133
  },
181
134
  },
182
- required: ["action_type", "data"],
183
135
  },
184
136
  },
185
137
  {
@@ -220,236 +172,91 @@ const BASE_TOOLS: Tool[] = [
220
172
  required: ["start_time", "end_time"],
221
173
  },
222
174
  },
223
- ];
224
-
225
- const MACOS_TOOLS: Tool[] = [
226
175
  {
227
- name: "find-elements",
176
+ name: "search-ui-events",
228
177
  description:
229
- "Find UI elements with a specific role in an application. " +
230
- "This tool is especially useful for identifying interactive elements. " +
231
- "\n\nMacOS Accessibility Roles Guide:\n" +
232
- "- Basic roles: 'button', 'textfield', 'checkbox', 'menu', 'list'\n" +
233
- "- MacOS specific roles: 'AXButton', 'AXTextField', 'AXCheckBox', 'AXMenu', etc.\n" +
234
- "- Text inputs can be: 'AXTextField', 'AXTextArea', 'AXComboBox', 'AXSearchField'\n" +
235
- "- Clickable items: 'AXButton', 'AXMenuItem', 'AXMenuBarItem', 'AXImage', 'AXStaticText'\n" +
236
- "- Web content may use: 'AXWebArea', 'AXLink', 'AXHeading', 'AXRadioButton'\n\n" +
237
- "Use MacOS Accessibility Inspector app to identify the exact roles in your target application.",
178
+ "Search UI input events captured via accessibility APIs (macOS). " +
179
+ "This is the third modality alongside vision (OCR) and audio. " +
180
+ "Captures: mouse clicks, keyboard text input, scroll events, app/window switches, clipboard operations. " +
181
+ "Events include app context, element info (accessibility labels), and precise timestamps. " +
182
+ "Great for understanding user workflow, what was typed, clicked, or copied.",
238
183
  annotations: {
239
- title: "Find Elements",
184
+ title: "Search UI Events (Accessibility)",
240
185
  readOnlyHint: true,
241
186
  },
242
187
  inputSchema: {
243
188
  type: "object",
244
189
  properties: {
245
- app: {
246
- type: "string",
247
- description:
248
- "The name of the application (e.g., 'Chrome', 'Finder', 'Terminal')",
249
- },
250
- window: {
251
- type: "string",
252
- description: "The window name or title (optional)",
253
- },
254
- role: {
255
- type: "string",
256
- description:
257
- "The role to search for (e.g., 'button', 'textfield', 'AXButton', 'AXTextField'). For best results, use MacOS AX prefixed roles.",
258
- },
259
- max_results: {
260
- type: "integer",
261
- description: "Maximum number of elements to return",
262
- default: 10,
263
- },
264
- max_depth: {
265
- type: "integer",
266
- description: "Maximum depth of element tree to search",
267
- },
268
- use_background_apps: {
269
- type: "boolean",
270
- description: "Whether to look in background apps",
271
- default: true,
272
- },
273
- activate_app: {
274
- type: "boolean",
275
- description: "Whether to activate the app before searching",
276
- default: true,
277
- },
278
- },
279
- required: ["app", "role"],
280
- },
281
- },
282
- {
283
- name: "click-element",
284
- description:
285
- "Click an element in an application using its id (MacOS only)",
286
- annotations: {
287
- title: "Click Element",
288
- destructiveHint: true,
289
- },
290
- inputSchema: {
291
- type: "object",
292
- properties: {
293
- app: {
294
- type: "string",
295
- description: "The name of the application",
296
- },
297
- window: {
298
- type: "string",
299
- description: "The window name (optional)",
300
- },
301
- id: {
302
- type: "string",
303
- description: "The id of the element to click",
304
- },
305
- use_background_apps: {
306
- type: "boolean",
307
- description: "Whether to look in background apps",
308
- default: true,
309
- },
310
- activate_app: {
311
- type: "boolean",
312
- description: "Whether to activate the app before clicking",
313
- default: true,
314
- },
315
- },
316
- required: ["app", "id"],
317
- },
318
- },
319
- {
320
- name: "fill-element",
321
- description: "Type text into an element in an application (MacOS only)",
322
- annotations: {
323
- title: "Fill Element",
324
- destructiveHint: true,
325
- },
326
- inputSchema: {
327
- type: "object",
328
- properties: {
329
- app: {
330
- type: "string",
331
- description: "The name of the application",
332
- },
333
- window: {
334
- type: "string",
335
- description: "The window name (optional)",
336
- },
337
- id: {
190
+ q: {
338
191
  type: "string",
339
- description: "The id of the element to fill",
192
+ description: "Search query for text content, app name, window title. Optional - omit to return recent events.",
340
193
  },
341
- text: {
194
+ event_type: {
342
195
  type: "string",
343
- description: "The text to type into the element",
344
- },
345
- use_background_apps: {
346
- type: "boolean",
347
- description: "Whether to look in background apps",
348
- default: true,
196
+ enum: ["click", "text", "scroll", "key", "app_switch", "window_focus", "clipboard"],
197
+ description: "Filter by event type. 'text' = aggregated keyboard input, 'click' = mouse clicks with element context, 'app_switch'/'window_focus' = app usage tracking, 'clipboard' = copy/paste events.",
349
198
  },
350
- activate_app: {
351
- type: "boolean",
352
- description: "Whether to activate the app before typing",
353
- default: true,
354
- },
355
- },
356
- required: ["app", "id", "text"],
357
- },
358
- },
359
- {
360
- name: "scroll-element",
361
- description: "Scroll an element in a specific direction (MacOS only)",
362
- annotations: {
363
- title: "Scroll Element",
364
- destructiveHint: true,
365
- },
366
- inputSchema: {
367
- type: "object",
368
- properties: {
369
- app: {
199
+ app_name: {
370
200
  type: "string",
371
- description: "The name of the application",
201
+ description: "Filter by application name (e.g., 'Google Chrome', 'Slack', 'Code')",
372
202
  },
373
- window: {
203
+ window_name: {
374
204
  type: "string",
375
- description: "The window name (optional)",
205
+ description: "Filter by window title",
376
206
  },
377
- id: {
207
+ start_time: {
378
208
  type: "string",
379
- description: "The id of the element to scroll",
209
+ format: "date-time",
210
+ description: "ISO 8601 UTC start time (e.g., 2024-01-15T10:00:00Z)",
380
211
  },
381
- direction: {
212
+ end_time: {
382
213
  type: "string",
383
- enum: ["up", "down", "left", "right"],
384
- description: "The direction to scroll",
214
+ format: "date-time",
215
+ description: "ISO 8601 UTC end time (e.g., 2024-01-15T18:00:00Z)",
385
216
  },
386
- amount: {
217
+ limit: {
387
218
  type: "integer",
388
- description: "The amount to scroll in pixels",
219
+ description: "Max results. Default: 50",
220
+ default: 50,
389
221
  },
390
- use_background_apps: {
391
- type: "boolean",
392
- description: "Whether to look in background apps",
393
- default: true,
394
- },
395
- activate_app: {
396
- type: "boolean",
397
- description: "Whether to activate the app before scrolling",
398
- default: true,
399
- },
400
- },
401
- required: ["app", "id", "direction", "amount"],
402
- },
403
- },
404
- {
405
- name: "open-application",
406
- description: "Open an application by name",
407
- annotations: {
408
- title: "Open Application",
409
- destructiveHint: true,
410
- },
411
- inputSchema: {
412
- type: "object",
413
- properties: {
414
- app_name: {
415
- type: "string",
416
- description: "The name of the application to open",
222
+ offset: {
223
+ type: "integer",
224
+ description: "Skip N results for pagination. Default: 0",
225
+ default: 0,
417
226
  },
418
227
  },
419
- required: ["app_name"],
420
228
  },
421
229
  },
422
230
  {
423
- name: "open-url",
424
- description: "Open a URL in a browser",
231
+ name: "get-ui-event-stats",
232
+ description:
233
+ "Get aggregated statistics of UI events by app and event type. " +
234
+ "Useful for understanding app usage patterns, productivity analysis, or finding which apps were used most.",
425
235
  annotations: {
426
- title: "Open URL",
427
- destructiveHint: true,
236
+ title: "UI Event Statistics",
237
+ readOnlyHint: true,
428
238
  },
429
239
  inputSchema: {
430
240
  type: "object",
431
241
  properties: {
432
- url: {
242
+ start_time: {
433
243
  type: "string",
434
- description: "The URL to open",
244
+ format: "date-time",
245
+ description: "ISO 8601 UTC start time for stats period",
435
246
  },
436
- browser: {
247
+ end_time: {
437
248
  type: "string",
438
- description: "The browser to use (optional)",
249
+ format: "date-time",
250
+ description: "ISO 8601 UTC end time for stats period",
439
251
  },
440
252
  },
441
- required: ["url"],
442
253
  },
443
254
  },
444
255
  ];
445
256
 
446
257
  // List tools handler
447
258
  server.setRequestHandler(ListToolsRequestSchema, async () => {
448
- const tools = [...BASE_TOOLS];
449
- if (IS_MACOS) {
450
- tools.push(...MACOS_TOOLS);
451
- }
452
- return { tools };
259
+ return { tools: BASE_TOOLS };
453
260
  });
454
261
 
455
262
  // MCP Resources - provide dynamic context data
@@ -518,18 +325,20 @@ server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
518
325
  mimeType: "text/markdown",
519
326
  text: `# Screenpipe Search Guide
520
327
 
328
+ ## Three Data Modalities
329
+
330
+ Screenpipe captures three types of data:
331
+ 1. **Vision (OCR)** - Screen text from screenshots
332
+ 2. **Audio** - Transcribed speech from microphone/system audio
333
+ 3. **UI Events (Accessibility)** - Keyboard input, mouse clicks, app switches, clipboard (macOS)
334
+
521
335
  ## Quick Start
522
336
  - **Get recent activity**: Call search-content with no parameters
523
337
  - **Search text**: \`{"q": "search term", "content_type": "ocr"}\`
524
- - **Time filter**: Use start_time/end_time with ISO 8601 UTC timestamps
338
+ - **Get keyboard input**: Use search-ui-events with \`event_type: "text"\`
339
+ - **Track app usage**: Use get-ui-event-stats for aggregated data
525
340
 
526
- ## Content Types
527
- - \`ocr\`: Screen text (what you see)
528
- - \`audio\`: Transcribed speech
529
- - \`ui\`: UI element interactions
530
- - \`all\`: Everything (default)
531
-
532
- ## Key Parameters
341
+ ## search-content (Vision + Audio)
533
342
  | Parameter | Description | Default |
534
343
  |-----------|-------------|---------|
535
344
  | q | Search query | (none - returns all) |
@@ -540,11 +349,27 @@ server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
540
349
  | app_name | Filter by app | (no filter) |
541
350
  | include_frames | Include screenshots | false |
542
351
 
352
+ ## search-ui-events (Accessibility Data)
353
+ | Parameter | Description | Default |
354
+ |-----------|-------------|---------|
355
+ | q | Search text content, app, window | (none) |
356
+ | event_type | click/text/scroll/key/app_switch/window_focus/clipboard | (all types) |
357
+ | app_name | Filter by application | (no filter) |
358
+ | limit | Max results | 50 |
359
+
360
+ ### Event Types
361
+ - \`text\`: Aggregated keyboard input (what was typed)
362
+ - \`click\`: Mouse clicks with element context (accessibility labels)
363
+ - \`app_switch\`: When user switched applications
364
+ - \`window_focus\`: When window focus changed
365
+ - \`clipboard\`: Copy/paste operations
366
+ - \`scroll\`: Scroll events with delta values
367
+
543
368
  ## Tips
544
369
  1. Read screenpipe://context first to get current timestamps
545
- 2. Omit \`q\` to get all content (useful for "what was I doing?")
546
- 3. Use \`limit: 50-100\` for comprehensive searches
547
- 4. Combine app_name + time filters for focused results`,
370
+ 2. Use search-ui-events for "what did I type?" queries
371
+ 3. Use get-ui-event-stats to understand app usage patterns
372
+ 4. Combine search-content (what was on screen) with search-ui-events (what was done)`,
548
373
  },
549
374
  ],
550
375
  };
@@ -750,27 +575,6 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
750
575
  throw new Error("Missing arguments");
751
576
  }
752
577
 
753
- // Check if the tool is MacOS-only and we're not on MacOS
754
- const macosOnlyTools = [
755
- "click-element",
756
- "fill-element",
757
- "find-elements",
758
- "scroll-element",
759
- "open-application",
760
- "open-url",
761
- ];
762
-
763
- if (macosOnlyTools.includes(name) && !IS_MACOS) {
764
- return {
765
- content: [
766
- {
767
- type: "text",
768
- text: `The '${name}' tool is only available on MacOS. Current platform: ${CURRENT_OS}`,
769
- },
770
- ],
771
- };
772
- }
773
-
774
578
  try {
775
579
  switch (name) {
776
580
  case "search-content": {
@@ -860,50 +664,6 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
860
664
  return { content: contentItems };
861
665
  }
862
666
 
863
- case "pixel-control": {
864
- const action = {
865
- type: args.action_type,
866
- data: args.data,
867
- };
868
-
869
- const response = await fetchAPI("/experimental/operator/pixel", {
870
- method: "POST",
871
- body: JSON.stringify({ action }),
872
- });
873
-
874
- if (!response.ok) {
875
- throw new Error(`HTTP error: ${response.status}`);
876
- }
877
-
878
- const data = await response.json();
879
- if (!data.success) {
880
- return {
881
- content: [
882
- {
883
- type: "text",
884
- text: `Failed to perform input control: ${data.error || "unknown error"}`,
885
- },
886
- ],
887
- };
888
- }
889
-
890
- let resultText = "Successfully performed input control action";
891
- if (args.action_type === "WriteText") {
892
- resultText = `Successfully typed text: '${args.data}'`;
893
- } else if (args.action_type === "KeyPress") {
894
- resultText = `Successfully pressed key: '${args.data}'`;
895
- } else if (args.action_type === "MouseMove") {
896
- const coords = args.data as { x: number; y: number };
897
- resultText = `Successfully moved mouse to coordinates: x=${coords.x}, y=${coords.y}`;
898
- } else if (args.action_type === "MouseClick") {
899
- resultText = `Successfully clicked ${args.data} mouse button`;
900
- }
901
-
902
- return {
903
- content: [{ type: "text", text: resultText }],
904
- };
905
- }
906
-
907
667
  case "export-video": {
908
668
  const startTime = args.start_time as string;
909
669
  const endTime = args.end_time as string;
@@ -976,7 +736,8 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
976
736
  frameIds.sort((a, b) => a - b);
977
737
 
978
738
  // Step 2: Connect to WebSocket and export video
979
- const wsUrl = `ws://localhost:${port}/frames/export?frame_ids=${frameIds.join(",")}&fps=${fps}`;
739
+ // Send frame_ids in message body to avoid URL length limits
740
+ const wsUrl = `ws://localhost:${port}/frames/export?fps=${fps}`;
980
741
 
981
742
  const exportResult = await new Promise<{
982
743
  success: boolean;
@@ -995,6 +756,11 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
995
756
  }
996
757
  }, 5 * 60 * 1000); // 5 minute timeout
997
758
 
759
+ ws.on("open", () => {
760
+ // Send frame_ids in message body to avoid URL length limits
761
+ ws.send(JSON.stringify({ frame_ids: frameIds }));
762
+ });
763
+
998
764
  ws.on("error", (error) => {
999
765
  if (!resolved) {
1000
766
  resolved = true;
@@ -1070,259 +836,131 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
1070
836
  }
1071
837
  }
1072
838
 
1073
- case "click-element": {
1074
- const selector = {
1075
- app_name: args.app,
1076
- window_name: args.window,
1077
- locator: `#${args.id}`,
1078
- use_background_apps: args.use_background_apps ?? true,
1079
- activate_app: args.activate_app ?? true,
1080
- };
1081
-
1082
- const response = await fetchAPI("/experimental/operator/click", {
1083
- method: "POST",
1084
- body: JSON.stringify({ selector }),
1085
- });
1086
-
1087
- if (!response.ok) {
1088
- throw new Error(`HTTP error: ${response.status}`);
1089
- }
1090
-
1091
- const data = await response.json();
1092
- if (!data.success) {
1093
- return {
1094
- content: [
1095
- {
1096
- type: "text",
1097
- text: `Failed to click element: ${data.error || "unknown error"}`,
1098
- },
1099
- ],
1100
- };
1101
- }
1102
-
1103
- const result = data.result || {};
1104
- const method = result.method || "unknown";
1105
- const details = result.details || "click operation completed";
1106
-
1107
- return {
1108
- content: [
1109
- {
1110
- type: "text",
1111
- text: `Successfully clicked element using ${method}. ${details}`,
1112
- },
1113
- ],
1114
- };
1115
- }
1116
-
1117
- case "fill-element": {
1118
- const selector = {
1119
- app_name: args.app,
1120
- window_name: args.window,
1121
- locator: `#${args.id}`,
1122
- use_background_apps: args.use_background_apps ?? true,
1123
- activate_app: args.activate_app ?? true,
1124
- };
1125
-
1126
- const response = await fetchAPI("/experimental/operator/type", {
1127
- method: "POST",
1128
- body: JSON.stringify({ selector, text: args.text || "" }),
1129
- });
1130
-
1131
- if (!response.ok) {
1132
- throw new Error(`HTTP error: ${response.status}`);
1133
- }
1134
-
1135
- const data = await response.json();
1136
- if (!data.success) {
1137
- return {
1138
- content: [
1139
- {
1140
- type: "text",
1141
- text: `Failed to fill element: ${data.error || "unknown error"}`,
1142
- },
1143
- ],
1144
- };
839
+ case "search-ui-events": {
840
+ const params = new URLSearchParams();
841
+ for (const [key, value] of Object.entries(args)) {
842
+ if (value !== null && value !== undefined) {
843
+ // Map event_type to the API parameter
844
+ params.append(key, String(value));
845
+ }
1145
846
  }
1146
847
 
1147
- return {
1148
- content: [
1149
- { type: "text", text: "Successfully filled element with text" },
1150
- ],
1151
- };
1152
- }
1153
-
1154
- case "find-elements": {
1155
- const selector = {
1156
- app_name: args.app,
1157
- window_name: args.window,
1158
- locator: args.role || "",
1159
- use_background_apps: args.use_background_apps ?? true,
1160
- activate_app: args.activate_app ?? true,
1161
- };
1162
-
1163
- const response = await fetchAPI("/experimental/operator", {
1164
- method: "POST",
1165
- body: JSON.stringify({
1166
- selector,
1167
- max_results: args.max_results || 10,
1168
- max_depth: args.max_depth,
1169
- }),
1170
- });
1171
-
848
+ const response = await fetchAPI(`/ui-events?${params.toString()}`);
1172
849
  if (!response.ok) {
1173
850
  throw new Error(`HTTP error: ${response.status}`);
1174
851
  }
1175
852
 
1176
853
  const data = await response.json();
1177
- if (!data.success) {
1178
- return {
1179
- content: [
1180
- {
1181
- type: "text",
1182
- text: `Failed to find elements: ${data.error || "unknown error"}`,
1183
- },
1184
- ],
1185
- };
1186
- }
854
+ const events = data.data || [];
855
+ const pagination = data.pagination || {};
1187
856
 
1188
- const elements = data.data || [];
1189
- if (elements.length === 0) {
857
+ if (events.length === 0) {
1190
858
  return {
1191
859
  content: [
1192
860
  {
1193
861
  type: "text",
1194
- text: `No elements found matching role '${args.role}' in app '${args.app}'`,
862
+ text: "No UI events found. This feature requires:\n" +
863
+ "1. macOS with Accessibility permissions granted\n" +
864
+ "2. UI Events enabled in screenpipe settings\n" +
865
+ "Try: broader time range or different event_type filter.",
1195
866
  },
1196
867
  ],
1197
868
  };
1198
869
  }
1199
870
 
1200
- let resultText = `Found ${elements.length} elements matching role '${args.role}' in app '${args.app}':\n\n`;
1201
- elements.forEach((element: any, i: number) => {
1202
- resultText +=
1203
- `Element ${i + 1}:\n` +
1204
- `ID: ${element.id || "N/A"}\n` +
1205
- `Role: ${element.role || "N/A"}\n` +
1206
- `Text: ${element.text || "N/A"}\n` +
1207
- `Description: ${element.description || "N/A"}\n` +
1208
- "---\n";
1209
- });
1210
-
1211
- return {
1212
- content: [{ type: "text", text: resultText }],
1213
- };
1214
- }
1215
-
1216
- case "scroll-element": {
1217
- const selector = {
1218
- app_name: args.app,
1219
- window_name: args.window,
1220
- locator: `#${args.id}`,
1221
- use_background_apps: args.use_background_apps ?? true,
1222
- activate_app: args.activate_app ?? true,
1223
- };
1224
-
1225
- const response = await fetchAPI("/experimental/operator/scroll", {
1226
- method: "POST",
1227
- body: JSON.stringify({
1228
- selector,
1229
- direction: args.direction,
1230
- amount: args.amount,
1231
- }),
1232
- });
871
+ const formattedEvents: string[] = [];
872
+ for (const event of events) {
873
+ const parts = [
874
+ `[${event.event_type?.toUpperCase() || "?"}]`,
875
+ event.app_name || "?",
876
+ event.window_title ? `| ${event.window_title}` : "",
877
+ ];
878
+
879
+ let details = "";
880
+ if (event.event_type === "text" && event.text_content) {
881
+ details = `Text: "${event.text_content}"`;
882
+ } else if (event.event_type === "click") {
883
+ details = `Click at (${event.x || 0}, ${event.y || 0})`;
884
+ if (event.element?.label) {
885
+ details += ` on "${event.element.label}"`;
886
+ }
887
+ } else if (event.event_type === "clipboard" && event.text_content) {
888
+ details = `Clipboard: "${event.text_content.substring(0, 100)}${event.text_content.length > 100 ? "..." : ""}"`;
889
+ } else if (event.event_type === "app_switch" || event.event_type === "window_focus") {
890
+ details = `Switched to: ${event.app_name}${event.window_title ? ` - ${event.window_title}` : ""}`;
891
+ } else if (event.event_type === "scroll") {
892
+ details = `Scroll: dx=${event.delta_x || 0}, dy=${event.delta_y || 0}`;
893
+ }
1233
894
 
1234
- if (!response.ok) {
1235
- throw new Error(`HTTP error: ${response.status}`);
895
+ formattedEvents.push(
896
+ `${parts.join(" ")}\n` +
897
+ `${event.timestamp || ""}\n` +
898
+ `${details}`
899
+ );
1236
900
  }
1237
901
 
1238
- const data = await response.json();
1239
- if (!data.success) {
1240
- return {
1241
- content: [
1242
- {
1243
- type: "text",
1244
- text: `Failed to scroll element: ${data.error || "unknown error"}`,
1245
- },
1246
- ],
1247
- };
1248
- }
902
+ const header = `UI Events: ${events.length}/${pagination.total || "?"}` +
903
+ (pagination.total > events.length ? ` (use offset=${(pagination.offset || 0) + events.length} for more)` : "");
1249
904
 
1250
905
  return {
1251
906
  content: [
1252
907
  {
1253
908
  type: "text",
1254
- text: `Successfully scrolled element ${args.direction} by ${args.amount} pixels`,
909
+ text: header + "\n\n" + formattedEvents.join("\n---\n"),
1255
910
  },
1256
911
  ],
1257
912
  };
1258
913
  }
1259
914
 
1260
- case "open-application": {
1261
- const response = await fetchAPI(
1262
- "/experimental/operator/open-application",
1263
- {
1264
- method: "POST",
1265
- body: JSON.stringify({ app_name: args.app_name || "" }),
1266
- }
1267
- );
915
+ case "get-ui-event-stats": {
916
+ const params = new URLSearchParams();
917
+ if (args.start_time) params.append("start_time", String(args.start_time));
918
+ if (args.end_time) params.append("end_time", String(args.end_time));
1268
919
 
920
+ const response = await fetchAPI(`/ui-events/stats?${params.toString()}`);
1269
921
  if (!response.ok) {
1270
922
  throw new Error(`HTTP error: ${response.status}`);
1271
923
  }
1272
924
 
1273
- const data = await response.json();
1274
- if (!data.success) {
925
+ const stats = await response.json();
926
+
927
+ if (!stats || stats.length === 0) {
1275
928
  return {
1276
929
  content: [
1277
930
  {
1278
931
  type: "text",
1279
- text: `Failed to open application: ${data.error || "unknown error"}`,
932
+ text: "No UI event statistics available. UI Events may not be enabled or no events have been captured yet.",
1280
933
  },
1281
934
  ],
1282
935
  };
1283
936
  }
1284
937
 
1285
- return {
1286
- content: [
1287
- {
1288
- type: "text",
1289
- text: `Successfully opened application '${args.app_name}'`,
1290
- },
1291
- ],
1292
- };
1293
- }
1294
-
1295
- case "open-url": {
1296
- const response = await fetchAPI("/experimental/operator/open-url", {
1297
- method: "POST",
1298
- body: JSON.stringify({
1299
- url: args.url || "",
1300
- browser: args.browser,
1301
- }),
1302
- });
1303
-
1304
- if (!response.ok) {
1305
- throw new Error(`HTTP error: ${response.status}`);
938
+ // Group by app
939
+ const byApp: Record<string, { app: string; events: Record<string, number>; total: number }> = {};
940
+ for (const stat of stats) {
941
+ const app = stat.app_name || "Unknown";
942
+ if (!byApp[app]) {
943
+ byApp[app] = { app, events: {}, total: 0 };
944
+ }
945
+ byApp[app].events[stat.event_type] = stat.count;
946
+ byApp[app].total += stat.count;
1306
947
  }
1307
948
 
1308
- const data = await response.json();
1309
- if (!data.success) {
1310
- return {
1311
- content: [
1312
- {
1313
- type: "text",
1314
- text: `Failed to open URL: ${data.error || "unknown error"}`,
1315
- },
1316
- ],
1317
- };
1318
- }
949
+ // Sort by total events
950
+ const sorted = Object.values(byApp).sort((a, b) => b.total - a.total);
951
+
952
+ const lines = sorted.map(({ app, events, total }) => {
953
+ const eventDetails = Object.entries(events)
954
+ .map(([type, count]) => `${type}: ${count}`)
955
+ .join(", ");
956
+ return `${app}: ${total} events (${eventDetails})`;
957
+ });
1319
958
 
1320
- const browserInfo = args.browser ? ` using ${args.browser}` : "";
1321
959
  return {
1322
960
  content: [
1323
961
  {
1324
962
  type: "text",
1325
- text: `Successfully opened URL '${args.url}'${browserInfo}`,
963
+ text: `UI Event Statistics:\n\n${lines.join("\n")}`,
1326
964
  },
1327
965
  ],
1328
966
  };