screenpipe-mcp 0.6.0 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -64,32 +64,42 @@ npx @modelcontextprotocol/inspector npx screenpipe-mcp
64
64
 
65
65
  ## Available Tools
66
66
 
67
- ### Cross-Platform
68
-
69
- - **search-content** - Search through recorded screen content, audio transcriptions, and UI elements
70
- - Full text search with content type filtering (OCR/Audio/UI)
71
- - Time range and app/window filtering
72
- - Pagination support
73
-
74
- - **pixel-control** - Control mouse and keyboard
75
- - Type text, press keys, move mouse, click
76
-
77
- ### macOS Only
78
-
79
- - **find-elements** - Find UI elements in applications by role
80
- - **click-element** - Click UI elements by accessibility ID
81
- - **fill-element** - Type text into UI elements
82
- - **scroll-element** - Scroll UI elements
83
- - **open-application** - Open applications by name
84
- - **open-url** - Open URLs in default browser
67
+ ### search-content
68
+ Search through recorded screen content (OCR) and audio transcriptions:
69
+ - Full text search with content type filtering (OCR/Audio/UI)
70
+ - Time range and app/window filtering
71
+ - Speaker filtering (by ID or name)
72
+ - Pagination support
73
+
74
+ ### search-ui-events (macOS)
75
+ Search UI input events captured via accessibility APIs. This is the third data modality alongside vision and audio:
76
+ - **Event types**: `click`, `text`, `scroll`, `key`, `app_switch`, `window_focus`, `clipboard`
77
+ - Filter by app, window, time range
78
+ - `text` events show aggregated keyboard input (what was typed)
79
+ - `click` events include accessibility element labels
80
+ - `clipboard` events show copy/paste content
81
+
82
+ ### get-ui-event-stats (macOS)
83
+ Get aggregated statistics of UI events:
84
+ - Event counts grouped by app and event type
85
+ - Useful for productivity analysis and app usage tracking
86
+
87
+ ### export-video
88
+ Export screen recordings as video files:
89
+ - Specify time range with start/end times
90
+ - Configurable FPS for output video
85
91
 
86
92
  ## Example Queries in Claude
87
93
 
88
94
  - "Search for any mentions of 'rust' in my screen recordings"
89
95
  - "Find audio transcriptions from the last hour"
90
96
  - "Show me what was on my screen in VSCode yesterday"
91
- - "Open Safari and go to github.com"
92
- - "Find the search button in Chrome and click it"
97
+ - "Export a video of my screen from 2-3pm today"
98
+ - "Find what John said in our meeting about the database"
99
+ - "What did I type in Slack today?" (uses search-ui-events)
100
+ - "Show me my app usage statistics for the past 3 hours"
101
+ - "What did I copy to clipboard recently?"
102
+ - "Which apps did I switch between most today?"
93
103
 
94
104
  ## Requirements
95
105
 
package/dist/index.js CHANGED
@@ -66,7 +66,7 @@ const SCREENPIPE_API = `http://localhost:${port}`;
66
66
  // Initialize server
67
67
  const server = new index_js_1.Server({
68
68
  name: "screenpipe",
69
- version: "0.5.0",
69
+ version: "0.7.0",
70
70
  }, {
71
71
  capabilities: {
72
72
  tools: {},
@@ -96,7 +96,7 @@ const BASE_TOOLS = [
96
96
  content_type: {
97
97
  type: "string",
98
98
  enum: ["all", "ocr", "audio", "ui"],
99
- description: "Content type filter. Default: 'all'",
99
+ description: "Content type filter: 'ocr' (screen text), 'audio' (transcriptions), 'ui' (legacy UI monitoring), 'all'. Default: 'all'. For keyboard/mouse/accessibility events, use search-ui-events tool instead.",
100
100
  default: "all",
101
101
  },
102
102
  limit: {
@@ -140,6 +140,14 @@ const BASE_TOOLS = [
140
140
  description: "Include base64 screenshots (OCR only). Default: false",
141
141
  default: false,
142
142
  },
143
+ speaker_ids: {
144
+ type: "string",
145
+ description: "Comma-separated speaker IDs to filter audio results (e.g., '1,2,3')",
146
+ },
147
+ speaker_name: {
148
+ type: "string",
149
+ description: "Filter audio by speaker name (case-insensitive partial match)",
150
+ },
143
151
  },
144
152
  },
145
153
  },
@@ -177,6 +185,84 @@ const BASE_TOOLS = [
177
185
  required: ["start_time", "end_time"],
178
186
  },
179
187
  },
188
+ {
189
+ name: "search-ui-events",
190
+ description: "Search UI input events captured via accessibility APIs (macOS). " +
191
+ "This is the third modality alongside vision (OCR) and audio. " +
192
+ "Captures: mouse clicks, keyboard text input, scroll events, app/window switches, clipboard operations. " +
193
+ "Events include app context, element info (accessibility labels), and precise timestamps. " +
194
+ "Great for understanding user workflow, what was typed, clicked, or copied.",
195
+ annotations: {
196
+ title: "Search UI Events (Accessibility)",
197
+ readOnlyHint: true,
198
+ },
199
+ inputSchema: {
200
+ type: "object",
201
+ properties: {
202
+ q: {
203
+ type: "string",
204
+ description: "Search query for text content, app name, window title. Optional - omit to return recent events.",
205
+ },
206
+ event_type: {
207
+ type: "string",
208
+ enum: ["click", "text", "scroll", "key", "app_switch", "window_focus", "clipboard"],
209
+ description: "Filter by event type. 'text' = aggregated keyboard input, 'click' = mouse clicks with element context, 'app_switch'/'window_focus' = app usage tracking, 'clipboard' = copy/paste events.",
210
+ },
211
+ app_name: {
212
+ type: "string",
213
+ description: "Filter by application name (e.g., 'Google Chrome', 'Slack', 'Code')",
214
+ },
215
+ window_name: {
216
+ type: "string",
217
+ description: "Filter by window title",
218
+ },
219
+ start_time: {
220
+ type: "string",
221
+ format: "date-time",
222
+ description: "ISO 8601 UTC start time (e.g., 2024-01-15T10:00:00Z)",
223
+ },
224
+ end_time: {
225
+ type: "string",
226
+ format: "date-time",
227
+ description: "ISO 8601 UTC end time (e.g., 2024-01-15T18:00:00Z)",
228
+ },
229
+ limit: {
230
+ type: "integer",
231
+ description: "Max results. Default: 50",
232
+ default: 50,
233
+ },
234
+ offset: {
235
+ type: "integer",
236
+ description: "Skip N results for pagination. Default: 0",
237
+ default: 0,
238
+ },
239
+ },
240
+ },
241
+ },
242
+ {
243
+ name: "get-ui-event-stats",
244
+ description: "Get aggregated statistics of UI events by app and event type. " +
245
+ "Useful for understanding app usage patterns, productivity analysis, or finding which apps were used most.",
246
+ annotations: {
247
+ title: "UI Event Statistics",
248
+ readOnlyHint: true,
249
+ },
250
+ inputSchema: {
251
+ type: "object",
252
+ properties: {
253
+ start_time: {
254
+ type: "string",
255
+ format: "date-time",
256
+ description: "ISO 8601 UTC start time for stats period",
257
+ },
258
+ end_time: {
259
+ type: "string",
260
+ format: "date-time",
261
+ description: "ISO 8601 UTC end time for stats period",
262
+ },
263
+ },
264
+ },
265
+ },
180
266
  ];
181
267
  // List tools handler
182
268
  server.setRequestHandler(types_js_1.ListToolsRequestSchema, async () => {
@@ -244,18 +330,20 @@ server.setRequestHandler(types_js_1.ReadResourceRequestSchema, async (request) =
244
330
  mimeType: "text/markdown",
245
331
  text: `# Screenpipe Search Guide
246
332
 
333
+ ## Three Data Modalities
334
+
335
+ Screenpipe captures three types of data:
336
+ 1. **Vision (OCR)** - Screen text from screenshots
337
+ 2. **Audio** - Transcribed speech from microphone/system audio
338
+ 3. **UI Events (Accessibility)** - Keyboard input, mouse clicks, app switches, clipboard (macOS)
339
+
247
340
  ## Quick Start
248
341
  - **Get recent activity**: Call search-content with no parameters
249
342
  - **Search text**: \`{"q": "search term", "content_type": "ocr"}\`
250
- - **Time filter**: Use start_time/end_time with ISO 8601 UTC timestamps
251
-
252
- ## Content Types
253
- - \`ocr\`: Screen text (what you see)
254
- - \`audio\`: Transcribed speech
255
- - \`ui\`: UI element interactions
256
- - \`all\`: Everything (default)
343
+ - **Get keyboard input**: Use search-ui-events with \`event_type: "text"\`
344
+ - **Track app usage**: Use get-ui-event-stats for aggregated data
257
345
 
258
- ## Key Parameters
346
+ ## search-content (Vision + Audio)
259
347
  | Parameter | Description | Default |
260
348
  |-----------|-------------|---------|
261
349
  | q | Search query | (none - returns all) |
@@ -266,11 +354,27 @@ server.setRequestHandler(types_js_1.ReadResourceRequestSchema, async (request) =
266
354
  | app_name | Filter by app | (no filter) |
267
355
  | include_frames | Include screenshots | false |
268
356
 
357
+ ## search-ui-events (Accessibility Data)
358
+ | Parameter | Description | Default |
359
+ |-----------|-------------|---------|
360
+ | q | Search text content, app, window | (none) |
361
+ | event_type | click/text/scroll/key/app_switch/window_focus/clipboard | (all types) |
362
+ | app_name | Filter by application | (no filter) |
363
+ | limit | Max results | 50 |
364
+
365
+ ### Event Types
366
+ - \`text\`: Aggregated keyboard input (what was typed)
367
+ - \`click\`: Mouse clicks with element context (accessibility labels)
368
+ - \`app_switch\`: When user switched applications
369
+ - \`window_focus\`: When window focus changed
370
+ - \`clipboard\`: Copy/paste operations
371
+ - \`scroll\`: Scroll events with delta values
372
+
269
373
  ## Tips
270
374
  1. Read screenpipe://context first to get current timestamps
271
- 2. Omit \`q\` to get all content (useful for "what was I doing?")
272
- 3. Use \`limit: 50-100\` for comprehensive searches
273
- 4. Combine app_name + time filters for focused results`,
375
+ 2. Use search-ui-events for "what did I type?" queries
376
+ 3. Use get-ui-event-stats to understand app usage patterns
377
+ 4. Combine search-content (what was on screen) with search-ui-events (what was done)`,
274
378
  },
275
379
  ],
276
380
  };
@@ -592,7 +696,8 @@ server.setRequestHandler(types_js_1.CallToolRequestSchema, async (request) => {
592
696
  // Sort frame IDs
593
697
  frameIds.sort((a, b) => a - b);
594
698
  // Step 2: Connect to WebSocket and export video
595
- const wsUrl = `ws://localhost:${port}/frames/export?frame_ids=${frameIds.join(",")}&fps=${fps}`;
699
+ // Send frame_ids in message body to avoid URL length limits
700
+ const wsUrl = `ws://localhost:${port}/frames/export?fps=${fps}`;
596
701
  const exportResult = await new Promise((resolve) => {
597
702
  const ws = new ws_1.WebSocket(wsUrl);
598
703
  let resolved = false;
@@ -603,6 +708,10 @@ server.setRequestHandler(types_js_1.CallToolRequestSchema, async (request) => {
603
708
  resolve({ success: false, error: "Export timed out after 5 minutes" });
604
709
  }
605
710
  }, 5 * 60 * 1000); // 5 minute timeout
711
+ ws.on("open", () => {
712
+ // Send frame_ids in message body to avoid URL length limits
713
+ ws.send(JSON.stringify({ frame_ids: frameIds }));
714
+ });
606
715
  ws.on("error", (error) => {
607
716
  if (!resolved) {
608
717
  resolved = true;
@@ -674,6 +783,123 @@ server.setRequestHandler(types_js_1.CallToolRequestSchema, async (request) => {
674
783
  };
675
784
  }
676
785
  }
786
+ case "search-ui-events": {
787
+ const params = new URLSearchParams();
788
+ for (const [key, value] of Object.entries(args)) {
789
+ if (value !== null && value !== undefined) {
790
+ // Map event_type to the API parameter
791
+ params.append(key, String(value));
792
+ }
793
+ }
794
+ const response = await fetchAPI(`/ui-events?${params.toString()}`);
795
+ if (!response.ok) {
796
+ throw new Error(`HTTP error: ${response.status}`);
797
+ }
798
+ const data = await response.json();
799
+ const events = data.data || [];
800
+ const pagination = data.pagination || {};
801
+ if (events.length === 0) {
802
+ return {
803
+ content: [
804
+ {
805
+ type: "text",
806
+ text: "No UI events found. This feature requires:\n" +
807
+ "1. macOS with Accessibility permissions granted\n" +
808
+ "2. UI Events enabled in screenpipe settings\n" +
809
+ "Try: broader time range or different event_type filter.",
810
+ },
811
+ ],
812
+ };
813
+ }
814
+ const formattedEvents = [];
815
+ for (const event of events) {
816
+ const parts = [
817
+ `[${event.event_type?.toUpperCase() || "?"}]`,
818
+ event.app_name || "?",
819
+ event.window_title ? `| ${event.window_title}` : "",
820
+ ];
821
+ let details = "";
822
+ if (event.event_type === "text" && event.text_content) {
823
+ details = `Text: "${event.text_content}"`;
824
+ }
825
+ else if (event.event_type === "click") {
826
+ details = `Click at (${event.x || 0}, ${event.y || 0})`;
827
+ if (event.element?.label) {
828
+ details += ` on "${event.element.label}"`;
829
+ }
830
+ }
831
+ else if (event.event_type === "clipboard" && event.text_content) {
832
+ details = `Clipboard: "${event.text_content.substring(0, 100)}${event.text_content.length > 100 ? "..." : ""}"`;
833
+ }
834
+ else if (event.event_type === "app_switch" || event.event_type === "window_focus") {
835
+ details = `Switched to: ${event.app_name}${event.window_title ? ` - ${event.window_title}` : ""}`;
836
+ }
837
+ else if (event.event_type === "scroll") {
838
+ details = `Scroll: dx=${event.delta_x || 0}, dy=${event.delta_y || 0}`;
839
+ }
840
+ formattedEvents.push(`${parts.join(" ")}\n` +
841
+ `${event.timestamp || ""}\n` +
842
+ `${details}`);
843
+ }
844
+ const header = `UI Events: ${events.length}/${pagination.total || "?"}` +
845
+ (pagination.total > events.length ? ` (use offset=${(pagination.offset || 0) + events.length} for more)` : "");
846
+ return {
847
+ content: [
848
+ {
849
+ type: "text",
850
+ text: header + "\n\n" + formattedEvents.join("\n---\n"),
851
+ },
852
+ ],
853
+ };
854
+ }
855
+ case "get-ui-event-stats": {
856
+ const params = new URLSearchParams();
857
+ if (args.start_time)
858
+ params.append("start_time", String(args.start_time));
859
+ if (args.end_time)
860
+ params.append("end_time", String(args.end_time));
861
+ const response = await fetchAPI(`/ui-events/stats?${params.toString()}`);
862
+ if (!response.ok) {
863
+ throw new Error(`HTTP error: ${response.status}`);
864
+ }
865
+ const stats = await response.json();
866
+ if (!stats || stats.length === 0) {
867
+ return {
868
+ content: [
869
+ {
870
+ type: "text",
871
+ text: "No UI event statistics available. UI Events may not be enabled or no events have been captured yet.",
872
+ },
873
+ ],
874
+ };
875
+ }
876
+ // Group by app
877
+ const byApp = {};
878
+ for (const stat of stats) {
879
+ const app = stat.app_name || "Unknown";
880
+ if (!byApp[app]) {
881
+ byApp[app] = { app, events: {}, total: 0 };
882
+ }
883
+ byApp[app].events[stat.event_type] = stat.count;
884
+ byApp[app].total += stat.count;
885
+ }
886
+ // Sort by total events
887
+ const sorted = Object.values(byApp).sort((a, b) => b.total - a.total);
888
+ const lines = sorted.map(({ app, events, total }) => {
889
+ const eventDetails = Object.entries(events)
890
+ .map(([type, count]) => `${type}: ${count}`)
891
+ .join(", ");
892
+ return `${app}: ${total} events (${eventDetails})`;
893
+ });
894
+ return {
895
+ content: [
896
+ {
897
+ type: "text",
898
+ text: `UI Event Statistics:\n\n${lines.join("\n")}`,
899
+ },
900
+ ],
901
+ };
902
+ }
677
903
  default:
678
904
  throw new Error(`Unknown tool: ${name}`);
679
905
  }
package/manifest.json CHANGED
@@ -3,8 +3,8 @@
3
3
  "name": "screenpipe",
4
4
  "display_name": "Screenpipe",
5
5
  "version": "0.5.0",
6
- "description": "Search your screen recordings, audio transcriptions, and control your computer with AI",
7
- "long_description": "Screenpipe is a 24/7 screen and audio recorder that lets you search everything you've seen or heard. This extension connects Claude to your local screenpipe instance, enabling AI-powered search through your digital memory and computer control capabilities.",
6
+ "description": "Search your screen recordings and audio transcriptions with AI",
7
+ "long_description": "Screenpipe is a 24/7 screen and audio recorder that lets you search everything you've seen or heard. This extension connects Claude to your local screenpipe instance, enabling AI-powered search through your digital memory.",
8
8
  "author": {
9
9
  "name": "screenpipe",
10
10
  "url": "https://screenpi.pe"
@@ -28,39 +28,11 @@
28
28
  "tools": [
29
29
  {
30
30
  "name": "search-content",
31
- "description": "Search through recorded screen content, audio transcriptions, and UI elements"
31
+ "description": "Search through recorded screen content, audio transcriptions, and UI elements with speaker filtering"
32
32
  },
33
33
  {
34
34
  "name": "export-video",
35
35
  "description": "Export screen recordings as MP4 video for a specific time range"
36
- },
37
- {
38
- "name": "pixel-control",
39
- "description": "Control mouse and keyboard (type text, press keys, move mouse, click)"
40
- },
41
- {
42
- "name": "find-elements",
43
- "description": "Find UI elements in applications by role (macOS only)"
44
- },
45
- {
46
- "name": "click-element",
47
- "description": "Click UI elements by ID (macOS only)"
48
- },
49
- {
50
- "name": "fill-element",
51
- "description": "Type text into UI elements (macOS only)"
52
- },
53
- {
54
- "name": "scroll-element",
55
- "description": "Scroll UI elements (macOS only)"
56
- },
57
- {
58
- "name": "open-application",
59
- "description": "Open applications by name (macOS only)"
60
- },
61
- {
62
- "name": "open-url",
63
- "description": "Open URLs in browser (macOS only)"
64
36
  }
65
37
  ],
66
38
  "compatibility": {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "screenpipe-mcp",
3
- "version": "0.6.0",
3
+ "version": "0.7.0",
4
4
  "description": "MCP server for screenpipe - search your screen recordings and audio transcriptions",
5
5
  "main": "dist/index.js",
6
6
  "bin": {
package/src/index.ts CHANGED
@@ -45,7 +45,7 @@ const SCREENPIPE_API = `http://localhost:${port}`;
45
45
  const server = new Server(
46
46
  {
47
47
  name: "screenpipe",
48
- version: "0.5.0",
48
+ version: "0.7.0",
49
49
  },
50
50
  {
51
51
  capabilities: {
@@ -79,7 +79,7 @@ const BASE_TOOLS: Tool[] = [
79
79
  content_type: {
80
80
  type: "string",
81
81
  enum: ["all", "ocr", "audio", "ui"],
82
- description: "Content type filter. Default: 'all'",
82
+ description: "Content type filter: 'ocr' (screen text), 'audio' (transcriptions), 'ui' (legacy UI monitoring), 'all'. Default: 'all'. For keyboard/mouse/accessibility events, use search-ui-events tool instead.",
83
83
  default: "all",
84
84
  },
85
85
  limit: {
@@ -123,6 +123,14 @@ const BASE_TOOLS: Tool[] = [
123
123
  description: "Include base64 screenshots (OCR only). Default: false",
124
124
  default: false,
125
125
  },
126
+ speaker_ids: {
127
+ type: "string",
128
+ description: "Comma-separated speaker IDs to filter audio results (e.g., '1,2,3')",
129
+ },
130
+ speaker_name: {
131
+ type: "string",
132
+ description: "Filter audio by speaker name (case-insensitive partial match)",
133
+ },
126
134
  },
127
135
  },
128
136
  },
@@ -164,6 +172,86 @@ const BASE_TOOLS: Tool[] = [
164
172
  required: ["start_time", "end_time"],
165
173
  },
166
174
  },
175
+ {
176
+ name: "search-ui-events",
177
+ description:
178
+ "Search UI input events captured via accessibility APIs (macOS). " +
179
+ "This is the third modality alongside vision (OCR) and audio. " +
180
+ "Captures: mouse clicks, keyboard text input, scroll events, app/window switches, clipboard operations. " +
181
+ "Events include app context, element info (accessibility labels), and precise timestamps. " +
182
+ "Great for understanding user workflow, what was typed, clicked, or copied.",
183
+ annotations: {
184
+ title: "Search UI Events (Accessibility)",
185
+ readOnlyHint: true,
186
+ },
187
+ inputSchema: {
188
+ type: "object",
189
+ properties: {
190
+ q: {
191
+ type: "string",
192
+ description: "Search query for text content, app name, window title. Optional - omit to return recent events.",
193
+ },
194
+ event_type: {
195
+ type: "string",
196
+ enum: ["click", "text", "scroll", "key", "app_switch", "window_focus", "clipboard"],
197
+ description: "Filter by event type. 'text' = aggregated keyboard input, 'click' = mouse clicks with element context, 'app_switch'/'window_focus' = app usage tracking, 'clipboard' = copy/paste events.",
198
+ },
199
+ app_name: {
200
+ type: "string",
201
+ description: "Filter by application name (e.g., 'Google Chrome', 'Slack', 'Code')",
202
+ },
203
+ window_name: {
204
+ type: "string",
205
+ description: "Filter by window title",
206
+ },
207
+ start_time: {
208
+ type: "string",
209
+ format: "date-time",
210
+ description: "ISO 8601 UTC start time (e.g., 2024-01-15T10:00:00Z)",
211
+ },
212
+ end_time: {
213
+ type: "string",
214
+ format: "date-time",
215
+ description: "ISO 8601 UTC end time (e.g., 2024-01-15T18:00:00Z)",
216
+ },
217
+ limit: {
218
+ type: "integer",
219
+ description: "Max results. Default: 50",
220
+ default: 50,
221
+ },
222
+ offset: {
223
+ type: "integer",
224
+ description: "Skip N results for pagination. Default: 0",
225
+ default: 0,
226
+ },
227
+ },
228
+ },
229
+ },
230
+ {
231
+ name: "get-ui-event-stats",
232
+ description:
233
+ "Get aggregated statistics of UI events by app and event type. " +
234
+ "Useful for understanding app usage patterns, productivity analysis, or finding which apps were used most.",
235
+ annotations: {
236
+ title: "UI Event Statistics",
237
+ readOnlyHint: true,
238
+ },
239
+ inputSchema: {
240
+ type: "object",
241
+ properties: {
242
+ start_time: {
243
+ type: "string",
244
+ format: "date-time",
245
+ description: "ISO 8601 UTC start time for stats period",
246
+ },
247
+ end_time: {
248
+ type: "string",
249
+ format: "date-time",
250
+ description: "ISO 8601 UTC end time for stats period",
251
+ },
252
+ },
253
+ },
254
+ },
167
255
  ];
168
256
 
169
257
  // List tools handler
@@ -237,18 +325,20 @@ server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
237
325
  mimeType: "text/markdown",
238
326
  text: `# Screenpipe Search Guide
239
327
 
328
+ ## Three Data Modalities
329
+
330
+ Screenpipe captures three types of data:
331
+ 1. **Vision (OCR)** - Screen text from screenshots
332
+ 2. **Audio** - Transcribed speech from microphone/system audio
333
+ 3. **UI Events (Accessibility)** - Keyboard input, mouse clicks, app switches, clipboard (macOS)
334
+
240
335
  ## Quick Start
241
336
  - **Get recent activity**: Call search-content with no parameters
242
337
  - **Search text**: \`{"q": "search term", "content_type": "ocr"}\`
243
- - **Time filter**: Use start_time/end_time with ISO 8601 UTC timestamps
244
-
245
- ## Content Types
246
- - \`ocr\`: Screen text (what you see)
247
- - \`audio\`: Transcribed speech
248
- - \`ui\`: UI element interactions
249
- - \`all\`: Everything (default)
338
+ - **Get keyboard input**: Use search-ui-events with \`event_type: "text"\`
339
+ - **Track app usage**: Use get-ui-event-stats for aggregated data
250
340
 
251
- ## Key Parameters
341
+ ## search-content (Vision + Audio)
252
342
  | Parameter | Description | Default |
253
343
  |-----------|-------------|---------|
254
344
  | q | Search query | (none - returns all) |
@@ -259,11 +349,27 @@ server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
259
349
  | app_name | Filter by app | (no filter) |
260
350
  | include_frames | Include screenshots | false |
261
351
 
352
+ ## search-ui-events (Accessibility Data)
353
+ | Parameter | Description | Default |
354
+ |-----------|-------------|---------|
355
+ | q | Search text content, app, window | (none) |
356
+ | event_type | click/text/scroll/key/app_switch/window_focus/clipboard | (all types) |
357
+ | app_name | Filter by application | (no filter) |
358
+ | limit | Max results | 50 |
359
+
360
+ ### Event Types
361
+ - \`text\`: Aggregated keyboard input (what was typed)
362
+ - \`click\`: Mouse clicks with element context (accessibility labels)
363
+ - \`app_switch\`: When user switched applications
364
+ - \`window_focus\`: When window focus changed
365
+ - \`clipboard\`: Copy/paste operations
366
+ - \`scroll\`: Scroll events with delta values
367
+
262
368
  ## Tips
263
369
  1. Read screenpipe://context first to get current timestamps
264
- 2. Omit \`q\` to get all content (useful for "what was I doing?")
265
- 3. Use \`limit: 50-100\` for comprehensive searches
266
- 4. Combine app_name + time filters for focused results`,
370
+ 2. Use search-ui-events for "what did I type?" queries
371
+ 3. Use get-ui-event-stats to understand app usage patterns
372
+ 4. Combine search-content (what was on screen) with search-ui-events (what was done)`,
267
373
  },
268
374
  ],
269
375
  };
@@ -630,7 +736,8 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
630
736
  frameIds.sort((a, b) => a - b);
631
737
 
632
738
  // Step 2: Connect to WebSocket and export video
633
- const wsUrl = `ws://localhost:${port}/frames/export?frame_ids=${frameIds.join(",")}&fps=${fps}`;
739
+ // Send frame_ids in message body to avoid URL length limits
740
+ const wsUrl = `ws://localhost:${port}/frames/export?fps=${fps}`;
634
741
 
635
742
  const exportResult = await new Promise<{
636
743
  success: boolean;
@@ -649,6 +756,11 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
649
756
  }
650
757
  }, 5 * 60 * 1000); // 5 minute timeout
651
758
 
759
+ ws.on("open", () => {
760
+ // Send frame_ids in message body to avoid URL length limits
761
+ ws.send(JSON.stringify({ frame_ids: frameIds }));
762
+ });
763
+
652
764
  ws.on("error", (error) => {
653
765
  if (!resolved) {
654
766
  resolved = true;
@@ -724,6 +836,136 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
724
836
  }
725
837
  }
726
838
 
839
+ case "search-ui-events": {
840
+ const params = new URLSearchParams();
841
+ for (const [key, value] of Object.entries(args)) {
842
+ if (value !== null && value !== undefined) {
843
+ // Map event_type to the API parameter
844
+ params.append(key, String(value));
845
+ }
846
+ }
847
+
848
+ const response = await fetchAPI(`/ui-events?${params.toString()}`);
849
+ if (!response.ok) {
850
+ throw new Error(`HTTP error: ${response.status}`);
851
+ }
852
+
853
+ const data = await response.json();
854
+ const events = data.data || [];
855
+ const pagination = data.pagination || {};
856
+
857
+ if (events.length === 0) {
858
+ return {
859
+ content: [
860
+ {
861
+ type: "text",
862
+ text: "No UI events found. This feature requires:\n" +
863
+ "1. macOS with Accessibility permissions granted\n" +
864
+ "2. UI Events enabled in screenpipe settings\n" +
865
+ "Try: broader time range or different event_type filter.",
866
+ },
867
+ ],
868
+ };
869
+ }
870
+
871
+ const formattedEvents: string[] = [];
872
+ for (const event of events) {
873
+ const parts = [
874
+ `[${event.event_type?.toUpperCase() || "?"}]`,
875
+ event.app_name || "?",
876
+ event.window_title ? `| ${event.window_title}` : "",
877
+ ];
878
+
879
+ let details = "";
880
+ if (event.event_type === "text" && event.text_content) {
881
+ details = `Text: "${event.text_content}"`;
882
+ } else if (event.event_type === "click") {
883
+ details = `Click at (${event.x || 0}, ${event.y || 0})`;
884
+ if (event.element?.label) {
885
+ details += ` on "${event.element.label}"`;
886
+ }
887
+ } else if (event.event_type === "clipboard" && event.text_content) {
888
+ details = `Clipboard: "${event.text_content.substring(0, 100)}${event.text_content.length > 100 ? "..." : ""}"`;
889
+ } else if (event.event_type === "app_switch" || event.event_type === "window_focus") {
890
+ details = `Switched to: ${event.app_name}${event.window_title ? ` - ${event.window_title}` : ""}`;
891
+ } else if (event.event_type === "scroll") {
892
+ details = `Scroll: dx=${event.delta_x || 0}, dy=${event.delta_y || 0}`;
893
+ }
894
+
895
+ formattedEvents.push(
896
+ `${parts.join(" ")}\n` +
897
+ `${event.timestamp || ""}\n` +
898
+ `${details}`
899
+ );
900
+ }
901
+
902
+ const header = `UI Events: ${events.length}/${pagination.total || "?"}` +
903
+ (pagination.total > events.length ? ` (use offset=${(pagination.offset || 0) + events.length} for more)` : "");
904
+
905
+ return {
906
+ content: [
907
+ {
908
+ type: "text",
909
+ text: header + "\n\n" + formattedEvents.join("\n---\n"),
910
+ },
911
+ ],
912
+ };
913
+ }
914
+
915
+ case "get-ui-event-stats": {
916
+ const params = new URLSearchParams();
917
+ if (args.start_time) params.append("start_time", String(args.start_time));
918
+ if (args.end_time) params.append("end_time", String(args.end_time));
919
+
920
+ const response = await fetchAPI(`/ui-events/stats?${params.toString()}`);
921
+ if (!response.ok) {
922
+ throw new Error(`HTTP error: ${response.status}`);
923
+ }
924
+
925
+ const stats = await response.json();
926
+
927
+ if (!stats || stats.length === 0) {
928
+ return {
929
+ content: [
930
+ {
931
+ type: "text",
932
+ text: "No UI event statistics available. UI Events may not be enabled or no events have been captured yet.",
933
+ },
934
+ ],
935
+ };
936
+ }
937
+
938
+ // Group by app
939
+ const byApp: Record<string, { app: string; events: Record<string, number>; total: number }> = {};
940
+ for (const stat of stats) {
941
+ const app = stat.app_name || "Unknown";
942
+ if (!byApp[app]) {
943
+ byApp[app] = { app, events: {}, total: 0 };
944
+ }
945
+ byApp[app].events[stat.event_type] = stat.count;
946
+ byApp[app].total += stat.count;
947
+ }
948
+
949
+ // Sort by total events
950
+ const sorted = Object.values(byApp).sort((a, b) => b.total - a.total);
951
+
952
+ const lines = sorted.map(({ app, events, total }) => {
953
+ const eventDetails = Object.entries(events)
954
+ .map(([type, count]) => `${type}: ${count}`)
955
+ .join(", ");
956
+ return `${app}: ${total} events (${eventDetails})`;
957
+ });
958
+
959
+ return {
960
+ content: [
961
+ {
962
+ type: "text",
963
+ text: `UI Event Statistics:\n\n${lines.join("\n")}`,
964
+ },
965
+ ],
966
+ };
967
+ }
968
+
727
969
  default:
728
970
  throw new Error(`Unknown tool: ${name}`);
729
971
  }