screenpipe-mcp 0.6.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # Screenpipe MCP Server
2
2
 
3
- <a href="https://www.pulsemcp.com/servers/mediar-ai-screenpipe"><img src="https://www.pulsemcp.com/badge/top-pick/mediar-ai-screenpipe" width="400" alt="PulseMCP Badge"></a>
3
+ <a href="https://www.pulsemcp.com/servers/screenpipe-screenpipe"><img src="https://www.pulsemcp.com/badge/top-pick/screenpipe-screenpipe" width="400" alt="PulseMCP Badge"></a>
4
4
 
5
5
  <br/>
6
6
 
@@ -33,8 +33,8 @@ The easiest way to use screenpipe-mcp is with npx. Edit your Claude Desktop conf
33
33
  Clone and build from source:
34
34
 
35
35
  ```bash
36
- git clone https://github.com/mediar-ai/screenpipe
37
- cd screenpipe/screenpipe-integrations/screenpipe-mcp
36
+ git clone https://github.com/screenpipe/screenpipe
37
+ cd screenpipe/crates/screenpipe-integrations/screenpipe-mcp
38
38
  npm install
39
39
  npm run build
40
40
  ```
@@ -64,32 +64,42 @@ npx @modelcontextprotocol/inspector npx screenpipe-mcp
64
64
 
65
65
  ## Available Tools
66
66
 
67
- ### Cross-Platform
68
-
69
- - **search-content** - Search through recorded screen content, audio transcriptions, and UI elements
70
- - Full text search with content type filtering (OCR/Audio/UI)
71
- - Time range and app/window filtering
72
- - Pagination support
73
-
74
- - **pixel-control** - Control mouse and keyboard
75
- - Type text, press keys, move mouse, click
76
-
77
- ### macOS Only
78
-
79
- - **find-elements** - Find UI elements in applications by role
80
- - **click-element** - Click UI elements by accessibility ID
81
- - **fill-element** - Type text into UI elements
82
- - **scroll-element** - Scroll UI elements
83
- - **open-application** - Open applications by name
84
- - **open-url** - Open URLs in default browser
67
+ ### search-content
68
+ Search through recorded screen content (OCR) and audio transcriptions:
69
+ - Full text search with content type filtering (OCR/Audio/UI)
70
+ - Time range and app/window filtering
71
+ - Speaker filtering (by ID or name)
72
+ - Pagination support
73
+
74
+ ### search-ui-events (macOS)
75
+ Search UI input events captured via accessibility APIs. This is the third data modality alongside vision and audio:
76
+ - **Event types**: `click`, `text`, `scroll`, `key`, `app_switch`, `window_focus`, `clipboard`
77
+ - Filter by app, window, time range
78
+ - `text` events show aggregated keyboard input (what was typed)
79
+ - `click` events include accessibility element labels
80
+ - `clipboard` events show copy/paste content
81
+
82
+ ### get-ui-event-stats (macOS)
83
+ Get aggregated statistics of UI events:
84
+ - Event counts grouped by app and event type
85
+ - Useful for productivity analysis and app usage tracking
86
+
87
+ ### export-video
88
+ Export screen recordings as video files:
89
+ - Specify time range with start/end times
90
+ - Configurable FPS for output video
85
91
 
86
92
  ## Example Queries in Claude
87
93
 
88
94
  - "Search for any mentions of 'rust' in my screen recordings"
89
95
  - "Find audio transcriptions from the last hour"
90
96
  - "Show me what was on my screen in VSCode yesterday"
91
- - "Open Safari and go to github.com"
92
- - "Find the search button in Chrome and click it"
97
+ - "Export a video of my screen from 2-3pm today"
98
+ - "Find what John said in our meeting about the database"
99
+ - "What did I type in Slack today?" (uses search-ui-events)
100
+ - "Show me my app usage statistics for the past 3 hours"
101
+ - "What did I copy to clipboard recently?"
102
+ - "Which apps did I switch between most today?"
93
103
 
94
104
  ## Requirements
95
105
 
package/dist/index.js CHANGED
@@ -66,7 +66,7 @@ const SCREENPIPE_API = `http://localhost:${port}`;
66
66
  // Initialize server
67
67
  const server = new index_js_1.Server({
68
68
  name: "screenpipe",
69
- version: "0.5.0",
69
+ version: "0.7.0",
70
70
  }, {
71
71
  capabilities: {
72
72
  tools: {},
@@ -81,7 +81,11 @@ const BASE_TOOLS = [
81
81
  description: "Search screenpipe's recorded content: screen text (OCR), audio transcriptions, and UI elements. " +
82
82
  "Returns timestamped results with app context. " +
83
83
  "Call with no parameters to get recent activity. " +
84
- "Use the 'screenpipe://context' resource for current time when building time-based queries.",
84
+ "Use the 'screenpipe://context' resource for current time when building time-based queries.\n\n" +
85
+ "DEEP LINKS: When referencing specific moments in results, create clickable timeline links:\n" +
86
+ "Format: [readable time](screenpipe://timeline?timestamp=ISO8601_TIMESTAMP)\n" +
87
+ "Example: [10:30 AM](screenpipe://timeline?timestamp=2024-01-15T18:30:00Z)\n" +
88
+ "Users can click these links to jump directly to that moment in their timeline.",
85
89
  annotations: {
86
90
  title: "Search Content",
87
91
  readOnlyHint: true,
@@ -96,7 +100,7 @@ const BASE_TOOLS = [
96
100
  content_type: {
97
101
  type: "string",
98
102
  enum: ["all", "ocr", "audio", "ui"],
99
- description: "Content type filter. Default: 'all'",
103
+ description: "Content type filter: 'ocr' (screen text), 'audio' (transcriptions), 'ui' (legacy UI monitoring), 'all'. Default: 'all'. For keyboard/mouse/accessibility events, use search-ui-events tool instead.",
100
104
  default: "all",
101
105
  },
102
106
  limit: {
@@ -140,6 +144,14 @@ const BASE_TOOLS = [
140
144
  description: "Include base64 screenshots (OCR only). Default: false",
141
145
  default: false,
142
146
  },
147
+ speaker_ids: {
148
+ type: "string",
149
+ description: "Comma-separated speaker IDs to filter audio results (e.g., '1,2,3')",
150
+ },
151
+ speaker_name: {
152
+ type: "string",
153
+ description: "Filter audio by speaker name (case-insensitive partial match)",
154
+ },
143
155
  },
144
156
  },
145
157
  },
@@ -177,6 +189,84 @@ const BASE_TOOLS = [
177
189
  required: ["start_time", "end_time"],
178
190
  },
179
191
  },
192
+ {
193
+ name: "search-ui-events",
194
+ description: "Search UI input events captured via accessibility APIs (macOS). " +
195
+ "This is the third modality alongside vision (OCR) and audio. " +
196
+ "Captures: mouse clicks, keyboard text input, scroll events, app/window switches, clipboard operations. " +
197
+ "Events include app context, element info (accessibility labels), and precise timestamps. " +
198
+ "Great for understanding user workflow, what was typed, clicked, or copied.",
199
+ annotations: {
200
+ title: "Search UI Events (Accessibility)",
201
+ readOnlyHint: true,
202
+ },
203
+ inputSchema: {
204
+ type: "object",
205
+ properties: {
206
+ q: {
207
+ type: "string",
208
+ description: "Search query for text content, app name, window title. Optional - omit to return recent events.",
209
+ },
210
+ event_type: {
211
+ type: "string",
212
+ enum: ["click", "text", "scroll", "key", "app_switch", "window_focus", "clipboard"],
213
+ description: "Filter by event type. 'text' = aggregated keyboard input, 'click' = mouse clicks with element context, 'app_switch'/'window_focus' = app usage tracking, 'clipboard' = copy/paste events.",
214
+ },
215
+ app_name: {
216
+ type: "string",
217
+ description: "Filter by application name (e.g., 'Google Chrome', 'Slack', 'Code')",
218
+ },
219
+ window_name: {
220
+ type: "string",
221
+ description: "Filter by window title",
222
+ },
223
+ start_time: {
224
+ type: "string",
225
+ format: "date-time",
226
+ description: "ISO 8601 UTC start time (e.g., 2024-01-15T10:00:00Z)",
227
+ },
228
+ end_time: {
229
+ type: "string",
230
+ format: "date-time",
231
+ description: "ISO 8601 UTC end time (e.g., 2024-01-15T18:00:00Z)",
232
+ },
233
+ limit: {
234
+ type: "integer",
235
+ description: "Max results. Default: 50",
236
+ default: 50,
237
+ },
238
+ offset: {
239
+ type: "integer",
240
+ description: "Skip N results for pagination. Default: 0",
241
+ default: 0,
242
+ },
243
+ },
244
+ },
245
+ },
246
+ {
247
+ name: "get-ui-event-stats",
248
+ description: "Get aggregated statistics of UI events by app and event type. " +
249
+ "Useful for understanding app usage patterns, productivity analysis, or finding which apps were used most.",
250
+ annotations: {
251
+ title: "UI Event Statistics",
252
+ readOnlyHint: true,
253
+ },
254
+ inputSchema: {
255
+ type: "object",
256
+ properties: {
257
+ start_time: {
258
+ type: "string",
259
+ format: "date-time",
260
+ description: "ISO 8601 UTC start time for stats period",
261
+ },
262
+ end_time: {
263
+ type: "string",
264
+ format: "date-time",
265
+ description: "ISO 8601 UTC end time for stats period",
266
+ },
267
+ },
268
+ },
269
+ },
180
270
  ];
181
271
  // List tools handler
182
272
  server.setRequestHandler(types_js_1.ListToolsRequestSchema, async () => {
@@ -244,18 +334,20 @@ server.setRequestHandler(types_js_1.ReadResourceRequestSchema, async (request) =
244
334
  mimeType: "text/markdown",
245
335
  text: `# Screenpipe Search Guide
246
336
 
337
+ ## Three Data Modalities
338
+
339
+ Screenpipe captures three types of data:
340
+ 1. **Vision (OCR)** - Screen text from screenshots
341
+ 2. **Audio** - Transcribed speech from microphone/system audio
342
+ 3. **UI Events (Accessibility)** - Keyboard input, mouse clicks, app switches, clipboard (macOS)
343
+
247
344
  ## Quick Start
248
345
  - **Get recent activity**: Call search-content with no parameters
249
346
  - **Search text**: \`{"q": "search term", "content_type": "ocr"}\`
250
- - **Time filter**: Use start_time/end_time with ISO 8601 UTC timestamps
251
-
252
- ## Content Types
253
- - \`ocr\`: Screen text (what you see)
254
- - \`audio\`: Transcribed speech
255
- - \`ui\`: UI element interactions
256
- - \`all\`: Everything (default)
347
+ - **Get keyboard input**: Use search-ui-events with \`event_type: "text"\`
348
+ - **Track app usage**: Use get-ui-event-stats for aggregated data
257
349
 
258
- ## Key Parameters
350
+ ## search-content (Vision + Audio)
259
351
  | Parameter | Description | Default |
260
352
  |-----------|-------------|---------|
261
353
  | q | Search query | (none - returns all) |
@@ -266,11 +358,39 @@ server.setRequestHandler(types_js_1.ReadResourceRequestSchema, async (request) =
266
358
  | app_name | Filter by app | (no filter) |
267
359
  | include_frames | Include screenshots | false |
268
360
 
361
+ ## search-ui-events (Accessibility Data)
362
+ | Parameter | Description | Default |
363
+ |-----------|-------------|---------|
364
+ | q | Search text content, app, window | (none) |
365
+ | event_type | click/text/scroll/key/app_switch/window_focus/clipboard | (all types) |
366
+ | app_name | Filter by application | (no filter) |
367
+ | limit | Max results | 50 |
368
+
369
+ ### Event Types
370
+ - \`text\`: Aggregated keyboard input (what was typed)
371
+ - \`click\`: Mouse clicks with element context (accessibility labels)
372
+ - \`app_switch\`: When user switched applications
373
+ - \`window_focus\`: When window focus changed
374
+ - \`clipboard\`: Copy/paste operations
375
+ - \`scroll\`: Scroll events with delta values
376
+
269
377
  ## Tips
270
378
  1. Read screenpipe://context first to get current timestamps
271
- 2. Omit \`q\` to get all content (useful for "what was I doing?")
272
- 3. Use \`limit: 50-100\` for comprehensive searches
273
- 4. Combine app_name + time filters for focused results`,
379
+ 2. Use search-ui-events for "what did I type?" queries
380
+ 3. Use get-ui-event-stats to understand app usage patterns
381
+ 4. Combine search-content (what was on screen) with search-ui-events (what was done)
382
+
383
+ ## Timeline Deep Links
384
+ When showing search results to users, create clickable links to specific moments:
385
+
386
+ **Format:** \`[readable time](screenpipe://timeline?timestamp=ISO8601_TIMESTAMP)\`
387
+
388
+ **Examples:**
389
+ - \`[10:30 AM](screenpipe://timeline?timestamp=2024-01-15T18:30:00Z)\`
390
+ - \`[yesterday at 3pm](screenpipe://timeline?timestamp=2024-01-14T15:00:00Z)\`
391
+
392
+ Users can click these links to jump directly to that moment in their screenpipe timeline.
393
+ Always use the exact timestamp from search results when creating these links.`,
274
394
  },
275
395
  ],
276
396
  };
@@ -592,7 +712,8 @@ server.setRequestHandler(types_js_1.CallToolRequestSchema, async (request) => {
592
712
  // Sort frame IDs
593
713
  frameIds.sort((a, b) => a - b);
594
714
  // Step 2: Connect to WebSocket and export video
595
- const wsUrl = `ws://localhost:${port}/frames/export?frame_ids=${frameIds.join(",")}&fps=${fps}`;
715
+ // Send frame_ids in message body to avoid URL length limits
716
+ const wsUrl = `ws://localhost:${port}/frames/export?fps=${fps}`;
596
717
  const exportResult = await new Promise((resolve) => {
597
718
  const ws = new ws_1.WebSocket(wsUrl);
598
719
  let resolved = false;
@@ -603,6 +724,10 @@ server.setRequestHandler(types_js_1.CallToolRequestSchema, async (request) => {
603
724
  resolve({ success: false, error: "Export timed out after 5 minutes" });
604
725
  }
605
726
  }, 5 * 60 * 1000); // 5 minute timeout
727
+ ws.on("open", () => {
728
+ // Send frame_ids in message body to avoid URL length limits
729
+ ws.send(JSON.stringify({ frame_ids: frameIds }));
730
+ });
606
731
  ws.on("error", (error) => {
607
732
  if (!resolved) {
608
733
  resolved = true;
@@ -674,6 +799,123 @@ server.setRequestHandler(types_js_1.CallToolRequestSchema, async (request) => {
674
799
  };
675
800
  }
676
801
  }
802
+ case "search-ui-events": {
803
+ const params = new URLSearchParams();
804
+ for (const [key, value] of Object.entries(args)) {
805
+ if (value !== null && value !== undefined) {
806
+ // Map event_type to the API parameter
807
+ params.append(key, String(value));
808
+ }
809
+ }
810
+ const response = await fetchAPI(`/ui-events?${params.toString()}`);
811
+ if (!response.ok) {
812
+ throw new Error(`HTTP error: ${response.status}`);
813
+ }
814
+ const data = await response.json();
815
+ const events = data.data || [];
816
+ const pagination = data.pagination || {};
817
+ if (events.length === 0) {
818
+ return {
819
+ content: [
820
+ {
821
+ type: "text",
822
+ text: "No UI events found. This feature requires:\n" +
823
+ "1. macOS with Accessibility permissions granted\n" +
824
+ "2. UI Events enabled in screenpipe settings\n" +
825
+ "Try: broader time range or different event_type filter.",
826
+ },
827
+ ],
828
+ };
829
+ }
830
+ const formattedEvents = [];
831
+ for (const event of events) {
832
+ const parts = [
833
+ `[${event.event_type?.toUpperCase() || "?"}]`,
834
+ event.app_name || "?",
835
+ event.window_title ? `| ${event.window_title}` : "",
836
+ ];
837
+ let details = "";
838
+ if (event.event_type === "text" && event.text_content) {
839
+ details = `Text: "${event.text_content}"`;
840
+ }
841
+ else if (event.event_type === "click") {
842
+ details = `Click at (${event.x || 0}, ${event.y || 0})`;
843
+ if (event.element?.label) {
844
+ details += ` on "${event.element.label}"`;
845
+ }
846
+ }
847
+ else if (event.event_type === "clipboard" && event.text_content) {
848
+ details = `Clipboard: "${event.text_content.substring(0, 100)}${event.text_content.length > 100 ? "..." : ""}"`;
849
+ }
850
+ else if (event.event_type === "app_switch" || event.event_type === "window_focus") {
851
+ details = `Switched to: ${event.app_name}${event.window_title ? ` - ${event.window_title}` : ""}`;
852
+ }
853
+ else if (event.event_type === "scroll") {
854
+ details = `Scroll: dx=${event.delta_x || 0}, dy=${event.delta_y || 0}`;
855
+ }
856
+ formattedEvents.push(`${parts.join(" ")}\n` +
857
+ `${event.timestamp || ""}\n` +
858
+ `${details}`);
859
+ }
860
+ const header = `UI Events: ${events.length}/${pagination.total || "?"}` +
861
+ (pagination.total > events.length ? ` (use offset=${(pagination.offset || 0) + events.length} for more)` : "");
862
+ return {
863
+ content: [
864
+ {
865
+ type: "text",
866
+ text: header + "\n\n" + formattedEvents.join("\n---\n"),
867
+ },
868
+ ],
869
+ };
870
+ }
871
+ case "get-ui-event-stats": {
872
+ const params = new URLSearchParams();
873
+ if (args.start_time)
874
+ params.append("start_time", String(args.start_time));
875
+ if (args.end_time)
876
+ params.append("end_time", String(args.end_time));
877
+ const response = await fetchAPI(`/ui-events/stats?${params.toString()}`);
878
+ if (!response.ok) {
879
+ throw new Error(`HTTP error: ${response.status}`);
880
+ }
881
+ const stats = await response.json();
882
+ if (!stats || stats.length === 0) {
883
+ return {
884
+ content: [
885
+ {
886
+ type: "text",
887
+ text: "No UI event statistics available. UI Events may not be enabled or no events have been captured yet.",
888
+ },
889
+ ],
890
+ };
891
+ }
892
+ // Group by app
893
+ const byApp = {};
894
+ for (const stat of stats) {
895
+ const app = stat.app_name || "Unknown";
896
+ if (!byApp[app]) {
897
+ byApp[app] = { app, events: {}, total: 0 };
898
+ }
899
+ byApp[app].events[stat.event_type] = stat.count;
900
+ byApp[app].total += stat.count;
901
+ }
902
+ // Sort by total events
903
+ const sorted = Object.values(byApp).sort((a, b) => b.total - a.total);
904
+ const lines = sorted.map(({ app, events, total }) => {
905
+ const eventDetails = Object.entries(events)
906
+ .map(([type, count]) => `${type}: ${count}`)
907
+ .join(", ");
908
+ return `${app}: ${total} events (${eventDetails})`;
909
+ });
910
+ return {
911
+ content: [
912
+ {
913
+ type: "text",
914
+ text: `UI Event Statistics:\n\n${lines.join("\n")}`,
915
+ },
916
+ ],
917
+ };
918
+ }
677
919
  default:
678
920
  throw new Error(`Unknown tool: ${name}`);
679
921
  }
package/manifest.json CHANGED
@@ -2,20 +2,20 @@
2
2
  "manifest_version": "0.3",
3
3
  "name": "screenpipe",
4
4
  "display_name": "Screenpipe",
5
- "version": "0.5.0",
6
- "description": "Search your screen recordings, audio transcriptions, and control your computer with AI",
7
- "long_description": "Screenpipe is a 24/7 screen and audio recorder that lets you search everything you've seen or heard. This extension connects Claude to your local screenpipe instance, enabling AI-powered search through your digital memory and computer control capabilities.",
5
+ "version": "0.8.0",
6
+ "description": "Search your screen recordings and audio transcriptions with AI",
7
+ "long_description": "Screenpipe is a 24/7 screen and audio recorder that lets you search everything you've seen or heard. This extension connects Claude to your local screenpipe instance, enabling AI-powered search through your digital memory.",
8
8
  "author": {
9
9
  "name": "screenpipe",
10
10
  "url": "https://screenpi.pe"
11
11
  },
12
12
  "repository": {
13
13
  "type": "git",
14
- "url": "https://github.com/mediar-ai/screenpipe"
14
+ "url": "https://github.com/screenpipe/screenpipe"
15
15
  },
16
16
  "homepage": "https://screenpi.pe",
17
- "documentation": "https://github.com/mediar-ai/screenpipe/tree/main/screenpipe-integrations/screenpipe-mcp",
18
- "support": "https://github.com/mediar-ai/screenpipe/issues",
17
+ "documentation": "https://github.com/screenpipe/screenpipe/tree/main/crates/screenpipe-integrations/screenpipe-mcp",
18
+ "support": "https://github.com/screenpipe/screenpipe/issues",
19
19
  "license": "MIT",
20
20
  "server": {
21
21
  "type": "node",
@@ -28,39 +28,11 @@
28
28
  "tools": [
29
29
  {
30
30
  "name": "search-content",
31
- "description": "Search through recorded screen content, audio transcriptions, and UI elements"
31
+ "description": "Search through recorded screen content, audio transcriptions, and UI elements with speaker filtering"
32
32
  },
33
33
  {
34
34
  "name": "export-video",
35
35
  "description": "Export screen recordings as MP4 video for a specific time range"
36
- },
37
- {
38
- "name": "pixel-control",
39
- "description": "Control mouse and keyboard (type text, press keys, move mouse, click)"
40
- },
41
- {
42
- "name": "find-elements",
43
- "description": "Find UI elements in applications by role (macOS only)"
44
- },
45
- {
46
- "name": "click-element",
47
- "description": "Click UI elements by ID (macOS only)"
48
- },
49
- {
50
- "name": "fill-element",
51
- "description": "Type text into UI elements (macOS only)"
52
- },
53
- {
54
- "name": "scroll-element",
55
- "description": "Scroll UI elements (macOS only)"
56
- },
57
- {
58
- "name": "open-application",
59
- "description": "Open applications by name (macOS only)"
60
- },
61
- {
62
- "name": "open-url",
63
- "description": "Open URLs in browser (macOS only)"
64
36
  }
65
37
  ],
66
38
  "compatibility": {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "screenpipe-mcp",
3
- "version": "0.6.0",
3
+ "version": "0.8.0",
4
4
  "description": "MCP server for screenpipe - search your screen recordings and audio transcriptions",
5
5
  "main": "dist/index.js",
6
6
  "bin": {
package/src/index.ts CHANGED
@@ -45,7 +45,7 @@ const SCREENPIPE_API = `http://localhost:${port}`;
45
45
  const server = new Server(
46
46
  {
47
47
  name: "screenpipe",
48
- version: "0.5.0",
48
+ version: "0.7.0",
49
49
  },
50
50
  {
51
51
  capabilities: {
@@ -64,7 +64,11 @@ const BASE_TOOLS: Tool[] = [
64
64
  "Search screenpipe's recorded content: screen text (OCR), audio transcriptions, and UI elements. " +
65
65
  "Returns timestamped results with app context. " +
66
66
  "Call with no parameters to get recent activity. " +
67
- "Use the 'screenpipe://context' resource for current time when building time-based queries.",
67
+ "Use the 'screenpipe://context' resource for current time when building time-based queries.\n\n" +
68
+ "DEEP LINKS: When referencing specific moments in results, create clickable timeline links:\n" +
69
+ "Format: [readable time](screenpipe://timeline?timestamp=ISO8601_TIMESTAMP)\n" +
70
+ "Example: [10:30 AM](screenpipe://timeline?timestamp=2024-01-15T18:30:00Z)\n" +
71
+ "Users can click these links to jump directly to that moment in their timeline.",
68
72
  annotations: {
69
73
  title: "Search Content",
70
74
  readOnlyHint: true,
@@ -79,7 +83,7 @@ const BASE_TOOLS: Tool[] = [
79
83
  content_type: {
80
84
  type: "string",
81
85
  enum: ["all", "ocr", "audio", "ui"],
82
- description: "Content type filter. Default: 'all'",
86
+ description: "Content type filter: 'ocr' (screen text), 'audio' (transcriptions), 'ui' (legacy UI monitoring), 'all'. Default: 'all'. For keyboard/mouse/accessibility events, use search-ui-events tool instead.",
83
87
  default: "all",
84
88
  },
85
89
  limit: {
@@ -123,6 +127,14 @@ const BASE_TOOLS: Tool[] = [
123
127
  description: "Include base64 screenshots (OCR only). Default: false",
124
128
  default: false,
125
129
  },
130
+ speaker_ids: {
131
+ type: "string",
132
+ description: "Comma-separated speaker IDs to filter audio results (e.g., '1,2,3')",
133
+ },
134
+ speaker_name: {
135
+ type: "string",
136
+ description: "Filter audio by speaker name (case-insensitive partial match)",
137
+ },
126
138
  },
127
139
  },
128
140
  },
@@ -164,6 +176,86 @@ const BASE_TOOLS: Tool[] = [
164
176
  required: ["start_time", "end_time"],
165
177
  },
166
178
  },
179
+ {
180
+ name: "search-ui-events",
181
+ description:
182
+ "Search UI input events captured via accessibility APIs (macOS). " +
183
+ "This is the third modality alongside vision (OCR) and audio. " +
184
+ "Captures: mouse clicks, keyboard text input, scroll events, app/window switches, clipboard operations. " +
185
+ "Events include app context, element info (accessibility labels), and precise timestamps. " +
186
+ "Great for understanding user workflow, what was typed, clicked, or copied.",
187
+ annotations: {
188
+ title: "Search UI Events (Accessibility)",
189
+ readOnlyHint: true,
190
+ },
191
+ inputSchema: {
192
+ type: "object",
193
+ properties: {
194
+ q: {
195
+ type: "string",
196
+ description: "Search query for text content, app name, window title. Optional - omit to return recent events.",
197
+ },
198
+ event_type: {
199
+ type: "string",
200
+ enum: ["click", "text", "scroll", "key", "app_switch", "window_focus", "clipboard"],
201
+ description: "Filter by event type. 'text' = aggregated keyboard input, 'click' = mouse clicks with element context, 'app_switch'/'window_focus' = app usage tracking, 'clipboard' = copy/paste events.",
202
+ },
203
+ app_name: {
204
+ type: "string",
205
+ description: "Filter by application name (e.g., 'Google Chrome', 'Slack', 'Code')",
206
+ },
207
+ window_name: {
208
+ type: "string",
209
+ description: "Filter by window title",
210
+ },
211
+ start_time: {
212
+ type: "string",
213
+ format: "date-time",
214
+ description: "ISO 8601 UTC start time (e.g., 2024-01-15T10:00:00Z)",
215
+ },
216
+ end_time: {
217
+ type: "string",
218
+ format: "date-time",
219
+ description: "ISO 8601 UTC end time (e.g., 2024-01-15T18:00:00Z)",
220
+ },
221
+ limit: {
222
+ type: "integer",
223
+ description: "Max results. Default: 50",
224
+ default: 50,
225
+ },
226
+ offset: {
227
+ type: "integer",
228
+ description: "Skip N results for pagination. Default: 0",
229
+ default: 0,
230
+ },
231
+ },
232
+ },
233
+ },
234
+ {
235
+ name: "get-ui-event-stats",
236
+ description:
237
+ "Get aggregated statistics of UI events by app and event type. " +
238
+ "Useful for understanding app usage patterns, productivity analysis, or finding which apps were used most.",
239
+ annotations: {
240
+ title: "UI Event Statistics",
241
+ readOnlyHint: true,
242
+ },
243
+ inputSchema: {
244
+ type: "object",
245
+ properties: {
246
+ start_time: {
247
+ type: "string",
248
+ format: "date-time",
249
+ description: "ISO 8601 UTC start time for stats period",
250
+ },
251
+ end_time: {
252
+ type: "string",
253
+ format: "date-time",
254
+ description: "ISO 8601 UTC end time for stats period",
255
+ },
256
+ },
257
+ },
258
+ },
167
259
  ];
168
260
 
169
261
  // List tools handler
@@ -237,18 +329,20 @@ server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
237
329
  mimeType: "text/markdown",
238
330
  text: `# Screenpipe Search Guide
239
331
 
332
+ ## Three Data Modalities
333
+
334
+ Screenpipe captures three types of data:
335
+ 1. **Vision (OCR)** - Screen text from screenshots
336
+ 2. **Audio** - Transcribed speech from microphone/system audio
337
+ 3. **UI Events (Accessibility)** - Keyboard input, mouse clicks, app switches, clipboard (macOS)
338
+
240
339
  ## Quick Start
241
340
  - **Get recent activity**: Call search-content with no parameters
242
341
  - **Search text**: \`{"q": "search term", "content_type": "ocr"}\`
243
- - **Time filter**: Use start_time/end_time with ISO 8601 UTC timestamps
244
-
245
- ## Content Types
246
- - \`ocr\`: Screen text (what you see)
247
- - \`audio\`: Transcribed speech
248
- - \`ui\`: UI element interactions
249
- - \`all\`: Everything (default)
342
+ - **Get keyboard input**: Use search-ui-events with \`event_type: "text"\`
343
+ - **Track app usage**: Use get-ui-event-stats for aggregated data
250
344
 
251
- ## Key Parameters
345
+ ## search-content (Vision + Audio)
252
346
  | Parameter | Description | Default |
253
347
  |-----------|-------------|---------|
254
348
  | q | Search query | (none - returns all) |
@@ -259,11 +353,39 @@ server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
259
353
  | app_name | Filter by app | (no filter) |
260
354
  | include_frames | Include screenshots | false |
261
355
 
356
+ ## search-ui-events (Accessibility Data)
357
+ | Parameter | Description | Default |
358
+ |-----------|-------------|---------|
359
+ | q | Search text content, app, window | (none) |
360
+ | event_type | click/text/scroll/key/app_switch/window_focus/clipboard | (all types) |
361
+ | app_name | Filter by application | (no filter) |
362
+ | limit | Max results | 50 |
363
+
364
+ ### Event Types
365
+ - \`text\`: Aggregated keyboard input (what was typed)
366
+ - \`click\`: Mouse clicks with element context (accessibility labels)
367
+ - \`app_switch\`: When user switched applications
368
+ - \`window_focus\`: When window focus changed
369
+ - \`clipboard\`: Copy/paste operations
370
+ - \`scroll\`: Scroll events with delta values
371
+
262
372
  ## Tips
263
373
  1. Read screenpipe://context first to get current timestamps
264
- 2. Omit \`q\` to get all content (useful for "what was I doing?")
265
- 3. Use \`limit: 50-100\` for comprehensive searches
266
- 4. Combine app_name + time filters for focused results`,
374
+ 2. Use search-ui-events for "what did I type?" queries
375
+ 3. Use get-ui-event-stats to understand app usage patterns
376
+ 4. Combine search-content (what was on screen) with search-ui-events (what was done)
377
+
378
+ ## Timeline Deep Links
379
+ When showing search results to users, create clickable links to specific moments:
380
+
381
+ **Format:** \`[readable time](screenpipe://timeline?timestamp=ISO8601_TIMESTAMP)\`
382
+
383
+ **Examples:**
384
+ - \`[10:30 AM](screenpipe://timeline?timestamp=2024-01-15T18:30:00Z)\`
385
+ - \`[yesterday at 3pm](screenpipe://timeline?timestamp=2024-01-14T15:00:00Z)\`
386
+
387
+ Users can click these links to jump directly to that moment in their screenpipe timeline.
388
+ Always use the exact timestamp from search results when creating these links.`,
267
389
  },
268
390
  ],
269
391
  };
@@ -630,7 +752,8 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
630
752
  frameIds.sort((a, b) => a - b);
631
753
 
632
754
  // Step 2: Connect to WebSocket and export video
633
- const wsUrl = `ws://localhost:${port}/frames/export?frame_ids=${frameIds.join(",")}&fps=${fps}`;
755
+ // Send frame_ids in message body to avoid URL length limits
756
+ const wsUrl = `ws://localhost:${port}/frames/export?fps=${fps}`;
634
757
 
635
758
  const exportResult = await new Promise<{
636
759
  success: boolean;
@@ -649,6 +772,11 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
649
772
  }
650
773
  }, 5 * 60 * 1000); // 5 minute timeout
651
774
 
775
+ ws.on("open", () => {
776
+ // Send frame_ids in message body to avoid URL length limits
777
+ ws.send(JSON.stringify({ frame_ids: frameIds }));
778
+ });
779
+
652
780
  ws.on("error", (error) => {
653
781
  if (!resolved) {
654
782
  resolved = true;
@@ -724,6 +852,136 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
724
852
  }
725
853
  }
726
854
 
855
+ case "search-ui-events": {
856
+ const params = new URLSearchParams();
857
+ for (const [key, value] of Object.entries(args)) {
858
+ if (value !== null && value !== undefined) {
859
+ // Map event_type to the API parameter
860
+ params.append(key, String(value));
861
+ }
862
+ }
863
+
864
+ const response = await fetchAPI(`/ui-events?${params.toString()}`);
865
+ if (!response.ok) {
866
+ throw new Error(`HTTP error: ${response.status}`);
867
+ }
868
+
869
+ const data = await response.json();
870
+ const events = data.data || [];
871
+ const pagination = data.pagination || {};
872
+
873
+ if (events.length === 0) {
874
+ return {
875
+ content: [
876
+ {
877
+ type: "text",
878
+ text: "No UI events found. This feature requires:\n" +
879
+ "1. macOS with Accessibility permissions granted\n" +
880
+ "2. UI Events enabled in screenpipe settings\n" +
881
+ "Try: broader time range or different event_type filter.",
882
+ },
883
+ ],
884
+ };
885
+ }
886
+
887
+ const formattedEvents: string[] = [];
888
+ for (const event of events) {
889
+ const parts = [
890
+ `[${event.event_type?.toUpperCase() || "?"}]`,
891
+ event.app_name || "?",
892
+ event.window_title ? `| ${event.window_title}` : "",
893
+ ];
894
+
895
+ let details = "";
896
+ if (event.event_type === "text" && event.text_content) {
897
+ details = `Text: "${event.text_content}"`;
898
+ } else if (event.event_type === "click") {
899
+ details = `Click at (${event.x || 0}, ${event.y || 0})`;
900
+ if (event.element?.label) {
901
+ details += ` on "${event.element.label}"`;
902
+ }
903
+ } else if (event.event_type === "clipboard" && event.text_content) {
904
+ details = `Clipboard: "${event.text_content.substring(0, 100)}${event.text_content.length > 100 ? "..." : ""}"`;
905
+ } else if (event.event_type === "app_switch" || event.event_type === "window_focus") {
906
+ details = `Switched to: ${event.app_name}${event.window_title ? ` - ${event.window_title}` : ""}`;
907
+ } else if (event.event_type === "scroll") {
908
+ details = `Scroll: dx=${event.delta_x || 0}, dy=${event.delta_y || 0}`;
909
+ }
910
+
911
+ formattedEvents.push(
912
+ `${parts.join(" ")}\n` +
913
+ `${event.timestamp || ""}\n` +
914
+ `${details}`
915
+ );
916
+ }
917
+
918
+ const header = `UI Events: ${events.length}/${pagination.total || "?"}` +
919
+ (pagination.total > events.length ? ` (use offset=${(pagination.offset || 0) + events.length} for more)` : "");
920
+
921
+ return {
922
+ content: [
923
+ {
924
+ type: "text",
925
+ text: header + "\n\n" + formattedEvents.join("\n---\n"),
926
+ },
927
+ ],
928
+ };
929
+ }
930
+
931
+ case "get-ui-event-stats": {
932
+ const params = new URLSearchParams();
933
+ if (args.start_time) params.append("start_time", String(args.start_time));
934
+ if (args.end_time) params.append("end_time", String(args.end_time));
935
+
936
+ const response = await fetchAPI(`/ui-events/stats?${params.toString()}`);
937
+ if (!response.ok) {
938
+ throw new Error(`HTTP error: ${response.status}`);
939
+ }
940
+
941
+ const stats = await response.json();
942
+
943
+ if (!stats || stats.length === 0) {
944
+ return {
945
+ content: [
946
+ {
947
+ type: "text",
948
+ text: "No UI event statistics available. UI Events may not be enabled or no events have been captured yet.",
949
+ },
950
+ ],
951
+ };
952
+ }
953
+
954
+ // Group by app
955
+ const byApp: Record<string, { app: string; events: Record<string, number>; total: number }> = {};
956
+ for (const stat of stats) {
957
+ const app = stat.app_name || "Unknown";
958
+ if (!byApp[app]) {
959
+ byApp[app] = { app, events: {}, total: 0 };
960
+ }
961
+ byApp[app].events[stat.event_type] = stat.count;
962
+ byApp[app].total += stat.count;
963
+ }
964
+
965
+ // Sort by total events
966
+ const sorted = Object.values(byApp).sort((a, b) => b.total - a.total);
967
+
968
+ const lines = sorted.map(({ app, events, total }) => {
969
+ const eventDetails = Object.entries(events)
970
+ .map(([type, count]) => `${type}: ${count}`)
971
+ .join(", ");
972
+ return `${app}: ${total} events (${eventDetails})`;
973
+ });
974
+
975
+ return {
976
+ content: [
977
+ {
978
+ type: "text",
979
+ text: `UI Event Statistics:\n\n${lines.join("\n")}`,
980
+ },
981
+ ],
982
+ };
983
+ }
984
+
727
985
  default:
728
986
  throw new Error(`Unknown tool: ${name}`);
729
987
  }