screenpipe-mcp 0.8.2 → 0.8.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -69,7 +69,7 @@ const SCREENPIPE_API = `http://localhost:${port}`;
69
69
  // Initialize server
70
70
  const server = new index_js_1.Server({
71
71
  name: "screenpipe",
72
- version: "0.8.2",
72
+ version: "0.8.3",
73
73
  }, {
74
74
  capabilities: {
75
75
  tools: {},
@@ -81,10 +81,14 @@ const server = new index_js_1.Server({
81
81
  const BASE_TOOLS = [
82
82
  {
83
83
  name: "search-content",
84
- description: "Search screenpipe's recorded content: screen text (OCR), audio transcriptions, and UI elements. " +
84
+ description: "Search screenpipe's recorded content: screen text (accessibility APIs, with OCR fallback), audio transcriptions, and UI elements. " +
85
85
  "Returns timestamped results with app context. " +
86
86
  "Call with no parameters to get recent activity. " +
87
87
  "Use the 'screenpipe://context' resource for current time when building time-based queries.\n\n" +
88
+ "SEARCH STRATEGY: First search with ONLY time params (start_time/end_time) — no q, no app_name, no content_type. " +
89
+ "This gives ground truth of what's recorded. Scan results to find correct app_name values, then narrow with filters using exact observed values. " +
90
+ "App names are case-sensitive and may differ from user input (e.g. 'Discord' vs 'Discord.exe'). " +
91
+ "The q param searches captured text (accessibility/OCR), NOT app names. NEVER report 'no data' after one filtered search — verify with unfiltered time-only search first.\n\n" +
88
92
  "DEEP LINKS: When referencing specific moments, create clickable links using IDs from search results:\n" +
89
93
  "- OCR results (PREFERRED): [10:30 AM — Chrome](screenpipe://frame/12345) — use content.frame_id from the result\n" +
90
94
  "- Audio results: [meeting at 3pm](screenpipe://timeline?timestamp=2024-01-15T15:00:00Z) — use exact timestamp from result\n" +
@@ -103,7 +107,7 @@ const BASE_TOOLS = [
103
107
  content_type: {
104
108
  type: "string",
105
109
  enum: ["all", "ocr", "audio", "input", "accessibility"],
106
- description: "Content type filter: 'ocr' (screen text), 'audio' (transcriptions), 'input' (clicks, keystrokes, clipboard, app switches), 'accessibility' (accessibility tree text), 'all'. Default: 'all'.",
110
+ description: "Content type filter: 'ocr' (screen text via OCR, legacy fallback), 'audio' (transcriptions), 'input' (clicks, keystrokes, clipboard, app switches), 'accessibility' (accessibility tree text, preferred for screen content), 'all'. Default: 'all'.",
107
111
  default: "all",
108
112
  },
109
113
  limit: {
@@ -192,6 +196,150 @@ const BASE_TOOLS = [
192
196
  required: ["start_time", "end_time"],
193
197
  },
194
198
  },
199
+ {
200
+ name: "list-meetings",
201
+ description: "List detected meetings with duration, app, and attendees. " +
202
+ "Returns meetings detected via app focus (Zoom, Meet, Teams) and audio. " +
203
+ "Only available when screenpipe runs in smart transcription mode.",
204
+ annotations: {
205
+ title: "List Meetings",
206
+ readOnlyHint: true,
207
+ },
208
+ inputSchema: {
209
+ type: "object",
210
+ properties: {
211
+ start_time: {
212
+ type: "string",
213
+ format: "date-time",
214
+ description: "ISO 8601 UTC start filter (e.g., 2024-01-15T10:00:00Z)",
215
+ },
216
+ end_time: {
217
+ type: "string",
218
+ format: "date-time",
219
+ description: "ISO 8601 UTC end filter (e.g., 2024-01-15T18:00:00Z)",
220
+ },
221
+ limit: {
222
+ type: "integer",
223
+ description: "Max results. Default: 20",
224
+ default: 20,
225
+ },
226
+ offset: {
227
+ type: "integer",
228
+ description: "Skip N results for pagination. Default: 0",
229
+ default: 0,
230
+ },
231
+ },
232
+ },
233
+ },
234
+ {
235
+ name: "activity-summary",
236
+ description: "Get a lightweight compressed activity overview for a time range (~200-500 tokens). " +
237
+ "Returns app usage (name, frame count, minutes), recent accessibility texts, and audio speaker summary. " +
238
+ "Use this FIRST for broad questions like 'what was I doing?' before drilling into search-content or search-elements. " +
239
+ "Much cheaper than search-content for getting an overview.",
240
+ annotations: {
241
+ title: "Activity Summary",
242
+ readOnlyHint: true,
243
+ },
244
+ inputSchema: {
245
+ type: "object",
246
+ properties: {
247
+ start_time: {
248
+ type: "string",
249
+ format: "date-time",
250
+ description: "Start of time range in ISO 8601 UTC (e.g., 2024-01-15T10:00:00Z)",
251
+ },
252
+ end_time: {
253
+ type: "string",
254
+ format: "date-time",
255
+ description: "End of time range in ISO 8601 UTC (e.g., 2024-01-15T18:00:00Z)",
256
+ },
257
+ app_name: {
258
+ type: "string",
259
+ description: "Optional app name filter (e.g., 'Google Chrome', 'VS Code')",
260
+ },
261
+ },
262
+ required: ["start_time", "end_time"],
263
+ },
264
+ },
265
+ {
266
+ name: "search-elements",
267
+ description: "Search structured UI elements (accessibility tree nodes and OCR text blocks). " +
268
+ "Returns ~100-500 bytes per element — much lighter than search-content for targeted lookups. " +
269
+ "Each element has: id, frame_id, source (accessibility/ocr), role (AXButton, AXStaticText, AXLink, etc.), text, bounds, depth.\n\n" +
270
+ "Use for: finding specific buttons, links, text fields, or UI components. " +
271
+ "Prefer this over search-content when you need structural UI detail rather than full screen text.",
272
+ annotations: {
273
+ title: "Search Elements",
274
+ readOnlyHint: true,
275
+ },
276
+ inputSchema: {
277
+ type: "object",
278
+ properties: {
279
+ q: {
280
+ type: "string",
281
+ description: "Full-text search query across element text. Optional.",
282
+ },
283
+ frame_id: {
284
+ type: "integer",
285
+ description: "Filter to elements from a specific frame",
286
+ },
287
+ source: {
288
+ type: "string",
289
+ enum: ["accessibility", "ocr"],
290
+ description: "Filter by element source: 'accessibility' (structured tree) or 'ocr' (text blocks)",
291
+ },
292
+ role: {
293
+ type: "string",
294
+ description: "Filter by element role (e.g., 'AXButton', 'AXStaticText', 'AXLink', 'AXTextField', 'line')",
295
+ },
296
+ start_time: {
297
+ type: "string",
298
+ format: "date-time",
299
+ description: "ISO 8601 UTC start time",
300
+ },
301
+ end_time: {
302
+ type: "string",
303
+ format: "date-time",
304
+ description: "ISO 8601 UTC end time",
305
+ },
306
+ app_name: {
307
+ type: "string",
308
+ description: "Filter by app name",
309
+ },
310
+ limit: {
311
+ type: "integer",
312
+ description: "Max results. Default: 50",
313
+ default: 50,
314
+ },
315
+ offset: {
316
+ type: "integer",
317
+ description: "Skip N results for pagination. Default: 0",
318
+ default: 0,
319
+ },
320
+ },
321
+ },
322
+ },
323
+ {
324
+ name: "frame-context",
325
+ description: "Get accessibility text, parsed tree nodes, and extracted URLs for a specific frame. " +
326
+ "Falls back to OCR data for legacy frames without accessibility data. " +
327
+ "Use after finding a frame_id from search-content or search-elements to get full structural detail and URLs.",
328
+ annotations: {
329
+ title: "Frame Context",
330
+ readOnlyHint: true,
331
+ },
332
+ inputSchema: {
333
+ type: "object",
334
+ properties: {
335
+ frame_id: {
336
+ type: "integer",
337
+ description: "The frame ID to get context for (from search results)",
338
+ },
339
+ },
340
+ required: ["frame_id"],
341
+ },
342
+ },
195
343
  ];
196
344
  // List tools handler
197
345
  server.setRequestHandler(types_js_1.ListToolsRequestSchema, async () => {
@@ -262,16 +410,16 @@ server.setRequestHandler(types_js_1.ReadResourceRequestSchema, async (request) =
262
410
  ## Data Modalities
263
411
 
264
412
  Screenpipe captures four types of data:
265
- 1. **OCR** - Screen text from screenshots
266
- 2. **Audio** - Transcribed speech from microphone/system audio
267
- 3. **Input** - Keyboard input, mouse clicks, app switches, clipboard (macOS)
268
- 4. **Accessibility** - Accessibility tree text
413
+ 1. **Accessibility** - Screen text via accessibility APIs (primary, preferred for screen content)
414
+ 2. **OCR** - Screen text from screenshots (legacy fallback for apps without accessibility support)
415
+ 3. **Audio** - Transcribed speech from microphone/system audio
416
+ 4. **Input** - Keyboard input, mouse clicks, app switches, clipboard (macOS)
269
417
 
270
418
  ## Quick Start
271
419
  - **Get recent activity**: Call search-content with no parameters
272
- - **Search text**: \`{"q": "search term", "content_type": "ocr"}\`
420
+ - **Search screen text**: \`{"q": "search term", "content_type": "all"}\`
273
421
  - **Get keyboard input**: \`{"content_type": "input"}\`
274
- - **Get accessibility text**: \`{"content_type": "accessibility"}\`
422
+ - **Get audio only**: \`{"content_type": "audio"}\`
275
423
 
276
424
  ## search-content
277
425
  | Parameter | Description | Default |
@@ -284,11 +432,27 @@ Screenpipe captures four types of data:
284
432
  | app_name | Filter by app | (no filter) |
285
433
  | include_frames | Include screenshots | false |
286
434
 
435
+ ## Search Strategy (MANDATORY)
436
+ 1. First search: ONLY use time params (start_time/end_time). No q, no app_name, no content_type. This gives ground truth of what's recorded.
437
+ 2. Scan results to find correct app_name values and content patterns.
438
+ 3. Only THEN narrow with filters using exact observed values. App names are case-sensitive and may differ from user input (e.g. "Discord" vs "Discord.exe").
439
+ 4. The q param searches captured text (accessibility/OCR), NOT app names — an app can be visible without its name in the captured text.
440
+ 5. NEVER report "no data found" after one filtered search. Verify with unfiltered time-only search first.
441
+
442
+ ## Progressive Disclosure (Token-Efficient Strategy)
443
+ 1. **Start with activity-summary** (~200 tokens) for broad questions ("what was I doing?")
444
+ 2. **Narrow with search-content** (~500-1000 tokens) using filters from step 1
445
+ 3. **Drill into search-elements** (~200 tokens each) for structural UI detail (buttons, links)
446
+ 4. **Fetch frame-context** for URLs and accessibility tree of specific frames
447
+ 5. **Screenshots** (include_frames=true) only when text isn't enough
448
+
287
449
  ## Tips
288
450
  1. Read screenpipe://context first to get current timestamps
289
- 2. Use content_type=input for "what did I type?" queries
290
- 3. Use content_type=accessibility for accessibility tree text
291
- 4. For large aggregations (e.g. "what apps did I use today?"), paginate with offset or suggest the user run raw SQL via \`curl -X POST http://localhost:3030/raw_sql\` for efficient GROUP BY queries
451
+ 2. Use activity-summary before search-content for broad overview questions
452
+ 3. Use search-elements instead of search-content for targeted UI lookups (10x lighter)
453
+ 4. Use content_type=input for "what did I type?" queries
454
+ 5. Use content_type=accessibility for accessibility tree text
455
+ 6. For large aggregations (e.g. "what apps did I use today?"), paginate with offset or suggest the user run raw SQL via \`curl -X POST http://localhost:3030/raw_sql\` for efficient GROUP BY queries
292
456
 
293
457
  ## Deep Links (Clickable References)
294
458
  When showing search results to users, create clickable links so they can jump to that exact moment.
@@ -434,7 +598,7 @@ Current time: ${dateInfo.isoDate}
434
598
  Use search-content with:
435
599
  - app_name: "${app}"
436
600
  ${query ? `- q: "${query}"` : "- No query filter"}
437
- - content_type: "ocr"
601
+ - content_type: "all"
438
602
  - limit: 50`,
439
603
  },
440
604
  },
@@ -712,6 +876,151 @@ server.setRequestHandler(types_js_1.CallToolRequestSchema, async (request) => {
712
876
  };
713
877
  }
714
878
  }
879
+ case "list-meetings": {
880
+ const params = new URLSearchParams();
881
+ for (const [key, value] of Object.entries(args)) {
882
+ if (value !== null && value !== undefined) {
883
+ params.append(key, String(value));
884
+ }
885
+ }
886
+ const response = await fetchAPI(`/meetings?${params.toString()}`);
887
+ if (!response.ok) {
888
+ throw new Error(`HTTP error: ${response.status}`);
889
+ }
890
+ const meetings = await response.json();
891
+ if (!Array.isArray(meetings) || meetings.length === 0) {
892
+ return {
893
+ content: [
894
+ {
895
+ type: "text",
896
+ text: "No meetings found. Make sure screenpipe is running in smart transcription mode.",
897
+ },
898
+ ],
899
+ };
900
+ }
901
+ const formatted = meetings.map((m) => {
902
+ const start = m.meeting_start;
903
+ const end = m.meeting_end || "ongoing";
904
+ const app = m.meeting_app;
905
+ const title = m.title ? ` — ${m.title}` : "";
906
+ const attendees = m.attendees ? `\nAttendees: ${m.attendees}` : "";
907
+ return `[${m.detection_source}] ${app}${title}\n ${start} → ${end}${attendees}`;
908
+ });
909
+ return {
910
+ content: [
911
+ {
912
+ type: "text",
913
+ text: `Meetings: ${meetings.length}\n\n${formatted.join("\n---\n")}`,
914
+ },
915
+ ],
916
+ };
917
+ }
918
+ case "activity-summary": {
919
+ const params = new URLSearchParams();
920
+ for (const [key, value] of Object.entries(args)) {
921
+ if (value !== null && value !== undefined) {
922
+ params.append(key, String(value));
923
+ }
924
+ }
925
+ const response = await fetchAPI(`/activity-summary?${params.toString()}`);
926
+ if (!response.ok) {
927
+ throw new Error(`HTTP error: ${response.status}`);
928
+ }
929
+ const data = await response.json();
930
+ // Format apps
931
+ const appsLines = (data.apps || []).map((a) => ` ${a.name}: ${a.minutes} min (${a.frame_count} frames)`);
932
+ // Format audio
933
+ const speakerLines = (data.audio_summary?.speakers || []).map((s) => ` ${s.name}: ${s.segment_count} segments`);
934
+ // Format recent texts
935
+ const textLines = (data.recent_texts || []).map((t) => ` [${t.app_name}] ${t.text}`);
936
+ const summary = [
937
+ `Activity Summary (${data.time_range?.start} → ${data.time_range?.end})`,
938
+ `Total frames: ${data.total_frames}`,
939
+ "",
940
+ "Apps:",
941
+ ...(appsLines.length ? appsLines : [" (none)"]),
942
+ "",
943
+ `Audio: ${data.audio_summary?.segment_count || 0} segments`,
944
+ ...(speakerLines.length ? speakerLines : []),
945
+ "",
946
+ "Recent texts:",
947
+ ...(textLines.length ? textLines.slice(0, 10) : [" (none)"]),
948
+ ].join("\n");
949
+ return { content: [{ type: "text", text: summary }] };
950
+ }
951
+ case "search-elements": {
952
+ const params = new URLSearchParams();
953
+ for (const [key, value] of Object.entries(args)) {
954
+ if (value !== null && value !== undefined) {
955
+ params.append(key, String(value));
956
+ }
957
+ }
958
+ const response = await fetchAPI(`/elements?${params.toString()}`);
959
+ if (!response.ok) {
960
+ throw new Error(`HTTP error: ${response.status}`);
961
+ }
962
+ const data = await response.json();
963
+ const elements = data.data || [];
964
+ const pagination = data.pagination || {};
965
+ if (elements.length === 0) {
966
+ return {
967
+ content: [
968
+ {
969
+ type: "text",
970
+ text: "No elements found. Try: broader search, different role/source, or wider time range.",
971
+ },
972
+ ],
973
+ };
974
+ }
975
+ const formatted = elements.map((e) => {
976
+ const boundsStr = e.bounds
977
+ ? ` [${e.bounds.left.toFixed(2)},${e.bounds.top.toFixed(2)} ${e.bounds.width.toFixed(2)}x${e.bounds.height.toFixed(2)}]`
978
+ : "";
979
+ return `[${e.source}] ${e.role} (frame:${e.frame_id}, depth:${e.depth})${boundsStr}\n ${e.text || "(no text)"}`;
980
+ });
981
+ const header = `Elements: ${elements.length}/${pagination.total || "?"}` +
982
+ (pagination.total > elements.length
983
+ ? ` (use offset=${(pagination.offset || 0) + elements.length} for more)`
984
+ : "");
985
+ return {
986
+ content: [{ type: "text", text: header + "\n\n" + formatted.join("\n---\n") }],
987
+ };
988
+ }
989
+ case "frame-context": {
990
+ const frameId = args.frame_id;
991
+ if (!frameId) {
992
+ return {
993
+ content: [{ type: "text", text: "Error: frame_id is required" }],
994
+ };
995
+ }
996
+ const response = await fetchAPI(`/frames/${frameId}/context`);
997
+ if (!response.ok) {
998
+ throw new Error(`HTTP error: ${response.status}`);
999
+ }
1000
+ const data = await response.json();
1001
+ const lines = [
1002
+ `Frame ${data.frame_id} (source: ${data.text_source})`,
1003
+ ];
1004
+ if (data.urls?.length) {
1005
+ lines.push("", "URLs:", ...data.urls.map((u) => ` ${u}`));
1006
+ }
1007
+ if (data.nodes?.length) {
1008
+ lines.push("", `Nodes: ${data.nodes.length}`);
1009
+ for (const node of data.nodes.slice(0, 50)) {
1010
+ const indent = " ".repeat(Math.min(node.depth, 5));
1011
+ lines.push(`${indent}[${node.role}] ${node.text}`);
1012
+ }
1013
+ if (data.nodes.length > 50) {
1014
+ lines.push(` ... and ${data.nodes.length - 50} more nodes`);
1015
+ }
1016
+ }
1017
+ if (data.text) {
1018
+ // Truncate to avoid massive outputs
1019
+ const truncated = data.text.length > 2000 ? data.text.substring(0, 2000) + "..." : data.text;
1020
+ lines.push("", "Full text:", truncated);
1021
+ }
1022
+ return { content: [{ type: "text", text: lines.join("\n") }] };
1023
+ }
715
1024
  default:
716
1025
  throw new Error(`Unknown tool: ${name}`);
717
1026
  }
package/manifest.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "manifest_version": "0.3",
3
3
  "name": "screenpipe",
4
4
  "display_name": "Screenpipe",
5
- "version": "0.8.0",
5
+ "version": "0.8.3",
6
6
  "description": "Search your screen recordings and audio transcriptions with AI",
7
7
  "long_description": "Screenpipe is a 24/7 screen and audio recorder that lets you search everything you've seen or heard. This extension connects Claude to your local screenpipe instance, enabling AI-powered search through your digital memory.",
8
8
  "author": {
@@ -33,6 +33,22 @@
33
33
  {
34
34
  "name": "export-video",
35
35
  "description": "Export screen recordings as MP4 video for a specific time range"
36
+ },
37
+ {
38
+ "name": "list-meetings",
39
+ "description": "List detected meetings with duration, app, and attendees"
40
+ },
41
+ {
42
+ "name": "activity-summary",
43
+ "description": "Lightweight compressed activity overview for a time range (apps, recent texts, audio summary)"
44
+ },
45
+ {
46
+ "name": "search-elements",
47
+ "description": "Search structured UI elements (accessibility tree nodes and OCR text blocks)"
48
+ },
49
+ {
50
+ "name": "frame-context",
51
+ "description": "Get accessibility text, parsed tree nodes, and extracted URLs for a specific frame"
36
52
  }
37
53
  ],
38
54
  "compatibility": {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "screenpipe-mcp",
3
- "version": "0.8.2",
3
+ "version": "0.8.3",
4
4
  "description": "MCP server for screenpipe - search your screen recordings and audio transcriptions",
5
5
  "main": "dist/index.js",
6
6
  "bin": {
package/src/index.ts CHANGED
@@ -48,7 +48,7 @@ const SCREENPIPE_API = `http://localhost:${port}`;
48
48
  const server = new Server(
49
49
  {
50
50
  name: "screenpipe",
51
- version: "0.8.2",
51
+ version: "0.8.3",
52
52
  },
53
53
  {
54
54
  capabilities: {
@@ -64,10 +64,14 @@ const BASE_TOOLS: Tool[] = [
64
64
  {
65
65
  name: "search-content",
66
66
  description:
67
- "Search screenpipe's recorded content: screen text (OCR), audio transcriptions, and UI elements. " +
67
+ "Search screenpipe's recorded content: screen text (accessibility APIs, with OCR fallback), audio transcriptions, and UI elements. " +
68
68
  "Returns timestamped results with app context. " +
69
69
  "Call with no parameters to get recent activity. " +
70
70
  "Use the 'screenpipe://context' resource for current time when building time-based queries.\n\n" +
71
+ "SEARCH STRATEGY: First search with ONLY time params (start_time/end_time) — no q, no app_name, no content_type. " +
72
+ "This gives ground truth of what's recorded. Scan results to find correct app_name values, then narrow with filters using exact observed values. " +
73
+ "App names are case-sensitive and may differ from user input (e.g. 'Discord' vs 'Discord.exe'). " +
74
+ "The q param searches captured text (accessibility/OCR), NOT app names. NEVER report 'no data' after one filtered search — verify with unfiltered time-only search first.\n\n" +
71
75
  "DEEP LINKS: When referencing specific moments, create clickable links using IDs from search results:\n" +
72
76
  "- OCR results (PREFERRED): [10:30 AM — Chrome](screenpipe://frame/12345) — use content.frame_id from the result\n" +
73
77
  "- Audio results: [meeting at 3pm](screenpipe://timeline?timestamp=2024-01-15T15:00:00Z) — use exact timestamp from result\n" +
@@ -86,7 +90,7 @@ const BASE_TOOLS: Tool[] = [
86
90
  content_type: {
87
91
  type: "string",
88
92
  enum: ["all", "ocr", "audio", "input", "accessibility"],
89
- description: "Content type filter: 'ocr' (screen text), 'audio' (transcriptions), 'input' (clicks, keystrokes, clipboard, app switches), 'accessibility' (accessibility tree text), 'all'. Default: 'all'.",
93
+ description: "Content type filter: 'ocr' (screen text via OCR, legacy fallback), 'audio' (transcriptions), 'input' (clicks, keystrokes, clipboard, app switches), 'accessibility' (accessibility tree text, preferred for screen content), 'all'. Default: 'all'.",
90
94
  default: "all",
91
95
  },
92
96
  limit: {
@@ -179,6 +183,154 @@ const BASE_TOOLS: Tool[] = [
179
183
  required: ["start_time", "end_time"],
180
184
  },
181
185
  },
186
+ {
187
+ name: "list-meetings",
188
+ description:
189
+ "List detected meetings with duration, app, and attendees. " +
190
+ "Returns meetings detected via app focus (Zoom, Meet, Teams) and audio. " +
191
+ "Only available when screenpipe runs in smart transcription mode.",
192
+ annotations: {
193
+ title: "List Meetings",
194
+ readOnlyHint: true,
195
+ },
196
+ inputSchema: {
197
+ type: "object",
198
+ properties: {
199
+ start_time: {
200
+ type: "string",
201
+ format: "date-time",
202
+ description: "ISO 8601 UTC start filter (e.g., 2024-01-15T10:00:00Z)",
203
+ },
204
+ end_time: {
205
+ type: "string",
206
+ format: "date-time",
207
+ description: "ISO 8601 UTC end filter (e.g., 2024-01-15T18:00:00Z)",
208
+ },
209
+ limit: {
210
+ type: "integer",
211
+ description: "Max results. Default: 20",
212
+ default: 20,
213
+ },
214
+ offset: {
215
+ type: "integer",
216
+ description: "Skip N results for pagination. Default: 0",
217
+ default: 0,
218
+ },
219
+ },
220
+ },
221
+ },
222
+ {
223
+ name: "activity-summary",
224
+ description:
225
+ "Get a lightweight compressed activity overview for a time range (~200-500 tokens). " +
226
+ "Returns app usage (name, frame count, minutes), recent accessibility texts, and audio speaker summary. " +
227
+ "Use this FIRST for broad questions like 'what was I doing?' before drilling into search-content or search-elements. " +
228
+ "Much cheaper than search-content for getting an overview.",
229
+ annotations: {
230
+ title: "Activity Summary",
231
+ readOnlyHint: true,
232
+ },
233
+ inputSchema: {
234
+ type: "object",
235
+ properties: {
236
+ start_time: {
237
+ type: "string",
238
+ format: "date-time",
239
+ description: "Start of time range in ISO 8601 UTC (e.g., 2024-01-15T10:00:00Z)",
240
+ },
241
+ end_time: {
242
+ type: "string",
243
+ format: "date-time",
244
+ description: "End of time range in ISO 8601 UTC (e.g., 2024-01-15T18:00:00Z)",
245
+ },
246
+ app_name: {
247
+ type: "string",
248
+ description: "Optional app name filter (e.g., 'Google Chrome', 'VS Code')",
249
+ },
250
+ },
251
+ required: ["start_time", "end_time"],
252
+ },
253
+ },
254
+ {
255
+ name: "search-elements",
256
+ description:
257
+ "Search structured UI elements (accessibility tree nodes and OCR text blocks). " +
258
+ "Returns ~100-500 bytes per element — much lighter than search-content for targeted lookups. " +
259
+ "Each element has: id, frame_id, source (accessibility/ocr), role (AXButton, AXStaticText, AXLink, etc.), text, bounds, depth.\n\n" +
260
+ "Use for: finding specific buttons, links, text fields, or UI components. " +
261
+ "Prefer this over search-content when you need structural UI detail rather than full screen text.",
262
+ annotations: {
263
+ title: "Search Elements",
264
+ readOnlyHint: true,
265
+ },
266
+ inputSchema: {
267
+ type: "object",
268
+ properties: {
269
+ q: {
270
+ type: "string",
271
+ description: "Full-text search query across element text. Optional.",
272
+ },
273
+ frame_id: {
274
+ type: "integer",
275
+ description: "Filter to elements from a specific frame",
276
+ },
277
+ source: {
278
+ type: "string",
279
+ enum: ["accessibility", "ocr"],
280
+ description: "Filter by element source: 'accessibility' (structured tree) or 'ocr' (text blocks)",
281
+ },
282
+ role: {
283
+ type: "string",
284
+ description: "Filter by element role (e.g., 'AXButton', 'AXStaticText', 'AXLink', 'AXTextField', 'line')",
285
+ },
286
+ start_time: {
287
+ type: "string",
288
+ format: "date-time",
289
+ description: "ISO 8601 UTC start time",
290
+ },
291
+ end_time: {
292
+ type: "string",
293
+ format: "date-time",
294
+ description: "ISO 8601 UTC end time",
295
+ },
296
+ app_name: {
297
+ type: "string",
298
+ description: "Filter by app name",
299
+ },
300
+ limit: {
301
+ type: "integer",
302
+ description: "Max results. Default: 50",
303
+ default: 50,
304
+ },
305
+ offset: {
306
+ type: "integer",
307
+ description: "Skip N results for pagination. Default: 0",
308
+ default: 0,
309
+ },
310
+ },
311
+ },
312
+ },
313
+ {
314
+ name: "frame-context",
315
+ description:
316
+ "Get accessibility text, parsed tree nodes, and extracted URLs for a specific frame. " +
317
+ "Falls back to OCR data for legacy frames without accessibility data. " +
318
+ "Use after finding a frame_id from search-content or search-elements to get full structural detail and URLs.",
319
+ annotations: {
320
+ title: "Frame Context",
321
+ readOnlyHint: true,
322
+ },
323
+ inputSchema: {
324
+ type: "object",
325
+ properties: {
326
+ frame_id: {
327
+ type: "integer",
328
+ description: "The frame ID to get context for (from search results)",
329
+ },
330
+ },
331
+ required: ["frame_id"],
332
+ },
333
+ },
182
334
  ];
183
335
 
184
336
  // List tools handler
@@ -255,16 +407,16 @@ server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
255
407
  ## Data Modalities
256
408
 
257
409
  Screenpipe captures four types of data:
258
- 1. **OCR** - Screen text from screenshots
259
- 2. **Audio** - Transcribed speech from microphone/system audio
260
- 3. **Input** - Keyboard input, mouse clicks, app switches, clipboard (macOS)
261
- 4. **Accessibility** - Accessibility tree text
410
+ 1. **Accessibility** - Screen text via accessibility APIs (primary, preferred for screen content)
411
+ 2. **OCR** - Screen text from screenshots (legacy fallback for apps without accessibility support)
412
+ 3. **Audio** - Transcribed speech from microphone/system audio
413
+ 4. **Input** - Keyboard input, mouse clicks, app switches, clipboard (macOS)
262
414
 
263
415
  ## Quick Start
264
416
  - **Get recent activity**: Call search-content with no parameters
265
- - **Search text**: \`{"q": "search term", "content_type": "ocr"}\`
417
+ - **Search screen text**: \`{"q": "search term", "content_type": "all"}\`
266
418
  - **Get keyboard input**: \`{"content_type": "input"}\`
267
- - **Get accessibility text**: \`{"content_type": "accessibility"}\`
419
+ - **Get audio only**: \`{"content_type": "audio"}\`
268
420
 
269
421
  ## search-content
270
422
  | Parameter | Description | Default |
@@ -277,11 +429,27 @@ Screenpipe captures four types of data:
277
429
  | app_name | Filter by app | (no filter) |
278
430
  | include_frames | Include screenshots | false |
279
431
 
432
+ ## Search Strategy (MANDATORY)
433
+ 1. First search: ONLY use time params (start_time/end_time). No q, no app_name, no content_type. This gives ground truth of what's recorded.
434
+ 2. Scan results to find correct app_name values and content patterns.
435
+ 3. Only THEN narrow with filters using exact observed values. App names are case-sensitive and may differ from user input (e.g. "Discord" vs "Discord.exe").
436
+ 4. The q param searches captured text (accessibility/OCR), NOT app names — an app can be visible without its name in the captured text.
437
+ 5. NEVER report "no data found" after one filtered search. Verify with unfiltered time-only search first.
438
+
439
+ ## Progressive Disclosure (Token-Efficient Strategy)
440
+ 1. **Start with activity-summary** (~200 tokens) for broad questions ("what was I doing?")
441
+ 2. **Narrow with search-content** (~500-1000 tokens) using filters from step 1
442
+ 3. **Drill into search-elements** (~200 tokens each) for structural UI detail (buttons, links)
443
+ 4. **Fetch frame-context** for URLs and accessibility tree of specific frames
444
+ 5. **Screenshots** (include_frames=true) only when text isn't enough
445
+
280
446
  ## Tips
281
447
  1. Read screenpipe://context first to get current timestamps
282
- 2. Use content_type=input for "what did I type?" queries
283
- 3. Use content_type=accessibility for accessibility tree text
284
- 4. For large aggregations (e.g. "what apps did I use today?"), paginate with offset or suggest the user run raw SQL via \`curl -X POST http://localhost:3030/raw_sql\` for efficient GROUP BY queries
448
+ 2. Use activity-summary before search-content for broad overview questions
449
+ 3. Use search-elements instead of search-content for targeted UI lookups (10x lighter)
450
+ 4. Use content_type=input for "what did I type?" queries
451
+ 5. Use content_type=accessibility for accessibility tree text
452
+ 6. For large aggregations (e.g. "what apps did I use today?"), paginate with offset or suggest the user run raw SQL via \`curl -X POST http://localhost:3030/raw_sql\` for efficient GROUP BY queries
285
453
 
286
454
  ## Deep Links (Clickable References)
287
455
  When showing search results to users, create clickable links so they can jump to that exact moment.
@@ -435,7 +603,7 @@ Current time: ${dateInfo.isoDate}
435
603
  Use search-content with:
436
604
  - app_name: "${app}"
437
605
  ${query ? `- q: "${query}"` : "- No query filter"}
438
- - content_type: "ocr"
606
+ - content_type: "all"
439
607
  - limit: 50`,
440
608
  },
441
609
  },
@@ -763,6 +931,200 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
763
931
  }
764
932
  }
765
933
 
934
+ case "list-meetings": {
935
+ const params = new URLSearchParams();
936
+ for (const [key, value] of Object.entries(args)) {
937
+ if (value !== null && value !== undefined) {
938
+ params.append(key, String(value));
939
+ }
940
+ }
941
+
942
+ const response = await fetchAPI(`/meetings?${params.toString()}`);
943
+ if (!response.ok) {
944
+ throw new Error(`HTTP error: ${response.status}`);
945
+ }
946
+
947
+ const meetings = await response.json();
948
+
949
+ if (!Array.isArray(meetings) || meetings.length === 0) {
950
+ return {
951
+ content: [
952
+ {
953
+ type: "text",
954
+ text: "No meetings found. Make sure screenpipe is running in smart transcription mode.",
955
+ },
956
+ ],
957
+ };
958
+ }
959
+
960
+ const formatted = meetings.map((m: Record<string, unknown>) => {
961
+ const start = m.meeting_start as string;
962
+ const end = (m.meeting_end as string) || "ongoing";
963
+ const app = m.meeting_app as string;
964
+ const title = m.title ? ` — ${m.title}` : "";
965
+ const attendees = m.attendees ? `\nAttendees: ${m.attendees}` : "";
966
+ return `[${m.detection_source}] ${app}${title}\n ${start} → ${end}${attendees}`;
967
+ });
968
+
969
+ return {
970
+ content: [
971
+ {
972
+ type: "text",
973
+ text: `Meetings: ${meetings.length}\n\n${formatted.join("\n---\n")}`,
974
+ },
975
+ ],
976
+ };
977
+ }
978
+
979
+ case "activity-summary": {
980
+ const params = new URLSearchParams();
981
+ for (const [key, value] of Object.entries(args)) {
982
+ if (value !== null && value !== undefined) {
983
+ params.append(key, String(value));
984
+ }
985
+ }
986
+
987
+ const response = await fetchAPI(`/activity-summary?${params.toString()}`);
988
+ if (!response.ok) {
989
+ throw new Error(`HTTP error: ${response.status}`);
990
+ }
991
+
992
+ const data = await response.json();
993
+
994
+ // Format apps
995
+ const appsLines = (data.apps || []).map(
996
+ (a: { name: string; frame_count: number; minutes: number }) =>
997
+ ` ${a.name}: ${a.minutes} min (${a.frame_count} frames)`
998
+ );
999
+
1000
+ // Format audio
1001
+ const speakerLines = (data.audio_summary?.speakers || []).map(
1002
+ (s: { name: string; segment_count: number }) =>
1003
+ ` ${s.name}: ${s.segment_count} segments`
1004
+ );
1005
+
1006
+ // Format recent texts
1007
+ const textLines = (data.recent_texts || []).map(
1008
+ (t: { text: string; app_name: string; timestamp: string }) =>
1009
+ ` [${t.app_name}] ${t.text}`
1010
+ );
1011
+
1012
+ const summary = [
1013
+ `Activity Summary (${data.time_range?.start} → ${data.time_range?.end})`,
1014
+ `Total frames: ${data.total_frames}`,
1015
+ "",
1016
+ "Apps:",
1017
+ ...(appsLines.length ? appsLines : [" (none)"]),
1018
+ "",
1019
+ `Audio: ${data.audio_summary?.segment_count || 0} segments`,
1020
+ ...(speakerLines.length ? speakerLines : []),
1021
+ "",
1022
+ "Recent texts:",
1023
+ ...(textLines.length ? textLines.slice(0, 10) : [" (none)"]),
1024
+ ].join("\n");
1025
+
1026
+ return { content: [{ type: "text", text: summary }] };
1027
+ }
1028
+
1029
+ case "search-elements": {
1030
+ const params = new URLSearchParams();
1031
+ for (const [key, value] of Object.entries(args)) {
1032
+ if (value !== null && value !== undefined) {
1033
+ params.append(key, String(value));
1034
+ }
1035
+ }
1036
+
1037
+ const response = await fetchAPI(`/elements?${params.toString()}`);
1038
+ if (!response.ok) {
1039
+ throw new Error(`HTTP error: ${response.status}`);
1040
+ }
1041
+
1042
+ const data = await response.json();
1043
+ const elements = data.data || [];
1044
+ const pagination = data.pagination || {};
1045
+
1046
+ if (elements.length === 0) {
1047
+ return {
1048
+ content: [
1049
+ {
1050
+ type: "text",
1051
+ text: "No elements found. Try: broader search, different role/source, or wider time range.",
1052
+ },
1053
+ ],
1054
+ };
1055
+ }
1056
+
1057
+ const formatted = elements.map(
1058
+ (e: {
1059
+ id: number;
1060
+ frame_id: number;
1061
+ source: string;
1062
+ role: string;
1063
+ text: string | null;
1064
+ depth: number;
1065
+ bounds: { left: number; top: number; width: number; height: number } | null;
1066
+ }) => {
1067
+ const boundsStr = e.bounds
1068
+ ? ` [${e.bounds.left.toFixed(2)},${e.bounds.top.toFixed(2)} ${e.bounds.width.toFixed(2)}x${e.bounds.height.toFixed(2)}]`
1069
+ : "";
1070
+ return `[${e.source}] ${e.role} (frame:${e.frame_id}, depth:${e.depth})${boundsStr}\n ${e.text || "(no text)"}`;
1071
+ }
1072
+ );
1073
+
1074
+ const header =
1075
+ `Elements: ${elements.length}/${pagination.total || "?"}` +
1076
+ (pagination.total > elements.length
1077
+ ? ` (use offset=${(pagination.offset || 0) + elements.length} for more)`
1078
+ : "");
1079
+
1080
+ return {
1081
+ content: [{ type: "text", text: header + "\n\n" + formatted.join("\n---\n") }],
1082
+ };
1083
+ }
1084
+
1085
+ case "frame-context": {
1086
+ const frameId = args.frame_id as number;
1087
+ if (!frameId) {
1088
+ return {
1089
+ content: [{ type: "text", text: "Error: frame_id is required" }],
1090
+ };
1091
+ }
1092
+
1093
+ const response = await fetchAPI(`/frames/${frameId}/context`);
1094
+ if (!response.ok) {
1095
+ throw new Error(`HTTP error: ${response.status}`);
1096
+ }
1097
+
1098
+ const data = await response.json();
1099
+
1100
+ const lines = [
1101
+ `Frame ${data.frame_id} (source: ${data.text_source})`,
1102
+ ];
1103
+
1104
+ if (data.urls?.length) {
1105
+ lines.push("", "URLs:", ...data.urls.map((u: string) => ` ${u}`));
1106
+ }
1107
+
1108
+ if (data.nodes?.length) {
1109
+ lines.push("", `Nodes: ${data.nodes.length}`);
1110
+ for (const node of data.nodes.slice(0, 50)) {
1111
+ const indent = " ".repeat(Math.min(node.depth, 5));
1112
+ lines.push(`${indent}[${node.role}] ${node.text}`);
1113
+ }
1114
+ if (data.nodes.length > 50) {
1115
+ lines.push(` ... and ${data.nodes.length - 50} more nodes`);
1116
+ }
1117
+ }
1118
+
1119
+ if (data.text) {
1120
+ // Truncate to avoid massive outputs
1121
+ const truncated = data.text.length > 2000 ? data.text.substring(0, 2000) + "..." : data.text;
1122
+ lines.push("", "Full text:", truncated);
1123
+ }
1124
+
1125
+ return { content: [{ type: "text", text: lines.join("\n") }] };
1126
+ }
1127
+
766
1128
  default:
767
1129
  throw new Error(`Unknown tool: ${name}`);
768
1130
  }