computeruseprotocol 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cup/__init__.py ADDED
@@ -0,0 +1,548 @@
1
+ """
2
+ CUP -- Computer Use Protocol.
3
+
4
+ Cross-platform accessibility tree capture in a unified format.
5
+
6
+ Quick start::
7
+
8
+ import cup
9
+
10
+ # Session is the primary API — snapshot + actions
11
+ session = cup.Session()
12
+ tree = session.snapshot(scope="overview") # window list only
13
+ tree = session.snapshot(scope="foreground") # foreground tree + window header
14
+ tree = session.snapshot(scope="desktop") # desktop items
15
+ result = session.action("e14", "click")
16
+ tree = session.snapshot(scope="foreground") # re-snapshot after action
17
+
18
+ # Convenience functions (use a default session internally)
19
+ text = cup.snapshot() # foreground compact text (the default)
20
+ text = cup.snapshot("full") # all windows compact text
21
+ raw = cup.snapshot_raw() # foreground as CUP envelope dict
22
+ raw = cup.snapshot_raw("full") # all windows as CUP envelope dict
23
+ text = cup.overview() # lightweight window list
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ from typing import Any, Literal
29
+
30
+ from cup._router import detect_platform, get_adapter
31
+ from cup.actions import ActionExecutor, ActionResult
32
+ from cup.format import (
33
+ Detail,
34
+ build_envelope,
35
+ prune_tree,
36
+ serialize_compact,
37
+ serialize_overview,
38
+ )
39
+
40
+ Scope = Literal["overview", "foreground", "desktop", "full"]
41
+
42
+ __all__ = [
43
+ "snapshot",
44
+ "snapshot_raw",
45
+ "overview",
46
+ "Session",
47
+ "Scope",
48
+ "ActionResult",
49
+ # Advanced / building blocks
50
+ "get_adapter",
51
+ "detect_platform",
52
+ "build_envelope",
53
+ "serialize_compact",
54
+ "serialize_overview",
55
+ "prune_tree",
56
+ ]
57
+
58
+
59
+ # ---------------------------------------------------------------------------
60
+ # Default session — used by the convenience functions below
61
+ # ---------------------------------------------------------------------------
62
+
63
+ _default_session: Session | None = None
64
+
65
+
66
+ def _get_default_session() -> Session:
67
+ global _default_session
68
+ if _default_session is None:
69
+ _default_session = Session()
70
+ return _default_session
71
+
72
+
73
+ # ---------------------------------------------------------------------------
74
+ # Convenience functions (thin wrappers around Session)
75
+ # ---------------------------------------------------------------------------
76
+
77
+
78
+ def snapshot(scope: Scope = "foreground", *, max_depth: int = 999) -> str:
79
+ """Capture the screen as LLM-optimized compact text.
80
+
81
+ Args:
82
+ scope: What to capture — "foreground" (default), "full", "desktop", or "overview".
83
+ max_depth: Maximum tree depth.
84
+ """
85
+ return _get_default_session().snapshot(
86
+ scope=scope,
87
+ max_depth=max_depth,
88
+ compact=True,
89
+ )
90
+
91
+
92
+ def snapshot_raw(scope: Scope = "foreground", *, max_depth: int = 999) -> dict:
93
+ """Capture the screen as a structured CUP envelope dict.
94
+
95
+ Args:
96
+ scope: What to capture — "foreground" (default), "full", "desktop", or "overview".
97
+ max_depth: Maximum tree depth.
98
+ """
99
+ return _get_default_session().snapshot(
100
+ scope=scope,
101
+ max_depth=max_depth,
102
+ compact=False,
103
+ )
104
+
105
+
106
+ def overview() -> str:
107
+ """List all open windows (no tree walking). Near-instant."""
108
+ return _get_default_session().snapshot(scope="overview", compact=True)
109
+
110
+
111
+ # ---------------------------------------------------------------------------
112
+ # Session — stateful tree capture with action execution
113
+ # ---------------------------------------------------------------------------
114
+
115
+
116
+ class Session:
117
+ """A CUP session that captures trees with element references for action execution.
118
+
119
+ Element IDs (e.g., "e14") are ephemeral — they are only valid for the
120
+ most recent tree capture. After executing any action, re-capture the
121
+ tree to get fresh IDs.
122
+
123
+ Example::
124
+
125
+ session = cup.Session()
126
+ overview = session.snapshot(scope="overview") # what's running?
127
+ tree = session.snapshot(scope="foreground") # interact with app
128
+ result = session.action("e7", "click")
129
+ tree = session.snapshot(scope="foreground") # fresh IDs after action
130
+ """
131
+
132
+ def __init__(self, *, platform: str | None = None) -> None:
133
+ self._adapter = get_adapter(platform)
134
+ self._executor = ActionExecutor(self._adapter)
135
+ self._last_tree: list[dict] | None = None
136
+ self._last_raw_tree: list[dict] | None = None
137
+
138
+ def snapshot(
139
+ self,
140
+ *,
141
+ scope: Scope = "foreground",
142
+ app: str | None = None,
143
+ max_depth: int = 999,
144
+ compact: bool = True,
145
+ detail: Detail = "compact",
146
+ ) -> str | dict:
147
+ """Capture the accessibility tree.
148
+
149
+ Args:
150
+ scope: Capture scope:
151
+ "overview" — Window list only (no tree walking, near-instant)
152
+ "foreground" — Foreground window tree + window list in header
153
+ "desktop" — Desktop surface tree only
154
+ "full" — All windows tree
155
+ app: Filter windows by title (only for scope="full").
156
+ max_depth: Maximum tree depth.
157
+ compact: If True, return compact LLM text; if False, return
158
+ the full CUP envelope dict.
159
+ detail: Pruning level ("compact" or "full").
160
+
161
+ Returns:
162
+ Compact text string or CUP envelope dict.
163
+ """
164
+ sw, sh, scale = self._adapter.get_screen_info()
165
+
166
+ # --- overview scope: no tree walking ---
167
+ if scope == "overview":
168
+ window_list = self._adapter.get_window_list()
169
+ if compact:
170
+ return serialize_overview(
171
+ window_list,
172
+ platform=self._adapter.platform_name,
173
+ screen_w=sw,
174
+ screen_h=sh,
175
+ )
176
+ return {
177
+ "version": "0.1.0",
178
+ "platform": self._adapter.platform_name,
179
+ "screen": {"w": sw, "h": sh},
180
+ "scope": "overview",
181
+ "tree": [],
182
+ "windows": window_list,
183
+ }
184
+
185
+ # --- scopes that require tree walking ---
186
+ window_list = None
187
+
188
+ if scope == "foreground":
189
+ win = self._adapter.get_foreground_window()
190
+ windows = [win]
191
+ app_name = win["title"]
192
+ app_pid = win["pid"]
193
+ app_bundle_id = win.get("bundle_id")
194
+ # Get window list for header awareness
195
+ window_list = self._adapter.get_window_list()
196
+ elif scope == "desktop":
197
+ desktop_win = self._adapter.get_desktop_window()
198
+ if desktop_win is None:
199
+ # Fallback: return overview for platforms without desktop
200
+ window_list = self._adapter.get_window_list()
201
+ if compact:
202
+ return serialize_overview(
203
+ window_list,
204
+ platform=self._adapter.platform_name,
205
+ screen_w=sw,
206
+ screen_h=sh,
207
+ )
208
+ return {
209
+ "version": "0.1.0",
210
+ "platform": self._adapter.platform_name,
211
+ "screen": {"w": sw, "h": sh},
212
+ "scope": "overview",
213
+ "tree": [],
214
+ "windows": window_list,
215
+ }
216
+ windows = [desktop_win]
217
+ app_name = "Desktop"
218
+ app_pid = desktop_win.get("pid")
219
+ app_bundle_id = desktop_win.get("bundle_id")
220
+ else: # "full"
221
+ windows = self._adapter.get_all_windows()
222
+ if app:
223
+ app_lower = app.lower()
224
+ windows = [w for w in windows if app_lower in (w.get("title") or "").lower()]
225
+ app_name = None
226
+ app_pid = None
227
+ app_bundle_id = None
228
+
229
+ tree, stats, refs = self._adapter.capture_tree(
230
+ windows,
231
+ max_depth=max_depth,
232
+ )
233
+ self._executor.set_refs(refs)
234
+
235
+ tools = None
236
+ if hasattr(self._adapter, "get_last_tools"):
237
+ tools = self._adapter.get_last_tools() or None
238
+
239
+ envelope = build_envelope(
240
+ tree,
241
+ platform=self._adapter.platform_name,
242
+ scope=scope,
243
+ screen_w=sw,
244
+ screen_h=sh,
245
+ screen_scale=scale,
246
+ app_name=app_name,
247
+ app_pid=app_pid,
248
+ app_bundle_id=app_bundle_id,
249
+ tools=tools,
250
+ )
251
+
252
+ # Store raw tree for semantic search + pruned tree for compact output
253
+ self._last_raw_tree = envelope["tree"]
254
+ self._last_tree = prune_tree(envelope["tree"], detail=detail)
255
+
256
+ if compact:
257
+ return serialize_compact(
258
+ envelope,
259
+ window_list=window_list,
260
+ detail=detail,
261
+ )
262
+ return envelope
263
+
264
+ def action(
265
+ self,
266
+ element_id: str,
267
+ action: str,
268
+ **params: Any,
269
+ ) -> ActionResult:
270
+ """Perform an action on an element from the last snapshot.
271
+
272
+ Args:
273
+ element_id: Element ID from the tree (e.g., "e14").
274
+ action: CUP canonical action (click, type, toggle, etc.).
275
+ **params: Action parameters (value, direction, etc.).
276
+ """
277
+ return self._executor.action(element_id, action, params)
278
+
279
+ def press(self, combo: str) -> ActionResult:
280
+ """Send a keyboard shortcut to the focused window.
281
+
282
+ Args:
283
+ combo: Key combination (e.g., "ctrl+s", "enter", "alt+f4").
284
+ """
285
+ return self._executor.press(combo)
286
+
287
+ def open_app(self, name: str) -> ActionResult:
288
+ """Open an application by name.
289
+
290
+ Fuzzy-matches against installed apps (e.g., "chrome" matches
291
+ "Google Chrome", "code" matches "Visual Studio Code").
292
+ Waits for the app window to appear before returning.
293
+
294
+ Args:
295
+ name: Application name (fuzzy matched).
296
+ """
297
+ return self._executor.open_app(name)
298
+
299
+ # -- find ---------------------------------------------------------------
300
+
301
+ def find(
302
+ self,
303
+ *,
304
+ query: str | None = None,
305
+ role: str | None = None,
306
+ name: str | None = None,
307
+ state: str | None = None,
308
+ limit: int = 5,
309
+ ) -> list[dict]:
310
+ """Search the last captured tree for matching elements.
311
+
312
+ Searches the full unpruned tree with semantic role matching,
313
+ fuzzy name matching, and relevance ranking.
314
+
315
+ Args:
316
+ query: Freeform semantic query (e.g., "play button", "search input").
317
+ role: Role filter — exact CUP role or synonym (e.g., "search bar").
318
+ name: Name filter — fuzzy token matching.
319
+ state: State filter — exact match (e.g., "focused", "disabled").
320
+ limit: Maximum results to return (default 5).
321
+
322
+ Returns:
323
+ List of matching CUP node dicts (without children), ranked by relevance.
324
+ """
325
+ if self._last_raw_tree is None:
326
+ self.snapshot(scope="foreground", compact=True)
327
+
328
+ from cup.search import search_tree
329
+
330
+ results = search_tree(
331
+ self._last_raw_tree,
332
+ query=query,
333
+ role=role,
334
+ name=name,
335
+ state=state,
336
+ limit=limit,
337
+ )
338
+ return [r.node for r in results]
339
+
340
+ # -- batch --------------------------------------------------------------
341
+
342
+ def batch(
343
+ self,
344
+ actions: list[dict[str, Any]],
345
+ ) -> list[ActionResult]:
346
+ """Execute a sequence of actions, stopping on first failure.
347
+
348
+ Each action spec is a dict with either:
349
+ {"element_id": "e14", "action": "click"}
350
+ {"element_id": "e5", "action": "type", "value": "hello"}
351
+ {"action": "press", "keys": "ctrl+s"}
352
+ {"action": "wait", "ms": 500}
353
+
354
+ Returns:
355
+ List of ActionResults — one per executed action.
356
+ If an action fails, the list stops at that failure.
357
+ """
358
+ import time
359
+
360
+ results: list[ActionResult] = []
361
+ for spec in actions:
362
+ action = spec.get("action", "")
363
+
364
+ if action == "wait":
365
+ ms = max(50, min(int(spec.get("ms", 500)), 5000))
366
+ time.sleep(ms / 1000)
367
+ result = ActionResult(success=True, message=f"Waited {ms}ms")
368
+ elif action == "press":
369
+ keys = spec.get("keys", "")
370
+ if not keys:
371
+ results.append(
372
+ ActionResult(
373
+ success=False,
374
+ message="",
375
+ error="press action requires 'keys' parameter",
376
+ )
377
+ )
378
+ break
379
+ result = self.press(keys)
380
+ else:
381
+ element_id = spec.get("element_id", "")
382
+ if not element_id:
383
+ results.append(
384
+ ActionResult(
385
+ success=False,
386
+ message="",
387
+ error=f"Element action '{action}' requires 'element_id' parameter",
388
+ )
389
+ )
390
+ break
391
+ params = {k: v for k, v in spec.items() if k not in ("element_id", "action")}
392
+ result = self.action(element_id, action, **params)
393
+
394
+ results.append(result)
395
+ if not result.success:
396
+ break
397
+
398
+ return results
399
+
400
+ # -- screenshot --------------------------------------------------------
401
+
402
+ def screenshot(
403
+ self,
404
+ *,
405
+ region: dict[str, int] | None = None,
406
+ ) -> bytes:
407
+ """Capture a screenshot and return PNG bytes.
408
+
409
+ On macOS, uses the ``screencapture`` system utility and checks
410
+ Screen Recording permission upfront — raises RuntimeError with
411
+ a clear message if the permission is missing.
412
+
413
+ On other platforms, requires the ``mss`` package:
414
+ ``pip install cup[screenshot]``
415
+
416
+ Args:
417
+ region: Optional capture region {"x", "y", "w", "h"} in pixels.
418
+ If None, captures the full primary monitor.
419
+
420
+ Returns:
421
+ PNG image bytes.
422
+
423
+ Raises:
424
+ RuntimeError: On macOS if Screen Recording permission is not
425
+ granted (System Settings > Privacy & Security > Screen Recording).
426
+ ImportError: On other platforms if ``mss`` is not installed.
427
+ """
428
+ import sys
429
+
430
+ if sys.platform == "darwin":
431
+ return self._screenshot_macos(region)
432
+
433
+ return self._screenshot_mss(region)
434
+
435
+ def _screenshot_macos(self, region: dict[str, int] | None) -> bytes:
436
+ """macOS screenshot via the ``screencapture`` system utility.
437
+
438
+ All macOS screenshot APIs (mss, Quartz CGWindowListCreateImage,
439
+ and screencapture) return only the desktop wallpaper when the
440
+ calling process lacks Screen Recording permission. We detect
441
+ this upfront and raise a clear error instead of returning a
442
+ useless desktop-only image.
443
+ """
444
+ self._check_macos_screen_recording_permission()
445
+
446
+ import os
447
+ import subprocess
448
+ import tempfile
449
+
450
+ fd, tmp_path = tempfile.mkstemp(suffix=".png")
451
+ os.close(fd)
452
+
453
+ try:
454
+ cmd = ["screencapture", "-x"] # -x = no sound
455
+
456
+ if region is not None:
457
+ cmd.extend(
458
+ [
459
+ "-R",
460
+ f"{region['x']},{region['y']},{region['w']},{region['h']}",
461
+ ]
462
+ )
463
+
464
+ cmd.append(tmp_path)
465
+
466
+ result = subprocess.run(cmd, capture_output=True, timeout=10)
467
+ if result.returncode != 0:
468
+ stderr = result.stderr.decode(errors="replace").strip()
469
+ raise RuntimeError(f"screencapture failed (exit {result.returncode}): {stderr}")
470
+
471
+ with open(tmp_path, "rb") as f:
472
+ data = f.read()
473
+
474
+ if not data:
475
+ raise RuntimeError("screencapture produced an empty file")
476
+
477
+ return data
478
+ finally:
479
+ try:
480
+ os.unlink(tmp_path)
481
+ except OSError:
482
+ pass
483
+
484
+ @staticmethod
485
+ def _check_macos_screen_recording_permission() -> None:
486
+ """Check if this process has Screen Recording permission.
487
+
488
+ Without it, all screenshot APIs silently return only the desktop
489
+ wallpaper with no application windows visible. We detect this by
490
+ checking if CGWindowListCopyWindowInfo returns any window names —
491
+ macOS strips them when the process lacks permission.
492
+
493
+ If permission is missing, we call CGRequestScreenCaptureAccess()
494
+ to trigger the system prompt and raise a clear error.
495
+ """
496
+ from Quartz import (
497
+ CGWindowListCopyWindowInfo,
498
+ kCGNullWindowID,
499
+ kCGWindowListOptionOnScreenOnly,
500
+ )
501
+
502
+ windows = CGWindowListCopyWindowInfo(
503
+ kCGWindowListOptionOnScreenOnly,
504
+ kCGNullWindowID,
505
+ )
506
+
507
+ # If any window has a name, we have permission
508
+ has_permission = any(w.get("kCGWindowName") for w in (windows or []))
509
+
510
+ if not has_permission:
511
+ # Trigger the macOS permission prompt
512
+ try:
513
+ from Quartz import CGRequestScreenCaptureAccess
514
+
515
+ CGRequestScreenCaptureAccess()
516
+ except ImportError:
517
+ pass
518
+
519
+ raise RuntimeError(
520
+ "Screen Recording permission is required for screenshots. "
521
+ "Grant it to this app in: System Settings > Privacy & Security "
522
+ "> Screen Recording. You may need to restart the app after granting."
523
+ )
524
+
525
+ def _screenshot_mss(self, region: dict[str, int] | None) -> bytes:
526
+ """Fallback screenshot via mss (Windows/Linux)."""
527
+ try:
528
+ import mss
529
+ import mss.tools
530
+ except ImportError:
531
+ raise ImportError(
532
+ "Screenshot support requires the 'mss' package. "
533
+ "Install it with: pip install cup[screenshot]"
534
+ ) from None
535
+
536
+ with mss.mss() as sct:
537
+ if region is not None:
538
+ monitor = {
539
+ "left": region["x"],
540
+ "top": region["y"],
541
+ "width": region["w"],
542
+ "height": region["h"],
543
+ }
544
+ else:
545
+ monitor = sct.monitors[1] # primary monitor
546
+
547
+ img = sct.grab(monitor)
548
+ return mss.tools.to_png(img.rgb, img.size)