unchainedsky-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ """unchainedsky-cli — browser automation over local Chrome CDP."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,4 @@
1
+ """Support `python -m unchained_cli`."""
2
+ from .cli import main
3
+
4
+ main()
unchained_cli/agent.py ADDED
@@ -0,0 +1,487 @@
1
+ """Interactive Claude browser agent — local browsing with DDM-first methodology.
2
+
3
+ Usage:
4
+ unchained agent # Start interactive agent
5
+ unchained agent "search for X" # Start with an initial task
6
+ unchained agent --model sonnet # Use a specific model
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import io
11
+ import json
12
+ import os
13
+ import sys
14
+ import textwrap
15
+ from typing import Any
16
+
17
+ try:
18
+ import anthropic
19
+ except ImportError:
20
+ print(
21
+ "The 'anthropic' package is required for the agent.\n"
22
+ "Install it with: pip install anthropic\n"
23
+ "Or: uv pip install anthropic",
24
+ file=sys.stderr,
25
+ )
26
+ sys.exit(1)
27
+
28
+ from .chrome import ChromeClient, CDPError
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # System prompt — DDM-first methodology
32
+ # ---------------------------------------------------------------------------
33
+
34
+ SYSTEM_PROMPT = textwrap.dedent("""\
35
+ You are a browser automation agent. You control a local Chrome browser through
36
+ CDP (Chrome DevTools Protocol) tools. Your job is to help the user accomplish
37
+ browsing tasks by navigating pages, clicking elements, filling forms, and
38
+ extracting information.
39
+
40
+ ## Critical Rules
41
+
42
+ 1. **DDM First, Always** — Use `ddm` for orientation (~500 tokens). Only use
43
+ `screenshot` as last resort for CAPTCHAs or visual-only content (~2100 tokens).
44
+
45
+ 2. **Navigate and Click return DDM inline** — After `navigate` or `click`, the
46
+ page layout is already in the response. Do NOT call `ddm` separately after them.
47
+ Only call `ddm` separately after `type`, or for `--text`, `--at`, `--find`.
48
+
49
+ 3. **Click to focus before typing** — Always click the target input first.
50
+ Key events go to whichever element has focus.
51
+
52
+ 4. **Probe on first visit** — On the first page of a new domain, the Intel probe
53
+ runs automatically with DDM. Check the strategy line to decide extraction method:
54
+ - `js_global > 50%` → use `intel --stores` → `intel --find-paths` → `js`
55
+ - `host_attrs > 50%` → use `intel --extract --strategy host_attrs`
56
+ - `data_testid > 40%` → use `intel --extract --strategy data_testid`
57
+ - Otherwise → stick with DDM (`--text`, `--at`, JS `querySelectorAll`)
58
+
59
+ ## Workflow
60
+
61
+ 1. **ORIENT** — Read the DDM layout from navigate/click output
62
+ 2. **IDENTIFY** — `ddm --at x,y` on elements you need details about
63
+ 3. **ACT** — Click coordinates, type text, or run JS
64
+ 4. **VERIFY** — Check the layout changed after actions
65
+ 5. **EXTRACT** — Use `ddm --text`, `intel --extract`, or `js` based on page type
66
+
67
+ ## Tool Tips
68
+
69
+ - Coordinates come from DDM: `label@x,y` format. Use those x,y for clicks.
70
+ - `ddm --text --find "keyword"` searches page text and shows nearby elements.
71
+ - `ddm --tabs` lists open tabs. Use `--tab <id>` on any tool for multi-tab work.
72
+ - `js` can run arbitrary JavaScript for complex extraction or interaction.
73
+ - `intel --stores` reveals JS data stores (YouTube, Next.js, Nuxt, etc.)
74
+ """)
75
+
76
+ # ---------------------------------------------------------------------------
77
+ # Tool definitions for Claude
78
+ # ---------------------------------------------------------------------------
79
+
80
+ TOOLS = [
81
+ {
82
+ "name": "navigate",
83
+ "description": "Navigate to a URL. Returns page layout with interactive elements and Intel probe.",
84
+ "input_schema": {
85
+ "type": "object",
86
+ "properties": {
87
+ "url": {"type": "string", "description": "URL to navigate to"},
88
+ "tab": {"type": "string", "description": "Tab ID (optional, default: current tab)"},
89
+ },
90
+ "required": ["url"],
91
+ },
92
+ },
93
+ {
94
+ "name": "click",
95
+ "description": "Click at pixel coordinates or a CSS selector. Returns updated page layout.",
96
+ "input_schema": {
97
+ "type": "object",
98
+ "properties": {
99
+ "x": {"type": "integer", "description": "X pixel coordinate"},
100
+ "y": {"type": "integer", "description": "Y pixel coordinate"},
101
+ "selector": {"type": "string", "description": "CSS selector (alternative to x,y)"},
102
+ },
103
+ },
104
+ },
105
+ {
106
+ "name": "type_text",
107
+ "description": "Type text into the currently focused element. Click the input first!",
108
+ "input_schema": {
109
+ "type": "object",
110
+ "properties": {
111
+ "text": {"type": "string", "description": "Text to type"},
112
+ },
113
+ "required": ["text"],
114
+ },
115
+ },
116
+ {
117
+ "name": "press_enter",
118
+ "description": "Press Enter key (submit form, confirm input, etc.)",
119
+ "input_schema": {"type": "object", "properties": {}},
120
+ },
121
+ {
122
+ "name": "key_press",
123
+ "description": "Press a keyboard key (Escape, Tab, ArrowDown, etc.)",
124
+ "input_schema": {
125
+ "type": "object",
126
+ "properties": {
127
+ "key": {"type": "string", "description": "Key name"},
128
+ "modifiers": {"type": "integer", "description": "Modifier bitmask: 1=Alt 2=Ctrl 4=Meta 8=Shift", "default": 0},
129
+ },
130
+ "required": ["key"],
131
+ },
132
+ },
133
+ {
134
+ "name": "scroll",
135
+ "description": "Scroll the page in a direction.",
136
+ "input_schema": {
137
+ "type": "object",
138
+ "properties": {
139
+ "direction": {"type": "string", "enum": ["up", "down", "left", "right"], "default": "down"},
140
+ "amount": {"type": "integer", "description": "Pixels to scroll (default: 500)", "default": 500},
141
+ },
142
+ },
143
+ },
144
+ {
145
+ "name": "ddm",
146
+ "description": "DOM Density Map — text-based page layout for orientation. Use flags: --text (extract text), --find <keyword> (search text), --at <x>,<y> (element details at coords), --tabs (list tabs), --sparse (compressed), --interactive (elements only).",
147
+ "input_schema": {
148
+ "type": "object",
149
+ "properties": {
150
+ "flags": {
151
+ "type": "array",
152
+ "items": {"type": "string"},
153
+ "description": "DDM flags, e.g. ['--text', '--find', 'price'] or ['--at', '694,584']",
154
+ "default": [],
155
+ },
156
+ },
157
+ },
158
+ },
159
+ {
160
+ "name": "intel",
161
+ "description": "Page intelligence — fingerprint page and rank extraction strategies. Use flags: --probe (fingerprint), --extract (full pipeline), --stores (JS data stores), --shape <global> (variable structure), --find-paths <global> <pattern> (search globals).",
162
+ "input_schema": {
163
+ "type": "object",
164
+ "properties": {
165
+ "flags": {
166
+ "type": "array",
167
+ "items": {"type": "string"},
168
+ "description": "Intel flags, e.g. ['--probe'] or ['--extract', '--strategy', 'host_attrs']",
169
+ "default": [],
170
+ },
171
+ },
172
+ },
173
+ },
174
+ {
175
+ "name": "js",
176
+ "description": "Execute JavaScript on the page and return the result.",
177
+ "input_schema": {
178
+ "type": "object",
179
+ "properties": {
180
+ "expression": {"type": "string", "description": "JavaScript expression to evaluate"},
181
+ },
182
+ "required": ["expression"],
183
+ },
184
+ },
185
+ {
186
+ "name": "screenshot",
187
+ "description": "Take a screenshot (last resort — costs ~2100 tokens). Use DDM first.",
188
+ "input_schema": {
189
+ "type": "object",
190
+ "properties": {
191
+ "output": {"type": "string", "description": "Output file path", "default": "/tmp/unchained_screenshot.png"},
192
+ },
193
+ },
194
+ },
195
+ {
196
+ "name": "create_tab",
197
+ "description": "Open a new browser tab, optionally navigating to a URL.",
198
+ "input_schema": {
199
+ "type": "object",
200
+ "properties": {
201
+ "url": {"type": "string", "description": "URL to open", "default": "about:blank"},
202
+ },
203
+ },
204
+ },
205
+ {
206
+ "name": "close_tab",
207
+ "description": "Close a browser tab by ID.",
208
+ "input_schema": {
209
+ "type": "object",
210
+ "properties": {
211
+ "tab_id": {"type": "string", "description": "Tab ID to close"},
212
+ },
213
+ "required": ["tab_id"],
214
+ },
215
+ },
216
+ ]
217
+
218
+ # ---------------------------------------------------------------------------
219
+ # Tool execution
220
+ # ---------------------------------------------------------------------------
221
+
222
+ def _capture_ddm(port: int, tab_id: str, flags: list[str]) -> str:
223
+ """Run DDM and capture stdout."""
224
+ from . import ddm as _ddm
225
+ old_stdout, old_stderr = sys.stdout, sys.stderr
226
+ captured = io.StringIO()
227
+ err = io.StringIO()
228
+ sys.stdout = captured
229
+ sys.stderr = err
230
+ try:
231
+ _ddm.run_ddm(port, tab_id, flags)
232
+ except SystemExit:
233
+ pass
234
+ finally:
235
+ sys.stdout, sys.stderr = old_stdout, old_stderr
236
+ output = captured.getvalue()
237
+ errors = err.getvalue()
238
+ if errors and not output:
239
+ return f"Error: {errors.strip()}"
240
+ return output.strip()
241
+
242
+
243
+ def _capture_intel(port: int, tab_id: str, flags: list[str]) -> str:
244
+ """Run Intel and capture stdout."""
245
+ from . import intel as _intel
246
+ old_stdout, old_stderr = sys.stdout, sys.stderr
247
+ captured = io.StringIO()
248
+ err = io.StringIO()
249
+ sys.stdout = captured
250
+ sys.stderr = err
251
+ try:
252
+ _intel.run_intel(port, tab_id, flags)
253
+ except SystemExit:
254
+ pass
255
+ finally:
256
+ sys.stdout, sys.stderr = old_stdout, old_stderr
257
+ output = captured.getvalue()
258
+ errors = err.getvalue()
259
+ if errors and not output:
260
+ return f"Error: {errors.strip()}"
261
+ return output.strip()
262
+
263
+
264
+ def execute_tool(client: ChromeClient, tool_name: str, tool_input: dict) -> str:
265
+ """Execute a tool call and return the result string."""
266
+ port = client.port
267
+ tab = tool_input.get("tab", "auto")
268
+
269
+ try:
270
+ if tool_name == "navigate":
271
+ tab_id = client.resolve_tab(tab)
272
+ client.navigate(tab_id, tool_input["url"])
273
+ final = client.js_eval(tab_id, "window.location.href") or tool_input["url"]
274
+ result = f"Navigated → {final}\n"
275
+ # Inline DDM + Intel probe
276
+ ddm_out = _capture_ddm(port, tab_id, ["--llm-2pass", "--cols", "60"])
277
+ if ddm_out:
278
+ result += f"\n{ddm_out}"
279
+ return result
280
+
281
+ elif tool_name == "click":
282
+ tab_id = client.resolve_tab(tab)
283
+ if "selector" in tool_input and tool_input["selector"]:
284
+ pos = client.click_selector(tab_id, tool_input["selector"])
285
+ click_desc = f"Clicked {tool_input['selector']!r} at ({int(pos['x'])}, {int(pos['y'])})"
286
+ else:
287
+ x, y = tool_input.get("x", 0), tool_input.get("y", 0)
288
+ client.click(tab_id, x, y)
289
+ click_desc = f"Clicked ({x}, {y})"
290
+ # Inline DDM after click (no probe — same domain)
291
+ ddm_out = _capture_ddm(port, tab_id, ["--llm-2pass", "--cols", "60", "--no-probe"])
292
+ result = click_desc
293
+ if ddm_out:
294
+ result += f"\n\n{ddm_out}"
295
+ return result
296
+
297
+ elif tool_name == "type_text":
298
+ tab_id = client.resolve_tab(tab)
299
+ client.type_text(tab_id, tool_input["text"])
300
+ preview = tool_input["text"][:40]
301
+ return f"Typed: {preview!r}"
302
+
303
+ elif tool_name == "press_enter":
304
+ tab_id = client.resolve_tab(tab)
305
+ client.key_press(tab_id, "Enter")
306
+ # Inline DDM after enter (may submit form / navigate)
307
+ ddm_out = _capture_ddm(port, tab_id, ["--llm-2pass", "--cols", "60", "--no-probe"])
308
+ result = "Pressed Enter"
309
+ if ddm_out:
310
+ result += f"\n\n{ddm_out}"
311
+ return result
312
+
313
+ elif tool_name == "key_press":
314
+ tab_id = client.resolve_tab(tab)
315
+ client.key_press(tab_id, tool_input["key"], tool_input.get("modifiers", 0))
316
+ return f"Pressed {tool_input['key']}"
317
+
318
+ elif tool_name == "scroll":
319
+ tab_id = client.resolve_tab(tab)
320
+ direction = tool_input.get("direction", "down")
321
+ amount = tool_input.get("amount", 500)
322
+ client.scroll(tab_id, direction, amount)
323
+ # DDM after scroll to show new viewport
324
+ ddm_out = _capture_ddm(port, tab_id, ["--llm-2pass", "--cols", "60", "--no-probe"])
325
+ result = f"Scrolled {direction} {amount}px"
326
+ if ddm_out:
327
+ result += f"\n\n{ddm_out}"
328
+ return result
329
+
330
+ elif tool_name == "ddm":
331
+ tab_id = client.resolve_tab(tab)
332
+ flags = tool_input.get("flags", [])
333
+ return _capture_ddm(port, tab_id, flags)
334
+
335
+ elif tool_name == "intel":
336
+ tab_id = client.resolve_tab(tab)
337
+ flags = tool_input.get("flags", ["--probe"])
338
+ return _capture_intel(port, tab_id, flags)
339
+
340
+ elif tool_name == "js":
341
+ tab_id = client.resolve_tab(tab)
342
+ result = client.js_eval(tab_id, tool_input["expression"])
343
+ if isinstance(result, (dict, list)):
344
+ return json.dumps(result, indent=2)
345
+ return str(result) if result is not None else "(undefined)"
346
+
347
+ elif tool_name == "screenshot":
348
+ tab_id = client.resolve_tab(tab)
349
+ png = client.screenshot(tab_id)
350
+ output = tool_input.get("output", "/tmp/unchained_screenshot.png")
351
+ with open(output, "wb") as f:
352
+ f.write(png)
353
+ return f"Screenshot saved → {output} ({len(png):,} bytes)"
354
+
355
+ elif tool_name == "create_tab":
356
+ url = tool_input.get("url", "about:blank")
357
+ info = client.create_tab(url)
358
+ return f"Created tab [{info.get('id', '?')}] → {info.get('url', url)}"
359
+
360
+ elif tool_name == "close_tab":
361
+ client.close_tab(tool_input["tab_id"])
362
+ return f"Closed tab {tool_input['tab_id']}"
363
+
364
+ else:
365
+ return f"Unknown tool: {tool_name}"
366
+
367
+ except CDPError as e:
368
+ return f"Error: {e}"
369
+ except Exception as e:
370
+ return f"Error: {type(e).__name__}: {e}"
371
+
372
+
373
+ # ---------------------------------------------------------------------------
374
+ # Agent loop
375
+ # ---------------------------------------------------------------------------
376
+
377
+ MODEL_ALIASES = {
378
+ "sonnet": "claude-sonnet-4-20250514",
379
+ "opus": "claude-opus-4-20250514",
380
+ "haiku": "claude-haiku-4-5-20251001",
381
+ }
382
+
383
+
384
+ def run_agent(port: int = 9222, model: str = "sonnet", initial_task: str | None = None):
385
+ """Run the interactive Claude browser agent."""
386
+ model_id = MODEL_ALIASES.get(model, model)
387
+ api_key = os.environ.get("ANTHROPIC_API_KEY")
388
+ if not api_key:
389
+ print("Error: ANTHROPIC_API_KEY not set.", file=sys.stderr)
390
+ print("Set it with: export ANTHROPIC_API_KEY=sk-ant-...", file=sys.stderr)
391
+ sys.exit(1)
392
+
393
+ sdk = anthropic.Anthropic(api_key=api_key)
394
+ client = ChromeClient(port=port)
395
+
396
+ # Verify Chrome is reachable
397
+ try:
398
+ client.browser_version()
399
+ except CDPError:
400
+ print(f"Chrome not reachable on port {port}.")
401
+ print(f"Start it with: unchained launch")
402
+ sys.exit(1)
403
+
404
+ messages: list[dict] = []
405
+
406
+ print(f"Unchained Agent — model: {model_id}, port: {port}")
407
+ print("Type your task, or 'quit' to exit.\n")
408
+
409
+ # If initial task provided, use it as first message
410
+ if initial_task:
411
+ print(f"You: {initial_task}\n")
412
+ messages.append({"role": "user", "content": initial_task})
413
+ else:
414
+ try:
415
+ user_input = input("You: ").strip()
416
+ except (EOFError, KeyboardInterrupt):
417
+ print()
418
+ return
419
+ if not user_input or user_input.lower() in ("quit", "exit", "q"):
420
+ return
421
+ messages.append({"role": "user", "content": user_input})
422
+
423
+ while True:
424
+ # Call Claude
425
+ try:
426
+ response = sdk.messages.create(
427
+ model=model_id,
428
+ max_tokens=4096,
429
+ system=SYSTEM_PROMPT,
430
+ tools=TOOLS,
431
+ messages=messages,
432
+ )
433
+ except anthropic.APIError as e:
434
+ print(f"\nAPI Error: {e}")
435
+ break
436
+
437
+ # Process response
438
+ assistant_content = response.content
439
+ messages.append({"role": "assistant", "content": assistant_content})
440
+
441
+ # Print text blocks and collect tool uses
442
+ tool_uses = []
443
+ for block in assistant_content:
444
+ if block.type == "text":
445
+ print(f"\nAgent: {block.text}")
446
+ elif block.type == "tool_use":
447
+ tool_uses.append(block)
448
+
449
+ # If no tool calls, wait for next user input
450
+ if response.stop_reason == "end_turn" or not tool_uses:
451
+ print()
452
+ try:
453
+ user_input = input("You: ").strip()
454
+ except (EOFError, KeyboardInterrupt):
455
+ print()
456
+ return
457
+ if not user_input or user_input.lower() in ("quit", "exit", "q"):
458
+ return
459
+ messages.append({"role": "user", "content": user_input})
460
+ continue
461
+
462
+ # Execute tool calls
463
+ tool_results = []
464
+ for tool_use in tool_uses:
465
+ name = tool_use.name
466
+ inp = tool_use.input
467
+ print(f"\n [{name}] {json.dumps(inp, separators=(',', ':'))[:120]}")
468
+
469
+ result = execute_tool(client, name, inp)
470
+
471
+ # Truncate very long results
472
+ if len(result) > 8000:
473
+ result = result[:7900] + f"\n... (truncated, {len(result)} chars total)"
474
+
475
+ # Show brief preview
476
+ preview = result[:200].replace('\n', ' ')
477
+ if len(result) > 200:
478
+ preview += "..."
479
+ print(f" → {preview}")
480
+
481
+ tool_results.append({
482
+ "type": "tool_result",
483
+ "tool_use_id": tool_use.id,
484
+ "content": result,
485
+ })
486
+
487
+ messages.append({"role": "user", "content": tool_results})