spider-browser 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. spider_browser-0.1.0/.gitignore +22 -0
  2. spider_browser-0.1.0/PKG-INFO +12 -0
  3. spider_browser-0.1.0/pyproject.toml +27 -0
  4. spider_browser-0.1.0/spider_browser/__init__.py +41 -0
  5. spider_browser-0.1.0/spider_browser/ai/__init__.py +0 -0
  6. spider_browser-0.1.0/spider_browser/ai/act.py +41 -0
  7. spider_browser-0.1.0/spider_browser/ai/agent.py +440 -0
  8. spider_browser-0.1.0/spider_browser/ai/extract.py +69 -0
  9. spider_browser-0.1.0/spider_browser/ai/llm_provider.py +64 -0
  10. spider_browser-0.1.0/spider_browser/ai/observe.py +107 -0
  11. spider_browser-0.1.0/spider_browser/ai/prompts.py +133 -0
  12. spider_browser-0.1.0/spider_browser/ai/providers/__init__.py +0 -0
  13. spider_browser-0.1.0/spider_browser/ai/providers/anthropic_provider.py +92 -0
  14. spider_browser-0.1.0/spider_browser/ai/providers/openai_provider.py +55 -0
  15. spider_browser-0.1.0/spider_browser/events/__init__.py +0 -0
  16. spider_browser-0.1.0/spider_browser/events/emitter.py +40 -0
  17. spider_browser-0.1.0/spider_browser/events/types.py +5 -0
  18. spider_browser-0.1.0/spider_browser/page.py +611 -0
  19. spider_browser-0.1.0/spider_browser/protocol/__init__.py +0 -0
  20. spider_browser-0.1.0/spider_browser/protocol/bidi_session.py +194 -0
  21. spider_browser-0.1.0/spider_browser/protocol/cdp_session.py +353 -0
  22. spider_browser-0.1.0/spider_browser/protocol/protocol_adapter.py +286 -0
  23. spider_browser-0.1.0/spider_browser/protocol/transport.py +304 -0
  24. spider_browser-0.1.0/spider_browser/protocol/types.py +27 -0
  25. spider_browser-0.1.0/spider_browser/retry/__init__.py +0 -0
  26. spider_browser-0.1.0/spider_browser/retry/browser_selector.py +66 -0
  27. spider_browser-0.1.0/spider_browser/retry/failure_tracker.py +73 -0
  28. spider_browser-0.1.0/spider_browser/retry/retry_engine.py +419 -0
  29. spider_browser-0.1.0/spider_browser/spider_browser.py +278 -0
  30. spider_browser-0.1.0/spider_browser/utils/__init__.py +0 -0
  31. spider_browser-0.1.0/spider_browser/utils/dom.py +55 -0
  32. spider_browser-0.1.0/spider_browser/utils/errors.py +75 -0
  33. spider_browser-0.1.0/spider_browser/utils/html.py +10 -0
  34. spider_browser-0.1.0/spider_browser/utils/logger.py +22 -0
@@ -0,0 +1,22 @@
1
+ node_modules/
2
+ dist/
3
+ *.tsbuildinfo
4
+ .env
5
+ .env.*
6
+
7
+ # Python
8
+ __pycache__/
9
+ *.pyc
10
+ *.pyo
11
+ *.egg-info/
12
+ build/
13
+ *.egg
14
+
15
+ # Rust
16
+ target/
17
+
18
+ # Test data (synced to spider-browser-dataset)
19
+ stealth-*.csv
20
+ stealth-*.json
21
+ automation-showcase*.csv
22
+ .claude/
@@ -0,0 +1,12 @@
1
+ Metadata-Version: 2.4
2
+ Name: spider-browser
3
+ Version: 0.1.0
4
+ Summary: Browser automation client for Spider's pre-warmed browser fleet with smart retry and browser switching
5
+ License-Expression: MIT
6
+ Requires-Python: >=3.10
7
+ Requires-Dist: httpx>=0.27.0
8
+ Requires-Dist: pydantic>=2.0
9
+ Requires-Dist: websockets>=12.0
10
+ Provides-Extra: dev
11
+ Requires-Dist: pytest-asyncio>=0.23; extra == 'dev'
12
+ Requires-Dist: pytest>=8.0; extra == 'dev'
@@ -0,0 +1,27 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "spider-browser"
7
+ version = "0.1.0"
8
+ description = "Browser automation client for Spider's pre-warmed browser fleet with smart retry and browser switching"
9
+ requires-python = ">=3.10"
10
+ license = "MIT"
11
+ dependencies = [
12
+ "websockets>=12.0",
13
+ "pydantic>=2.0",
14
+ "httpx>=0.27.0",
15
+ ]
16
+
17
+ [project.optional-dependencies]
18
+ dev = [
19
+ "pytest>=8.0",
20
+ "pytest-asyncio>=0.23",
21
+ ]
22
+
23
+ [tool.hatch.build.targets.sdist]
24
+ include = ["spider_browser/", "LICENSE"]
25
+
26
+ [tool.hatch.build.targets.wheel]
27
+ packages = ["spider_browser"]
@@ -0,0 +1,41 @@
1
+ """spider-browser — Python browser automation client for Spider's pre-warmed browser fleet."""
2
+
3
+ from .spider_browser import SpiderBrowser, SpiderBrowserOptions
4
+ from .page import SpiderPage
5
+ from .events.types import BrowserType
6
+ from .events.emitter import SpiderEventEmitter
7
+ from .utils.errors import (
8
+ SpiderError,
9
+ ConnectionError as SpiderConnectionError,
10
+ AuthError,
11
+ RateLimitError,
12
+ BlockedError,
13
+ BackendUnavailableError,
14
+ TimeoutError as SpiderTimeoutError,
15
+ ProtocolError,
16
+ LLMError,
17
+ )
18
+ from .ai.llm_provider import LLMConfig, create_provider
19
+ from .ai.agent import Agent, AgentOptions, AgentResult
20
+
21
+ __all__ = [
22
+ "SpiderBrowser",
23
+ "SpiderBrowserOptions",
24
+ "SpiderPage",
25
+ "BrowserType",
26
+ "SpiderEventEmitter",
27
+ "SpiderError",
28
+ "SpiderConnectionError",
29
+ "AuthError",
30
+ "RateLimitError",
31
+ "BlockedError",
32
+ "BackendUnavailableError",
33
+ "SpiderTimeoutError",
34
+ "ProtocolError",
35
+ "LLMError",
36
+ "LLMConfig",
37
+ "create_provider",
38
+ "Agent",
39
+ "AgentOptions",
40
+ "AgentResult",
41
+ ]
File without changes
@@ -0,0 +1,41 @@
1
+ """act() — single action from natural language."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+
7
+ from ..protocol.protocol_adapter import ProtocolAdapter
8
+ from .llm_provider import LLMProvider
9
+ from .prompts import SYSTEM_PROMPT, build_user_message
10
+ from .agent import execute_action
11
+
12
+
13
+ async def act(
14
+ adapter: ProtocolAdapter,
15
+ llm: LLMProvider,
16
+ instruction: str,
17
+ ) -> None:
18
+ """
19
+ Execute a single action from natural language.
20
+
21
+ Takes a screenshot + HTML, sends to LLM with the instruction,
22
+ then executes the returned action steps.
23
+ """
24
+ screenshot, html, url, title = await asyncio.gather(
25
+ adapter.capture_screenshot(),
26
+ adapter.get_html(),
27
+ adapter.evaluate("window.location.href"),
28
+ adapter.evaluate("document.title"),
29
+ )
30
+
31
+ context = f"Task: {instruction}\nPAGE TITLE: {title}"
32
+
33
+ plan = await llm.chat_json([
34
+ {"role": "system", "content": SYSTEM_PROMPT},
35
+ {"role": "user", "content": build_user_message(str(url), str(html), str(screenshot), context)},
36
+ ])
37
+
38
+ steps = plan.get("steps", []) if isinstance(plan, dict) else []
39
+ for step in steps:
40
+ await execute_action(adapter, step)
41
+ await asyncio.sleep(0.2)
@@ -0,0 +1,440 @@
1
+ """Autonomous multi-step agent."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ from dataclasses import dataclass, field
8
+ from typing import Any, Dict, List, Optional
9
+
10
+ from ..protocol.protocol_adapter import ProtocolAdapter
11
+ from ..events.emitter import SpiderEventEmitter
12
+ from ..utils.errors import TimeoutError
13
+ from ..utils.logger import logger
14
+ from .llm_provider import LLMProvider
15
+ from .prompts import SYSTEM_PROMPT, build_user_message
16
+
17
+
18
+ @dataclass
19
+ class AgentOptions:
20
+ """Agent configuration."""
21
+
22
+ max_rounds: int = 30
23
+ step_delay_ms: int = 1500
24
+ instruction: Optional[str] = None
25
+
26
+
27
+ @dataclass
28
+ class AgentResult:
29
+ """Agent execution result."""
30
+
31
+ done: bool
32
+ rounds: int
33
+ extracted: Any = None
34
+ label: str = ""
35
+
36
+
37
+ class Agent:
38
+ """
39
+ Autonomous multi-step agent.
40
+
41
+ Uses the same action vocabulary and system prompt as Spider's
42
+ server-side captcha solver.
43
+
44
+ Loop: screenshot -> HTML -> LLM -> parse plan -> execute actions -> repeat.
45
+ """
46
+
47
+ def __init__(
48
+ self,
49
+ adapter: ProtocolAdapter,
50
+ llm: LLMProvider,
51
+ emitter: SpiderEventEmitter,
52
+ options: Optional[AgentOptions] = None,
53
+ ) -> None:
54
+ self._adapter = adapter
55
+ self._llm = llm
56
+ self._emitter = emitter
57
+ opts = options or AgentOptions()
58
+ self._max_rounds = opts.max_rounds
59
+ self._step_delay_s = opts.step_delay_ms / 1000.0
60
+ self._instruction = opts.instruction
61
+
62
+ async def execute(self, instruction: str) -> AgentResult:
63
+ """Execute the agent loop until done or max rounds reached."""
64
+ extracted: Any = None
65
+ last_label = ""
66
+
67
+ await asyncio.sleep(0.5)
68
+
69
+ for round_num in range(self._max_rounds):
70
+ # 1. Screenshot
71
+ try:
72
+ screenshot = await self._adapter.capture_screenshot()
73
+ except Exception as err:
74
+ logger.warning(f"agent: screenshot failed round {round_num}: {err}")
75
+ break
76
+
77
+ # 2. HTML
78
+ try:
79
+ html = await self._adapter.get_html()
80
+ except Exception as err:
81
+ logger.warning(f"agent: get HTML failed round {round_num}: {err}")
82
+ break
83
+
84
+ # 3. URL and title
85
+ try:
86
+ url = str(await self._adapter.evaluate("window.location.href"))
87
+ except Exception:
88
+ url = "unknown"
89
+ try:
90
+ title = str(await self._adapter.evaluate("document.title"))
91
+ except Exception:
92
+ title = ""
93
+
94
+ # 4. Call LLM
95
+ context = f"Round {round_num + 1}/{self._max_rounds}. Task: {instruction}\nPAGE TITLE: {title}"
96
+
97
+ try:
98
+ plan = await self._llm.chat_json([
99
+ {"role": "system", "content": SYSTEM_PROMPT},
100
+ {"role": "user", "content": build_user_message(url, html, screenshot, context)},
101
+ ])
102
+ except Exception as err:
103
+ logger.warning(f"agent: LLM call failed round {round_num}: {err}")
104
+ await asyncio.sleep(2.0)
105
+ continue
106
+
107
+ if not isinstance(plan, dict):
108
+ await asyncio.sleep(self._step_delay_s)
109
+ continue
110
+
111
+ last_label = plan.get("label", "")
112
+ if plan.get("extracted") is not None:
113
+ extracted = plan["extracted"]
114
+
115
+ steps = plan.get("steps", [])
116
+ logger.info(f"agent: round {round_num + 1} label={last_label} done={plan.get('done')} steps={len(steps)}")
117
+
118
+ self._emitter.emit("agent.step", {
119
+ "round": round_num + 1,
120
+ "label": last_label,
121
+ "stepsCount": len(steps),
122
+ })
123
+
124
+ # 5. Check if done
125
+ if plan.get("done"):
126
+ self._emitter.emit("agent.done", {"rounds": round_num + 1, "result": extracted})
127
+ return AgentResult(done=True, rounds=round_num + 1, extracted=extracted, label=last_label)
128
+
129
+ if not steps:
130
+ logger.info("agent: no steps, retrying")
131
+ await asyncio.sleep(self._step_delay_s)
132
+ continue
133
+
134
+ # 6. Execute steps
135
+ for i, action in enumerate(steps):
136
+ try:
137
+ await execute_action(self._adapter, action)
138
+ except Exception as err:
139
+ logger.warning(f"agent: action failed round {round_num} step {i}: {err}")
140
+ break
141
+ await asyncio.sleep(0.2)
142
+
143
+ # 7. Wait for page settle
144
+ await asyncio.sleep(self._step_delay_s)
145
+
146
+ logger.warning("agent: max rounds exceeded")
147
+ self._emitter.emit("agent.error", {"error": "max rounds exceeded", "round": self._max_rounds})
148
+ return AgentResult(done=False, rounds=self._max_rounds, extracted=extracted, label=last_label)
149
+
150
+
151
+ # -------------------------------------------------------------------
152
+ # Action executor — mirrors agent.rs execute_action()
153
+ # -------------------------------------------------------------------
154
+
155
+ async def execute_action(adapter: ProtocolAdapter, action: Dict[str, Any]) -> None:
156
+ """Execute a single agent action via the protocol adapter."""
157
+
158
+ # Click actions
159
+ if "Click" in action:
160
+ pos = await _get_element_center(adapter, action["Click"])
161
+ await adapter.click_point(pos["x"], pos["y"])
162
+ return
163
+
164
+ if "ClickAll" in action:
165
+ selector = action["ClickAll"]
166
+ points = await adapter.evaluate(f"""
167
+ (function() {{
168
+ const els = document.querySelectorAll({json.dumps(selector)});
169
+ return Array.from(els).map(el => {{
170
+ const r = el.getBoundingClientRect();
171
+ return {{ x: r.x + r.width / 2, y: r.y + r.height / 2 }};
172
+ }});
173
+ }})()
174
+ """)
175
+ if isinstance(points, list):
176
+ for pt in points:
177
+ if isinstance(pt, dict):
178
+ await adapter.click_point(pt["x"], pt["y"])
179
+ await asyncio.sleep(0.1)
180
+ return
181
+
182
+ if "ClickPoint" in action:
183
+ pt = action["ClickPoint"]
184
+ await adapter.click_point(pt["x"], pt["y"])
185
+ return
186
+
187
+ if "ClickHold" in action:
188
+ data = action["ClickHold"]
189
+ pos = await _get_element_center(adapter, data["selector"])
190
+ await adapter.click_hold_point(pos["x"], pos["y"], data.get("hold_ms", 500))
191
+ return
192
+
193
+ if "ClickHoldPoint" in action:
194
+ pt = action["ClickHoldPoint"]
195
+ await adapter.click_hold_point(pt["x"], pt["y"], pt.get("hold_ms", 500))
196
+ return
197
+
198
+ if "DoubleClick" in action:
199
+ pos = await _get_element_center(adapter, action["DoubleClick"])
200
+ await adapter.double_click_point(pos["x"], pos["y"])
201
+ return
202
+
203
+ if "DoubleClickPoint" in action:
204
+ pt = action["DoubleClickPoint"]
205
+ await adapter.double_click_point(pt["x"], pt["y"])
206
+ return
207
+
208
+ if "RightClick" in action:
209
+ pos = await _get_element_center(adapter, action["RightClick"])
210
+ await adapter.right_click_point(pos["x"], pos["y"])
211
+ return
212
+
213
+ if "RightClickPoint" in action:
214
+ pt = action["RightClickPoint"]
215
+ await adapter.right_click_point(pt["x"], pt["y"])
216
+ return
217
+
218
+ if "WaitForAndClick" in action:
219
+ selector = action["WaitForAndClick"]
220
+ await _wait_for_element(adapter, selector, 5000)
221
+ pos = await _get_element_center(adapter, selector)
222
+ await adapter.click_point(pos["x"], pos["y"])
223
+ return
224
+
225
+ # Drag actions
226
+ if "ClickDrag" in action:
227
+ data = action["ClickDrag"]
228
+ f = await _get_element_center(adapter, data["from"])
229
+ t = await _get_element_center(adapter, data["to"])
230
+ await adapter.drag_point(f["x"], f["y"], t["x"], t["y"])
231
+ return
232
+
233
+ if "ClickDragPoint" in action:
234
+ pt = action["ClickDragPoint"]
235
+ await adapter.drag_point(pt["from_x"], pt["from_y"], pt["to_x"], pt["to_y"])
236
+ return
237
+
238
+ # Input actions
239
+ if "Type" in action:
240
+ await adapter.insert_text(action["Type"]["value"])
241
+ return
242
+
243
+ if "Fill" in action:
244
+ data = action["Fill"]
245
+ selector = data["selector"]
246
+ value = data["value"]
247
+ await adapter.evaluate(f"""
248
+ (function() {{
249
+ const el = document.querySelector({json.dumps(selector)});
250
+ if (el) {{ el.focus(); el.value = ''; }}
251
+ }})()
252
+ """)
253
+ try:
254
+ pos = await _get_element_center(adapter, selector)
255
+ await adapter.click_point(pos["x"], pos["y"])
256
+ except Exception:
257
+ pass
258
+ await adapter.insert_text(value)
259
+ await adapter.evaluate(f"""
260
+ (function() {{
261
+ const el = document.querySelector({json.dumps(selector)});
262
+ if (el) {{
263
+ el.dispatchEvent(new Event('input', {{ bubbles: true }}));
264
+ el.dispatchEvent(new Event('change', {{ bubbles: true }}));
265
+ }}
266
+ }})()
267
+ """)
268
+ return
269
+
270
+ if "Clear" in action:
271
+ selector = action["Clear"]
272
+ await adapter.evaluate(f"document.querySelector({json.dumps(selector)}).value = ''")
273
+ return
274
+
275
+ if "Press" in action:
276
+ await adapter.press_key(action["Press"])
277
+ return
278
+
279
+ if "KeyDown" in action:
280
+ await adapter.key_down(action["KeyDown"])
281
+ return
282
+
283
+ if "KeyUp" in action:
284
+ await adapter.key_up(action["KeyUp"])
285
+ return
286
+
287
+ # Select & Focus
288
+ if "Select" in action:
289
+ data = action["Select"]
290
+ selector = data["selector"]
291
+ value = data["value"]
292
+ await adapter.evaluate(f"""
293
+ (function() {{
294
+ const el = document.querySelector({json.dumps(selector)});
295
+ if (el) {{
296
+ el.value = {json.dumps(value)};
297
+ el.dispatchEvent(new Event('change', {{ bubbles: true }}));
298
+ }}
299
+ }})()
300
+ """)
301
+ return
302
+
303
+ if "Focus" in action:
304
+ await adapter.evaluate(f"document.querySelector({json.dumps(action['Focus'])})?.focus()")
305
+ return
306
+
307
+ if "Blur" in action:
308
+ await adapter.evaluate(f"document.querySelector({json.dumps(action['Blur'])})?.blur()")
309
+ return
310
+
311
+ if "Hover" in action:
312
+ pos = await _get_element_center(adapter, action["Hover"])
313
+ await adapter.hover_point(pos["x"], pos["y"])
314
+ return
315
+
316
+ if "HoverPoint" in action:
317
+ pt = action["HoverPoint"]
318
+ await adapter.hover_point(pt["x"], pt["y"])
319
+ return
320
+
321
+ # Scroll actions
322
+ if "ScrollY" in action:
323
+ await adapter.evaluate(f"window.scrollBy(0, {action['ScrollY']})")
324
+ return
325
+
326
+ if "ScrollX" in action:
327
+ await adapter.evaluate(f"window.scrollBy({action['ScrollX']}, 0)")
328
+ return
329
+
330
+ if "ScrollTo" in action:
331
+ selector = action["ScrollTo"]["selector"] if isinstance(action["ScrollTo"], dict) else action["ScrollTo"]
332
+ await adapter.evaluate(
333
+ f"document.querySelector({json.dumps(selector)})?.scrollIntoView({{ behavior: 'smooth', block: 'center' }})"
334
+ )
335
+ return
336
+
337
+ if "ScrollToPoint" in action:
338
+ pt = action["ScrollToPoint"]
339
+ await adapter.evaluate(f"window.scrollTo({pt['x']}, {pt['y']})")
340
+ return
341
+
342
+ if "InfiniteScroll" in action:
343
+ count = action["InfiniteScroll"]
344
+ for _ in range(count):
345
+ await adapter.evaluate("window.scrollTo(0, document.body.scrollHeight)")
346
+ await asyncio.sleep(0.5)
347
+ return
348
+
349
+ # Wait actions
350
+ if "Wait" in action:
351
+ await asyncio.sleep(action["Wait"] / 1000.0)
352
+ return
353
+
354
+ if "WaitFor" in action:
355
+ await _wait_for_element(adapter, action["WaitFor"], 5000)
356
+ return
357
+
358
+ if "WaitForWithTimeout" in action:
359
+ data = action["WaitForWithTimeout"]
360
+ await _wait_for_element(adapter, data["selector"], data.get("timeout", 5000))
361
+ return
362
+
363
+ if "WaitForNavigation" in action:
364
+ await asyncio.sleep(1.0)
365
+ return
366
+
367
+ if "WaitForDom" in action:
368
+ timeout = action["WaitForDom"].get("timeout", 5000) if isinstance(action["WaitForDom"], dict) else 5000
369
+ await asyncio.sleep(timeout / 1000.0)
370
+ return
371
+
372
+ # Navigation actions
373
+ if "Navigate" in action:
374
+ await adapter.navigate(action["Navigate"])
375
+ return
376
+
377
+ if "GoBack" in action:
378
+ await adapter.evaluate("window.history.back()")
379
+ return
380
+
381
+ if "GoForward" in action:
382
+ await adapter.evaluate("window.history.forward()")
383
+ return
384
+
385
+ if "Reload" in action:
386
+ await adapter.evaluate("window.location.reload()")
387
+ return
388
+
389
+ # Viewport
390
+ if "SetViewport" in action:
391
+ data = action["SetViewport"]
392
+ await adapter.set_viewport(
393
+ data["width"],
394
+ data["height"],
395
+ data.get("device_scale_factor", 2.0),
396
+ data.get("mobile", False),
397
+ )
398
+ return
399
+
400
+ # JavaScript
401
+ if "Evaluate" in action:
402
+ await adapter.evaluate(action["Evaluate"])
403
+ return
404
+
405
+ # Screenshot (no-op — handled by agent loop)
406
+ if "Screenshot" in action:
407
+ return
408
+
409
+ logger.warning(f"agent: unknown action: {json.dumps(action)[:100]}")
410
+
411
+
412
+ # -------------------------------------------------------------------
413
+ # Helpers
414
+ # -------------------------------------------------------------------
415
+
416
+ async def _get_element_center(adapter: ProtocolAdapter, selector: str) -> Dict[str, float]:
417
+ result = await adapter.evaluate(f"""
418
+ (function() {{
419
+ const el = document.querySelector({json.dumps(selector)});
420
+ if (!el) return null;
421
+ el.scrollIntoView({{ block: 'center', behavior: 'instant' }});
422
+ const r = el.getBoundingClientRect();
423
+ return {{ x: r.x + r.width / 2, y: r.y + r.height / 2 }};
424
+ }})()
425
+ """)
426
+ if not result or not isinstance(result, dict):
427
+ raise RuntimeError(f"Element not found: {selector}")
428
+ return result
429
+
430
+
431
+ async def _wait_for_element(adapter: ProtocolAdapter, selector: str, timeout_ms: int) -> None:
432
+ interval = 0.1
433
+ max_iter = int(timeout_ms / 100)
434
+ check_js = f"!!document.querySelector({json.dumps(selector)})"
435
+ for _ in range(max_iter):
436
+ found = await adapter.evaluate(check_js)
437
+ if found:
438
+ return
439
+ await asyncio.sleep(interval)
440
+ raise TimeoutError(f"Timeout waiting for element: {selector}")
@@ -0,0 +1,69 @@
1
+ """extract() — structured data extraction from pages."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ from typing import Any, Dict, Optional, Type
8
+
9
+ from ..protocol.protocol_adapter import ProtocolAdapter
10
+ from ..utils.html import truncate_html
11
+ from .llm_provider import LLMProvider
12
+
13
+
14
+ async def extract(
15
+ adapter: ProtocolAdapter,
16
+ llm: LLMProvider,
17
+ instruction: str,
18
+ schema: Optional[Any] = None,
19
+ ) -> Any:
20
+ """
21
+ Extract structured data from the page.
22
+
23
+ Takes a screenshot + HTML, sends to LLM with the instruction
24
+ and optional Pydantic model, returns parsed data.
25
+ """
26
+ screenshot, html, url, title = await asyncio.gather(
27
+ adapter.capture_screenshot(),
28
+ adapter.get_html(),
29
+ adapter.evaluate("window.location.href"),
30
+ adapter.evaluate("document.title"),
31
+ )
32
+
33
+ truncated_html = truncate_html(str(html), 12000)
34
+
35
+ schema_desc = ""
36
+ if schema is not None:
37
+ try:
38
+ # Pydantic v2 model
39
+ if hasattr(schema, "model_json_schema"):
40
+ schema_desc = f"\n\nReturn data matching this JSON schema:\n{json.dumps(schema.model_json_schema(), indent=2)}"
41
+ else:
42
+ schema_desc = "\n\nReturn a JSON object matching the expected structure."
43
+ except Exception:
44
+ schema_desc = "\n\nReturn a JSON object matching the expected structure."
45
+
46
+ system_prompt = (
47
+ f"You are a data extraction agent. Given a webpage screenshot and HTML, "
48
+ f"extract the requested information as JSON.{schema_desc}\n\n"
49
+ f"Return ONLY a valid JSON object. No prose, no markdown."
50
+ )
51
+
52
+ user_text = f"URL: {url}\nTitle: {title}\nInstruction: {instruction}\n\nHTML (truncated):\n{truncated_html}"
53
+
54
+ result = await llm.chat_json([
55
+ {"role": "system", "content": system_prompt},
56
+ {
57
+ "role": "user",
58
+ "content": [
59
+ {"type": "text", "text": user_text},
60
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{screenshot}"}},
61
+ ],
62
+ },
63
+ ])
64
+
65
+ # Validate with Pydantic if schema provided
66
+ if schema is not None and hasattr(schema, "model_validate"):
67
+ return schema.model_validate(result)
68
+
69
+ return result