cmdop 0.1.21__py3-none-any.whl → 0.1.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cmdop/__init__.py +1 -1
- cmdop/_generated/rpc_messages/browser_pb2.py +135 -85
- cmdop/_generated/rpc_messages/browser_pb2.pyi +270 -2
- cmdop/_generated/rpc_messages_pb2.pyi +25 -0
- cmdop/_generated/service_pb2.py +2 -2
- cmdop/_generated/service_pb2_grpc.py +345 -0
- cmdop/client.py +2 -8
- cmdop/services/browser/__init__.py +44 -31
- cmdop/services/browser/capabilities/__init__.py +17 -0
- cmdop/services/browser/capabilities/_base.py +28 -0
- cmdop/services/browser/capabilities/_helpers.py +16 -0
- cmdop/services/browser/capabilities/dom.py +76 -0
- cmdop/services/browser/capabilities/fetch.py +45 -0
- cmdop/services/browser/capabilities/input.py +49 -0
- cmdop/services/browser/capabilities/network.py +245 -0
- cmdop/services/browser/capabilities/scroll.py +147 -0
- cmdop/services/browser/capabilities/timing.py +66 -0
- cmdop/services/browser/js/__init__.py +6 -4
- cmdop/services/browser/js/interaction.py +34 -0
- cmdop/services/browser/models.py +103 -0
- cmdop/services/browser/service/__init__.py +5 -0
- cmdop/services/browser/service/aio.py +30 -0
- cmdop/services/browser/{sync/service.py → service/sync.py} +206 -6
- cmdop/services/browser/session.py +194 -0
- {cmdop-0.1.21.dist-info → cmdop-0.1.23.dist-info}/METADATA +107 -59
- {cmdop-0.1.21.dist-info → cmdop-0.1.23.dist-info}/RECORD +29 -24
- cmdop/services/browser/aio/__init__.py +0 -6
- cmdop/services/browser/aio/service.py +0 -420
- cmdop/services/browser/aio/session.py +0 -407
- cmdop/services/browser/base/__init__.py +0 -6
- cmdop/services/browser/base/session.py +0 -124
- cmdop/services/browser/sync/__init__.py +0 -6
- cmdop/services/browser/sync/session.py +0 -644
- /cmdop/services/browser/{base/service.py → service/_helpers.py} +0 -0
- {cmdop-0.1.21.dist-info → cmdop-0.1.23.dist-info}/WHEEL +0 -0
- {cmdop-0.1.21.dist-info → cmdop-0.1.23.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,644 +0,0 @@
|
|
|
1
|
-
"""Synchronous browser session."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import time
|
|
6
|
-
import threading
|
|
7
|
-
from typing import TYPE_CHECKING, Any, Callable, TypeVar
|
|
8
|
-
|
|
9
|
-
T = TypeVar("T")
|
|
10
|
-
|
|
11
|
-
from cmdop.logging import get_logger
|
|
12
|
-
from cmdop.services.browser.base.session import BaseSession
|
|
13
|
-
|
|
14
|
-
# Module-level logger for browser actions
|
|
15
|
-
_log = get_logger("cmdop.browser")
|
|
16
|
-
from cmdop.services.browser.models import (
|
|
17
|
-
BrowserCookie,
|
|
18
|
-
BrowserState,
|
|
19
|
-
PageInfo,
|
|
20
|
-
ScrollInfo,
|
|
21
|
-
ScrollResult,
|
|
22
|
-
InfiniteScrollResult,
|
|
23
|
-
)
|
|
24
|
-
from cmdop.services.browser.js import (
|
|
25
|
-
parse_json_result,
|
|
26
|
-
build_infinite_scroll_js,
|
|
27
|
-
)
|
|
28
|
-
from cmdop.services.browser.parsing import (
|
|
29
|
-
parse_html as _parse_html,
|
|
30
|
-
SoupWrapper,
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
if TYPE_CHECKING:
|
|
34
|
-
from bs4 import BeautifulSoup
|
|
35
|
-
from cmdop.services.browser.sync.service import BrowserService
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
class BrowserSession(BaseSession):
|
|
39
|
-
"""
|
|
40
|
-
Synchronous browser session with fluent API.
|
|
41
|
-
|
|
42
|
-
Usage:
|
|
43
|
-
with client.browser.create_session() as session:
|
|
44
|
-
session.navigate("https://example.com")
|
|
45
|
-
session.click("button.submit")
|
|
46
|
-
data = session.fetch_all(urls) # credentials + accept header by default
|
|
47
|
-
"""
|
|
48
|
-
|
|
49
|
-
_service: BrowserService
|
|
50
|
-
|
|
51
|
-
def _call_service(self, method: str, *args: Any, **kwargs: Any) -> Any:
|
|
52
|
-
"""Call service method synchronously."""
|
|
53
|
-
return getattr(self._service, method)(self._session_id, *args, **kwargs)
|
|
54
|
-
|
|
55
|
-
# === Navigation & Interaction ===
|
|
56
|
-
|
|
57
|
-
def navigate(self, url: str, timeout_ms: int = 30000) -> str:
|
|
58
|
-
"""Navigate to URL. Returns final URL."""
|
|
59
|
-
_log.debug("[navigate] %s", url[:80])
|
|
60
|
-
result = self._service.navigate(self._session_id, url, timeout_ms)
|
|
61
|
-
_log.debug("[navigate] → %s", result[:80] if result else "")
|
|
62
|
-
return result
|
|
63
|
-
|
|
64
|
-
def click(
|
|
65
|
-
self,
|
|
66
|
-
selector: str,
|
|
67
|
-
timeout_ms: int = 5000,
|
|
68
|
-
move_cursor: bool = False,
|
|
69
|
-
) -> None:
|
|
70
|
-
"""
|
|
71
|
-
Click element by CSS selector.
|
|
72
|
-
|
|
73
|
-
Args:
|
|
74
|
-
selector: CSS selector
|
|
75
|
-
timeout_ms: Timeout in milliseconds
|
|
76
|
-
move_cursor: If True, move cursor to element before clicking (human-like)
|
|
77
|
-
"""
|
|
78
|
-
_log.debug("[click] %s%s", selector[:60], " (move_cursor)" if move_cursor else "")
|
|
79
|
-
self._service.click(self._session_id, selector, timeout_ms, move_cursor)
|
|
80
|
-
|
|
81
|
-
def type(
|
|
82
|
-
self,
|
|
83
|
-
selector: str,
|
|
84
|
-
text: str,
|
|
85
|
-
human_like: bool = False,
|
|
86
|
-
clear_first: bool = True,
|
|
87
|
-
) -> None:
|
|
88
|
-
"""Type text into element."""
|
|
89
|
-
_log.debug("[type] %s → '%s'", selector[:40], text[:30] if len(text) <= 30 else text[:27] + "...")
|
|
90
|
-
self._service.type(self._session_id, selector, text, human_like, clear_first)
|
|
91
|
-
|
|
92
|
-
def wait_for(self, selector: str, timeout_ms: int = 30000) -> bool:
|
|
93
|
-
"""Wait for element to appear."""
|
|
94
|
-
return self._service.wait_for(self._session_id, selector, timeout_ms)
|
|
95
|
-
|
|
96
|
-
# === Extraction ===
|
|
97
|
-
|
|
98
|
-
def extract(
|
|
99
|
-
self, selector: str, attr: str | None = None, limit: int = 100
|
|
100
|
-
) -> list[str]:
|
|
101
|
-
"""Extract text/attributes from elements."""
|
|
102
|
-
return self._service.extract(self._session_id, selector, attr, limit)
|
|
103
|
-
|
|
104
|
-
def extract_regex(
|
|
105
|
-
self, pattern: str, from_html: bool = False, limit: int = 100
|
|
106
|
-
) -> list[str]:
|
|
107
|
-
"""Extract data using regex pattern."""
|
|
108
|
-
return self._service.extract_regex(self._session_id, pattern, from_html, limit)
|
|
109
|
-
|
|
110
|
-
def get_html(self, selector: str | None = None) -> str:
|
|
111
|
-
"""Get page HTML."""
|
|
112
|
-
return self._service.get_html(self._session_id, selector)
|
|
113
|
-
|
|
114
|
-
def get_text(self, selector: str | None = None) -> str:
|
|
115
|
-
"""Get page text content."""
|
|
116
|
-
return self._service.get_text(self._session_id, selector)
|
|
117
|
-
|
|
118
|
-
def parse_html(self, html: str | None = None, selector: str | None = None) -> "BeautifulSoup":
|
|
119
|
-
"""
|
|
120
|
-
Parse HTML with BeautifulSoup.
|
|
121
|
-
|
|
122
|
-
Args:
|
|
123
|
-
html: HTML string to parse. If None, fetches from page.
|
|
124
|
-
selector: CSS selector to get HTML from (if html not provided).
|
|
125
|
-
|
|
126
|
-
Returns:
|
|
127
|
-
BeautifulSoup object for easy parsing.
|
|
128
|
-
|
|
129
|
-
Raises:
|
|
130
|
-
ImportError: If beautifulsoup4 not installed.
|
|
131
|
-
|
|
132
|
-
Example:
|
|
133
|
-
soup = browser.parse_html() # Parse entire page
|
|
134
|
-
soup = browser.parse_html(selector="[role='feed']") # Parse feed only
|
|
135
|
-
|
|
136
|
-
for item in soup.select('.item'):
|
|
137
|
-
title = item.select_one('h2').get_text(strip=True)
|
|
138
|
-
"""
|
|
139
|
-
if html is None:
|
|
140
|
-
html = self.get_html(selector)
|
|
141
|
-
return _parse_html(html)
|
|
142
|
-
|
|
143
|
-
def soup(self, selector: str | None = None) -> SoupWrapper:
|
|
144
|
-
"""
|
|
145
|
-
Get page HTML as SoupWrapper with chainable API.
|
|
146
|
-
|
|
147
|
-
Args:
|
|
148
|
-
selector: CSS selector to limit scope (optional)
|
|
149
|
-
|
|
150
|
-
Returns:
|
|
151
|
-
SoupWrapper with convenience methods
|
|
152
|
-
|
|
153
|
-
Example:
|
|
154
|
-
# Get all links
|
|
155
|
-
links = browser.soup().links("a.product")
|
|
156
|
-
|
|
157
|
-
# Get text from elements
|
|
158
|
-
titles = browser.soup("[role='feed']").texts("h2")
|
|
159
|
-
|
|
160
|
-
# Chain selects
|
|
161
|
-
for item in browser.soup().select(".item"):
|
|
162
|
-
title = item.select_one("h2").text()
|
|
163
|
-
url = item.attr("href")
|
|
164
|
-
"""
|
|
165
|
-
html = self.get_html(selector)
|
|
166
|
-
return SoupWrapper(html=html)
|
|
167
|
-
|
|
168
|
-
# === JavaScript Execution ===
|
|
169
|
-
|
|
170
|
-
def execute_script(self, script: str) -> str:
|
|
171
|
-
"""Execute JavaScript (raw, no wrapper)."""
|
|
172
|
-
return self._service.execute_script(self._session_id, script)
|
|
173
|
-
|
|
174
|
-
def execute_js(self, code: str, raw: bool = False) -> dict | list | str | None:
|
|
175
|
-
"""
|
|
176
|
-
Execute async JavaScript with auto-wrap.
|
|
177
|
-
|
|
178
|
-
Code is wrapped in async IIFE with try/catch and JSON serialization.
|
|
179
|
-
|
|
180
|
-
Args:
|
|
181
|
-
code: JS code to execute (can use await)
|
|
182
|
-
raw: If True, return raw JSON string. Default False (parse to dict/list).
|
|
183
|
-
|
|
184
|
-
Example:
|
|
185
|
-
result = session.execute_js('''
|
|
186
|
-
const resp = await fetch('/api/data');
|
|
187
|
-
return await resp.json();
|
|
188
|
-
''')
|
|
189
|
-
"""
|
|
190
|
-
js = self._build_execute_js(code)
|
|
191
|
-
result = self.execute_script(js)
|
|
192
|
-
return self._parse_execute_js(result, raw)
|
|
193
|
-
|
|
194
|
-
def fetch_json(self, url: str) -> dict | list | None:
|
|
195
|
-
"""Fetch JSON from URL using JS fetch()."""
|
|
196
|
-
js = self._build_fetch_json(url)
|
|
197
|
-
result = self.execute_script(js)
|
|
198
|
-
return parse_json_result(result)
|
|
199
|
-
|
|
200
|
-
def fetch_all(
|
|
201
|
-
self,
|
|
202
|
-
urls: dict[str, str],
|
|
203
|
-
headers: dict[str, str] | None = None,
|
|
204
|
-
credentials: bool = False,
|
|
205
|
-
) -> dict[str, Any]:
|
|
206
|
-
"""
|
|
207
|
-
Fetch multiple URLs in parallel.
|
|
208
|
-
|
|
209
|
-
Args:
|
|
210
|
-
urls: Dict of {id: url} to fetch
|
|
211
|
-
headers: Optional headers (accept: application/json by default)
|
|
212
|
-
credentials: Include credentials/cookies (default False, may break CORS)
|
|
213
|
-
|
|
214
|
-
Returns:
|
|
215
|
-
Dict of {id: {data: ..., error: ...}}
|
|
216
|
-
"""
|
|
217
|
-
if not urls:
|
|
218
|
-
return {}
|
|
219
|
-
js = self._build_fetch_all(urls, headers, credentials)
|
|
220
|
-
result = self.execute_js(js)
|
|
221
|
-
return self._parse_fetch_all(result)
|
|
222
|
-
|
|
223
|
-
# === State & Cookies ===
|
|
224
|
-
|
|
225
|
-
def screenshot(self, full_page: bool = False) -> bytes:
|
|
226
|
-
"""Take screenshot."""
|
|
227
|
-
return self._service.screenshot(self._session_id, full_page)
|
|
228
|
-
|
|
229
|
-
def get_state(self) -> BrowserState:
|
|
230
|
-
"""Get current browser state."""
|
|
231
|
-
return self._service.get_state(self._session_id)
|
|
232
|
-
|
|
233
|
-
def set_cookies(self, cookies: list[BrowserCookie | dict]) -> None:
|
|
234
|
-
"""Set browser cookies."""
|
|
235
|
-
self._service.set_cookies(self._session_id, cookies)
|
|
236
|
-
|
|
237
|
-
def get_cookies(self, domain: str = "") -> list[BrowserCookie]:
|
|
238
|
-
"""Get browser cookies."""
|
|
239
|
-
return self._service.get_cookies(self._session_id, domain)
|
|
240
|
-
|
|
241
|
-
# === Parser helpers ===
|
|
242
|
-
|
|
243
|
-
def validate_selectors(self, item: str, fields: dict[str, str]) -> dict:
|
|
244
|
-
"""Validate CSS selectors on page."""
|
|
245
|
-
return self._service.validate_selectors(self._session_id, item, fields)
|
|
246
|
-
|
|
247
|
-
def extract_data(self, item: str, fields_json: str, limit: int = 100) -> dict:
|
|
248
|
-
"""Extract structured data from page."""
|
|
249
|
-
return self._service.extract_data(self._session_id, item, fields_json, limit)
|
|
250
|
-
|
|
251
|
-
# === Mouse & Scroll (native, not JS) ===
|
|
252
|
-
|
|
253
|
-
def mouse_move(self, x: int, y: int, steps: int = 10) -> None:
|
|
254
|
-
"""
|
|
255
|
-
Move mouse to coordinates with human-like movement.
|
|
256
|
-
|
|
257
|
-
Args:
|
|
258
|
-
x: Target X coordinate
|
|
259
|
-
y: Target Y coordinate
|
|
260
|
-
steps: Number of intermediate steps (1 = instant, >1 = smooth)
|
|
261
|
-
|
|
262
|
-
Example:
|
|
263
|
-
browser.mouse_move(500, 300) # Smooth move
|
|
264
|
-
browser.mouse_move(100, 100, steps=1) # Instant move
|
|
265
|
-
"""
|
|
266
|
-
_log.debug("[mouse_move] x=%d, y=%d, steps=%d", x, y, steps)
|
|
267
|
-
self._service.mouse_move(self._session_id, x, y, steps)
|
|
268
|
-
|
|
269
|
-
def scroll(
|
|
270
|
-
self,
|
|
271
|
-
direction: str = "down",
|
|
272
|
-
amount: int = 500,
|
|
273
|
-
selector: str | None = None,
|
|
274
|
-
smooth: bool = True,
|
|
275
|
-
) -> ScrollResult:
|
|
276
|
-
"""
|
|
277
|
-
Scroll the page (native, not JS).
|
|
278
|
-
|
|
279
|
-
Args:
|
|
280
|
-
direction: "up", "down", "left", "right"
|
|
281
|
-
amount: Pixels to scroll
|
|
282
|
-
selector: If provided, scroll element into view instead
|
|
283
|
-
smooth: Use smooth scroll animation (default True)
|
|
284
|
-
|
|
285
|
-
Returns:
|
|
286
|
-
ScrollResult with position info
|
|
287
|
-
|
|
288
|
-
Example:
|
|
289
|
-
# Fast scroll
|
|
290
|
-
browser.scroll("down", 800)
|
|
291
|
-
|
|
292
|
-
# Scroll element into view
|
|
293
|
-
browser.scroll(selector=".target-element")
|
|
294
|
-
"""
|
|
295
|
-
if selector:
|
|
296
|
-
_log.debug("[scroll_to] %s", selector[:60])
|
|
297
|
-
else:
|
|
298
|
-
_log.debug("[scroll] %s %dpx", direction, amount)
|
|
299
|
-
|
|
300
|
-
data = self._service.scroll(self._session_id, direction, amount, selector, smooth)
|
|
301
|
-
scroll_result = ScrollResult(
|
|
302
|
-
success=True,
|
|
303
|
-
scroll_y=data.get("scroll_y", 0),
|
|
304
|
-
scrolled_by=data.get("scrolled_by", 0),
|
|
305
|
-
at_bottom=data.get("at_bottom", False),
|
|
306
|
-
)
|
|
307
|
-
_log.debug("[scroll] → y=%d, by=%d, bottom=%s", scroll_result.scroll_y, scroll_result.scrolled_by, scroll_result.at_bottom)
|
|
308
|
-
return scroll_result
|
|
309
|
-
|
|
310
|
-
# def scroll_js(
|
|
311
|
-
# self,
|
|
312
|
-
# direction: str = "down",
|
|
313
|
-
# amount: int = 500,
|
|
314
|
-
# selector: str | None = None,
|
|
315
|
-
# smooth: bool = True,
|
|
316
|
-
# human_like: bool = False,
|
|
317
|
-
# container: str | None = None,
|
|
318
|
-
# ) -> ScrollResult:
|
|
319
|
-
# """Scroll the page (JS fallback)."""
|
|
320
|
-
# js = self._build_scroll(direction, amount, selector, smooth, human_like, container)
|
|
321
|
-
# result = self.execute_script(js)
|
|
322
|
-
# data = parse_json_result(result) or {}
|
|
323
|
-
# return ScrollResult(
|
|
324
|
-
# success=data.get("success", False),
|
|
325
|
-
# scroll_y=int(data.get("scrollY", 0)),
|
|
326
|
-
# scrolled_by=int(data.get("scrolledBy", 0)),
|
|
327
|
-
# at_bottom=data.get("atBottom", False),
|
|
328
|
-
# error=data.get("error"),
|
|
329
|
-
# )
|
|
330
|
-
|
|
331
|
-
def scroll_to(self, selector: str) -> ScrollResult:
|
|
332
|
-
"""Scroll element into view."""
|
|
333
|
-
_log.debug("[scroll_to] %s", selector[:60])
|
|
334
|
-
return self.scroll(selector=selector)
|
|
335
|
-
|
|
336
|
-
def scroll_to_bottom(self) -> ScrollResult:
|
|
337
|
-
"""Scroll to page bottom."""
|
|
338
|
-
js = self._build_scroll_to_bottom()
|
|
339
|
-
result = self.execute_script(js)
|
|
340
|
-
data = parse_json_result(result) or {}
|
|
341
|
-
return ScrollResult(
|
|
342
|
-
success=data.get("success", False),
|
|
343
|
-
scroll_y=int(data.get("scrollY", 0)),
|
|
344
|
-
scrolled_by=int(data.get("scrolledBy", 0)),
|
|
345
|
-
at_bottom=True,
|
|
346
|
-
)
|
|
347
|
-
|
|
348
|
-
def get_scroll_info(self) -> ScrollInfo:
|
|
349
|
-
"""Get current scroll position and page dimensions (JS-based)."""
|
|
350
|
-
js = self._build_get_scroll_info()
|
|
351
|
-
result = self.execute_script(js)
|
|
352
|
-
data = parse_json_result(result) or {}
|
|
353
|
-
return ScrollInfo(
|
|
354
|
-
scroll_x=int(data.get("scrollX", 0)),
|
|
355
|
-
scroll_y=int(data.get("scrollY", 0)),
|
|
356
|
-
page_height=int(data.get("pageHeight", 0)),
|
|
357
|
-
page_width=int(data.get("pageWidth", 0)),
|
|
358
|
-
viewport_height=int(data.get("viewportHeight", 0)),
|
|
359
|
-
viewport_width=int(data.get("viewportWidth", 0)),
|
|
360
|
-
at_bottom=data.get("atBottom", False),
|
|
361
|
-
at_top=data.get("atTop", True),
|
|
362
|
-
)
|
|
363
|
-
|
|
364
|
-
def get_page_info(self) -> PageInfo:
|
|
365
|
-
"""Get comprehensive page info (native)."""
|
|
366
|
-
return self._service.get_page_info(self._session_id)
|
|
367
|
-
|
|
368
|
-
def scroll_and_collect(
|
|
369
|
-
self,
|
|
370
|
-
seen_keys: set[str],
|
|
371
|
-
key_selector: str = "a[href]",
|
|
372
|
-
key_attr: str = "href",
|
|
373
|
-
container_selector: str = "body",
|
|
374
|
-
) -> InfiniteScrollResult:
|
|
375
|
-
"""
|
|
376
|
-
Extract new keys from page, for infinite scroll patterns.
|
|
377
|
-
|
|
378
|
-
Args:
|
|
379
|
-
seen_keys: Set of already seen keys (will be updated)
|
|
380
|
-
key_selector: CSS selector for elements with keys
|
|
381
|
-
key_attr: Attribute to use as key
|
|
382
|
-
container_selector: Container to search in
|
|
383
|
-
|
|
384
|
-
Returns:
|
|
385
|
-
InfiniteScrollResult with new keys found
|
|
386
|
-
"""
|
|
387
|
-
js = build_infinite_scroll_js(
|
|
388
|
-
list(seen_keys), key_selector, key_attr, container_selector
|
|
389
|
-
)
|
|
390
|
-
result = self.execute_script(js)
|
|
391
|
-
data = parse_json_result(result) or {}
|
|
392
|
-
|
|
393
|
-
new_keys = data.get("new_keys", [])
|
|
394
|
-
seen_keys.update(new_keys)
|
|
395
|
-
|
|
396
|
-
return InfiniteScrollResult(
|
|
397
|
-
new_keys=new_keys,
|
|
398
|
-
at_bottom=data.get("at_bottom", False),
|
|
399
|
-
total_seen=data.get("total_seen", len(seen_keys)),
|
|
400
|
-
error=data.get("error"),
|
|
401
|
-
)
|
|
402
|
-
|
|
403
|
-
def infinite_scroll(
|
|
404
|
-
self,
|
|
405
|
-
extract_fn: Callable[[], list[Any]],
|
|
406
|
-
limit: int = 100,
|
|
407
|
-
max_scrolls: int = 50,
|
|
408
|
-
max_no_new: int = 3,
|
|
409
|
-
scroll_amount: int = 800,
|
|
410
|
-
delay: float = 1.0,
|
|
411
|
-
) -> list[Any]:
|
|
412
|
-
"""
|
|
413
|
-
Smart infinite scroll with extraction.
|
|
414
|
-
|
|
415
|
-
Args:
|
|
416
|
-
extract_fn: Function that extracts and returns new items (deduplication is caller's responsibility)
|
|
417
|
-
limit: Stop after collecting this many items
|
|
418
|
-
max_scrolls: Maximum scroll attempts
|
|
419
|
-
max_no_new: Stop after this many scrolls with no new items
|
|
420
|
-
scroll_amount: Pixels to scroll each time
|
|
421
|
-
delay: Seconds to wait between scrolls
|
|
422
|
-
|
|
423
|
-
Returns:
|
|
424
|
-
List of all extracted items
|
|
425
|
-
"""
|
|
426
|
-
all_items: list[Any] = []
|
|
427
|
-
no_new_count = 0
|
|
428
|
-
|
|
429
|
-
for _ in range(max_scrolls):
|
|
430
|
-
new_items = extract_fn()
|
|
431
|
-
|
|
432
|
-
if new_items:
|
|
433
|
-
all_items.extend(new_items)
|
|
434
|
-
no_new_count = 0
|
|
435
|
-
if len(all_items) >= limit:
|
|
436
|
-
break
|
|
437
|
-
else:
|
|
438
|
-
no_new_count += 1
|
|
439
|
-
if no_new_count >= max_no_new:
|
|
440
|
-
break
|
|
441
|
-
|
|
442
|
-
self.scroll("down", scroll_amount)
|
|
443
|
-
time.sleep(delay)
|
|
444
|
-
|
|
445
|
-
return all_items[:limit]
|
|
446
|
-
|
|
447
|
-
# === UI Interaction Helpers ===
|
|
448
|
-
|
|
449
|
-
def hover(self, selector: str, timeout_ms: int = 5000) -> None:
|
|
450
|
-
"""Hover over element (native, not JS)."""
|
|
451
|
-
_log.debug("[hover] %s", selector[:60])
|
|
452
|
-
self._service.hover(self._session_id, selector, timeout_ms)
|
|
453
|
-
|
|
454
|
-
# def hover_js(self, selector: str) -> bool:
|
|
455
|
-
# """Hover over element (JS fallback)."""
|
|
456
|
-
# js = self._build_hover(selector)
|
|
457
|
-
# result = self.execute_script(js)
|
|
458
|
-
# data = parse_json_result(result) or {}
|
|
459
|
-
# return data.get("success", False)
|
|
460
|
-
|
|
461
|
-
def select(
|
|
462
|
-
self,
|
|
463
|
-
selector: str,
|
|
464
|
-
value: str | None = None,
|
|
465
|
-
text: str | None = None,
|
|
466
|
-
) -> dict:
|
|
467
|
-
"""
|
|
468
|
-
Select option from dropdown.
|
|
469
|
-
|
|
470
|
-
Args:
|
|
471
|
-
selector: CSS selector for <select> element
|
|
472
|
-
value: Option value to select
|
|
473
|
-
text: Option text to select (if value not provided)
|
|
474
|
-
|
|
475
|
-
Returns:
|
|
476
|
-
Dict with selected_value and selected_text
|
|
477
|
-
"""
|
|
478
|
-
js = self._build_select(selector, value, text)
|
|
479
|
-
result = self.execute_script(js)
|
|
480
|
-
return parse_json_result(result) or {}
|
|
481
|
-
|
|
482
|
-
def close_modal(self, selectors: list[str] | None = None) -> bool:
|
|
483
|
-
"""
|
|
484
|
-
Try to close modal/dialog.
|
|
485
|
-
|
|
486
|
-
Args:
|
|
487
|
-
selectors: Custom close button selectors to try
|
|
488
|
-
|
|
489
|
-
Returns:
|
|
490
|
-
True if modal was closed
|
|
491
|
-
"""
|
|
492
|
-
js = self._build_close_modal(selectors)
|
|
493
|
-
result = self.execute_script(js)
|
|
494
|
-
data = parse_json_result(result) or {}
|
|
495
|
-
return data.get("success", False)
|
|
496
|
-
|
|
497
|
-
def wait(self, ms: int, jitter: float = 0.1) -> None:
|
|
498
|
-
"""
|
|
499
|
-
Wait for specified milliseconds with optional jitter.
|
|
500
|
-
|
|
501
|
-
Args:
|
|
502
|
-
ms: Wait time in milliseconds
|
|
503
|
-
jitter: Random variation ±jitter (default 10%, so 1000ms becomes 900-1100ms)
|
|
504
|
-
"""
|
|
505
|
-
import random
|
|
506
|
-
actual = (ms / 1000) * (1 + random.uniform(-jitter, jitter))
|
|
507
|
-
time.sleep(actual)
|
|
508
|
-
|
|
509
|
-
def wait_seconds(self, seconds: float, jitter: float = 0.1) -> None:
|
|
510
|
-
"""
|
|
511
|
-
Wait for specified seconds with optional jitter.
|
|
512
|
-
|
|
513
|
-
Args:
|
|
514
|
-
seconds: Base wait time in seconds
|
|
515
|
-
jitter: Random variation ±jitter (default 10%, so 1.0s becomes 0.9-1.1s)
|
|
516
|
-
"""
|
|
517
|
-
self.wait(int(seconds * 1000), jitter)
|
|
518
|
-
|
|
519
|
-
def wait_random(self, min_sec: float = 0.5, max_sec: float = 1.5) -> None:
|
|
520
|
-
"""Wait for random time between min and max seconds."""
|
|
521
|
-
import random
|
|
522
|
-
time.sleep(min_sec + random.random() * (max_sec - min_sec))
|
|
523
|
-
|
|
524
|
-
def with_timeout(
|
|
525
|
-
self,
|
|
526
|
-
fn: Callable[[], T],
|
|
527
|
-
timeout_sec: float = 60.0,
|
|
528
|
-
on_timeout: Callable[[], None] | None = None,
|
|
529
|
-
) -> tuple[T | None, bool]:
|
|
530
|
-
"""
|
|
531
|
-
Run a function with a timeout. Skips if it hangs.
|
|
532
|
-
|
|
533
|
-
Args:
|
|
534
|
-
fn: Function to run (no arguments, use lambda/closure for args)
|
|
535
|
-
timeout_sec: Timeout in seconds (default 60)
|
|
536
|
-
on_timeout: Optional cleanup function to call on timeout
|
|
537
|
-
|
|
538
|
-
Returns:
|
|
539
|
-
Tuple of (result, success). If timeout, returns (None, False).
|
|
540
|
-
|
|
541
|
-
Example:
|
|
542
|
-
# Simple usage
|
|
543
|
-
result, ok = browser.with_timeout(
|
|
544
|
-
lambda: process_listing(browser, listing),
|
|
545
|
-
timeout_sec=30,
|
|
546
|
-
)
|
|
547
|
-
if not ok:
|
|
548
|
-
print("Skipped due to timeout")
|
|
549
|
-
continue
|
|
550
|
-
|
|
551
|
-
# With cleanup
|
|
552
|
-
result, ok = browser.with_timeout(
|
|
553
|
-
lambda: enrich_listing(browser, item),
|
|
554
|
-
timeout_sec=60,
|
|
555
|
-
on_timeout=lambda: browser.press_key('Escape'),
|
|
556
|
-
)
|
|
557
|
-
"""
|
|
558
|
-
result_container: list[T | None] = [None]
|
|
559
|
-
exception_container: list[Exception | None] = [None]
|
|
560
|
-
completed = threading.Event()
|
|
561
|
-
|
|
562
|
-
def target() -> None:
|
|
563
|
-
try:
|
|
564
|
-
result_container[0] = fn()
|
|
565
|
-
except Exception as e:
|
|
566
|
-
exception_container[0] = e
|
|
567
|
-
finally:
|
|
568
|
-
completed.set()
|
|
569
|
-
|
|
570
|
-
thread = threading.Thread(target=target, daemon=True)
|
|
571
|
-
thread.start()
|
|
572
|
-
|
|
573
|
-
if completed.wait(timeout=timeout_sec):
|
|
574
|
-
# Completed in time
|
|
575
|
-
if exception_container[0]:
|
|
576
|
-
raise exception_container[0]
|
|
577
|
-
return result_container[0], True
|
|
578
|
-
else:
|
|
579
|
-
# Timeout - run cleanup if provided
|
|
580
|
-
_log.warning("[timeout] Function timed out after %.1fs", timeout_sec)
|
|
581
|
-
if on_timeout:
|
|
582
|
-
try:
|
|
583
|
-
on_timeout()
|
|
584
|
-
except Exception as e:
|
|
585
|
-
_log.debug("[timeout] Cleanup failed: %s", e)
|
|
586
|
-
return None, False
|
|
587
|
-
|
|
588
|
-
def click_all_by_text(self, text: str, role: str = "button") -> int:
|
|
589
|
-
"""
|
|
590
|
-
Click all elements containing specific text.
|
|
591
|
-
|
|
592
|
-
Args:
|
|
593
|
-
text: Text to match (case-insensitive)
|
|
594
|
-
role: Element role to filter (default: "button")
|
|
595
|
-
|
|
596
|
-
Returns:
|
|
597
|
-
Number of elements clicked
|
|
598
|
-
|
|
599
|
-
Example:
|
|
600
|
-
browser.click_all_by_text("See more") # Expand all posts
|
|
601
|
-
browser.click_all_by_text("Load more", role="link")
|
|
602
|
-
"""
|
|
603
|
-
_log.debug("[click_all_by_text] '%s' role=%s", text, role)
|
|
604
|
-
js = self._build_click_all_by_text(text, role)
|
|
605
|
-
result = self.execute_script(js)
|
|
606
|
-
data = parse_json_result(result) or {}
|
|
607
|
-
clicked = data.get("clicked", 0)
|
|
608
|
-
_log.debug("[click_all_by_text] → clicked %d", clicked)
|
|
609
|
-
return clicked
|
|
610
|
-
|
|
611
|
-
def press_key(self, key: str, selector: str | None = None) -> bool:
|
|
612
|
-
"""
|
|
613
|
-
Press a keyboard key.
|
|
614
|
-
|
|
615
|
-
Args:
|
|
616
|
-
key: Key to press (e.g., 'Escape', 'Enter', 'Tab', 'ArrowDown')
|
|
617
|
-
selector: Optional CSS selector to target. If None, targets activeElement.
|
|
618
|
-
|
|
619
|
-
Returns:
|
|
620
|
-
True if key was pressed successfully
|
|
621
|
-
|
|
622
|
-
Example:
|
|
623
|
-
browser.press_key('Escape') # Close modal
|
|
624
|
-
browser.press_key('Enter', 'input.search') # Submit search
|
|
625
|
-
browser.press_key('Tab') # Move focus
|
|
626
|
-
"""
|
|
627
|
-
target = selector[:40] if selector else "activeElement"
|
|
628
|
-
_log.debug("[press_key] %s → %s", key, target)
|
|
629
|
-
js = self._build_press_key(key, selector)
|
|
630
|
-
result = self.execute_script(js)
|
|
631
|
-
data = parse_json_result(result) or {}
|
|
632
|
-
return data.get("success", False)
|
|
633
|
-
|
|
634
|
-
# === Context Manager ===
|
|
635
|
-
|
|
636
|
-
def close(self) -> None:
|
|
637
|
-
"""Close browser session."""
|
|
638
|
-
self._service.close_session(self._session_id)
|
|
639
|
-
|
|
640
|
-
def __enter__(self) -> "BrowserSession":
|
|
641
|
-
return self
|
|
642
|
-
|
|
643
|
-
def __exit__(self, *args: Any) -> None:
|
|
644
|
-
self.close()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|