cmdop 0.1.22__py3-none-any.whl → 0.1.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,349 @@
1
+ """Network analyzer for discovering API endpoints and creating request snapshots."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import time
6
+ from typing import TYPE_CHECKING, Any
7
+
8
+ from pydantic import BaseModel, Field
9
+ from urllib.parse import urlparse, parse_qs
10
+
11
+ if TYPE_CHECKING:
12
+ from cmdop.services.browser.session import BrowserSession
13
+ from cmdop.services.browser.models import NetworkExchange
14
+
15
+
16
+ class RequestSnapshot(BaseModel):
17
+ """Complete snapshot of an API request for reproduction."""
18
+
19
+ # Request info
20
+ url: str
21
+ method: str = "GET"
22
+ headers: dict[str, str] = Field(default_factory=dict)
23
+ body: str = ""
24
+
25
+ # Response info
26
+ status: int | None = None
27
+ content_type: str = ""
28
+ size: int = 0
29
+
30
+ # Parsed URL parts
31
+ base_url: str = ""
32
+ path: str = ""
33
+ query_params: dict[str, list[str]] = Field(default_factory=dict)
34
+
35
+ # Data analysis
36
+ data_key: str | None = None
37
+ item_count: int | None = None
38
+ item_fields: list[str] = Field(default_factory=list)
39
+ sample_response: Any = None
40
+
41
+ # Session data
42
+ cookies: dict[str, str] = Field(default_factory=dict)
43
+
44
+ def to_curl(self) -> str:
45
+ """Generate curl command to reproduce request."""
46
+ parts = [f"curl -X {self.method}"]
47
+
48
+ # Add headers
49
+ for key, value in self.headers.items():
50
+ if key.lower() not in ("host", "content-length"):
51
+ parts.append(f"-H '{key}: {value}'")
52
+
53
+ # Add cookies if not in headers
54
+ if self.cookies and "cookie" not in [k.lower() for k in self.headers]:
55
+ cookie_str = "; ".join(f"{k}={v}" for k, v in self.cookies.items())
56
+ parts.append(f"-H 'Cookie: {cookie_str}'")
57
+
58
+ # Add body
59
+ if self.body:
60
+ parts.append(f"-d '{self.body}'")
61
+
62
+ # Add URL
63
+ parts.append(f"'{self.url}'")
64
+
65
+ return " \\\n ".join(parts)
66
+
67
+ def to_httpx(self) -> str:
68
+ """Generate httpx Python code to reproduce request."""
69
+ lines = ["import httpx", ""]
70
+
71
+ # Headers
72
+ if self.headers:
73
+ lines.append("headers = {")
74
+ for key, value in self.headers.items():
75
+ if key.lower() not in ("host", "content-length"):
76
+ lines.append(f' "{key}": "{value}",')
77
+ lines.append("}")
78
+ else:
79
+ lines.append("headers = {}")
80
+
81
+ # Cookies
82
+ if self.cookies:
83
+ lines.append("")
84
+ lines.append("cookies = {")
85
+ for key, value in self.cookies.items():
86
+ lines.append(f' "{key}": "{value}",')
87
+ lines.append("}")
88
+ else:
89
+ lines.append("cookies = {}")
90
+
91
+ # Request
92
+ lines.append("")
93
+ if self.method == "GET":
94
+ lines.append(f'response = httpx.get("{self.url}", headers=headers, cookies=cookies)')
95
+ elif self.method == "POST":
96
+ if self.body:
97
+ lines.append(f'data = {repr(self.body)}')
98
+ lines.append(f'response = httpx.post("{self.url}", headers=headers, cookies=cookies, content=data)')
99
+ else:
100
+ lines.append(f'response = httpx.post("{self.url}", headers=headers, cookies=cookies)')
101
+ else:
102
+ lines.append(f'response = httpx.request("{self.method}", "{self.url}", headers=headers, cookies=cookies)')
103
+
104
+ lines.append("print(response.json())")
105
+
106
+ return "\n".join(lines)
107
+
108
+
109
+ class NetworkSnapshot(BaseModel):
110
+ """Complete snapshot of network activity for a site."""
111
+
112
+ url: str
113
+ timestamp: str = ""
114
+
115
+ # Session data
116
+ cookies: dict[str, str] = Field(default_factory=dict)
117
+ local_storage: dict[str, str] = Field(default_factory=dict)
118
+
119
+ # Captured requests
120
+ api_requests: list[RequestSnapshot] = Field(default_factory=list)
121
+ json_requests: list[RequestSnapshot] = Field(default_factory=list)
122
+ other_requests: list[dict] = Field(default_factory=list)
123
+
124
+ # Stats
125
+ total_requests: int = 0
126
+ total_bytes: int = 0
127
+
128
+ def to_json(self, indent: int = 2) -> str:
129
+ """Convert to JSON string."""
130
+ return self.model_dump_json(indent=indent)
131
+
132
+ def best_api(self) -> RequestSnapshot | None:
133
+ """Get the best data API (most items)."""
134
+ if not self.api_requests:
135
+ return None
136
+ return max(self.api_requests, key=lambda r: r.item_count or 0)
137
+
138
+
139
+ class NetworkAnalyzer:
140
+ """Analyze network requests to discover API endpoints.
141
+
142
+ Creates complete request snapshots including cookies, headers, and
143
+ all data needed to reproduce API calls.
144
+
145
+ Usage:
146
+ from cmdop import CMDOPClient
147
+ from cmdop.helpers import NetworkAnalyzer
148
+
149
+ client = CMDOPClient.local()
150
+ with client.browser.create_session(headless=False) as b:
151
+ analyzer = NetworkAnalyzer(b)
152
+
153
+ # Interactive mode - user clicks pagination
154
+ snapshot = analyzer.capture("https://example.com/cars", wait_seconds=30)
155
+
156
+ # Get best API endpoint
157
+ if snapshot.api_requests:
158
+ best = snapshot.best_api()
159
+ print(f"API: {best.url}")
160
+ print(f"Curl: {best.to_curl()}")
161
+ """
162
+
163
+ # Common keys that contain data arrays
164
+ DATA_KEYS = [
165
+ "data", "items", "results", "list", "records",
166
+ "cars", "vehicles", "products", "listings", "entries",
167
+ "rows", "content", "objects", "elements", "collection",
168
+ ]
169
+
170
+ def __init__(self, session: "BrowserSession"):
171
+ """Initialize with browser session."""
172
+ self._session = session
173
+
174
+ def capture(
175
+ self,
176
+ url: str,
177
+ wait_seconds: int = 30,
178
+ url_pattern: str = "",
179
+ clear_initial: bool = True,
180
+ same_origin: bool = True,
181
+ min_size: int = 100,
182
+ max_size: int = 500_000,
183
+ countdown_message: str = "Click pagination!",
184
+ ) -> NetworkSnapshot:
185
+ """Capture network requests while user interacts with page.
186
+
187
+ Args:
188
+ url: Page URL to open
189
+ wait_seconds: Time to wait for user interactions
190
+ url_pattern: Optional regex filter for API URLs
191
+ clear_initial: Clear page load requests before capture
192
+ same_origin: Only capture requests to same domain (default True)
193
+ min_size: Min response size in bytes (filter tracking pixels)
194
+ max_size: Max response size in bytes (filter images/assets)
195
+ countdown_message: Message to show in countdown toast
196
+
197
+ Returns:
198
+ NetworkSnapshot with all captured requests and session data
199
+ """
200
+ from cmdop.services.browser.models import WaitUntil
201
+ from datetime import datetime
202
+
203
+ b = self._session
204
+ snapshot = NetworkSnapshot(
205
+ url=url,
206
+ timestamp=datetime.now().isoformat(),
207
+ )
208
+
209
+ # Extract base domain for filtering
210
+ base_domain = self._extract_base_domain(url)
211
+
212
+ # Enable network capture
213
+ b.network.enable(max_exchanges=500, max_response_size=5_000_000)
214
+
215
+ try:
216
+ print(f"Opening {url}...")
217
+ b.navigate(url, timeout_ms=90000, wait_until=WaitUntil.LOAD)
218
+
219
+ # Wait for page to be interactive
220
+ try:
221
+ b.wait_for("body", timeout_ms=10000)
222
+ except Exception:
223
+ pass
224
+ time.sleep(2)
225
+
226
+ if clear_initial:
227
+ b.network.clear()
228
+
229
+ # Show countdown while user interacts
230
+ b.visual.countdown(wait_seconds, countdown_message)
231
+
232
+ # Get cookies
233
+ try:
234
+ cookies = b.get_cookies()
235
+ snapshot.cookies = {c.name: c.value for c in cookies}
236
+ except Exception:
237
+ pass
238
+
239
+ # Get stats
240
+ stats = b.network.stats()
241
+ snapshot.total_requests = stats.total_captured
242
+ snapshot.total_bytes = stats.total_bytes
243
+
244
+ print(f"\nCaptured {stats.total_captured} requests ({stats.total_bytes} bytes)")
245
+
246
+ # Get XHR/Fetch calls
247
+ api_calls = b.network.filter(
248
+ url_pattern=url_pattern,
249
+ resource_types=["xhr", "fetch"],
250
+ )
251
+
252
+ # Filter by domain
253
+ if same_origin:
254
+ api_calls = [
255
+ call for call in api_calls
256
+ if base_domain in urlparse(call.request.url).netloc
257
+ ]
258
+
259
+ # Filter by response size (ignore tracking pixels and heavy assets)
260
+ api_calls = [
261
+ call for call in api_calls
262
+ if call.response and min_size <= call.response.size <= max_size
263
+ ]
264
+
265
+ print(f"Found {len(api_calls)} XHR/Fetch requests (domain: {base_domain}, {min_size}-{max_size} bytes)")
266
+
267
+ # Analyze calls
268
+ for call in api_calls:
269
+ req = self._create_snapshot(call, snapshot.cookies)
270
+ if req:
271
+ if req.data_key or (req.item_count and req.item_count > 0):
272
+ snapshot.api_requests.append(req)
273
+ elif req.content_type and "json" in req.content_type:
274
+ snapshot.json_requests.append(req)
275
+ else:
276
+ snapshot.other_requests.append({
277
+ "url": call.request.url,
278
+ "method": call.request.method,
279
+ "status": call.response.status if call.response else None,
280
+ })
281
+
282
+ finally:
283
+ b.network.disable()
284
+
285
+ return snapshot
286
+
287
+ def _extract_base_domain(self, url: str) -> str:
288
+ """Extract base domain from URL, handling country-code TLDs."""
289
+ parsed = urlparse(url)
290
+ host = parsed.netloc.replace("www.", "")
291
+ parts = host.split(".")
292
+
293
+ # Country-code second-level domains
294
+ cc_slds = {"co", "com", "net", "org", "ac", "go", "ne", "or"}
295
+
296
+ if len(parts) >= 3 and parts[-2] in cc_slds:
297
+ return ".".join(parts[-3:]) # bobaedream.co.kr
298
+ elif len(parts) >= 2:
299
+ return ".".join(parts[-2:]) # kcar.com
300
+ return host
301
+
302
+ def _create_snapshot(
303
+ self,
304
+ exchange: "NetworkExchange",
305
+ session_cookies: dict[str, str],
306
+ ) -> RequestSnapshot | None:
307
+ """Create request snapshot from network exchange."""
308
+ if not exchange.response:
309
+ return None
310
+
311
+ parsed = urlparse(exchange.request.url)
312
+
313
+ snapshot = RequestSnapshot(
314
+ url=exchange.request.url,
315
+ method=exchange.request.method,
316
+ headers=dict(exchange.request.headers),
317
+ body=exchange.request.body.decode("utf-8", errors="ignore") if exchange.request.body else "",
318
+ status=exchange.response.status,
319
+ content_type=exchange.response.content_type or "",
320
+ size=exchange.response.size,
321
+ base_url=f"{parsed.scheme}://{parsed.netloc}",
322
+ path=parsed.path,
323
+ query_params=parse_qs(parsed.query),
324
+ cookies=session_cookies,
325
+ )
326
+
327
+ # Parse JSON response
328
+ if "json" in snapshot.content_type.lower():
329
+ try:
330
+ data = exchange.json_body()
331
+ snapshot.sample_response = data
332
+
333
+ if isinstance(data, list):
334
+ snapshot.item_count = len(data)
335
+ if data and isinstance(data[0], dict):
336
+ snapshot.item_fields = list(data[0].keys())
337
+ elif isinstance(data, dict):
338
+ for key in self.DATA_KEYS:
339
+ if key in data and isinstance(data[key], list):
340
+ snapshot.data_key = key
341
+ snapshot.item_count = len(data[key])
342
+ if data[key] and isinstance(data[key][0], dict):
343
+ snapshot.item_fields = list(data[key][0].keys())
344
+ break
345
+ except Exception:
346
+ pass
347
+
348
+ return snapshot
349
+
@@ -5,6 +5,8 @@ from .input import InputCapability
5
5
  from .timing import TimingCapability
6
6
  from .dom import DOMCapability
7
7
  from .fetch import FetchCapability
8
+ from .network import NetworkCapability
9
+ from .visual import VisualCapability
8
10
 
9
11
  __all__ = [
10
12
  "ScrollCapability",
@@ -12,4 +14,6 @@ __all__ = [
12
14
  "TimingCapability",
13
15
  "DOMCapability",
14
16
  "FetchCapability",
17
+ "NetworkCapability",
18
+ "VisualCapability",
15
19
  ]
@@ -35,8 +35,7 @@ class FetchCapability(BaseCapability):
35
35
  if not urls:
36
36
  return {}
37
37
  js = build_fetch_all_js(urls, headers, credentials)
38
- # fetch_all returns via execute_js (async wrapper)
39
- wrapped = build_async_js(js.replace("return ", ""))
38
+ wrapped = build_async_js(js)
40
39
  result = parse_json_result(self._js(wrapped))
41
40
  return result if isinstance(result, dict) else {}
42
41
 
@@ -0,0 +1,245 @@
1
+ """Network capture capability (v2.19.0)."""
2
+
3
+ from __future__ import annotations
4
+ from typing import Any
5
+
6
+ from cmdop.services.browser.models import (
7
+ NetworkExchange,
8
+ NetworkRequest,
9
+ NetworkResponse,
10
+ NetworkTiming,
11
+ NetworkStats,
12
+ NetworkFilter,
13
+ )
14
+
15
+ from ._base import BaseCapability
16
+
17
+
18
+ class NetworkCapability(BaseCapability):
19
+ """Network capture operations.
20
+
21
+ Captures HTTP requests/responses made by the browser.
22
+ Useful for:
23
+ - Intercepting API responses
24
+ - Debugging network issues
25
+ - Extracting data from XHR/Fetch calls
26
+
27
+ Usage:
28
+ # Enable capture
29
+ session.network.enable()
30
+
31
+ # Navigate and trigger requests
32
+ session.navigate("https://example.com")
33
+
34
+ # Get all captured exchanges
35
+ exchanges = session.network.get_all()
36
+
37
+ # Get last API response
38
+ api = session.network.last("/api/data")
39
+ data = api.json_body()
40
+
41
+ # Filter by criteria
42
+ xhr = session.network.filter(
43
+ url_pattern="/api/",
44
+ methods=["POST"],
45
+ status_codes=[200],
46
+ )
47
+
48
+ # Disable capture
49
+ session.network.disable()
50
+ """
51
+
52
+ def enable(self, max_exchanges: int = 1000, max_response_size: int = 10_000_000) -> None:
53
+ """Enable network capture.
54
+
55
+ Args:
56
+ max_exchanges: Max exchanges to keep in memory (FIFO eviction)
57
+ max_response_size: Max response body size in bytes
58
+ """
59
+ self._call("network_enable", max_exchanges, max_response_size)
60
+
61
+ def disable(self) -> None:
62
+ """Disable network capture."""
63
+ self._call("network_disable")
64
+
65
+ def get_all(self, limit: int = 0) -> list[NetworkExchange]:
66
+ """Get all captured exchanges.
67
+
68
+ Args:
69
+ limit: Max results (0 = unlimited)
70
+ """
71
+ return self.filter(limit=limit)
72
+
73
+ def filter(
74
+ self,
75
+ url_pattern: str = "",
76
+ methods: list[str] | None = None,
77
+ status_codes: list[int] | None = None,
78
+ resource_types: list[str] | None = None,
79
+ limit: int = 0,
80
+ ) -> list[NetworkExchange]:
81
+ """Get exchanges matching filter criteria.
82
+
83
+ Args:
84
+ url_pattern: Regex pattern for URL matching
85
+ methods: HTTP methods (GET, POST, etc.)
86
+ status_codes: HTTP status codes (200, 404, etc.)
87
+ resource_types: xhr, fetch, document, script, image, etc.
88
+ limit: Max results (0 = unlimited)
89
+ """
90
+ data = self._call(
91
+ "network_get_exchanges",
92
+ url_pattern,
93
+ methods or [],
94
+ status_codes or [],
95
+ resource_types or [],
96
+ limit,
97
+ )
98
+ return [self._parse_exchange(e) for e in data.get("exchanges", [])]
99
+
100
+ def get(self, exchange_id: str) -> NetworkExchange | None:
101
+ """Get specific exchange by ID."""
102
+ data = self._call("network_get_exchange", exchange_id)
103
+ exchange = data.get("exchange")
104
+ if exchange:
105
+ return self._parse_exchange(exchange)
106
+ return None
107
+
108
+ def last(self, url_pattern: str = "") -> NetworkExchange | None:
109
+ """Get most recent exchange matching URL pattern.
110
+
111
+ Args:
112
+ url_pattern: Regex pattern for URL (empty = any)
113
+ """
114
+ data = self._call("network_get_last", url_pattern)
115
+ exchange = data.get("exchange")
116
+ if exchange:
117
+ return self._parse_exchange(exchange)
118
+ return None
119
+
120
+ def clear(self) -> None:
121
+ """Clear all captured exchanges."""
122
+ self._call("network_clear")
123
+
124
+ def stats(self) -> NetworkStats:
125
+ """Get capture statistics."""
126
+ data = self._call("network_stats")
127
+ return NetworkStats(
128
+ enabled=data.get("enabled", False),
129
+ total_captured=data.get("total_captured", 0),
130
+ total_errors=data.get("total_errors", 0),
131
+ total_bytes=data.get("total_bytes", 0),
132
+ average_duration_ms=data.get("average_duration_ms", 0),
133
+ )
134
+
135
+ def export_har(
136
+ self,
137
+ url_pattern: str = "",
138
+ methods: list[str] | None = None,
139
+ status_codes: list[int] | None = None,
140
+ resource_types: list[str] | None = None,
141
+ ) -> bytes:
142
+ """Export captured exchanges to HAR format.
143
+
144
+ Args:
145
+ url_pattern: Regex pattern for URL matching
146
+ methods: HTTP methods filter
147
+ status_codes: HTTP status codes filter
148
+ resource_types: Resource types filter
149
+
150
+ Returns:
151
+ HAR JSON as bytes
152
+ """
153
+ data = self._call(
154
+ "network_export_har",
155
+ url_pattern,
156
+ methods or [],
157
+ status_codes or [],
158
+ resource_types or [],
159
+ )
160
+ return data.get("har_data", b"")
161
+
162
+ # === Convenience Methods ===
163
+
164
+ def api_calls(self, url_pattern: str = "/api/") -> list[NetworkExchange]:
165
+ """Get XHR/Fetch API calls matching pattern."""
166
+ return self.filter(
167
+ url_pattern=url_pattern,
168
+ resource_types=["xhr", "fetch"],
169
+ )
170
+
171
+ def last_json(self, url_pattern: str = "") -> Any:
172
+ """Get JSON body from most recent matching response."""
173
+ exchange = self.last(url_pattern)
174
+ if exchange:
175
+ return exchange.json_body()
176
+ return None
177
+
178
+ def wait_for(self, url_pattern: str, timeout_ms: int = 30000) -> NetworkExchange | None:
179
+ """Wait for a matching request to be captured.
180
+
181
+ Args:
182
+ url_pattern: Regex pattern for URL
183
+ timeout_ms: Timeout in milliseconds
184
+
185
+ Returns:
186
+ Matching exchange or None if timeout
187
+ """
188
+ import time
189
+ start = time.time()
190
+ timeout_sec = timeout_ms / 1000
191
+
192
+ while time.time() - start < timeout_sec:
193
+ exchange = self.last(url_pattern)
194
+ if exchange:
195
+ return exchange
196
+ time.sleep(0.1)
197
+
198
+ return None
199
+
200
+ # === Internal ===
201
+
202
+ def _parse_exchange(self, data: dict[str, Any]) -> NetworkExchange:
203
+ """Parse exchange from dict."""
204
+ request_data = data.get("request", {})
205
+ response_data = data.get("response")
206
+ timing_data = data.get("timing", {})
207
+
208
+ request = NetworkRequest(
209
+ url=request_data.get("url", ""),
210
+ method=request_data.get("method", "GET"),
211
+ headers=request_data.get("headers", {}),
212
+ body=request_data.get("body", b""),
213
+ content_type=request_data.get("content_type", ""),
214
+ resource_type=request_data.get("resource_type", ""),
215
+ )
216
+
217
+ response = None
218
+ if response_data:
219
+ response = NetworkResponse(
220
+ status=response_data.get("status", 0),
221
+ status_text=response_data.get("status_text", ""),
222
+ headers=response_data.get("headers", {}),
223
+ body=response_data.get("body", b""),
224
+ content_type=response_data.get("content_type", ""),
225
+ size=response_data.get("size", 0),
226
+ from_cache=response_data.get("from_cache", False),
227
+ )
228
+
229
+ timing = NetworkTiming(
230
+ started_at_ms=timing_data.get("started_at_ms", 0),
231
+ ended_at_ms=timing_data.get("ended_at_ms", 0),
232
+ duration_ms=timing_data.get("duration_ms", 0),
233
+ wait_time_ms=timing_data.get("wait_time_ms", 0),
234
+ receive_time_ms=timing_data.get("receive_time_ms", 0),
235
+ )
236
+
237
+ return NetworkExchange(
238
+ id=data.get("id", ""),
239
+ request=request,
240
+ response=response,
241
+ timing=timing,
242
+ error=data.get("error", ""),
243
+ frame_id=data.get("frame_id", ""),
244
+ initiator=data.get("initiator", ""),
245
+ )