web2cli 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- web2cli/__init__.py +3 -0
- web2cli/__main__.py +5 -0
- web2cli/adapter/__init__.py +0 -0
- web2cli/adapter/lint.py +667 -0
- web2cli/adapter/loader.py +157 -0
- web2cli/adapter/validator.py +127 -0
- web2cli/adapters/discord.com/web2cli.yaml +476 -0
- web2cli/adapters/mail.google.com/parsers/inbox.py +200 -0
- web2cli/adapters/mail.google.com/web2cli.yaml +52 -0
- web2cli/adapters/news.ycombinator.com/web2cli.yaml +356 -0
- web2cli/adapters/reddit.com/web2cli.yaml +233 -0
- web2cli/adapters/slack.com/web2cli.yaml +445 -0
- web2cli/adapters/stackoverflow.com/web2cli.yaml +257 -0
- web2cli/adapters/x.com/providers/x_graphql.py +299 -0
- web2cli/adapters/x.com/web2cli.yaml +449 -0
- web2cli/auth/__init__.py +0 -0
- web2cli/auth/browser_login.py +820 -0
- web2cli/auth/manager.py +166 -0
- web2cli/auth/store.py +68 -0
- web2cli/cli.py +1286 -0
- web2cli/executor/__init__.py +0 -0
- web2cli/executor/http.py +113 -0
- web2cli/output/__init__.py +0 -0
- web2cli/output/formatter.py +116 -0
- web2cli/parser/__init__.py +0 -0
- web2cli/parser/custom.py +21 -0
- web2cli/parser/html_parser.py +111 -0
- web2cli/parser/transforms.py +127 -0
- web2cli/pipe.py +10 -0
- web2cli/providers/__init__.py +6 -0
- web2cli/providers/base.py +22 -0
- web2cli/providers/registry.py +86 -0
- web2cli/runtime/__init__.py +1 -0
- web2cli/runtime/cache.py +42 -0
- web2cli/runtime/engine.py +743 -0
- web2cli/runtime/parser.py +398 -0
- web2cli/runtime/template.py +52 -0
- web2cli/types.py +71 -0
- web2cli-0.2.0.dist-info/METADATA +467 -0
- web2cli-0.2.0.dist-info/RECORD +44 -0
- web2cli-0.2.0.dist-info/WHEEL +5 -0
- web2cli-0.2.0.dist-info/entry_points.txt +2 -0
- web2cli-0.2.0.dist-info/licenses/LICENSE +202 -0
- web2cli-0.2.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,820 @@
|
|
|
1
|
+
"""Browser-assisted auth capture for login command."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import re
|
|
8
|
+
import shutil
|
|
9
|
+
import socket
|
|
10
|
+
import subprocess
|
|
11
|
+
import sys
|
|
12
|
+
import tempfile
|
|
13
|
+
import time
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from collections.abc import Callable
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from urllib.request import urlopen
|
|
18
|
+
from urllib.parse import parse_qs, urlparse
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class BrowserLoginError(RuntimeError):
|
|
22
|
+
"""Raised for browser-login failures."""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class BrowserLoginCancelled(RuntimeError):
|
|
26
|
+
"""Raised when user cancels browser login."""
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(frozen=True)
|
|
30
|
+
class TokenCaptureRule:
|
|
31
|
+
"""Declarative rule for extracting token from browser network requests."""
|
|
32
|
+
|
|
33
|
+
source: str # request.header | request.form
|
|
34
|
+
key: str
|
|
35
|
+
host: str | None = None
|
|
36
|
+
path_regex: str | None = None
|
|
37
|
+
method: str | None = None
|
|
38
|
+
strip_prefix: str | None = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass(frozen=True)
|
|
42
|
+
class AutoCdpSession:
|
|
43
|
+
"""Process/session details for auto-started local Chrome over CDP."""
|
|
44
|
+
|
|
45
|
+
cdp_url: str
|
|
46
|
+
process: subprocess.Popen[str]
|
|
47
|
+
user_data_dir: Path
|
|
48
|
+
port: int
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _emit(status_cb: Callable[[str], None] | None, message: str) -> None:
|
|
52
|
+
if status_cb:
|
|
53
|
+
status_cb(message)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _emit_debug(debug_cb: Callable[[str], None] | None, message: str) -> None:
|
|
57
|
+
if debug_cb:
|
|
58
|
+
debug_cb(message)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _run_command(cmd: list[str]) -> None:
|
|
62
|
+
proc = subprocess.run(cmd, capture_output=True, text=True)
|
|
63
|
+
if proc.returncode == 0:
|
|
64
|
+
return
|
|
65
|
+
detail = (proc.stderr or proc.stdout or "").strip()
|
|
66
|
+
raise BrowserLoginError(
|
|
67
|
+
f"Command failed: {' '.join(cmd)}"
|
|
68
|
+
+ (f"\n{detail}" if detail else "")
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _ensure_playwright_package(status_cb: Callable[[str], None] | None):
|
|
73
|
+
try:
|
|
74
|
+
from playwright import async_api as playwright_async_api
|
|
75
|
+
|
|
76
|
+
return playwright_async_api
|
|
77
|
+
except Exception:
|
|
78
|
+
_emit(status_cb, "Installing Playwright Python package...")
|
|
79
|
+
_run_command([sys.executable, "-m", "pip", "install", "playwright"])
|
|
80
|
+
try:
|
|
81
|
+
from playwright import async_api as playwright_async_api
|
|
82
|
+
|
|
83
|
+
return playwright_async_api
|
|
84
|
+
except Exception as e:
|
|
85
|
+
raise BrowserLoginError(f"Failed to import Playwright after install: {e}")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _is_missing_browser_error(exc: Exception) -> bool:
|
|
89
|
+
text = str(exc).lower()
|
|
90
|
+
return (
|
|
91
|
+
"executable doesn't exist" in text
|
|
92
|
+
or "chromium distribution 'chrome'" in text
|
|
93
|
+
or "cannot find chromium" in text
|
|
94
|
+
or ("playwright install" in text and "chromium" in text)
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _install_chromium(status_cb: Callable[[str], None] | None) -> None:
|
|
99
|
+
_emit(status_cb, "Installing browser engine (one-time, ~50MB)...")
|
|
100
|
+
_run_command([sys.executable, "-m", "playwright", "install", "chromium"])
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _pick_free_port() -> int:
|
|
104
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
|
105
|
+
sock.bind(("127.0.0.1", 0))
|
|
106
|
+
sock.listen(1)
|
|
107
|
+
return int(sock.getsockname()[1])
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _find_chrome_executable() -> str | None:
|
|
111
|
+
if sys.platform == "darwin":
|
|
112
|
+
candidates = [
|
|
113
|
+
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
|
114
|
+
"/Applications/Chromium.app/Contents/MacOS/Chromium",
|
|
115
|
+
]
|
|
116
|
+
for candidate in candidates:
|
|
117
|
+
if Path(candidate).exists():
|
|
118
|
+
return candidate
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
if sys.platform.startswith("win"):
|
|
122
|
+
candidates = [
|
|
123
|
+
r"C:\Program Files\Google\Chrome\Application\chrome.exe",
|
|
124
|
+
r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
|
|
125
|
+
]
|
|
126
|
+
for candidate in candidates:
|
|
127
|
+
if Path(candidate).exists():
|
|
128
|
+
return candidate
|
|
129
|
+
return None
|
|
130
|
+
|
|
131
|
+
for cmd in ("google-chrome", "google-chrome-stable", "chromium", "chromium-browser"):
|
|
132
|
+
found = shutil.which(cmd)
|
|
133
|
+
if found:
|
|
134
|
+
return found
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _wait_for_cdp_ready(port: int, timeout_seconds: float = 12.0) -> bool:
|
|
139
|
+
url = f"http://127.0.0.1:{port}/json/version"
|
|
140
|
+
deadline = time.monotonic() + timeout_seconds
|
|
141
|
+
while True:
|
|
142
|
+
try:
|
|
143
|
+
with urlopen(url, timeout=1.0) as resp:
|
|
144
|
+
payload = resp.read().decode("utf-8", errors="ignore")
|
|
145
|
+
data = json.loads(payload or "{}")
|
|
146
|
+
if isinstance(data, dict) and data.get("webSocketDebuggerUrl"):
|
|
147
|
+
return True
|
|
148
|
+
except Exception:
|
|
149
|
+
pass
|
|
150
|
+
|
|
151
|
+
now = time.monotonic()
|
|
152
|
+
if now >= deadline:
|
|
153
|
+
return False
|
|
154
|
+
time.sleep(0.2)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _stop_auto_cdp_session(session: AutoCdpSession) -> None:
|
|
158
|
+
proc = session.process
|
|
159
|
+
try:
|
|
160
|
+
if proc.poll() is None:
|
|
161
|
+
proc.terminate()
|
|
162
|
+
try:
|
|
163
|
+
proc.wait(timeout=2.0)
|
|
164
|
+
except Exception:
|
|
165
|
+
proc.kill()
|
|
166
|
+
except Exception:
|
|
167
|
+
pass
|
|
168
|
+
shutil.rmtree(session.user_data_dir, ignore_errors=True)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def find_local_chrome_executable() -> str | None:
|
|
172
|
+
"""Return local Chrome/Chromium executable path if found."""
|
|
173
|
+
return _find_chrome_executable()
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def probe_cdp_endpoint(cdp_url: str, timeout_seconds: float = 2.0) -> bool:
|
|
177
|
+
"""Best-effort probe for a running CDP endpoint."""
|
|
178
|
+
url = cdp_url.rstrip("/") + "/json/version"
|
|
179
|
+
deadline = time.monotonic() + timeout_seconds
|
|
180
|
+
while time.monotonic() < deadline:
|
|
181
|
+
try:
|
|
182
|
+
with urlopen(url, timeout=1.0) as resp:
|
|
183
|
+
payload = resp.read().decode("utf-8", errors="ignore")
|
|
184
|
+
data = json.loads(payload or "{}")
|
|
185
|
+
if isinstance(data, dict) and data.get("webSocketDebuggerUrl"):
|
|
186
|
+
return True
|
|
187
|
+
except Exception:
|
|
188
|
+
pass
|
|
189
|
+
time.sleep(0.15)
|
|
190
|
+
return False
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def start_auto_cdp_chrome(
|
|
194
|
+
*,
|
|
195
|
+
status_cb: Callable[[str], None] | None = None,
|
|
196
|
+
debug_cb: Callable[[str], None] | None = None,
|
|
197
|
+
chrome_path: str | None = None,
|
|
198
|
+
port: int | None = None,
|
|
199
|
+
headless: bool = False,
|
|
200
|
+
) -> AutoCdpSession:
|
|
201
|
+
"""Start local Chrome with CDP and return session metadata."""
|
|
202
|
+
return _start_auto_cdp_chrome(
|
|
203
|
+
status_cb=status_cb,
|
|
204
|
+
debug_cb=debug_cb,
|
|
205
|
+
chrome_path=chrome_path,
|
|
206
|
+
port=port,
|
|
207
|
+
headless=headless,
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def stop_auto_cdp_chrome(session: AutoCdpSession) -> None:
|
|
212
|
+
"""Stop auto-started CDP Chrome session and remove temp profile."""
|
|
213
|
+
_stop_auto_cdp_session(session)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _start_auto_cdp_chrome(
|
|
217
|
+
*,
|
|
218
|
+
status_cb: Callable[[str], None] | None,
|
|
219
|
+
debug_cb: Callable[[str], None] | None,
|
|
220
|
+
chrome_path: str | None = None,
|
|
221
|
+
port: int | None = None,
|
|
222
|
+
headless: bool = False,
|
|
223
|
+
) -> AutoCdpSession:
|
|
224
|
+
binary = chrome_path or _find_chrome_executable()
|
|
225
|
+
if not binary:
|
|
226
|
+
raise BrowserLoginError(
|
|
227
|
+
"Could not find local Chrome executable for --browser-cdp-auto. "
|
|
228
|
+
"Use --browser-cdp-url or --browser-chrome-path."
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
cdp_port = int(port or _pick_free_port())
|
|
232
|
+
user_data_dir = Path(tempfile.mkdtemp(prefix="web2cli-cdp-"))
|
|
233
|
+
args = [
|
|
234
|
+
binary,
|
|
235
|
+
f"--remote-debugging-port={cdp_port}",
|
|
236
|
+
f"--user-data-dir={str(user_data_dir)}",
|
|
237
|
+
"--no-first-run",
|
|
238
|
+
"--no-default-browser-check",
|
|
239
|
+
]
|
|
240
|
+
if headless:
|
|
241
|
+
args.append("--headless=new")
|
|
242
|
+
|
|
243
|
+
_emit(status_cb, "Starting local browser...")
|
|
244
|
+
proc = subprocess.Popen(
|
|
245
|
+
args,
|
|
246
|
+
stdout=subprocess.DEVNULL,
|
|
247
|
+
stderr=subprocess.DEVNULL,
|
|
248
|
+
text=True,
|
|
249
|
+
)
|
|
250
|
+
if debug_cb:
|
|
251
|
+
_emit_debug(debug_cb, f"cdp auto: chrome={binary}")
|
|
252
|
+
_emit_debug(debug_cb, f"cdp auto: port={cdp_port} profile={user_data_dir}")
|
|
253
|
+
|
|
254
|
+
ready = _wait_for_cdp_ready(cdp_port, timeout_seconds=12.0)
|
|
255
|
+
if not ready:
|
|
256
|
+
_stop_auto_cdp_session(
|
|
257
|
+
AutoCdpSession(
|
|
258
|
+
cdp_url=f"http://127.0.0.1:{cdp_port}",
|
|
259
|
+
process=proc,
|
|
260
|
+
user_data_dir=user_data_dir,
|
|
261
|
+
port=cdp_port,
|
|
262
|
+
)
|
|
263
|
+
)
|
|
264
|
+
raise BrowserLoginError(
|
|
265
|
+
"Failed to start local Chrome CDP endpoint. "
|
|
266
|
+
"Try --browser-cdp-url with an existing Chrome instance."
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
return AutoCdpSession(
|
|
270
|
+
cdp_url=f"http://127.0.0.1:{cdp_port}",
|
|
271
|
+
process=proc,
|
|
272
|
+
user_data_dir=user_data_dir,
|
|
273
|
+
port=cdp_port,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def _header_value(headers: dict[str, str], key: str) -> str | None:
|
|
278
|
+
wanted = key.lower()
|
|
279
|
+
for hkey, hval in headers.items():
|
|
280
|
+
if str(hkey).lower() == wanted:
|
|
281
|
+
return str(hval)
|
|
282
|
+
return None
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def _multipart_form_value(post_data: str, key: str) -> str | None:
|
|
286
|
+
lines = post_data.splitlines()
|
|
287
|
+
if not lines:
|
|
288
|
+
return None
|
|
289
|
+
first = lines[0].strip()
|
|
290
|
+
if not first.startswith("--") or len(first) <= 2:
|
|
291
|
+
return None
|
|
292
|
+
boundary = first[2:]
|
|
293
|
+
delimiter = f"--{boundary}"
|
|
294
|
+
|
|
295
|
+
for raw_part in post_data.split(delimiter):
|
|
296
|
+
part = raw_part.strip()
|
|
297
|
+
if not part or part == "--":
|
|
298
|
+
continue
|
|
299
|
+
if part.endswith("--"):
|
|
300
|
+
part = part[:-2].rstrip()
|
|
301
|
+
|
|
302
|
+
header_blob, sep, body = part.partition("\r\n\r\n")
|
|
303
|
+
if not sep:
|
|
304
|
+
header_blob, sep, body = part.partition("\n\n")
|
|
305
|
+
if not sep:
|
|
306
|
+
continue
|
|
307
|
+
|
|
308
|
+
header_lines = [h.strip() for h in header_blob.splitlines() if h.strip()]
|
|
309
|
+
disposition = ""
|
|
310
|
+
for line in header_lines:
|
|
311
|
+
if line.lower().startswith("content-disposition:"):
|
|
312
|
+
disposition = line
|
|
313
|
+
break
|
|
314
|
+
if not disposition:
|
|
315
|
+
continue
|
|
316
|
+
|
|
317
|
+
needle = f'name="{key}"'
|
|
318
|
+
if needle not in disposition:
|
|
319
|
+
continue
|
|
320
|
+
|
|
321
|
+
value = body.strip("\r\n")
|
|
322
|
+
return value or None
|
|
323
|
+
return None
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def _form_value(post_data: str, key: str) -> str | None:
|
|
327
|
+
parsed = parse_qs(post_data, keep_blank_values=True)
|
|
328
|
+
values = parsed.get(key)
|
|
329
|
+
if values:
|
|
330
|
+
value = values[0]
|
|
331
|
+
if value is not None:
|
|
332
|
+
return str(value)
|
|
333
|
+
|
|
334
|
+
# Fallback for multipart/form-data payloads.
|
|
335
|
+
return _multipart_form_value(post_data, key)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _request_headers_safe(request) -> dict[str, str]:
|
|
339
|
+
try:
|
|
340
|
+
raw_headers = request.headers
|
|
341
|
+
except Exception:
|
|
342
|
+
return {}
|
|
343
|
+
if callable(raw_headers):
|
|
344
|
+
try:
|
|
345
|
+
raw_headers = raw_headers()
|
|
346
|
+
except Exception:
|
|
347
|
+
return {}
|
|
348
|
+
if isinstance(raw_headers, dict):
|
|
349
|
+
return dict(raw_headers)
|
|
350
|
+
return {}
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
def _request_post_data_safe(request) -> str:
|
|
354
|
+
# Prefer raw bytes if exposed by current Playwright version.
|
|
355
|
+
try:
|
|
356
|
+
raw_post_data = request.post_data_buffer
|
|
357
|
+
if callable(raw_post_data):
|
|
358
|
+
raw_post_data = raw_post_data()
|
|
359
|
+
if isinstance(raw_post_data, (bytes, bytearray)):
|
|
360
|
+
return bytes(raw_post_data).decode("utf-8", errors="ignore")
|
|
361
|
+
except Exception:
|
|
362
|
+
pass
|
|
363
|
+
|
|
364
|
+
# Fallback to decoded post_data; some requests can raise decode errors.
|
|
365
|
+
try:
|
|
366
|
+
raw_post_data = request.post_data
|
|
367
|
+
except Exception:
|
|
368
|
+
return ""
|
|
369
|
+
if callable(raw_post_data):
|
|
370
|
+
try:
|
|
371
|
+
raw_post_data = raw_post_data()
|
|
372
|
+
except Exception:
|
|
373
|
+
return ""
|
|
374
|
+
if isinstance(raw_post_data, (bytes, bytearray)):
|
|
375
|
+
return bytes(raw_post_data).decode("utf-8", errors="ignore")
|
|
376
|
+
return str(raw_post_data or "")
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def _short_url(url: str, max_len: int = 88) -> str:
|
|
380
|
+
if len(url) <= max_len:
|
|
381
|
+
return url
|
|
382
|
+
return f"{url[: max_len - 3]}..."
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def _request_label(request) -> str:
|
|
386
|
+
try:
|
|
387
|
+
method = str(request.method or "").upper()
|
|
388
|
+
except Exception:
|
|
389
|
+
method = "?"
|
|
390
|
+
try:
|
|
391
|
+
raw_url = str(request.url or "")
|
|
392
|
+
except Exception:
|
|
393
|
+
raw_url = ""
|
|
394
|
+
if not raw_url:
|
|
395
|
+
return method
|
|
396
|
+
parsed = urlparse(raw_url)
|
|
397
|
+
host = parsed.hostname or ""
|
|
398
|
+
path = parsed.path or "/"
|
|
399
|
+
if parsed.query:
|
|
400
|
+
path = f"{path}?..."
|
|
401
|
+
return f"{method} {host}{path}"
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
def _token_rule_label(rule: TokenCaptureRule) -> str:
|
|
405
|
+
bits = [f"{rule.source}:{rule.key}"]
|
|
406
|
+
if rule.method:
|
|
407
|
+
bits.append(rule.method.upper())
|
|
408
|
+
if rule.host:
|
|
409
|
+
bits.append(rule.host)
|
|
410
|
+
if rule.path_regex:
|
|
411
|
+
bits.append(f"path~{rule.path_regex}")
|
|
412
|
+
return " ".join(bits)
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def _request_route_info(request) -> tuple[str, str, str] | None:
|
|
416
|
+
try:
|
|
417
|
+
parsed = urlparse(str(request.url))
|
|
418
|
+
host = (parsed.hostname or "").lower()
|
|
419
|
+
path = parsed.path or "/"
|
|
420
|
+
method = str(request.method or "").upper()
|
|
421
|
+
except Exception:
|
|
422
|
+
return None
|
|
423
|
+
return host, path, method
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
def _request_matches_any_rule(request, rules: list[TokenCaptureRule]) -> bool:
|
|
427
|
+
route = _request_route_info(request)
|
|
428
|
+
if route is None:
|
|
429
|
+
return False
|
|
430
|
+
host, path, method = route
|
|
431
|
+
for rule in rules:
|
|
432
|
+
if _rule_matches_request(rule, host=host, path=path, method=method):
|
|
433
|
+
return True
|
|
434
|
+
return False
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
async def _apply_stealth_init_script(context) -> None:
|
|
438
|
+
# Best-effort JS patches to reduce obvious automation fingerprints.
|
|
439
|
+
script = """
|
|
440
|
+
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
|
|
441
|
+
window.chrome = window.chrome || { runtime: {} };
|
|
442
|
+
Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
|
|
443
|
+
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
|
|
444
|
+
"""
|
|
445
|
+
try:
|
|
446
|
+
await context.add_init_script(script)
|
|
447
|
+
except Exception:
|
|
448
|
+
return
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
async def _launch_browser_with_fallback(
|
|
452
|
+
playwright_async_api,
|
|
453
|
+
debug_cb: Callable[[str], None] | None,
|
|
454
|
+
):
|
|
455
|
+
common_args = [
|
|
456
|
+
"--disable-features=PrivateNetworkAccessRespectPreflightResults,"
|
|
457
|
+
"BlockInsecurePrivateNetworkRequests",
|
|
458
|
+
"--disable-blink-features=AutomationControlled",
|
|
459
|
+
]
|
|
460
|
+
|
|
461
|
+
# Prefer user-installed Chrome first (looks less synthetic than Playwright Chromium).
|
|
462
|
+
launch_profiles = [
|
|
463
|
+
(
|
|
464
|
+
"chrome",
|
|
465
|
+
{
|
|
466
|
+
"channel": "chrome",
|
|
467
|
+
"headless": False,
|
|
468
|
+
"args": common_args,
|
|
469
|
+
"ignore_default_args": ["--enable-automation", "--no-sandbox"],
|
|
470
|
+
},
|
|
471
|
+
),
|
|
472
|
+
(
|
|
473
|
+
"chromium",
|
|
474
|
+
{
|
|
475
|
+
"headless": False,
|
|
476
|
+
"args": common_args,
|
|
477
|
+
"ignore_default_args": ["--enable-automation", "--no-sandbox"],
|
|
478
|
+
},
|
|
479
|
+
),
|
|
480
|
+
]
|
|
481
|
+
|
|
482
|
+
errors: list[str] = []
|
|
483
|
+
for profile_name, options in launch_profiles:
|
|
484
|
+
try:
|
|
485
|
+
browser = await playwright_async_api.chromium.launch(**options)
|
|
486
|
+
if debug_cb:
|
|
487
|
+
_emit_debug(debug_cb, f"browser profile: {profile_name}")
|
|
488
|
+
return browser, profile_name
|
|
489
|
+
except Exception as e:
|
|
490
|
+
errors.append(f"{profile_name}: {e}")
|
|
491
|
+
if debug_cb:
|
|
492
|
+
_emit_debug(debug_cb, f"browser profile failed: {profile_name}: {e}")
|
|
493
|
+
|
|
494
|
+
detail = " | ".join(errors)
|
|
495
|
+
raise BrowserLoginError(f"Failed to launch browser ({detail})")
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
async def _open_browser_and_context(
|
|
499
|
+
playwright_async_api,
|
|
500
|
+
*,
|
|
501
|
+
debug_cb: Callable[[str], None] | None,
|
|
502
|
+
cdp_url: str | None,
|
|
503
|
+
):
|
|
504
|
+
if cdp_url:
|
|
505
|
+
browser = await playwright_async_api.chromium.connect_over_cdp(cdp_url)
|
|
506
|
+
contexts = list(browser.contexts)
|
|
507
|
+
if contexts:
|
|
508
|
+
context = contexts[0]
|
|
509
|
+
else:
|
|
510
|
+
context = await browser.new_context(viewport={"width": 1280, "height": 900})
|
|
511
|
+
if debug_cb:
|
|
512
|
+
_emit_debug(debug_cb, f"browser profile: cdp ({cdp_url})")
|
|
513
|
+
_emit_debug(
|
|
514
|
+
debug_cb,
|
|
515
|
+
f"cdp contexts={len(contexts)} tabs={len(getattr(context, 'pages', []))}",
|
|
516
|
+
)
|
|
517
|
+
return browser, context, "cdp", False
|
|
518
|
+
|
|
519
|
+
browser, browser_profile = await _launch_browser_with_fallback(
|
|
520
|
+
playwright_async_api,
|
|
521
|
+
debug_cb=debug_cb,
|
|
522
|
+
)
|
|
523
|
+
context = await browser.new_context(
|
|
524
|
+
viewport={"width": 1280, "height": 900},
|
|
525
|
+
)
|
|
526
|
+
return browser, context, browser_profile, True
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
def _rule_matches_request(rule: TokenCaptureRule, *, host: str, path: str, method: str) -> bool:
|
|
530
|
+
if rule.host:
|
|
531
|
+
normalized = rule.host.lower()
|
|
532
|
+
if host != normalized and not host.endswith(f".{normalized}"):
|
|
533
|
+
return False
|
|
534
|
+
|
|
535
|
+
if rule.method and method != rule.method.upper():
|
|
536
|
+
return False
|
|
537
|
+
|
|
538
|
+
if rule.path_regex:
|
|
539
|
+
try:
|
|
540
|
+
if re.search(rule.path_regex, path) is None:
|
|
541
|
+
return False
|
|
542
|
+
except re.error:
|
|
543
|
+
return False
|
|
544
|
+
|
|
545
|
+
return True
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
def _extract_token_from_request(
|
|
549
|
+
request,
|
|
550
|
+
rules: list[TokenCaptureRule],
|
|
551
|
+
) -> tuple[str, str] | None:
|
|
552
|
+
if not rules:
|
|
553
|
+
return None
|
|
554
|
+
|
|
555
|
+
route = _request_route_info(request)
|
|
556
|
+
if route is None:
|
|
557
|
+
return None
|
|
558
|
+
host, path, method = route
|
|
559
|
+
|
|
560
|
+
headers: dict[str, str] | None = None
|
|
561
|
+
post_data: str | None = None
|
|
562
|
+
|
|
563
|
+
for rule in rules:
|
|
564
|
+
if not _rule_matches_request(rule, host=host, path=path, method=method):
|
|
565
|
+
continue
|
|
566
|
+
|
|
567
|
+
value: str | None = None
|
|
568
|
+
source = rule.source.lower()
|
|
569
|
+
if source == "request.header":
|
|
570
|
+
if headers is None:
|
|
571
|
+
headers = _request_headers_safe(request)
|
|
572
|
+
value = _header_value(headers, rule.key)
|
|
573
|
+
elif source == "request.form":
|
|
574
|
+
if post_data is None:
|
|
575
|
+
post_data = _request_post_data_safe(request)
|
|
576
|
+
value = _form_value(post_data, rule.key)
|
|
577
|
+
|
|
578
|
+
if value is None:
|
|
579
|
+
continue
|
|
580
|
+
if rule.strip_prefix and value.startswith(rule.strip_prefix):
|
|
581
|
+
value = value[len(rule.strip_prefix) :]
|
|
582
|
+
if value:
|
|
583
|
+
source = f"{_token_rule_label(rule)} <= {_request_label(request)}"
|
|
584
|
+
return value, source
|
|
585
|
+
|
|
586
|
+
return None
|
|
587
|
+
|
|
588
|
+
|
|
589
|
+
async def _capture_auth_once(
|
|
590
|
+
playwright_async_api,
|
|
591
|
+
domain: str,
|
|
592
|
+
required_cookies: list[str],
|
|
593
|
+
token_rules: list[TokenCaptureRule],
|
|
594
|
+
poll_seconds: float = 1.0,
|
|
595
|
+
debug_cb: Callable[[str], None] | None = None,
|
|
596
|
+
cdp_url: str | None = None,
|
|
597
|
+
) -> tuple[dict[str, str], str | None]:
|
|
598
|
+
required = [c for c in required_cookies if c]
|
|
599
|
+
if not required and not token_rules:
|
|
600
|
+
raise BrowserLoginError(
|
|
601
|
+
"No required cookie keys or token capture rules configured for browser login"
|
|
602
|
+
)
|
|
603
|
+
|
|
604
|
+
async with playwright_async_api.async_playwright() as p:
|
|
605
|
+
browser, context, browser_profile, managed_browser = await _open_browser_and_context(
|
|
606
|
+
p,
|
|
607
|
+
debug_cb=debug_cb,
|
|
608
|
+
cdp_url=cdp_url,
|
|
609
|
+
)
|
|
610
|
+
await _apply_stealth_init_script(context)
|
|
611
|
+
if debug_cb:
|
|
612
|
+
_emit_debug(debug_cb, f"context ready ({browser_profile}, viewport=1280x900)")
|
|
613
|
+
|
|
614
|
+
captured_token: str | None = None
|
|
615
|
+
captured_token_source: str | None = None
|
|
616
|
+
token_candidate_count = 0
|
|
617
|
+
last_token_candidate: str | None = None
|
|
618
|
+
last_debug_state: tuple | None = None
|
|
619
|
+
|
|
620
|
+
if debug_cb:
|
|
621
|
+
if required:
|
|
622
|
+
_emit_debug(debug_cb, f"required cookies: {', '.join(required)}")
|
|
623
|
+
else:
|
|
624
|
+
_emit_debug(debug_cb, "required cookies: none")
|
|
625
|
+
if token_rules:
|
|
626
|
+
for idx, rule in enumerate(token_rules, start=1):
|
|
627
|
+
_emit_debug(debug_cb, f"token rule[{idx}]: {_token_rule_label(rule)}")
|
|
628
|
+
else:
|
|
629
|
+
_emit_debug(debug_cb, "token capture: none")
|
|
630
|
+
|
|
631
|
+
def _handle_request(request) -> None:
|
|
632
|
+
nonlocal captured_token
|
|
633
|
+
nonlocal captured_token_source
|
|
634
|
+
nonlocal token_candidate_count
|
|
635
|
+
nonlocal last_token_candidate
|
|
636
|
+
if captured_token is not None:
|
|
637
|
+
return
|
|
638
|
+
try:
|
|
639
|
+
if _request_matches_any_rule(request, token_rules):
|
|
640
|
+
token_candidate_count += 1
|
|
641
|
+
last_token_candidate = _request_label(request)
|
|
642
|
+
token_match = _extract_token_from_request(request, token_rules)
|
|
643
|
+
if token_match:
|
|
644
|
+
captured_token, captured_token_source = token_match
|
|
645
|
+
if debug_cb:
|
|
646
|
+
_emit_debug(debug_cb, f"captured token via {captured_token_source}")
|
|
647
|
+
except Exception:
|
|
648
|
+
# Never let Playwright request events crash the login flow.
|
|
649
|
+
return
|
|
650
|
+
|
|
651
|
+
context.on("request", _handle_request)
|
|
652
|
+
page = await context.new_page()
|
|
653
|
+
await page.goto(f"https://{domain}", wait_until="domcontentloaded")
|
|
654
|
+
|
|
655
|
+
while True:
|
|
656
|
+
cookies = await context.cookies()
|
|
657
|
+
by_name = {
|
|
658
|
+
str(c.get("name", "")): str(c.get("value", ""))
|
|
659
|
+
for c in cookies
|
|
660
|
+
if c.get("name")
|
|
661
|
+
}
|
|
662
|
+
have_cookies = [name for name in required if name in by_name]
|
|
663
|
+
missing_cookies = [name for name in required if name not in by_name]
|
|
664
|
+
|
|
665
|
+
tab_urls: list[str] = []
|
|
666
|
+
for pg in list(context.pages):
|
|
667
|
+
try:
|
|
668
|
+
tab_urls.append(_short_url(str(pg.url or "")))
|
|
669
|
+
except Exception:
|
|
670
|
+
tab_urls.append("<unknown>")
|
|
671
|
+
|
|
672
|
+
if debug_cb:
|
|
673
|
+
debug_state = (
|
|
674
|
+
tuple(have_cookies),
|
|
675
|
+
tuple(missing_cookies),
|
|
676
|
+
bool(captured_token),
|
|
677
|
+
captured_token_source or "",
|
|
678
|
+
token_candidate_count,
|
|
679
|
+
last_token_candidate or "",
|
|
680
|
+
tuple(tab_urls),
|
|
681
|
+
)
|
|
682
|
+
if debug_state != last_debug_state:
|
|
683
|
+
cookies_summary = (
|
|
684
|
+
f"cookies {len(have_cookies)}/{len(required)}"
|
|
685
|
+
if required
|
|
686
|
+
else "cookies n/a"
|
|
687
|
+
)
|
|
688
|
+
if required:
|
|
689
|
+
have_txt = ",".join(have_cookies) if have_cookies else "-"
|
|
690
|
+
missing_txt = ",".join(missing_cookies) if missing_cookies else "-"
|
|
691
|
+
cookies_summary = (
|
|
692
|
+
f"{cookies_summary} have=[{have_txt}] missing=[{missing_txt}]"
|
|
693
|
+
)
|
|
694
|
+
token_summary = (
|
|
695
|
+
f"token={'present' if captured_token else 'missing'}"
|
|
696
|
+
+ (
|
|
697
|
+
f" ({captured_token_source})"
|
|
698
|
+
if captured_token and captured_token_source
|
|
699
|
+
else ""
|
|
700
|
+
)
|
|
701
|
+
)
|
|
702
|
+
if not captured_token:
|
|
703
|
+
token_summary = (
|
|
704
|
+
f"{token_summary} candidates={token_candidate_count}"
|
|
705
|
+
+ (
|
|
706
|
+
f" last={last_token_candidate}"
|
|
707
|
+
if last_token_candidate
|
|
708
|
+
else ""
|
|
709
|
+
)
|
|
710
|
+
)
|
|
711
|
+
tabs_summary = (
|
|
712
|
+
f"tabs={len(tab_urls)} "
|
|
713
|
+
+ "; ".join(tab_urls if tab_urls else ["<none>"])
|
|
714
|
+
)
|
|
715
|
+
_emit_debug(debug_cb, f"{cookies_summary} | {token_summary} | {tabs_summary}")
|
|
716
|
+
last_debug_state = debug_state
|
|
717
|
+
|
|
718
|
+
cookies_ready = all(name in by_name for name in required)
|
|
719
|
+
token_ready = (not token_rules) or (captured_token is not None)
|
|
720
|
+
if cookies_ready and token_ready:
|
|
721
|
+
if managed_browser:
|
|
722
|
+
await browser.close()
|
|
723
|
+
else:
|
|
724
|
+
try:
|
|
725
|
+
await page.close()
|
|
726
|
+
except Exception:
|
|
727
|
+
pass
|
|
728
|
+
out_cookies = {name: by_name[name] for name in required}
|
|
729
|
+
return out_cookies, captured_token
|
|
730
|
+
await asyncio.sleep(poll_seconds)
|
|
731
|
+
|
|
732
|
+
|
|
733
|
+
def capture_auth_with_browser(
|
|
734
|
+
domain: str,
|
|
735
|
+
required_cookies: list[str],
|
|
736
|
+
token_rules: list[TokenCaptureRule] | None = None,
|
|
737
|
+
status_cb: Callable[[str], None] | None = None,
|
|
738
|
+
debug_cb: Callable[[str], None] | None = None,
|
|
739
|
+
cdp_url: str | None = None,
|
|
740
|
+
cdp_auto: bool = False,
|
|
741
|
+
cdp_port: int | None = None,
|
|
742
|
+
chrome_path: str | None = None,
|
|
743
|
+
) -> tuple[dict[str, str], str | None]:
|
|
744
|
+
"""Open browser and wait until required auth values are present."""
|
|
745
|
+
normalized_rules = token_rules or []
|
|
746
|
+
playwright_async_api = _ensure_playwright_package(status_cb)
|
|
747
|
+
auto_session: AutoCdpSession | None = None
|
|
748
|
+
|
|
749
|
+
try:
|
|
750
|
+
# Default behavior: transparently prefer local Chrome via CDP when URL isn't explicit.
|
|
751
|
+
prefer_auto_cdp = cdp_auto or (cdp_url is None)
|
|
752
|
+
if prefer_auto_cdp and cdp_url is None:
|
|
753
|
+
try:
|
|
754
|
+
auto_session = _start_auto_cdp_chrome(
|
|
755
|
+
status_cb=status_cb,
|
|
756
|
+
debug_cb=debug_cb,
|
|
757
|
+
chrome_path=chrome_path,
|
|
758
|
+
port=cdp_port,
|
|
759
|
+
)
|
|
760
|
+
cdp_url = auto_session.cdp_url
|
|
761
|
+
if debug_cb:
|
|
762
|
+
_emit_debug(debug_cb, f"cdp auto ready: {cdp_url}")
|
|
763
|
+
except BrowserLoginError as e:
|
|
764
|
+
if cdp_auto:
|
|
765
|
+
raise
|
|
766
|
+
# Silent fallback for default --browser mode.
|
|
767
|
+
_emit(status_cb, "Local browser unavailable, falling back to embedded browser...")
|
|
768
|
+
if debug_cb:
|
|
769
|
+
_emit_debug(debug_cb, f"cdp auto unavailable, fallback to playwright: {e}")
|
|
770
|
+
cdp_url = None
|
|
771
|
+
|
|
772
|
+
return asyncio.run(
|
|
773
|
+
_capture_auth_once(
|
|
774
|
+
playwright_async_api,
|
|
775
|
+
domain,
|
|
776
|
+
required_cookies,
|
|
777
|
+
normalized_rules,
|
|
778
|
+
debug_cb=debug_cb,
|
|
779
|
+
cdp_url=cdp_url,
|
|
780
|
+
)
|
|
781
|
+
)
|
|
782
|
+
except KeyboardInterrupt:
|
|
783
|
+
raise BrowserLoginCancelled("Login cancelled by user")
|
|
784
|
+
except Exception as e:
|
|
785
|
+
if _is_missing_browser_error(e):
|
|
786
|
+
_install_chromium(status_cb)
|
|
787
|
+
try:
|
|
788
|
+
return asyncio.run(
|
|
789
|
+
_capture_auth_once(
|
|
790
|
+
playwright_async_api,
|
|
791
|
+
domain,
|
|
792
|
+
required_cookies,
|
|
793
|
+
normalized_rules,
|
|
794
|
+
debug_cb=debug_cb,
|
|
795
|
+
cdp_url=cdp_url,
|
|
796
|
+
)
|
|
797
|
+
)
|
|
798
|
+
except KeyboardInterrupt:
|
|
799
|
+
raise BrowserLoginCancelled("Login cancelled by user")
|
|
800
|
+
except Exception as inner:
|
|
801
|
+
raise BrowserLoginError(str(inner))
|
|
802
|
+
raise BrowserLoginError(str(e))
|
|
803
|
+
finally:
|
|
804
|
+
if auto_session is not None:
|
|
805
|
+
_stop_auto_cdp_session(auto_session)
|
|
806
|
+
|
|
807
|
+
|
|
808
|
+
def capture_cookies_with_browser(
|
|
809
|
+
domain: str,
|
|
810
|
+
required_cookies: list[str],
|
|
811
|
+
status_cb: Callable[[str], None] | None = None,
|
|
812
|
+
) -> dict[str, str]:
|
|
813
|
+
"""Backward-compatible wrapper returning only cookies."""
|
|
814
|
+
cookies, _ = capture_auth_with_browser(
|
|
815
|
+
domain=domain,
|
|
816
|
+
required_cookies=required_cookies,
|
|
817
|
+
token_rules=[],
|
|
818
|
+
status_cb=status_cb,
|
|
819
|
+
)
|
|
820
|
+
return cookies
|