web2cli 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. web2cli/__init__.py +3 -0
  2. web2cli/__main__.py +5 -0
  3. web2cli/adapter/__init__.py +0 -0
  4. web2cli/adapter/lint.py +667 -0
  5. web2cli/adapter/loader.py +157 -0
  6. web2cli/adapter/validator.py +127 -0
  7. web2cli/adapters/discord.com/web2cli.yaml +476 -0
  8. web2cli/adapters/mail.google.com/parsers/inbox.py +200 -0
  9. web2cli/adapters/mail.google.com/web2cli.yaml +52 -0
  10. web2cli/adapters/news.ycombinator.com/web2cli.yaml +356 -0
  11. web2cli/adapters/reddit.com/web2cli.yaml +233 -0
  12. web2cli/adapters/slack.com/web2cli.yaml +445 -0
  13. web2cli/adapters/stackoverflow.com/web2cli.yaml +257 -0
  14. web2cli/adapters/x.com/providers/x_graphql.py +299 -0
  15. web2cli/adapters/x.com/web2cli.yaml +449 -0
  16. web2cli/auth/__init__.py +0 -0
  17. web2cli/auth/browser_login.py +820 -0
  18. web2cli/auth/manager.py +166 -0
  19. web2cli/auth/store.py +68 -0
  20. web2cli/cli.py +1286 -0
  21. web2cli/executor/__init__.py +0 -0
  22. web2cli/executor/http.py +113 -0
  23. web2cli/output/__init__.py +0 -0
  24. web2cli/output/formatter.py +116 -0
  25. web2cli/parser/__init__.py +0 -0
  26. web2cli/parser/custom.py +21 -0
  27. web2cli/parser/html_parser.py +111 -0
  28. web2cli/parser/transforms.py +127 -0
  29. web2cli/pipe.py +10 -0
  30. web2cli/providers/__init__.py +6 -0
  31. web2cli/providers/base.py +22 -0
  32. web2cli/providers/registry.py +86 -0
  33. web2cli/runtime/__init__.py +1 -0
  34. web2cli/runtime/cache.py +42 -0
  35. web2cli/runtime/engine.py +743 -0
  36. web2cli/runtime/parser.py +398 -0
  37. web2cli/runtime/template.py +52 -0
  38. web2cli/types.py +71 -0
  39. web2cli-0.2.0.dist-info/METADATA +467 -0
  40. web2cli-0.2.0.dist-info/RECORD +44 -0
  41. web2cli-0.2.0.dist-info/WHEEL +5 -0
  42. web2cli-0.2.0.dist-info/entry_points.txt +2 -0
  43. web2cli-0.2.0.dist-info/licenses/LICENSE +202 -0
  44. web2cli-0.2.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,820 @@
1
+ """Browser-assisted auth capture for login command."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import re
8
+ import shutil
9
+ import socket
10
+ import subprocess
11
+ import sys
12
+ import tempfile
13
+ import time
14
+ from dataclasses import dataclass
15
+ from collections.abc import Callable
16
+ from pathlib import Path
17
+ from urllib.request import urlopen
18
+ from urllib.parse import parse_qs, urlparse
19
+
20
+
21
+ class BrowserLoginError(RuntimeError):
22
+ """Raised for browser-login failures."""
23
+
24
+
25
+ class BrowserLoginCancelled(RuntimeError):
26
+ """Raised when user cancels browser login."""
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class TokenCaptureRule:
31
+ """Declarative rule for extracting token from browser network requests."""
32
+
33
+ source: str # request.header | request.form
34
+ key: str
35
+ host: str | None = None
36
+ path_regex: str | None = None
37
+ method: str | None = None
38
+ strip_prefix: str | None = None
39
+
40
+
41
+ @dataclass(frozen=True)
42
+ class AutoCdpSession:
43
+ """Process/session details for auto-started local Chrome over CDP."""
44
+
45
+ cdp_url: str
46
+ process: subprocess.Popen[str]
47
+ user_data_dir: Path
48
+ port: int
49
+
50
+
51
+ def _emit(status_cb: Callable[[str], None] | None, message: str) -> None:
52
+ if status_cb:
53
+ status_cb(message)
54
+
55
+
56
+ def _emit_debug(debug_cb: Callable[[str], None] | None, message: str) -> None:
57
+ if debug_cb:
58
+ debug_cb(message)
59
+
60
+
61
+ def _run_command(cmd: list[str]) -> None:
62
+ proc = subprocess.run(cmd, capture_output=True, text=True)
63
+ if proc.returncode == 0:
64
+ return
65
+ detail = (proc.stderr or proc.stdout or "").strip()
66
+ raise BrowserLoginError(
67
+ f"Command failed: {' '.join(cmd)}"
68
+ + (f"\n{detail}" if detail else "")
69
+ )
70
+
71
+
72
+ def _ensure_playwright_package(status_cb: Callable[[str], None] | None):
73
+ try:
74
+ from playwright import async_api as playwright_async_api
75
+
76
+ return playwright_async_api
77
+ except Exception:
78
+ _emit(status_cb, "Installing Playwright Python package...")
79
+ _run_command([sys.executable, "-m", "pip", "install", "playwright"])
80
+ try:
81
+ from playwright import async_api as playwright_async_api
82
+
83
+ return playwright_async_api
84
+ except Exception as e:
85
+ raise BrowserLoginError(f"Failed to import Playwright after install: {e}")
86
+
87
+
88
+ def _is_missing_browser_error(exc: Exception) -> bool:
89
+ text = str(exc).lower()
90
+ return (
91
+ "executable doesn't exist" in text
92
+ or "chromium distribution 'chrome'" in text
93
+ or "cannot find chromium" in text
94
+ or ("playwright install" in text and "chromium" in text)
95
+ )
96
+
97
+
98
+ def _install_chromium(status_cb: Callable[[str], None] | None) -> None:
99
+ _emit(status_cb, "Installing browser engine (one-time, ~50MB)...")
100
+ _run_command([sys.executable, "-m", "playwright", "install", "chromium"])
101
+
102
+
103
+ def _pick_free_port() -> int:
104
+ with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
105
+ sock.bind(("127.0.0.1", 0))
106
+ sock.listen(1)
107
+ return int(sock.getsockname()[1])
108
+
109
+
110
+ def _find_chrome_executable() -> str | None:
111
+ if sys.platform == "darwin":
112
+ candidates = [
113
+ "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
114
+ "/Applications/Chromium.app/Contents/MacOS/Chromium",
115
+ ]
116
+ for candidate in candidates:
117
+ if Path(candidate).exists():
118
+ return candidate
119
+ return None
120
+
121
+ if sys.platform.startswith("win"):
122
+ candidates = [
123
+ r"C:\Program Files\Google\Chrome\Application\chrome.exe",
124
+ r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe",
125
+ ]
126
+ for candidate in candidates:
127
+ if Path(candidate).exists():
128
+ return candidate
129
+ return None
130
+
131
+ for cmd in ("google-chrome", "google-chrome-stable", "chromium", "chromium-browser"):
132
+ found = shutil.which(cmd)
133
+ if found:
134
+ return found
135
+ return None
136
+
137
+
138
+ def _wait_for_cdp_ready(port: int, timeout_seconds: float = 12.0) -> bool:
139
+ url = f"http://127.0.0.1:{port}/json/version"
140
+ deadline = time.monotonic() + timeout_seconds
141
+ while True:
142
+ try:
143
+ with urlopen(url, timeout=1.0) as resp:
144
+ payload = resp.read().decode("utf-8", errors="ignore")
145
+ data = json.loads(payload or "{}")
146
+ if isinstance(data, dict) and data.get("webSocketDebuggerUrl"):
147
+ return True
148
+ except Exception:
149
+ pass
150
+
151
+ now = time.monotonic()
152
+ if now >= deadline:
153
+ return False
154
+ time.sleep(0.2)
155
+
156
+
157
+ def _stop_auto_cdp_session(session: AutoCdpSession) -> None:
158
+ proc = session.process
159
+ try:
160
+ if proc.poll() is None:
161
+ proc.terminate()
162
+ try:
163
+ proc.wait(timeout=2.0)
164
+ except Exception:
165
+ proc.kill()
166
+ except Exception:
167
+ pass
168
+ shutil.rmtree(session.user_data_dir, ignore_errors=True)
169
+
170
+
171
+ def find_local_chrome_executable() -> str | None:
172
+ """Return local Chrome/Chromium executable path if found."""
173
+ return _find_chrome_executable()
174
+
175
+
176
+ def probe_cdp_endpoint(cdp_url: str, timeout_seconds: float = 2.0) -> bool:
177
+ """Best-effort probe for a running CDP endpoint."""
178
+ url = cdp_url.rstrip("/") + "/json/version"
179
+ deadline = time.monotonic() + timeout_seconds
180
+ while time.monotonic() < deadline:
181
+ try:
182
+ with urlopen(url, timeout=1.0) as resp:
183
+ payload = resp.read().decode("utf-8", errors="ignore")
184
+ data = json.loads(payload or "{}")
185
+ if isinstance(data, dict) and data.get("webSocketDebuggerUrl"):
186
+ return True
187
+ except Exception:
188
+ pass
189
+ time.sleep(0.15)
190
+ return False
191
+
192
+
193
+ def start_auto_cdp_chrome(
194
+ *,
195
+ status_cb: Callable[[str], None] | None = None,
196
+ debug_cb: Callable[[str], None] | None = None,
197
+ chrome_path: str | None = None,
198
+ port: int | None = None,
199
+ headless: bool = False,
200
+ ) -> AutoCdpSession:
201
+ """Start local Chrome with CDP and return session metadata."""
202
+ return _start_auto_cdp_chrome(
203
+ status_cb=status_cb,
204
+ debug_cb=debug_cb,
205
+ chrome_path=chrome_path,
206
+ port=port,
207
+ headless=headless,
208
+ )
209
+
210
+
211
+ def stop_auto_cdp_chrome(session: AutoCdpSession) -> None:
212
+ """Stop auto-started CDP Chrome session and remove temp profile."""
213
+ _stop_auto_cdp_session(session)
214
+
215
+
216
+ def _start_auto_cdp_chrome(
217
+ *,
218
+ status_cb: Callable[[str], None] | None,
219
+ debug_cb: Callable[[str], None] | None,
220
+ chrome_path: str | None = None,
221
+ port: int | None = None,
222
+ headless: bool = False,
223
+ ) -> AutoCdpSession:
224
+ binary = chrome_path or _find_chrome_executable()
225
+ if not binary:
226
+ raise BrowserLoginError(
227
+ "Could not find local Chrome executable for --browser-cdp-auto. "
228
+ "Use --browser-cdp-url or --browser-chrome-path."
229
+ )
230
+
231
+ cdp_port = int(port or _pick_free_port())
232
+ user_data_dir = Path(tempfile.mkdtemp(prefix="web2cli-cdp-"))
233
+ args = [
234
+ binary,
235
+ f"--remote-debugging-port={cdp_port}",
236
+ f"--user-data-dir={str(user_data_dir)}",
237
+ "--no-first-run",
238
+ "--no-default-browser-check",
239
+ ]
240
+ if headless:
241
+ args.append("--headless=new")
242
+
243
+ _emit(status_cb, "Starting local browser...")
244
+ proc = subprocess.Popen(
245
+ args,
246
+ stdout=subprocess.DEVNULL,
247
+ stderr=subprocess.DEVNULL,
248
+ text=True,
249
+ )
250
+ if debug_cb:
251
+ _emit_debug(debug_cb, f"cdp auto: chrome={binary}")
252
+ _emit_debug(debug_cb, f"cdp auto: port={cdp_port} profile={user_data_dir}")
253
+
254
+ ready = _wait_for_cdp_ready(cdp_port, timeout_seconds=12.0)
255
+ if not ready:
256
+ _stop_auto_cdp_session(
257
+ AutoCdpSession(
258
+ cdp_url=f"http://127.0.0.1:{cdp_port}",
259
+ process=proc,
260
+ user_data_dir=user_data_dir,
261
+ port=cdp_port,
262
+ )
263
+ )
264
+ raise BrowserLoginError(
265
+ "Failed to start local Chrome CDP endpoint. "
266
+ "Try --browser-cdp-url with an existing Chrome instance."
267
+ )
268
+
269
+ return AutoCdpSession(
270
+ cdp_url=f"http://127.0.0.1:{cdp_port}",
271
+ process=proc,
272
+ user_data_dir=user_data_dir,
273
+ port=cdp_port,
274
+ )
275
+
276
+
277
+ def _header_value(headers: dict[str, str], key: str) -> str | None:
278
+ wanted = key.lower()
279
+ for hkey, hval in headers.items():
280
+ if str(hkey).lower() == wanted:
281
+ return str(hval)
282
+ return None
283
+
284
+
285
+ def _multipart_form_value(post_data: str, key: str) -> str | None:
286
+ lines = post_data.splitlines()
287
+ if not lines:
288
+ return None
289
+ first = lines[0].strip()
290
+ if not first.startswith("--") or len(first) <= 2:
291
+ return None
292
+ boundary = first[2:]
293
+ delimiter = f"--{boundary}"
294
+
295
+ for raw_part in post_data.split(delimiter):
296
+ part = raw_part.strip()
297
+ if not part or part == "--":
298
+ continue
299
+ if part.endswith("--"):
300
+ part = part[:-2].rstrip()
301
+
302
+ header_blob, sep, body = part.partition("\r\n\r\n")
303
+ if not sep:
304
+ header_blob, sep, body = part.partition("\n\n")
305
+ if not sep:
306
+ continue
307
+
308
+ header_lines = [h.strip() for h in header_blob.splitlines() if h.strip()]
309
+ disposition = ""
310
+ for line in header_lines:
311
+ if line.lower().startswith("content-disposition:"):
312
+ disposition = line
313
+ break
314
+ if not disposition:
315
+ continue
316
+
317
+ needle = f'name="{key}"'
318
+ if needle not in disposition:
319
+ continue
320
+
321
+ value = body.strip("\r\n")
322
+ return value or None
323
+ return None
324
+
325
+
326
+ def _form_value(post_data: str, key: str) -> str | None:
327
+ parsed = parse_qs(post_data, keep_blank_values=True)
328
+ values = parsed.get(key)
329
+ if values:
330
+ value = values[0]
331
+ if value is not None:
332
+ return str(value)
333
+
334
+ # Fallback for multipart/form-data payloads.
335
+ return _multipart_form_value(post_data, key)
336
+
337
+
338
+ def _request_headers_safe(request) -> dict[str, str]:
339
+ try:
340
+ raw_headers = request.headers
341
+ except Exception:
342
+ return {}
343
+ if callable(raw_headers):
344
+ try:
345
+ raw_headers = raw_headers()
346
+ except Exception:
347
+ return {}
348
+ if isinstance(raw_headers, dict):
349
+ return dict(raw_headers)
350
+ return {}
351
+
352
+
353
+ def _request_post_data_safe(request) -> str:
354
+ # Prefer raw bytes if exposed by current Playwright version.
355
+ try:
356
+ raw_post_data = request.post_data_buffer
357
+ if callable(raw_post_data):
358
+ raw_post_data = raw_post_data()
359
+ if isinstance(raw_post_data, (bytes, bytearray)):
360
+ return bytes(raw_post_data).decode("utf-8", errors="ignore")
361
+ except Exception:
362
+ pass
363
+
364
+ # Fallback to decoded post_data; some requests can raise decode errors.
365
+ try:
366
+ raw_post_data = request.post_data
367
+ except Exception:
368
+ return ""
369
+ if callable(raw_post_data):
370
+ try:
371
+ raw_post_data = raw_post_data()
372
+ except Exception:
373
+ return ""
374
+ if isinstance(raw_post_data, (bytes, bytearray)):
375
+ return bytes(raw_post_data).decode("utf-8", errors="ignore")
376
+ return str(raw_post_data or "")
377
+
378
+
379
+ def _short_url(url: str, max_len: int = 88) -> str:
380
+ if len(url) <= max_len:
381
+ return url
382
+ return f"{url[: max_len - 3]}..."
383
+
384
+
385
+ def _request_label(request) -> str:
386
+ try:
387
+ method = str(request.method or "").upper()
388
+ except Exception:
389
+ method = "?"
390
+ try:
391
+ raw_url = str(request.url or "")
392
+ except Exception:
393
+ raw_url = ""
394
+ if not raw_url:
395
+ return method
396
+ parsed = urlparse(raw_url)
397
+ host = parsed.hostname or ""
398
+ path = parsed.path or "/"
399
+ if parsed.query:
400
+ path = f"{path}?..."
401
+ return f"{method} {host}{path}"
402
+
403
+
404
+ def _token_rule_label(rule: TokenCaptureRule) -> str:
405
+ bits = [f"{rule.source}:{rule.key}"]
406
+ if rule.method:
407
+ bits.append(rule.method.upper())
408
+ if rule.host:
409
+ bits.append(rule.host)
410
+ if rule.path_regex:
411
+ bits.append(f"path~{rule.path_regex}")
412
+ return " ".join(bits)
413
+
414
+
415
+ def _request_route_info(request) -> tuple[str, str, str] | None:
416
+ try:
417
+ parsed = urlparse(str(request.url))
418
+ host = (parsed.hostname or "").lower()
419
+ path = parsed.path or "/"
420
+ method = str(request.method or "").upper()
421
+ except Exception:
422
+ return None
423
+ return host, path, method
424
+
425
+
426
+ def _request_matches_any_rule(request, rules: list[TokenCaptureRule]) -> bool:
427
+ route = _request_route_info(request)
428
+ if route is None:
429
+ return False
430
+ host, path, method = route
431
+ for rule in rules:
432
+ if _rule_matches_request(rule, host=host, path=path, method=method):
433
+ return True
434
+ return False
435
+
436
+
437
+ async def _apply_stealth_init_script(context) -> None:
438
+ # Best-effort JS patches to reduce obvious automation fingerprints.
439
+ script = """
440
+ Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
441
+ window.chrome = window.chrome || { runtime: {} };
442
+ Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] });
443
+ Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5] });
444
+ """
445
+ try:
446
+ await context.add_init_script(script)
447
+ except Exception:
448
+ return
449
+
450
+
451
+ async def _launch_browser_with_fallback(
452
+ playwright_async_api,
453
+ debug_cb: Callable[[str], None] | None,
454
+ ):
455
+ common_args = [
456
+ "--disable-features=PrivateNetworkAccessRespectPreflightResults,"
457
+ "BlockInsecurePrivateNetworkRequests",
458
+ "--disable-blink-features=AutomationControlled",
459
+ ]
460
+
461
+ # Prefer user-installed Chrome first (looks less synthetic than Playwright Chromium).
462
+ launch_profiles = [
463
+ (
464
+ "chrome",
465
+ {
466
+ "channel": "chrome",
467
+ "headless": False,
468
+ "args": common_args,
469
+ "ignore_default_args": ["--enable-automation", "--no-sandbox"],
470
+ },
471
+ ),
472
+ (
473
+ "chromium",
474
+ {
475
+ "headless": False,
476
+ "args": common_args,
477
+ "ignore_default_args": ["--enable-automation", "--no-sandbox"],
478
+ },
479
+ ),
480
+ ]
481
+
482
+ errors: list[str] = []
483
+ for profile_name, options in launch_profiles:
484
+ try:
485
+ browser = await playwright_async_api.chromium.launch(**options)
486
+ if debug_cb:
487
+ _emit_debug(debug_cb, f"browser profile: {profile_name}")
488
+ return browser, profile_name
489
+ except Exception as e:
490
+ errors.append(f"{profile_name}: {e}")
491
+ if debug_cb:
492
+ _emit_debug(debug_cb, f"browser profile failed: {profile_name}: {e}")
493
+
494
+ detail = " | ".join(errors)
495
+ raise BrowserLoginError(f"Failed to launch browser ({detail})")
496
+
497
+
498
+ async def _open_browser_and_context(
499
+ playwright_async_api,
500
+ *,
501
+ debug_cb: Callable[[str], None] | None,
502
+ cdp_url: str | None,
503
+ ):
504
+ if cdp_url:
505
+ browser = await playwright_async_api.chromium.connect_over_cdp(cdp_url)
506
+ contexts = list(browser.contexts)
507
+ if contexts:
508
+ context = contexts[0]
509
+ else:
510
+ context = await browser.new_context(viewport={"width": 1280, "height": 900})
511
+ if debug_cb:
512
+ _emit_debug(debug_cb, f"browser profile: cdp ({cdp_url})")
513
+ _emit_debug(
514
+ debug_cb,
515
+ f"cdp contexts={len(contexts)} tabs={len(getattr(context, 'pages', []))}",
516
+ )
517
+ return browser, context, "cdp", False
518
+
519
+ browser, browser_profile = await _launch_browser_with_fallback(
520
+ playwright_async_api,
521
+ debug_cb=debug_cb,
522
+ )
523
+ context = await browser.new_context(
524
+ viewport={"width": 1280, "height": 900},
525
+ )
526
+ return browser, context, browser_profile, True
527
+
528
+
529
+ def _rule_matches_request(rule: TokenCaptureRule, *, host: str, path: str, method: str) -> bool:
530
+ if rule.host:
531
+ normalized = rule.host.lower()
532
+ if host != normalized and not host.endswith(f".{normalized}"):
533
+ return False
534
+
535
+ if rule.method and method != rule.method.upper():
536
+ return False
537
+
538
+ if rule.path_regex:
539
+ try:
540
+ if re.search(rule.path_regex, path) is None:
541
+ return False
542
+ except re.error:
543
+ return False
544
+
545
+ return True
546
+
547
+
548
+ def _extract_token_from_request(
549
+ request,
550
+ rules: list[TokenCaptureRule],
551
+ ) -> tuple[str, str] | None:
552
+ if not rules:
553
+ return None
554
+
555
+ route = _request_route_info(request)
556
+ if route is None:
557
+ return None
558
+ host, path, method = route
559
+
560
+ headers: dict[str, str] | None = None
561
+ post_data: str | None = None
562
+
563
+ for rule in rules:
564
+ if not _rule_matches_request(rule, host=host, path=path, method=method):
565
+ continue
566
+
567
+ value: str | None = None
568
+ source = rule.source.lower()
569
+ if source == "request.header":
570
+ if headers is None:
571
+ headers = _request_headers_safe(request)
572
+ value = _header_value(headers, rule.key)
573
+ elif source == "request.form":
574
+ if post_data is None:
575
+ post_data = _request_post_data_safe(request)
576
+ value = _form_value(post_data, rule.key)
577
+
578
+ if value is None:
579
+ continue
580
+ if rule.strip_prefix and value.startswith(rule.strip_prefix):
581
+ value = value[len(rule.strip_prefix) :]
582
+ if value:
583
+ source = f"{_token_rule_label(rule)} <= {_request_label(request)}"
584
+ return value, source
585
+
586
+ return None
587
+
588
+
589
+ async def _capture_auth_once(
590
+ playwright_async_api,
591
+ domain: str,
592
+ required_cookies: list[str],
593
+ token_rules: list[TokenCaptureRule],
594
+ poll_seconds: float = 1.0,
595
+ debug_cb: Callable[[str], None] | None = None,
596
+ cdp_url: str | None = None,
597
+ ) -> tuple[dict[str, str], str | None]:
598
+ required = [c for c in required_cookies if c]
599
+ if not required and not token_rules:
600
+ raise BrowserLoginError(
601
+ "No required cookie keys or token capture rules configured for browser login"
602
+ )
603
+
604
+ async with playwright_async_api.async_playwright() as p:
605
+ browser, context, browser_profile, managed_browser = await _open_browser_and_context(
606
+ p,
607
+ debug_cb=debug_cb,
608
+ cdp_url=cdp_url,
609
+ )
610
+ await _apply_stealth_init_script(context)
611
+ if debug_cb:
612
+ _emit_debug(debug_cb, f"context ready ({browser_profile}, viewport=1280x900)")
613
+
614
+ captured_token: str | None = None
615
+ captured_token_source: str | None = None
616
+ token_candidate_count = 0
617
+ last_token_candidate: str | None = None
618
+ last_debug_state: tuple | None = None
619
+
620
+ if debug_cb:
621
+ if required:
622
+ _emit_debug(debug_cb, f"required cookies: {', '.join(required)}")
623
+ else:
624
+ _emit_debug(debug_cb, "required cookies: none")
625
+ if token_rules:
626
+ for idx, rule in enumerate(token_rules, start=1):
627
+ _emit_debug(debug_cb, f"token rule[{idx}]: {_token_rule_label(rule)}")
628
+ else:
629
+ _emit_debug(debug_cb, "token capture: none")
630
+
631
+ def _handle_request(request) -> None:
632
+ nonlocal captured_token
633
+ nonlocal captured_token_source
634
+ nonlocal token_candidate_count
635
+ nonlocal last_token_candidate
636
+ if captured_token is not None:
637
+ return
638
+ try:
639
+ if _request_matches_any_rule(request, token_rules):
640
+ token_candidate_count += 1
641
+ last_token_candidate = _request_label(request)
642
+ token_match = _extract_token_from_request(request, token_rules)
643
+ if token_match:
644
+ captured_token, captured_token_source = token_match
645
+ if debug_cb:
646
+ _emit_debug(debug_cb, f"captured token via {captured_token_source}")
647
+ except Exception:
648
+ # Never let Playwright request events crash the login flow.
649
+ return
650
+
651
+ context.on("request", _handle_request)
652
+ page = await context.new_page()
653
+ await page.goto(f"https://{domain}", wait_until="domcontentloaded")
654
+
655
+ while True:
656
+ cookies = await context.cookies()
657
+ by_name = {
658
+ str(c.get("name", "")): str(c.get("value", ""))
659
+ for c in cookies
660
+ if c.get("name")
661
+ }
662
+ have_cookies = [name for name in required if name in by_name]
663
+ missing_cookies = [name for name in required if name not in by_name]
664
+
665
+ tab_urls: list[str] = []
666
+ for pg in list(context.pages):
667
+ try:
668
+ tab_urls.append(_short_url(str(pg.url or "")))
669
+ except Exception:
670
+ tab_urls.append("<unknown>")
671
+
672
+ if debug_cb:
673
+ debug_state = (
674
+ tuple(have_cookies),
675
+ tuple(missing_cookies),
676
+ bool(captured_token),
677
+ captured_token_source or "",
678
+ token_candidate_count,
679
+ last_token_candidate or "",
680
+ tuple(tab_urls),
681
+ )
682
+ if debug_state != last_debug_state:
683
+ cookies_summary = (
684
+ f"cookies {len(have_cookies)}/{len(required)}"
685
+ if required
686
+ else "cookies n/a"
687
+ )
688
+ if required:
689
+ have_txt = ",".join(have_cookies) if have_cookies else "-"
690
+ missing_txt = ",".join(missing_cookies) if missing_cookies else "-"
691
+ cookies_summary = (
692
+ f"{cookies_summary} have=[{have_txt}] missing=[{missing_txt}]"
693
+ )
694
+ token_summary = (
695
+ f"token={'present' if captured_token else 'missing'}"
696
+ + (
697
+ f" ({captured_token_source})"
698
+ if captured_token and captured_token_source
699
+ else ""
700
+ )
701
+ )
702
+ if not captured_token:
703
+ token_summary = (
704
+ f"{token_summary} candidates={token_candidate_count}"
705
+ + (
706
+ f" last={last_token_candidate}"
707
+ if last_token_candidate
708
+ else ""
709
+ )
710
+ )
711
+ tabs_summary = (
712
+ f"tabs={len(tab_urls)} "
713
+ + "; ".join(tab_urls if tab_urls else ["<none>"])
714
+ )
715
+ _emit_debug(debug_cb, f"{cookies_summary} | {token_summary} | {tabs_summary}")
716
+ last_debug_state = debug_state
717
+
718
+ cookies_ready = all(name in by_name for name in required)
719
+ token_ready = (not token_rules) or (captured_token is not None)
720
+ if cookies_ready and token_ready:
721
+ if managed_browser:
722
+ await browser.close()
723
+ else:
724
+ try:
725
+ await page.close()
726
+ except Exception:
727
+ pass
728
+ out_cookies = {name: by_name[name] for name in required}
729
+ return out_cookies, captured_token
730
+ await asyncio.sleep(poll_seconds)
731
+
732
+
733
+ def capture_auth_with_browser(
734
+ domain: str,
735
+ required_cookies: list[str],
736
+ token_rules: list[TokenCaptureRule] | None = None,
737
+ status_cb: Callable[[str], None] | None = None,
738
+ debug_cb: Callable[[str], None] | None = None,
739
+ cdp_url: str | None = None,
740
+ cdp_auto: bool = False,
741
+ cdp_port: int | None = None,
742
+ chrome_path: str | None = None,
743
+ ) -> tuple[dict[str, str], str | None]:
744
+ """Open browser and wait until required auth values are present."""
745
+ normalized_rules = token_rules or []
746
+ playwright_async_api = _ensure_playwright_package(status_cb)
747
+ auto_session: AutoCdpSession | None = None
748
+
749
+ try:
750
+ # Default behavior: transparently prefer local Chrome via CDP when URL isn't explicit.
751
+ prefer_auto_cdp = cdp_auto or (cdp_url is None)
752
+ if prefer_auto_cdp and cdp_url is None:
753
+ try:
754
+ auto_session = _start_auto_cdp_chrome(
755
+ status_cb=status_cb,
756
+ debug_cb=debug_cb,
757
+ chrome_path=chrome_path,
758
+ port=cdp_port,
759
+ )
760
+ cdp_url = auto_session.cdp_url
761
+ if debug_cb:
762
+ _emit_debug(debug_cb, f"cdp auto ready: {cdp_url}")
763
+ except BrowserLoginError as e:
764
+ if cdp_auto:
765
+ raise
766
+ # Silent fallback for default --browser mode.
767
+ _emit(status_cb, "Local browser unavailable, falling back to embedded browser...")
768
+ if debug_cb:
769
+ _emit_debug(debug_cb, f"cdp auto unavailable, fallback to playwright: {e}")
770
+ cdp_url = None
771
+
772
+ return asyncio.run(
773
+ _capture_auth_once(
774
+ playwright_async_api,
775
+ domain,
776
+ required_cookies,
777
+ normalized_rules,
778
+ debug_cb=debug_cb,
779
+ cdp_url=cdp_url,
780
+ )
781
+ )
782
+ except KeyboardInterrupt:
783
+ raise BrowserLoginCancelled("Login cancelled by user")
784
+ except Exception as e:
785
+ if _is_missing_browser_error(e):
786
+ _install_chromium(status_cb)
787
+ try:
788
+ return asyncio.run(
789
+ _capture_auth_once(
790
+ playwright_async_api,
791
+ domain,
792
+ required_cookies,
793
+ normalized_rules,
794
+ debug_cb=debug_cb,
795
+ cdp_url=cdp_url,
796
+ )
797
+ )
798
+ except KeyboardInterrupt:
799
+ raise BrowserLoginCancelled("Login cancelled by user")
800
+ except Exception as inner:
801
+ raise BrowserLoginError(str(inner))
802
+ raise BrowserLoginError(str(e))
803
+ finally:
804
+ if auto_session is not None:
805
+ _stop_auto_cdp_session(auto_session)
806
+
807
+
808
+ def capture_cookies_with_browser(
809
+ domain: str,
810
+ required_cookies: list[str],
811
+ status_cb: Callable[[str], None] | None = None,
812
+ ) -> dict[str, str]:
813
+ """Backward-compatible wrapper returning only cookies."""
814
+ cookies, _ = capture_auth_with_browser(
815
+ domain=domain,
816
+ required_cookies=required_cookies,
817
+ token_rules=[],
818
+ status_cb=status_cb,
819
+ )
820
+ return cookies