@music-league-eras/local-runner 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,267 @@
1
+ import asyncio
2
+ import json
3
+ import re
4
+ from dataclasses import dataclass
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ from typing import Any, cast
8
+ from urllib.parse import urljoin, urlparse
9
+
10
+ from playwright.async_api import async_playwright
11
+
12
+ from ..services.viewer import extract_viewer_avatar_url, extract_viewer_user_id
13
+
14
+
15
+ @dataclass(frozen=True)
16
+ class CaptureResult:
17
+ storage_state: dict[str, Any]
18
+ storage_state_path: str | None
19
+ screenshot_path: str | None
20
+ viewer_user_id: str | None
21
+ viewer_avatar_url: str | None
22
+
23
+
24
+ def _ensure_parent(path: Path) -> None:
25
+ path.parent.mkdir(parents=True, exist_ok=True)
26
+
27
+
28
+ def _build_target(base_url: str) -> str:
29
+ return urljoin(base_url, "/completed/")
30
+
31
+
32
+ def _cookie_domain_matches(domain: str, host: str | None) -> bool:
33
+ if not domain or not host:
34
+ return False
35
+ normalized = domain.lstrip(".")
36
+ return host == normalized or host.endswith(f".{normalized}")
37
+
38
+
39
+ def _is_music_league_host(host: str | None, base_url: str) -> bool:
40
+ if not host:
41
+ return False
42
+ base_host = urlparse(base_url).hostname
43
+ if base_host and _cookie_domain_matches(base_host, host):
44
+ return True
45
+ return host.endswith(".musicleague.com") or host == "musicleague.com"
46
+
47
+
48
+ def _has_base_cookie(storage_state: dict[str, Any], base_url: str) -> bool:
49
+ host = urlparse(base_url).hostname
50
+ if not host:
51
+ return False
52
+ for cookie in storage_state.get("cookies", []):
53
+ domain = str(cookie.get("domain") or "")
54
+ if _cookie_domain_matches(domain, host):
55
+ return True
56
+ return False
57
+
58
+
59
+ def _is_missing_playwright_browser_error(exc: BaseException) -> bool:
60
+ message = str(exc)
61
+ return (
62
+ "Executable doesn't exist" in message
63
+ or "Looks like Playwright was just installed or updated" in message
64
+ or "playwright install" in message
65
+ )
66
+
67
+
68
+ def _missing_playwright_browser_help() -> str:
69
+ return (
70
+ "Playwright browser binaries are not installed for this environment.\n\n"
71
+ "Fix (repo venv):\n"
72
+ " cd services/scraper\n"
73
+ " .venv/bin/python -m playwright install chromium\n"
74
+ "\n"
75
+ "If you prefer, this also works:\n"
76
+ " cd services/scraper\n"
77
+ " .venv/bin/playwright install chromium\n"
78
+ )
79
+
80
+
81
+ async def _is_login_prompt_visible(page) -> bool:
82
+ login_ctas = [
83
+ "Log in",
84
+ "Login",
85
+ "Sign in",
86
+ "Sign up",
87
+ "Create account",
88
+ "Continue with Spotify",
89
+ "Log in with Spotify",
90
+ "Sign in with Spotify",
91
+ ]
92
+ spotify_hints = [
93
+ "spotify",
94
+ "log in",
95
+ "login",
96
+ "sign in",
97
+ "sign up",
98
+ "continue",
99
+ "create account",
100
+ ]
101
+ try:
102
+ parsed = urlparse(page.url)
103
+ if "spotify.com" in (parsed.hostname or ""):
104
+ return True
105
+ if parsed.path in {"/login", "/sign-in", "/signin", "/auth"}:
106
+ return True
107
+ except Exception:
108
+ pass
109
+ for label in login_ctas:
110
+ try:
111
+ if await page.get_by_role("button", name=label).count():
112
+ return True
113
+ except Exception:
114
+ pass
115
+ try:
116
+ if await page.get_by_role("link", name=label).count():
117
+ return True
118
+ except Exception:
119
+ pass
120
+ try:
121
+ body_text = await page.evaluate("document.body ? document.body.innerText : ''")
122
+ if isinstance(body_text, str):
123
+ lowered = body_text.lower()
124
+ if "spotify" in lowered and any(hint in lowered for hint in spotify_hints):
125
+ return True
126
+ except Exception:
127
+ pass
128
+ return False
129
+
130
+
131
+ async def _is_capture_ready(page, base_url: str) -> bool:
132
+ try:
133
+ host = urlparse(page.url).hostname
134
+ except Exception:
135
+ host = None
136
+ if not _is_music_league_host(host, base_url):
137
+ return False
138
+ if await _is_login_prompt_visible(page):
139
+ return False
140
+ return True
141
+
142
+
143
+ async def _has_logout_link(page) -> bool:
144
+ logout_labels = ["Sign Out", "Log out", "Logout"]
145
+ for label in logout_labels:
146
+ try:
147
+ if await page.get_by_role("link", name=label).count():
148
+ return True
149
+ except Exception:
150
+ pass
151
+ try:
152
+ if await page.locator("a[href*=\"/logout\"]").count():
153
+ return True
154
+ except Exception:
155
+ pass
156
+ return False
157
+
158
+
159
+ async def _advance_login_flow(page) -> bool:
160
+ try:
161
+ buttons = [
162
+ re.compile(r"^import$", re.I),
163
+ re.compile(r"import", re.I),
164
+ re.compile(r"log in with spotify", re.I),
165
+ re.compile(r"continue with spotify", re.I),
166
+ re.compile(r"sign in with spotify", re.I),
167
+ re.compile(r"log in", re.I),
168
+ re.compile(r"sign in", re.I),
169
+ ]
170
+ for pattern in buttons:
171
+ locator = page.get_by_role("button", name=pattern)
172
+ if await locator.count():
173
+ await locator.first.click()
174
+ try:
175
+ await page.wait_for_load_state("domcontentloaded", timeout=3000)
176
+ except Exception:
177
+ pass
178
+ return True
179
+ for pattern in buttons:
180
+ locator = page.get_by_role("link", name=pattern)
181
+ if await locator.count():
182
+ await locator.first.click()
183
+ try:
184
+ await page.wait_for_load_state("domcontentloaded", timeout=3000)
185
+ except Exception:
186
+ pass
187
+ return True
188
+ except Exception:
189
+ pass
190
+ return False
191
+
192
+
193
+ async def capture_storage_state(
194
+ *,
195
+ base_url: str,
196
+ headless: bool,
197
+ storage_state_path: str | None,
198
+ screenshot_path: str | None,
199
+ ) -> CaptureResult:
200
+ target = _build_target(base_url)
201
+ async with async_playwright() as p:
202
+ try:
203
+ browser = await p.chromium.launch(headless=headless)
204
+ except Exception as exc:
205
+ if _is_missing_playwright_browser_error(exc):
206
+ raise RuntimeError(_missing_playwright_browser_help()) from exc
207
+ raise
208
+ context = await browser.new_context()
209
+ page = await context.new_page()
210
+ last_advance = 0.0
211
+ try:
212
+ await page.goto(target, wait_until="domcontentloaded")
213
+
214
+ while True:
215
+ if page.url.startswith(base_url) and await _is_login_prompt_visible(page):
216
+ now = asyncio.get_running_loop().time()
217
+ if now - last_advance > 1.0:
218
+ if await _advance_login_flow(page):
219
+ last_advance = now
220
+ # Capture as soon as we have valid Music League cookies, even if the user was
221
+ # already logged in and never hit the Spotify auth domain in this run.
222
+ if await _is_capture_ready(page, base_url):
223
+ storage_state = cast(dict[str, Any], await context.storage_state())
224
+ if _has_base_cookie(storage_state, base_url):
225
+ if storage_state_path:
226
+ path = Path(storage_state_path)
227
+ _ensure_parent(path)
228
+ path.write_text(json.dumps(storage_state, indent=2), encoding="utf-8")
229
+ if screenshot_path:
230
+ shot_path = Path(screenshot_path)
231
+ _ensure_parent(shot_path)
232
+ await page.screenshot(path=str(shot_path), full_page=True)
233
+ viewer_user_id = None
234
+ viewer_avatar_url = None
235
+ try:
236
+ html = await page.content()
237
+ viewer_user_id = extract_viewer_user_id(html)
238
+ viewer_avatar_url = extract_viewer_avatar_url(html)
239
+ except Exception:
240
+ viewer_user_id = None
241
+ viewer_avatar_url = None
242
+ return CaptureResult(
243
+ storage_state=storage_state,
244
+ storage_state_path=storage_state_path,
245
+ screenshot_path=screenshot_path,
246
+ viewer_user_id=viewer_user_id,
247
+ viewer_avatar_url=viewer_avatar_url,
248
+ )
249
+ await asyncio.sleep(0.5)
250
+ finally:
251
+ await context.close()
252
+ await browser.close()
253
+
254
+ raise RuntimeError("Login capture aborted before storage state was saved.")
255
+
256
+
257
+ def build_storage_state_path(base_dir: str, session_id: str) -> str:
258
+ filename = f"storage_state_{session_id}.json"
259
+ dated_dir = datetime.now().strftime("%Y-%m-%d")
260
+ return str(Path(base_dir) / dated_dir / filename)
261
+
262
+
263
+ def build_screenshot_path(base_dir: str, session_id: str) -> str:
264
+ ts = datetime.now().strftime("%Y%m%d_%H%M%S")
265
+ filename = f"login_{session_id}_{ts}.png"
266
+ dated_dir = datetime.now().strftime("%Y-%m-%d")
267
+ return str(Path(base_dir) / dated_dir / "screenshots" / filename)
@@ -0,0 +1,112 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import asyncio
5
+ import os
6
+ from datetime import datetime, timezone
7
+ from pathlib import Path
8
+
9
+ from .local_sync_runner import run_local_sync
10
+
11
+
12
+ def _env_bool(key: str, default: bool) -> bool:
13
+ raw = os.getenv(key)
14
+ if raw is None:
15
+ return default
16
+ return raw.strip().lower() == "true"
17
+
18
+
19
+ def _default_music_league_base_url() -> str:
20
+ return os.getenv("MUSIC_LEAGUE_BASE_URL", "https://app.musicleague.com")
21
+
22
+
23
+ def _default_capture_headless() -> bool:
24
+ # Keep parity with `services/scraper/app/core/config.py`.
25
+ return _env_bool("SCRAPER_CAPTURE_HEADLESS", _env_bool("SCRAPER_HEADLESS", False))
26
+
27
+
28
+ def _default_scrape_headless() -> bool:
29
+ # Keep parity with `services/scraper/app/core/config.py`.
30
+ return _env_bool("SCRAPER_SCRAPE_HEADLESS", _env_bool("SCRAPER_HEADLESS", True))
31
+
32
+
33
+ def _default_runs_dir() -> str:
34
+ return os.getenv("SCRAPER_RUNS_DIR", "artifacts/scraper_runs")
35
+
36
+
37
+ def _default_runner_out_dir(sync_session_id: str) -> Path:
38
+ run_date = datetime.now(tz=timezone.utc).strftime("%Y-%m-%d")
39
+ return Path(_default_runs_dir()) / run_date / sync_session_id
40
+
41
+
42
+ def build_parser() -> argparse.ArgumentParser:
43
+ parser = argparse.ArgumentParser(description="Music League Eras local runner (vendored entrypoint).")
44
+ subparsers = parser.add_subparsers(dest="command", required=True)
45
+
46
+ local_sync_parser = subparsers.add_parser(
47
+ "local-sync",
48
+ help="Run local login + scrape, then upload manifest to the API using a sync token.",
49
+ )
50
+ local_sync_parser.add_argument("--api-base-url", required=True, help="API base URL (e.g. http://localhost:8000)")
51
+ local_sync_parser.add_argument("--sync-session-id", required=True, help="Local sync session id")
52
+ local_sync_parser.add_argument("--sync-token", required=True, help="Local sync token (secret)")
53
+ local_sync_parser.add_argument(
54
+ "--music-league-base-url",
55
+ default=_default_music_league_base_url(),
56
+ help="Music League base URL",
57
+ )
58
+ local_sync_parser.add_argument(
59
+ "--capture-headless",
60
+ default=_default_capture_headless(),
61
+ action=argparse.BooleanOptionalAction,
62
+ help="Run the login capture headless",
63
+ )
64
+ local_sync_parser.add_argument(
65
+ "--scrape-headless",
66
+ default=_default_scrape_headless(),
67
+ action=argparse.BooleanOptionalAction,
68
+ help="Run the scrape headless",
69
+ )
70
+ local_sync_parser.add_argument(
71
+ "--out-dir",
72
+ type=Path,
73
+ default=None,
74
+ help="Output directory for artifacts (defaults under SCRAPER_RUNS_DIR)",
75
+ )
76
+ local_sync_parser.add_argument(
77
+ "--timeout-s",
78
+ type=int,
79
+ default=60,
80
+ help="HTTP timeout seconds for upload (default: 60)",
81
+ )
82
+
83
+ return parser
84
+
85
+
86
+ def main(argv: list[str] | None = None) -> None:
87
+ parser = build_parser()
88
+ args = parser.parse_args(argv)
89
+
90
+ if args.command != "local-sync":
91
+ raise SystemExit(f"Unknown command: {args.command}")
92
+
93
+ out_dir = args.out_dir or _default_runner_out_dir(args.sync_session_id)
94
+ result = asyncio.run(
95
+ run_local_sync(
96
+ api_base_url=args.api_base_url,
97
+ sync_session_id=args.sync_session_id,
98
+ sync_token=args.sync_token,
99
+ music_league_base_url=args.music_league_base_url,
100
+ capture_headless=args.capture_headless,
101
+ scrape_headless=args.scrape_headless,
102
+ out_dir=out_dir,
103
+ timeout_s=args.timeout_s,
104
+ )
105
+ )
106
+ print(f"Manifest: {result.manifest_path}")
107
+ print(f"Upload: {result.upload_response}")
108
+
109
+
110
+ if __name__ == "__main__":
111
+ main()
112
+
@@ -0,0 +1,99 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import json
5
+ import urllib.error
6
+ import urllib.request
7
+ from dataclasses import dataclass
8
+ from datetime import datetime, timezone
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from .browser.session import capture_storage_state
13
+ from .services.scrape_manifest import build_manifest
14
+
15
+
16
+ @dataclass(frozen=True)
17
+ class LocalSyncRunnerResult:
18
+ manifest_path: str
19
+ upload_response: dict[str, Any]
20
+
21
+
22
+ def _now_iso() -> str:
23
+ return datetime.now(tz=timezone.utc).strftime("%Y-%m-%dT%H-%M-%SZ")
24
+
25
+
26
+ def _post_json(*, url: str, headers: dict[str, str], payload: dict[str, Any], timeout_s: int) -> dict[str, Any]:
27
+ data = json.dumps(payload).encode("utf-8")
28
+ request = urllib.request.Request(url, method="POST", data=data)
29
+ request.add_header("content-type", "application/json")
30
+ request.add_header("accept", "application/json")
31
+ for key, value in headers.items():
32
+ request.add_header(key, value)
33
+ try:
34
+ with urllib.request.urlopen(request, timeout=timeout_s) as response: # noqa: S310 - URL is user-provided
35
+ body = response.read().decode("utf-8", errors="replace")
36
+ if not body:
37
+ return {}
38
+ parsed = json.loads(body)
39
+ if isinstance(parsed, dict):
40
+ return parsed
41
+ return {"_": parsed}
42
+ except urllib.error.HTTPError as exc:
43
+ try:
44
+ body = exc.read().decode("utf-8", errors="replace")
45
+ except Exception:
46
+ body = ""
47
+ message = body or (exc.reason if isinstance(exc.reason, str) else "HTTP error")
48
+ raise RuntimeError(f"Upload failed ({exc.code}): {message}") from exc
49
+ except urllib.error.URLError as exc:
50
+ raise RuntimeError(f"Upload failed (network): {exc.reason}") from exc
51
+
52
+
53
+ async def run_local_sync(
54
+ *,
55
+ api_base_url: str,
56
+ sync_session_id: str,
57
+ sync_token: str,
58
+ music_league_base_url: str,
59
+ capture_headless: bool,
60
+ scrape_headless: bool,
61
+ out_dir: Path,
62
+ timeout_s: int,
63
+ ) -> LocalSyncRunnerResult:
64
+ out_dir.mkdir(parents=True, exist_ok=True)
65
+ storage_state_path = str(out_dir / f"storage_state_{_now_iso()}.json")
66
+ screenshot_path = str(out_dir / f"login_{_now_iso()}.png")
67
+
68
+ print("Local sync: opening a browser window to capture login (close it once the command finishes).")
69
+ capture = await capture_storage_state(
70
+ base_url=music_league_base_url,
71
+ headless=capture_headless,
72
+ storage_state_path=storage_state_path,
73
+ screenshot_path=screenshot_path,
74
+ )
75
+
76
+ print("Local sync: login captured, scraping your Music League history…")
77
+ manifest, manifest_path = await build_manifest(
78
+ storage_state=capture.storage_state,
79
+ storage_state_path=None,
80
+ base_url=music_league_base_url,
81
+ output_dir=str(out_dir),
82
+ headless=scrape_headless,
83
+ )
84
+
85
+ print("Local sync: uploading manifest to the API…")
86
+ upload_url = (
87
+ api_base_url.rstrip("/")
88
+ + f"/api/profile/music-league/local-sync/sessions/{sync_session_id}/upload"
89
+ )
90
+ upload_payload = {"manifest": manifest}
91
+
92
+ upload_response = await asyncio.to_thread(
93
+ _post_json,
94
+ url=upload_url,
95
+ headers={"x-sync-token": sync_token},
96
+ payload=upload_payload,
97
+ timeout_s=timeout_s,
98
+ )
99
+ return LocalSyncRunnerResult(manifest_path=manifest_path, upload_response=upload_response)
File without changes