@music-league-eras/local-runner 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,774 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import hashlib
5
+ import json
6
+ import re
7
+ import time
8
+ import urllib.request
9
+ from datetime import datetime, timezone
10
+ from pathlib import Path
11
+ from typing import Any, Optional
12
+ from urllib.parse import urljoin, urlparse
13
+
14
+ from bs4 import BeautifulSoup
15
+ from playwright.async_api import async_playwright
16
+
17
+ from .viewer import extract_viewer_user_id
18
+
19
+ COMPLETED_LEAGUES_XHR = "/completed/-/completedLeagues"
20
+ DEFAULT_USER_AGENT = (
21
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
22
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
23
+ "Chrome/123.0.0.0 Safari/537.36"
24
+ )
25
+
26
+ LEAGUE_HREF_RE = re.compile(r"^/l/([0-9a-f]{32})/?$", re.I)
27
+ ROUND_NO_RE = re.compile(r"ROUND\s+(\d+)", re.IGNORECASE)
28
+ ISO_TS_RE = re.compile(r"dateFns\.parse\('([^']+)'\)")
29
+ USER_HREF_RE = re.compile(r"^/user/([0-9a-f]{32})/?$", re.I)
30
+ USER_ASSET_RE = re.compile(r"/users/([0-9a-f]{32})/", re.I)
31
+ ORDINAL_RE = re.compile(r"^\d+(st|nd|rd|th)$", re.IGNORECASE)
32
+
33
+
34
+ def _abs_url(base_url: str, href: str) -> str:
35
+ return urljoin(base_url, href)
36
+
37
+
38
+ def _ensure_parent(path: Path) -> None:
39
+ path.parent.mkdir(parents=True, exist_ok=True)
40
+
41
+
42
+ def _write_text(path: Path, content: str) -> None:
43
+ _ensure_parent(path)
44
+ path.write_text(content, encoding="utf-8")
45
+
46
+
47
+ def _parse_user_id_from_href(href: Optional[str]) -> Optional[str]:
48
+ if not href:
49
+ return None
50
+ match = USER_HREF_RE.match(href.strip())
51
+ return match.group(1) if match else None
52
+
53
+
54
+ def _parse_user_id_from_img_src(src: Optional[str]) -> Optional[str]:
55
+ if not src:
56
+ return None
57
+ match = USER_ASSET_RE.search(src)
58
+ return match.group(1) if match else None
59
+
60
+
61
+ def _parse_int(value: Optional[str]) -> Optional[int]:
62
+ if value is None:
63
+ return None
64
+ cleaned = re.sub(r"[^\d\-]", "", str(value).strip())
65
+ if cleaned in ("", "-"):
66
+ return None
67
+ try:
68
+ return int(cleaned)
69
+ except ValueError:
70
+ return None
71
+
72
+
73
+ def _iso_to_date_only(ts: Optional[str]) -> Optional[str]:
74
+ return ts[:10] if ts and len(ts) >= 10 else None
75
+
76
+
77
+
78
+
79
+ def _extract_completed_leagues(fragment_html: str, base_url: str) -> list[dict[str, Any]]:
80
+ soup = BeautifulSoup(fragment_html, "html.parser")
81
+ leagues: dict[str, dict[str, Any]] = {}
82
+
83
+ for tile in soup.select(".league-tile"):
84
+ link = tile.select_one('a[href^="/l/"]')
85
+ if not link:
86
+ continue
87
+ href = (link.get("href") or "").strip()
88
+ match = LEAGUE_HREF_RE.match(href)
89
+ if not match:
90
+ continue
91
+
92
+ league_id = match.group(1)
93
+ title = (link.get_text(" ", strip=True) or "").strip()
94
+
95
+ img = tile.select_one('img[alt="League image"]')
96
+ src = (img.get("src") or "").strip() if img else ""
97
+ league_image_url = None
98
+ if src:
99
+ league_image_url = _abs_url(base_url, src) if src.startswith("/") else src
100
+
101
+ leagues[league_id] = {
102
+ "league_id": league_id,
103
+ "league_href": href if href.endswith("/") else f"{href}/",
104
+ "league_url": _abs_url(base_url, href),
105
+ "league_title": title,
106
+ "league_image_url": league_image_url,
107
+ }
108
+
109
+ if leagues:
110
+ return list(leagues.values())
111
+
112
+ # Fallback: older tiles without `.league-tile` wrapper.
113
+ for link in soup.select('a[href^="/l/"]'):
114
+ href = link.get("href", "")
115
+ match = LEAGUE_HREF_RE.match(href)
116
+ if not match:
117
+ continue
118
+ league_id = match.group(1)
119
+ title = (link.get_text(" ", strip=True) or "").strip()
120
+ leagues[league_id] = {
121
+ "league_id": league_id,
122
+ "league_href": href if href.endswith("/") else f"{href}/",
123
+ "league_url": _abs_url(base_url, href),
124
+ "league_title": title,
125
+ "league_image_url": None,
126
+ }
127
+
128
+ return list(leagues.values())
129
+
130
+
131
+ def _extract_completed_rounds(league_id: str, rounds_html: str, base_url: str) -> list[dict[str, Any]]:
132
+ soup = BeautifulSoup(rounds_html, "html.parser")
133
+
134
+ header = soup.find(
135
+ lambda tag: tag.name in ("h3", "h4", "h5")
136
+ and tag.get_text(strip=True).lower() == "completed rounds"
137
+ )
138
+ container = header.find_parent() if header else soup
139
+
140
+ rounds: list[dict[str, Any]] = []
141
+
142
+ for item in container.select(".league-round-item"):
143
+ round_id = (item.get("id") or "").strip()
144
+ if not round_id:
145
+ continue
146
+
147
+ round_no = None
148
+ span = item.find("span", string=lambda s: bool(s and "ROUND" in s.upper()))
149
+ if span:
150
+ match = ROUND_NO_RE.search(span.get_text(" ", strip=True))
151
+ if match:
152
+ round_no = int(match.group(1))
153
+
154
+ title_tag = item.select_one("h5.card-title") or item.find("h5")
155
+ round_title = title_tag.get_text(" ", strip=True) if title_tag else ""
156
+
157
+ desc_tag = item.select_one("p[data-description]")
158
+ round_description = desc_tag.get("data-description") if desc_tag else None
159
+
160
+ ts = None
161
+ match = ISO_TS_RE.search(str(item))
162
+ if match:
163
+ ts = match.group(1)
164
+ completed_date_utc = _iso_to_date_only(ts)
165
+
166
+ playlist_tag = item.select_one('a[href^="https://open.spotify.com/playlist/"]')
167
+ playlist_url = playlist_tag.get("href") if playlist_tag else None
168
+
169
+ results_href = None
170
+ for link in item.select('a[href^="/l/"]'):
171
+ if "RESULTS" in (link.get_text(" ", strip=True) or "").upper():
172
+ results_href = link.get("href")
173
+ break
174
+ results_url = _abs_url(base_url, results_href) if results_href else None
175
+ results_xhr_url = _abs_url(base_url, f"/l/{league_id}/{round_id}/-/results")
176
+
177
+ rounds.append(
178
+ {
179
+ "round_id": round_id,
180
+ "round_no": round_no,
181
+ "round_title": round_title,
182
+ "round_description": round_description,
183
+ "completed_date_utc": completed_date_utc,
184
+ "playlist_url": playlist_url,
185
+ "results_url": results_url,
186
+ "results_xhr_url": results_xhr_url,
187
+ }
188
+ )
189
+
190
+ rounds.sort(key=lambda entry: (entry["round_no"] is None, entry["round_no"] or 0))
191
+ return rounds
192
+
193
+
194
+ def _parse_round_results(results_html: str) -> list[dict[str, Any]]:
195
+ soup = BeautifulSoup(results_html, "html.parser")
196
+ body = soup.body or soup
197
+
198
+ containers: list[Any] = []
199
+ for div in body.find_all("div", recursive=False):
200
+ if div.select_one('a[href^="https://open.spotify.com/track/"]'):
201
+ containers.append(div)
202
+
203
+ if not containers:
204
+ seen = set()
205
+ for link in soup.select('a[href^="https://open.spotify.com/track/"]'):
206
+ node = link
207
+ while node and getattr(node, "name", None) not in ("body", None):
208
+ if node.name == "div" and node.select_one('img[alt="Album art"]'):
209
+ if id(node) not in seen:
210
+ seen.add(id(node))
211
+ containers.append(node)
212
+ break
213
+ node = node.parent
214
+
215
+ submissions: list[dict[str, Any]] = []
216
+
217
+ for cont in containers:
218
+ album_art_img = cont.select_one('img[alt="Album art"]')
219
+ album_art_url = album_art_img.get("src") if album_art_img else None
220
+
221
+ album_art_link = cont.select_one('a[title="Album art"][href^="/user/"]')
222
+ submitter_user_id = _parse_user_id_from_href(
223
+ album_art_link.get("href") if album_art_link else None
224
+ )
225
+
226
+ track_link = cont.select_one('a[href^="https://open.spotify.com/track/"]')
227
+ track_url = track_link.get("href") if track_link else None
228
+ song_title = track_link.get_text(" ", strip=True) if track_link else None
229
+
230
+ artist = None
231
+ album = None
232
+ if track_link:
233
+ h6 = track_link.find_parent("h6")
234
+ block = h6.find_parent() if h6 else track_link.find_parent()
235
+ ps = block.find_all("p") if block else []
236
+ if len(ps) > 0:
237
+ artist = ps[0].get_text(" ", strip=True)
238
+ if len(ps) > 1:
239
+ album = ps[1].get_text(" ", strip=True)
240
+
241
+ score_tag = cont.find("h3")
242
+ score_overridden = False
243
+ score_override_expected = None
244
+ score_effective = None
245
+
246
+ if score_tag:
247
+ strike_tag = score_tag.find("s", class_=lambda c: bool(c and "text-danger" in c))
248
+ if strike_tag:
249
+ score_overridden = True
250
+ score_override_expected = _parse_int(strike_tag.get_text(" ", strip=True))
251
+ strike_tag.extract()
252
+ score_effective = _parse_int(score_tag.get_text(" ", strip=True))
253
+ else:
254
+ score_effective = _parse_int(score_tag.get_text(" ", strip=True))
255
+
256
+ voters_count = None
257
+ for p_tag in cont.find_all("p"):
258
+ if "voter" in (p_tag.get_text(" ", strip=True) or "").lower():
259
+ voters_count = _parse_int(p_tag.get_text(" ", strip=True))
260
+ break
261
+
262
+ rank_tag = cont.find("h6", string=lambda s: bool(s and ORDINAL_RE.match(s.strip())))
263
+ rank = rank_tag.get_text(" ", strip=True) if rank_tag else None
264
+
265
+ submitter_name = None
266
+ for h6 in cont.find_all("h6"):
267
+ if h6.find("a"):
268
+ continue
269
+ text = h6.get_text(" ", strip=True)
270
+ if text and not ORDINAL_RE.match(text):
271
+ submitter_name = text
272
+ break
273
+
274
+ comment_span = cont.select_one("span.text-break.ws-pre-wrap")
275
+ submitter_comment = comment_span.get_text(" ", strip=True) if comment_span else None
276
+ if submitter_comment == "":
277
+ submitter_comment = None
278
+
279
+ submitter_avatar_url = None
280
+ for img in cont.select('img[src*="musicleague-user-assets"]'):
281
+ src = img.get("src", "")
282
+ if submitter_user_id and submitter_user_id in src:
283
+ submitter_avatar_url = src
284
+ break
285
+
286
+ votes: list[dict[str, Any]] = []
287
+ for bold in cont.find_all("b"):
288
+ voter_name = bold.get_text(" ", strip=True)
289
+
290
+ node = bold
291
+ row = None
292
+ for _ in range(6):
293
+ node = node.parent
294
+ if not node:
295
+ break
296
+ if (
297
+ getattr(node, "name", None) == "div"
298
+ and node.select_one('img[src*="musicleague-user-assets"]')
299
+ ):
300
+ row = node
301
+ break
302
+ if not row:
303
+ continue
304
+
305
+ voter_avatar_img = row.select_one('img[src*="musicleague-user-assets"]')
306
+ voter_avatar_url = voter_avatar_img.get("src") if voter_avatar_img else None
307
+ voter_user_id = _parse_user_id_from_img_src(voter_avatar_url)
308
+
309
+ span = row.find("span")
310
+ comment = span.get_text(" ", strip=True) if span else None
311
+ if comment == "":
312
+ comment = None
313
+
314
+ vote_val = None
315
+ for h6 in reversed(row.find_all("h6")):
316
+ parsed = _parse_int(h6.get_text(" ", strip=True))
317
+ if parsed is not None:
318
+ vote_val = parsed
319
+ break
320
+
321
+ if score_overridden:
322
+ vote_effective = None
323
+ is_effective = False
324
+ else:
325
+ vote_effective = vote_val
326
+ is_effective = True
327
+
328
+ votes.append(
329
+ {
330
+ "voter_user_id": voter_user_id,
331
+ "voter_name": voter_name,
332
+ "vote": vote_val,
333
+ "vote_raw": vote_val,
334
+ "vote_effective": vote_effective,
335
+ "is_effective": is_effective,
336
+ "comment": comment,
337
+ "voter_avatar_url": voter_avatar_url,
338
+ }
339
+ )
340
+
341
+ submissions.append(
342
+ {
343
+ "track_url": track_url,
344
+ "song_title": song_title,
345
+ "artist": artist,
346
+ "album": album,
347
+ "album_art_url": album_art_url,
348
+ "score": score_effective,
349
+ "score_effective": score_effective,
350
+ "score_overridden": score_overridden,
351
+ "score_override_expected": score_override_expected,
352
+ "voters_count": voters_count,
353
+ "rank": rank,
354
+ "submitter": {
355
+ "user_id": submitter_user_id,
356
+ "name": submitter_name,
357
+ "avatar_url": submitter_avatar_url,
358
+ "comment": submitter_comment,
359
+ },
360
+ "votes": votes,
361
+ }
362
+ )
363
+
364
+ return submissions
365
+
366
+
367
+ async def _fetch_html(page, url: str) -> str:
368
+ await page.goto(url, wait_until="domcontentloaded")
369
+ return await page.content()
370
+
371
+
372
+ def _cookie_matches(domain: str, host: str | None) -> bool:
373
+ if not domain or not host:
374
+ return False
375
+ normalized = domain.lstrip(".")
376
+ return host == normalized or host.endswith(f".{normalized}")
377
+
378
+
379
+ def _has_host_cookie(storage_state: dict[str, Any] | None, host: str | None) -> bool:
380
+ if not storage_state or not host:
381
+ return False
382
+ for cookie in storage_state.get("cookies", []):
383
+ domain = str(cookie.get("domain") or "")
384
+ if _cookie_matches(domain, host):
385
+ return True
386
+ return False
387
+
388
+
389
+ def _cookie_header_for_host(storage_state: dict[str, Any] | None, host: str | None) -> str | None:
390
+ if not storage_state or not host:
391
+ return None
392
+ cookies = []
393
+ for cookie in storage_state.get("cookies", []):
394
+ domain = str(cookie.get("domain") or "")
395
+ if _cookie_matches(domain, host):
396
+ name = cookie.get("name")
397
+ value = cookie.get("value")
398
+ if name and value is not None:
399
+ cookies.append(f"{name}={value}")
400
+ return "; ".join(cookies) if cookies else None
401
+
402
+
403
+ def _fingerprint_storage_state(storage_state: dict[str, Any]) -> str:
404
+ # Stable hash for debugging (do not log/store cookies themselves in artifacts/DB).
405
+ payload = json.dumps(storage_state, sort_keys=True, separators=(",", ":")).encode("utf-8")
406
+ return hashlib.sha256(payload).hexdigest()[:12]
407
+
408
+
409
+ async def _cookie_header_from_context(context, base_url: str) -> str | None:
410
+ try:
411
+ cookies = await context.cookies(base_url)
412
+ except Exception:
413
+ return None
414
+ if not cookies:
415
+ return None
416
+ parts: list[str] = []
417
+ for cookie in cookies:
418
+ name = cookie.get("name")
419
+ value = cookie.get("value")
420
+ if name and value is not None:
421
+ parts.append(f"{name}={value}")
422
+ return "; ".join(parts) if parts else None
423
+
424
+
425
+ def _build_xhr_headers(referer: str | None, cookie_header: str | None) -> dict[str, str]:
426
+ headers = {
427
+ "x-requested-with": "XMLHttpRequest",
428
+ "HX-Request": "true",
429
+ "accept": "text/html, */*; q=0.01",
430
+ "user-agent": DEFAULT_USER_AGENT,
431
+ }
432
+ if referer:
433
+ headers["referer"] = referer
434
+ headers["HX-Current-URL"] = referer
435
+ headers["HX-Trigger"] = "load"
436
+ if cookie_header:
437
+ headers["cookie"] = cookie_header
438
+ return headers
439
+
440
+
441
+ def _fetch_xhr_html_via_http(
442
+ url: str,
443
+ *,
444
+ headers: dict[str, str],
445
+ expected_marker: str | None = None,
446
+ retries: int = 2,
447
+ return_last_on_marker_miss: bool = False,
448
+ ) -> str:
449
+ last_text = ""
450
+ for attempt in range(retries + 1):
451
+ req = urllib.request.Request(url, headers=headers, method="GET")
452
+ try:
453
+ with urllib.request.urlopen(req, timeout=20) as resp:
454
+ text = resp.read().decode("utf-8", errors="replace")
455
+ except Exception:
456
+ text = ""
457
+ last_text = text or ""
458
+ if last_text.strip() and (expected_marker is None or expected_marker in last_text):
459
+ return last_text
460
+ if attempt < retries:
461
+ time.sleep(0.4 * (attempt + 1))
462
+ if expected_marker and expected_marker not in last_text and not return_last_on_marker_miss:
463
+ return ""
464
+ return last_text
465
+
466
+
467
+ async def _fetch_xhr_html_in_page(
468
+ page,
469
+ url: str,
470
+ *,
471
+ referer: str | None = None,
472
+ expected_marker: str | None = None,
473
+ retries: int = 2,
474
+ ) -> str:
475
+ headers = {"x-requested-with": "XMLHttpRequest", "HX-Request": "true"}
476
+ last_text = ""
477
+ for attempt in range(retries + 1):
478
+ text = await page.evaluate(
479
+ """async ({ url, headers, referer }) => {
480
+ const res = await fetch(url, {
481
+ method: 'GET',
482
+ credentials: 'include',
483
+ headers,
484
+ referrer: referer || undefined
485
+ });
486
+ return await res.text();
487
+ }""",
488
+ {"url": url, "headers": headers, "referer": referer},
489
+ )
490
+ last_text = text or ""
491
+ if last_text.strip() and (expected_marker is None or expected_marker in last_text):
492
+ return last_text
493
+ if attempt < retries:
494
+ await asyncio.sleep(0.4 * (attempt + 1))
495
+ return last_text
496
+
497
+
498
+ async def _fetch_xhr_html(
499
+ request,
500
+ url: str,
501
+ *,
502
+ referer: str | None = None,
503
+ expected_marker: str | None = None,
504
+ retries: int = 2,
505
+ cookie_header: str | None = None,
506
+ return_last_on_marker_miss: bool = False,
507
+ ) -> str:
508
+ headers = _build_xhr_headers(referer, cookie_header)
509
+ last_text = ""
510
+ for attempt in range(retries + 1):
511
+ response = await request.get(url, headers=headers)
512
+ text = await response.text()
513
+ last_text = text or ""
514
+ if last_text.strip() and (expected_marker is None or expected_marker in last_text):
515
+ return last_text
516
+ if attempt < retries:
517
+ await asyncio.sleep(0.4 * (attempt + 1))
518
+ if expected_marker and expected_marker not in last_text and not return_last_on_marker_miss:
519
+ return ""
520
+ return last_text
521
+
522
+
523
+ async def build_manifest(
524
+ *,
525
+ storage_state: dict[str, Any] | None,
526
+ storage_state_path: str | None,
527
+ base_url: str,
528
+ output_dir: str,
529
+ headless: bool,
530
+ ) -> tuple[dict[str, Any], str]:
531
+ out_dir = Path(output_dir)
532
+ out_dir.mkdir(parents=True, exist_ok=True)
533
+
534
+ manifest: dict[str, Any] = {
535
+ "schema_version": 2,
536
+ "generated_at_local": datetime.now().isoformat(timespec="seconds"),
537
+ "generated_at_utc": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
538
+ "base_url": base_url,
539
+ "out_dir": str(out_dir.resolve()),
540
+ "viewer": None,
541
+ "leagues": [],
542
+ "errors": [],
543
+ }
544
+
545
+ storage_state_arg: dict[str, Any] | str | None = None
546
+ storage_state_payload: dict[str, Any] | None = None
547
+ storage_state_source: str | None = None
548
+ storage_state_path_missing: str | None = None
549
+ if storage_state_path and Path(storage_state_path).exists():
550
+ storage_state_arg = storage_state_path
551
+ storage_state_payload = json.loads(Path(storage_state_path).read_text())
552
+ storage_state_source = "path"
553
+ elif storage_state_path:
554
+ storage_state_path_missing = storage_state_path
555
+ elif storage_state:
556
+ storage_state_arg = storage_state
557
+ storage_state_payload = storage_state
558
+ storage_state_source = "inline"
559
+
560
+ if not storage_state_arg:
561
+ raise RuntimeError("Missing storage state for session scrape.")
562
+
563
+ host = urlparse(base_url).hostname
564
+ if not _has_host_cookie(storage_state_payload, host):
565
+ raise RuntimeError(
566
+ "Storage state missing Music League cookies. Complete login before scraping."
567
+ )
568
+ initial_cookie_header = _cookie_header_for_host(storage_state_payload, host)
569
+ storage_state_ref = {
570
+ "source": storage_state_source,
571
+ "path_used": storage_state_path if storage_state_source == "path" else None,
572
+ "path_missing": storage_state_path_missing,
573
+ "fingerprint": _fingerprint_storage_state(storage_state_payload),
574
+ "cookies_total": len(storage_state_payload.get("cookies", [])) if storage_state_payload else 0,
575
+ "host_cookie_present": True,
576
+ }
577
+ manifest["storage_state_ref"] = storage_state_ref
578
+
579
+ async with async_playwright() as playwright:
580
+ browser = await playwright.chromium.launch(headless=headless)
581
+ context = await browser.new_context(storage_state=storage_state_arg)
582
+ page = await context.new_page()
583
+ try:
584
+ completed_page_url = _abs_url(base_url, "/completed/")
585
+ response = await page.goto(completed_page_url, wait_until="domcontentloaded")
586
+ completed_final_url = page.url
587
+ completed_page_html = await page.content()
588
+ _write_text(out_dir / "completed.page.html", completed_page_html)
589
+ manifest["debug"] = {
590
+ "completed_url": completed_page_url,
591
+ "completed_final_url": completed_final_url,
592
+ "completed_status": response.status if response else None,
593
+ }
594
+
595
+ cookie_header = await _cookie_header_from_context(context, base_url) or initial_cookie_header
596
+
597
+ viewer_id = extract_viewer_user_id(completed_page_html)
598
+ manifest["viewer"] = {"user_id": viewer_id} if viewer_id else None
599
+
600
+ completed_url = _abs_url(base_url, COMPLETED_LEAGUES_XHR)
601
+ completed_debug: dict[str, Any] = {
602
+ "url": completed_url,
603
+ "expected_marker": "league-tile",
604
+ "attempts": [],
605
+ }
606
+
607
+ completed_html = await asyncio.to_thread(
608
+ _fetch_xhr_html_via_http,
609
+ completed_url,
610
+ headers=_build_xhr_headers(_abs_url(base_url, "/completed/"), cookie_header),
611
+ expected_marker="league-tile",
612
+ return_last_on_marker_miss=True,
613
+ )
614
+ completed_marker = bool(completed_html.strip()) and ("league-tile" in completed_html)
615
+ completed_debug["attempts"].append(
616
+ {"method": "http", "bytes": len(completed_html), "marker_found": completed_marker}
617
+ )
618
+
619
+ if not completed_marker:
620
+ resp2 = await context.request.get(
621
+ completed_url,
622
+ headers=_build_xhr_headers(_abs_url(base_url, "/completed/"), cookie_header),
623
+ )
624
+ completed_html = await resp2.text()
625
+ completed_marker = bool(completed_html.strip()) and ("league-tile" in completed_html)
626
+ completed_debug["attempts"].append(
627
+ {
628
+ "method": "context_request",
629
+ "status": resp2.status,
630
+ "final_url": resp2.url,
631
+ "bytes": len(completed_html),
632
+ "marker_found": completed_marker,
633
+ }
634
+ )
635
+
636
+ if not completed_marker:
637
+ completed_html = await _fetch_xhr_html_in_page(
638
+ page,
639
+ completed_url,
640
+ referer=_abs_url(base_url, "/completed/"),
641
+ )
642
+ completed_marker = bool(completed_html.strip()) and ("league-tile" in completed_html)
643
+ completed_debug["attempts"].append(
644
+ {"method": "in_page", "bytes": len(completed_html), "marker_found": completed_marker}
645
+ )
646
+
647
+ _write_text(out_dir / "completedLeagues.html", completed_html)
648
+ _write_text(out_dir / "scrape.debug.json", json.dumps(
649
+ {
650
+ "storage_state_ref": storage_state_ref,
651
+ "cookie_header_source": "context" if cookie_header != initial_cookie_header else "storage_state",
652
+ "completed": manifest.get("debug"),
653
+ "completedLeagues": completed_debug,
654
+ },
655
+ indent=2,
656
+ ))
657
+ leagues = _extract_completed_leagues(completed_html, base_url)
658
+ if not leagues:
659
+ leagues = _extract_completed_leagues(completed_page_html, base_url)
660
+
661
+ for league_meta in leagues:
662
+ league_id = league_meta["league_id"]
663
+ league_dir = out_dir / "leagues" / league_id
664
+ league_dir.mkdir(parents=True, exist_ok=True)
665
+
666
+ try:
667
+ rounds_xhr_url = _abs_url(base_url, f"/l/{league_id}/-/rounds")
668
+ rounds_html = await asyncio.to_thread(
669
+ _fetch_xhr_html_via_http,
670
+ rounds_xhr_url,
671
+ headers=_build_xhr_headers(league_meta["league_url"], cookie_header),
672
+ expected_marker="league-round-item",
673
+ return_last_on_marker_miss=True,
674
+ )
675
+ rounds_marker = bool(rounds_html.strip()) and ("league-round-item" in rounds_html)
676
+ if not rounds_marker:
677
+ rounds_html = await _fetch_xhr_html(
678
+ context.request,
679
+ rounds_xhr_url,
680
+ referer=league_meta["league_url"],
681
+ expected_marker="league-round-item",
682
+ cookie_header=cookie_header,
683
+ return_last_on_marker_miss=True,
684
+ )
685
+ rounds_marker = bool(rounds_html.strip()) and ("league-round-item" in rounds_html)
686
+ if not rounds_marker:
687
+ rounds_html = await _fetch_xhr_html_in_page(
688
+ page,
689
+ rounds_xhr_url,
690
+ referer=league_meta["league_url"],
691
+ )
692
+ _write_text(league_dir / "rounds.html", rounds_html)
693
+
694
+ rounds = _extract_completed_rounds(league_id, rounds_html, base_url)
695
+
696
+ for rd in rounds:
697
+ round_id = rd["round_id"]
698
+ round_dir = league_dir / "rounds" / round_id
699
+ round_dir.mkdir(parents=True, exist_ok=True)
700
+
701
+ results_html = await asyncio.to_thread(
702
+ _fetch_xhr_html_via_http,
703
+ rd["results_xhr_url"],
704
+ headers=_build_xhr_headers(league_meta["league_url"], cookie_header),
705
+ expected_marker="open.spotify.com/track/",
706
+ return_last_on_marker_miss=True,
707
+ )
708
+ results_marker = bool(results_html.strip()) and (
709
+ "open.spotify.com/track/" in results_html
710
+ )
711
+ if not results_marker:
712
+ results_html = await _fetch_xhr_html(
713
+ context.request,
714
+ rd["results_xhr_url"],
715
+ referer=league_meta["league_url"],
716
+ expected_marker="open.spotify.com/track/",
717
+ cookie_header=cookie_header,
718
+ return_last_on_marker_miss=True,
719
+ )
720
+ results_marker = bool(results_html.strip()) and (
721
+ "open.spotify.com/track/" in results_html
722
+ )
723
+ if not results_marker:
724
+ results_html = await _fetch_xhr_html_in_page(
725
+ page,
726
+ rd["results_xhr_url"],
727
+ referer=league_meta["league_url"],
728
+ )
729
+ _write_text(round_dir / "results.html", results_html)
730
+
731
+ submissions = _parse_round_results(results_html)
732
+ rd["submissions_total"] = len(submissions)
733
+ rd["votes_total"] = sum(len(s.get("votes", [])) for s in submissions)
734
+ rd["submissions"] = submissions
735
+
736
+ _write_text(
737
+ round_dir / "results.parsed.json",
738
+ json.dumps({"round_id": round_id, "submissions": submissions}, indent=2),
739
+ )
740
+
741
+ dates = [
742
+ rd.get("completed_date_utc")
743
+ for rd in rounds
744
+ if rd.get("completed_date_utc")
745
+ ]
746
+ league_completed_date_utc = max(dates) if dates else None
747
+
748
+ league_obj = {
749
+ **league_meta,
750
+ "league_completed_date_utc": league_completed_date_utc,
751
+ "rounds_total": len(rounds),
752
+ "rounds": rounds,
753
+ }
754
+
755
+ _write_text(
756
+ league_dir / "league.summary.json", json.dumps(league_obj, indent=2)
757
+ )
758
+ manifest["leagues"].append(league_obj)
759
+ except Exception as exc:
760
+ manifest["errors"].append(
761
+ {
762
+ "type": type(exc).__name__,
763
+ "message": str(exc),
764
+ "league_id": league_id,
765
+ }
766
+ )
767
+
768
+ finally:
769
+ await context.close()
770
+ await browser.close()
771
+
772
+ manifest_path = out_dir / "manifest.json"
773
+ _write_text(manifest_path, json.dumps(manifest, indent=2))
774
+ return manifest, str(manifest_path.resolve())