@music-league-eras/local-runner 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +71 -0
- package/bin/ml-eras-local-runner.js +6 -0
- package/package.json +28 -0
- package/scripts/prepack.mjs +54 -0
- package/src/args.js +64 -0
- package/src/cli.js +99 -0
- package/src/log.js +16 -0
- package/src/python/bootstrap.js +127 -0
- package/src/python/run.js +50 -0
- package/src/syncToken.js +37 -0
- package/src/ttyPrompt.js +50 -0
- package/vendor/python/app/__init__.py +0 -0
- package/vendor/python/app/browser/__init__.py +0 -0
- package/vendor/python/app/browser/session.py +267 -0
- package/vendor/python/app/local_runner_cli.py +112 -0
- package/vendor/python/app/local_sync_runner.py +99 -0
- package/vendor/python/app/services/__init__.py +0 -0
- package/vendor/python/app/services/scrape_manifest.py +774 -0
- package/vendor/python/app/services/viewer.py +58 -0
- package/vendor/python/requirements.txt +3 -0
|
@@ -0,0 +1,774 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import hashlib
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
import time
|
|
8
|
+
import urllib.request
|
|
9
|
+
from datetime import datetime, timezone
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any, Optional
|
|
12
|
+
from urllib.parse import urljoin, urlparse
|
|
13
|
+
|
|
14
|
+
from bs4 import BeautifulSoup
|
|
15
|
+
from playwright.async_api import async_playwright
|
|
16
|
+
|
|
17
|
+
from .viewer import extract_viewer_user_id
|
|
18
|
+
|
|
19
|
+
COMPLETED_LEAGUES_XHR = "/completed/-/completedLeagues"
|
|
20
|
+
DEFAULT_USER_AGENT = (
|
|
21
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
22
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
23
|
+
"Chrome/123.0.0.0 Safari/537.36"
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
LEAGUE_HREF_RE = re.compile(r"^/l/([0-9a-f]{32})/?$", re.I)
|
|
27
|
+
ROUND_NO_RE = re.compile(r"ROUND\s+(\d+)", re.IGNORECASE)
|
|
28
|
+
ISO_TS_RE = re.compile(r"dateFns\.parse\('([^']+)'\)")
|
|
29
|
+
USER_HREF_RE = re.compile(r"^/user/([0-9a-f]{32})/?$", re.I)
|
|
30
|
+
USER_ASSET_RE = re.compile(r"/users/([0-9a-f]{32})/", re.I)
|
|
31
|
+
ORDINAL_RE = re.compile(r"^\d+(st|nd|rd|th)$", re.IGNORECASE)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _abs_url(base_url: str, href: str) -> str:
|
|
35
|
+
return urljoin(base_url, href)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _ensure_parent(path: Path) -> None:
|
|
39
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _write_text(path: Path, content: str) -> None:
|
|
43
|
+
_ensure_parent(path)
|
|
44
|
+
path.write_text(content, encoding="utf-8")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _parse_user_id_from_href(href: Optional[str]) -> Optional[str]:
|
|
48
|
+
if not href:
|
|
49
|
+
return None
|
|
50
|
+
match = USER_HREF_RE.match(href.strip())
|
|
51
|
+
return match.group(1) if match else None
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _parse_user_id_from_img_src(src: Optional[str]) -> Optional[str]:
|
|
55
|
+
if not src:
|
|
56
|
+
return None
|
|
57
|
+
match = USER_ASSET_RE.search(src)
|
|
58
|
+
return match.group(1) if match else None
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _parse_int(value: Optional[str]) -> Optional[int]:
|
|
62
|
+
if value is None:
|
|
63
|
+
return None
|
|
64
|
+
cleaned = re.sub(r"[^\d\-]", "", str(value).strip())
|
|
65
|
+
if cleaned in ("", "-"):
|
|
66
|
+
return None
|
|
67
|
+
try:
|
|
68
|
+
return int(cleaned)
|
|
69
|
+
except ValueError:
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _iso_to_date_only(ts: Optional[str]) -> Optional[str]:
|
|
74
|
+
return ts[:10] if ts and len(ts) >= 10 else None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _extract_completed_leagues(fragment_html: str, base_url: str) -> list[dict[str, Any]]:
|
|
80
|
+
soup = BeautifulSoup(fragment_html, "html.parser")
|
|
81
|
+
leagues: dict[str, dict[str, Any]] = {}
|
|
82
|
+
|
|
83
|
+
for tile in soup.select(".league-tile"):
|
|
84
|
+
link = tile.select_one('a[href^="/l/"]')
|
|
85
|
+
if not link:
|
|
86
|
+
continue
|
|
87
|
+
href = (link.get("href") or "").strip()
|
|
88
|
+
match = LEAGUE_HREF_RE.match(href)
|
|
89
|
+
if not match:
|
|
90
|
+
continue
|
|
91
|
+
|
|
92
|
+
league_id = match.group(1)
|
|
93
|
+
title = (link.get_text(" ", strip=True) or "").strip()
|
|
94
|
+
|
|
95
|
+
img = tile.select_one('img[alt="League image"]')
|
|
96
|
+
src = (img.get("src") or "").strip() if img else ""
|
|
97
|
+
league_image_url = None
|
|
98
|
+
if src:
|
|
99
|
+
league_image_url = _abs_url(base_url, src) if src.startswith("/") else src
|
|
100
|
+
|
|
101
|
+
leagues[league_id] = {
|
|
102
|
+
"league_id": league_id,
|
|
103
|
+
"league_href": href if href.endswith("/") else f"{href}/",
|
|
104
|
+
"league_url": _abs_url(base_url, href),
|
|
105
|
+
"league_title": title,
|
|
106
|
+
"league_image_url": league_image_url,
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
if leagues:
|
|
110
|
+
return list(leagues.values())
|
|
111
|
+
|
|
112
|
+
# Fallback: older tiles without `.league-tile` wrapper.
|
|
113
|
+
for link in soup.select('a[href^="/l/"]'):
|
|
114
|
+
href = link.get("href", "")
|
|
115
|
+
match = LEAGUE_HREF_RE.match(href)
|
|
116
|
+
if not match:
|
|
117
|
+
continue
|
|
118
|
+
league_id = match.group(1)
|
|
119
|
+
title = (link.get_text(" ", strip=True) or "").strip()
|
|
120
|
+
leagues[league_id] = {
|
|
121
|
+
"league_id": league_id,
|
|
122
|
+
"league_href": href if href.endswith("/") else f"{href}/",
|
|
123
|
+
"league_url": _abs_url(base_url, href),
|
|
124
|
+
"league_title": title,
|
|
125
|
+
"league_image_url": None,
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
return list(leagues.values())
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _extract_completed_rounds(league_id: str, rounds_html: str, base_url: str) -> list[dict[str, Any]]:
|
|
132
|
+
soup = BeautifulSoup(rounds_html, "html.parser")
|
|
133
|
+
|
|
134
|
+
header = soup.find(
|
|
135
|
+
lambda tag: tag.name in ("h3", "h4", "h5")
|
|
136
|
+
and tag.get_text(strip=True).lower() == "completed rounds"
|
|
137
|
+
)
|
|
138
|
+
container = header.find_parent() if header else soup
|
|
139
|
+
|
|
140
|
+
rounds: list[dict[str, Any]] = []
|
|
141
|
+
|
|
142
|
+
for item in container.select(".league-round-item"):
|
|
143
|
+
round_id = (item.get("id") or "").strip()
|
|
144
|
+
if not round_id:
|
|
145
|
+
continue
|
|
146
|
+
|
|
147
|
+
round_no = None
|
|
148
|
+
span = item.find("span", string=lambda s: bool(s and "ROUND" in s.upper()))
|
|
149
|
+
if span:
|
|
150
|
+
match = ROUND_NO_RE.search(span.get_text(" ", strip=True))
|
|
151
|
+
if match:
|
|
152
|
+
round_no = int(match.group(1))
|
|
153
|
+
|
|
154
|
+
title_tag = item.select_one("h5.card-title") or item.find("h5")
|
|
155
|
+
round_title = title_tag.get_text(" ", strip=True) if title_tag else ""
|
|
156
|
+
|
|
157
|
+
desc_tag = item.select_one("p[data-description]")
|
|
158
|
+
round_description = desc_tag.get("data-description") if desc_tag else None
|
|
159
|
+
|
|
160
|
+
ts = None
|
|
161
|
+
match = ISO_TS_RE.search(str(item))
|
|
162
|
+
if match:
|
|
163
|
+
ts = match.group(1)
|
|
164
|
+
completed_date_utc = _iso_to_date_only(ts)
|
|
165
|
+
|
|
166
|
+
playlist_tag = item.select_one('a[href^="https://open.spotify.com/playlist/"]')
|
|
167
|
+
playlist_url = playlist_tag.get("href") if playlist_tag else None
|
|
168
|
+
|
|
169
|
+
results_href = None
|
|
170
|
+
for link in item.select('a[href^="/l/"]'):
|
|
171
|
+
if "RESULTS" in (link.get_text(" ", strip=True) or "").upper():
|
|
172
|
+
results_href = link.get("href")
|
|
173
|
+
break
|
|
174
|
+
results_url = _abs_url(base_url, results_href) if results_href else None
|
|
175
|
+
results_xhr_url = _abs_url(base_url, f"/l/{league_id}/{round_id}/-/results")
|
|
176
|
+
|
|
177
|
+
rounds.append(
|
|
178
|
+
{
|
|
179
|
+
"round_id": round_id,
|
|
180
|
+
"round_no": round_no,
|
|
181
|
+
"round_title": round_title,
|
|
182
|
+
"round_description": round_description,
|
|
183
|
+
"completed_date_utc": completed_date_utc,
|
|
184
|
+
"playlist_url": playlist_url,
|
|
185
|
+
"results_url": results_url,
|
|
186
|
+
"results_xhr_url": results_xhr_url,
|
|
187
|
+
}
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
rounds.sort(key=lambda entry: (entry["round_no"] is None, entry["round_no"] or 0))
|
|
191
|
+
return rounds
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _parse_round_results(results_html: str) -> list[dict[str, Any]]:
|
|
195
|
+
soup = BeautifulSoup(results_html, "html.parser")
|
|
196
|
+
body = soup.body or soup
|
|
197
|
+
|
|
198
|
+
containers: list[Any] = []
|
|
199
|
+
for div in body.find_all("div", recursive=False):
|
|
200
|
+
if div.select_one('a[href^="https://open.spotify.com/track/"]'):
|
|
201
|
+
containers.append(div)
|
|
202
|
+
|
|
203
|
+
if not containers:
|
|
204
|
+
seen = set()
|
|
205
|
+
for link in soup.select('a[href^="https://open.spotify.com/track/"]'):
|
|
206
|
+
node = link
|
|
207
|
+
while node and getattr(node, "name", None) not in ("body", None):
|
|
208
|
+
if node.name == "div" and node.select_one('img[alt="Album art"]'):
|
|
209
|
+
if id(node) not in seen:
|
|
210
|
+
seen.add(id(node))
|
|
211
|
+
containers.append(node)
|
|
212
|
+
break
|
|
213
|
+
node = node.parent
|
|
214
|
+
|
|
215
|
+
submissions: list[dict[str, Any]] = []
|
|
216
|
+
|
|
217
|
+
for cont in containers:
|
|
218
|
+
album_art_img = cont.select_one('img[alt="Album art"]')
|
|
219
|
+
album_art_url = album_art_img.get("src") if album_art_img else None
|
|
220
|
+
|
|
221
|
+
album_art_link = cont.select_one('a[title="Album art"][href^="/user/"]')
|
|
222
|
+
submitter_user_id = _parse_user_id_from_href(
|
|
223
|
+
album_art_link.get("href") if album_art_link else None
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
track_link = cont.select_one('a[href^="https://open.spotify.com/track/"]')
|
|
227
|
+
track_url = track_link.get("href") if track_link else None
|
|
228
|
+
song_title = track_link.get_text(" ", strip=True) if track_link else None
|
|
229
|
+
|
|
230
|
+
artist = None
|
|
231
|
+
album = None
|
|
232
|
+
if track_link:
|
|
233
|
+
h6 = track_link.find_parent("h6")
|
|
234
|
+
block = h6.find_parent() if h6 else track_link.find_parent()
|
|
235
|
+
ps = block.find_all("p") if block else []
|
|
236
|
+
if len(ps) > 0:
|
|
237
|
+
artist = ps[0].get_text(" ", strip=True)
|
|
238
|
+
if len(ps) > 1:
|
|
239
|
+
album = ps[1].get_text(" ", strip=True)
|
|
240
|
+
|
|
241
|
+
score_tag = cont.find("h3")
|
|
242
|
+
score_overridden = False
|
|
243
|
+
score_override_expected = None
|
|
244
|
+
score_effective = None
|
|
245
|
+
|
|
246
|
+
if score_tag:
|
|
247
|
+
strike_tag = score_tag.find("s", class_=lambda c: bool(c and "text-danger" in c))
|
|
248
|
+
if strike_tag:
|
|
249
|
+
score_overridden = True
|
|
250
|
+
score_override_expected = _parse_int(strike_tag.get_text(" ", strip=True))
|
|
251
|
+
strike_tag.extract()
|
|
252
|
+
score_effective = _parse_int(score_tag.get_text(" ", strip=True))
|
|
253
|
+
else:
|
|
254
|
+
score_effective = _parse_int(score_tag.get_text(" ", strip=True))
|
|
255
|
+
|
|
256
|
+
voters_count = None
|
|
257
|
+
for p_tag in cont.find_all("p"):
|
|
258
|
+
if "voter" in (p_tag.get_text(" ", strip=True) or "").lower():
|
|
259
|
+
voters_count = _parse_int(p_tag.get_text(" ", strip=True))
|
|
260
|
+
break
|
|
261
|
+
|
|
262
|
+
rank_tag = cont.find("h6", string=lambda s: bool(s and ORDINAL_RE.match(s.strip())))
|
|
263
|
+
rank = rank_tag.get_text(" ", strip=True) if rank_tag else None
|
|
264
|
+
|
|
265
|
+
submitter_name = None
|
|
266
|
+
for h6 in cont.find_all("h6"):
|
|
267
|
+
if h6.find("a"):
|
|
268
|
+
continue
|
|
269
|
+
text = h6.get_text(" ", strip=True)
|
|
270
|
+
if text and not ORDINAL_RE.match(text):
|
|
271
|
+
submitter_name = text
|
|
272
|
+
break
|
|
273
|
+
|
|
274
|
+
comment_span = cont.select_one("span.text-break.ws-pre-wrap")
|
|
275
|
+
submitter_comment = comment_span.get_text(" ", strip=True) if comment_span else None
|
|
276
|
+
if submitter_comment == "":
|
|
277
|
+
submitter_comment = None
|
|
278
|
+
|
|
279
|
+
submitter_avatar_url = None
|
|
280
|
+
for img in cont.select('img[src*="musicleague-user-assets"]'):
|
|
281
|
+
src = img.get("src", "")
|
|
282
|
+
if submitter_user_id and submitter_user_id in src:
|
|
283
|
+
submitter_avatar_url = src
|
|
284
|
+
break
|
|
285
|
+
|
|
286
|
+
votes: list[dict[str, Any]] = []
|
|
287
|
+
for bold in cont.find_all("b"):
|
|
288
|
+
voter_name = bold.get_text(" ", strip=True)
|
|
289
|
+
|
|
290
|
+
node = bold
|
|
291
|
+
row = None
|
|
292
|
+
for _ in range(6):
|
|
293
|
+
node = node.parent
|
|
294
|
+
if not node:
|
|
295
|
+
break
|
|
296
|
+
if (
|
|
297
|
+
getattr(node, "name", None) == "div"
|
|
298
|
+
and node.select_one('img[src*="musicleague-user-assets"]')
|
|
299
|
+
):
|
|
300
|
+
row = node
|
|
301
|
+
break
|
|
302
|
+
if not row:
|
|
303
|
+
continue
|
|
304
|
+
|
|
305
|
+
voter_avatar_img = row.select_one('img[src*="musicleague-user-assets"]')
|
|
306
|
+
voter_avatar_url = voter_avatar_img.get("src") if voter_avatar_img else None
|
|
307
|
+
voter_user_id = _parse_user_id_from_img_src(voter_avatar_url)
|
|
308
|
+
|
|
309
|
+
span = row.find("span")
|
|
310
|
+
comment = span.get_text(" ", strip=True) if span else None
|
|
311
|
+
if comment == "":
|
|
312
|
+
comment = None
|
|
313
|
+
|
|
314
|
+
vote_val = None
|
|
315
|
+
for h6 in reversed(row.find_all("h6")):
|
|
316
|
+
parsed = _parse_int(h6.get_text(" ", strip=True))
|
|
317
|
+
if parsed is not None:
|
|
318
|
+
vote_val = parsed
|
|
319
|
+
break
|
|
320
|
+
|
|
321
|
+
if score_overridden:
|
|
322
|
+
vote_effective = None
|
|
323
|
+
is_effective = False
|
|
324
|
+
else:
|
|
325
|
+
vote_effective = vote_val
|
|
326
|
+
is_effective = True
|
|
327
|
+
|
|
328
|
+
votes.append(
|
|
329
|
+
{
|
|
330
|
+
"voter_user_id": voter_user_id,
|
|
331
|
+
"voter_name": voter_name,
|
|
332
|
+
"vote": vote_val,
|
|
333
|
+
"vote_raw": vote_val,
|
|
334
|
+
"vote_effective": vote_effective,
|
|
335
|
+
"is_effective": is_effective,
|
|
336
|
+
"comment": comment,
|
|
337
|
+
"voter_avatar_url": voter_avatar_url,
|
|
338
|
+
}
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
submissions.append(
|
|
342
|
+
{
|
|
343
|
+
"track_url": track_url,
|
|
344
|
+
"song_title": song_title,
|
|
345
|
+
"artist": artist,
|
|
346
|
+
"album": album,
|
|
347
|
+
"album_art_url": album_art_url,
|
|
348
|
+
"score": score_effective,
|
|
349
|
+
"score_effective": score_effective,
|
|
350
|
+
"score_overridden": score_overridden,
|
|
351
|
+
"score_override_expected": score_override_expected,
|
|
352
|
+
"voters_count": voters_count,
|
|
353
|
+
"rank": rank,
|
|
354
|
+
"submitter": {
|
|
355
|
+
"user_id": submitter_user_id,
|
|
356
|
+
"name": submitter_name,
|
|
357
|
+
"avatar_url": submitter_avatar_url,
|
|
358
|
+
"comment": submitter_comment,
|
|
359
|
+
},
|
|
360
|
+
"votes": votes,
|
|
361
|
+
}
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
return submissions
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
async def _fetch_html(page, url: str) -> str:
|
|
368
|
+
await page.goto(url, wait_until="domcontentloaded")
|
|
369
|
+
return await page.content()
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def _cookie_matches(domain: str, host: str | None) -> bool:
|
|
373
|
+
if not domain or not host:
|
|
374
|
+
return False
|
|
375
|
+
normalized = domain.lstrip(".")
|
|
376
|
+
return host == normalized or host.endswith(f".{normalized}")
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def _has_host_cookie(storage_state: dict[str, Any] | None, host: str | None) -> bool:
|
|
380
|
+
if not storage_state or not host:
|
|
381
|
+
return False
|
|
382
|
+
for cookie in storage_state.get("cookies", []):
|
|
383
|
+
domain = str(cookie.get("domain") or "")
|
|
384
|
+
if _cookie_matches(domain, host):
|
|
385
|
+
return True
|
|
386
|
+
return False
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def _cookie_header_for_host(storage_state: dict[str, Any] | None, host: str | None) -> str | None:
|
|
390
|
+
if not storage_state or not host:
|
|
391
|
+
return None
|
|
392
|
+
cookies = []
|
|
393
|
+
for cookie in storage_state.get("cookies", []):
|
|
394
|
+
domain = str(cookie.get("domain") or "")
|
|
395
|
+
if _cookie_matches(domain, host):
|
|
396
|
+
name = cookie.get("name")
|
|
397
|
+
value = cookie.get("value")
|
|
398
|
+
if name and value is not None:
|
|
399
|
+
cookies.append(f"{name}={value}")
|
|
400
|
+
return "; ".join(cookies) if cookies else None
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
def _fingerprint_storage_state(storage_state: dict[str, Any]) -> str:
|
|
404
|
+
# Stable hash for debugging (do not log/store cookies themselves in artifacts/DB).
|
|
405
|
+
payload = json.dumps(storage_state, sort_keys=True, separators=(",", ":")).encode("utf-8")
|
|
406
|
+
return hashlib.sha256(payload).hexdigest()[:12]
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
async def _cookie_header_from_context(context, base_url: str) -> str | None:
|
|
410
|
+
try:
|
|
411
|
+
cookies = await context.cookies(base_url)
|
|
412
|
+
except Exception:
|
|
413
|
+
return None
|
|
414
|
+
if not cookies:
|
|
415
|
+
return None
|
|
416
|
+
parts: list[str] = []
|
|
417
|
+
for cookie in cookies:
|
|
418
|
+
name = cookie.get("name")
|
|
419
|
+
value = cookie.get("value")
|
|
420
|
+
if name and value is not None:
|
|
421
|
+
parts.append(f"{name}={value}")
|
|
422
|
+
return "; ".join(parts) if parts else None
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def _build_xhr_headers(referer: str | None, cookie_header: str | None) -> dict[str, str]:
|
|
426
|
+
headers = {
|
|
427
|
+
"x-requested-with": "XMLHttpRequest",
|
|
428
|
+
"HX-Request": "true",
|
|
429
|
+
"accept": "text/html, */*; q=0.01",
|
|
430
|
+
"user-agent": DEFAULT_USER_AGENT,
|
|
431
|
+
}
|
|
432
|
+
if referer:
|
|
433
|
+
headers["referer"] = referer
|
|
434
|
+
headers["HX-Current-URL"] = referer
|
|
435
|
+
headers["HX-Trigger"] = "load"
|
|
436
|
+
if cookie_header:
|
|
437
|
+
headers["cookie"] = cookie_header
|
|
438
|
+
return headers
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def _fetch_xhr_html_via_http(
|
|
442
|
+
url: str,
|
|
443
|
+
*,
|
|
444
|
+
headers: dict[str, str],
|
|
445
|
+
expected_marker: str | None = None,
|
|
446
|
+
retries: int = 2,
|
|
447
|
+
return_last_on_marker_miss: bool = False,
|
|
448
|
+
) -> str:
|
|
449
|
+
last_text = ""
|
|
450
|
+
for attempt in range(retries + 1):
|
|
451
|
+
req = urllib.request.Request(url, headers=headers, method="GET")
|
|
452
|
+
try:
|
|
453
|
+
with urllib.request.urlopen(req, timeout=20) as resp:
|
|
454
|
+
text = resp.read().decode("utf-8", errors="replace")
|
|
455
|
+
except Exception:
|
|
456
|
+
text = ""
|
|
457
|
+
last_text = text or ""
|
|
458
|
+
if last_text.strip() and (expected_marker is None or expected_marker in last_text):
|
|
459
|
+
return last_text
|
|
460
|
+
if attempt < retries:
|
|
461
|
+
time.sleep(0.4 * (attempt + 1))
|
|
462
|
+
if expected_marker and expected_marker not in last_text and not return_last_on_marker_miss:
|
|
463
|
+
return ""
|
|
464
|
+
return last_text
|
|
465
|
+
|
|
466
|
+
|
|
467
|
+
async def _fetch_xhr_html_in_page(
|
|
468
|
+
page,
|
|
469
|
+
url: str,
|
|
470
|
+
*,
|
|
471
|
+
referer: str | None = None,
|
|
472
|
+
expected_marker: str | None = None,
|
|
473
|
+
retries: int = 2,
|
|
474
|
+
) -> str:
|
|
475
|
+
headers = {"x-requested-with": "XMLHttpRequest", "HX-Request": "true"}
|
|
476
|
+
last_text = ""
|
|
477
|
+
for attempt in range(retries + 1):
|
|
478
|
+
text = await page.evaluate(
|
|
479
|
+
"""async ({ url, headers, referer }) => {
|
|
480
|
+
const res = await fetch(url, {
|
|
481
|
+
method: 'GET',
|
|
482
|
+
credentials: 'include',
|
|
483
|
+
headers,
|
|
484
|
+
referrer: referer || undefined
|
|
485
|
+
});
|
|
486
|
+
return await res.text();
|
|
487
|
+
}""",
|
|
488
|
+
{"url": url, "headers": headers, "referer": referer},
|
|
489
|
+
)
|
|
490
|
+
last_text = text or ""
|
|
491
|
+
if last_text.strip() and (expected_marker is None or expected_marker in last_text):
|
|
492
|
+
return last_text
|
|
493
|
+
if attempt < retries:
|
|
494
|
+
await asyncio.sleep(0.4 * (attempt + 1))
|
|
495
|
+
return last_text
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
async def _fetch_xhr_html(
|
|
499
|
+
request,
|
|
500
|
+
url: str,
|
|
501
|
+
*,
|
|
502
|
+
referer: str | None = None,
|
|
503
|
+
expected_marker: str | None = None,
|
|
504
|
+
retries: int = 2,
|
|
505
|
+
cookie_header: str | None = None,
|
|
506
|
+
return_last_on_marker_miss: bool = False,
|
|
507
|
+
) -> str:
|
|
508
|
+
headers = _build_xhr_headers(referer, cookie_header)
|
|
509
|
+
last_text = ""
|
|
510
|
+
for attempt in range(retries + 1):
|
|
511
|
+
response = await request.get(url, headers=headers)
|
|
512
|
+
text = await response.text()
|
|
513
|
+
last_text = text or ""
|
|
514
|
+
if last_text.strip() and (expected_marker is None or expected_marker in last_text):
|
|
515
|
+
return last_text
|
|
516
|
+
if attempt < retries:
|
|
517
|
+
await asyncio.sleep(0.4 * (attempt + 1))
|
|
518
|
+
if expected_marker and expected_marker not in last_text and not return_last_on_marker_miss:
|
|
519
|
+
return ""
|
|
520
|
+
return last_text
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
async def build_manifest(
|
|
524
|
+
*,
|
|
525
|
+
storage_state: dict[str, Any] | None,
|
|
526
|
+
storage_state_path: str | None,
|
|
527
|
+
base_url: str,
|
|
528
|
+
output_dir: str,
|
|
529
|
+
headless: bool,
|
|
530
|
+
) -> tuple[dict[str, Any], str]:
|
|
531
|
+
out_dir = Path(output_dir)
|
|
532
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
533
|
+
|
|
534
|
+
manifest: dict[str, Any] = {
|
|
535
|
+
"schema_version": 2,
|
|
536
|
+
"generated_at_local": datetime.now().isoformat(timespec="seconds"),
|
|
537
|
+
"generated_at_utc": datetime.now(timezone.utc).isoformat().replace("+00:00", "Z"),
|
|
538
|
+
"base_url": base_url,
|
|
539
|
+
"out_dir": str(out_dir.resolve()),
|
|
540
|
+
"viewer": None,
|
|
541
|
+
"leagues": [],
|
|
542
|
+
"errors": [],
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
storage_state_arg: dict[str, Any] | str | None = None
|
|
546
|
+
storage_state_payload: dict[str, Any] | None = None
|
|
547
|
+
storage_state_source: str | None = None
|
|
548
|
+
storage_state_path_missing: str | None = None
|
|
549
|
+
if storage_state_path and Path(storage_state_path).exists():
|
|
550
|
+
storage_state_arg = storage_state_path
|
|
551
|
+
storage_state_payload = json.loads(Path(storage_state_path).read_text())
|
|
552
|
+
storage_state_source = "path"
|
|
553
|
+
elif storage_state_path:
|
|
554
|
+
storage_state_path_missing = storage_state_path
|
|
555
|
+
elif storage_state:
|
|
556
|
+
storage_state_arg = storage_state
|
|
557
|
+
storage_state_payload = storage_state
|
|
558
|
+
storage_state_source = "inline"
|
|
559
|
+
|
|
560
|
+
if not storage_state_arg:
|
|
561
|
+
raise RuntimeError("Missing storage state for session scrape.")
|
|
562
|
+
|
|
563
|
+
host = urlparse(base_url).hostname
|
|
564
|
+
if not _has_host_cookie(storage_state_payload, host):
|
|
565
|
+
raise RuntimeError(
|
|
566
|
+
"Storage state missing Music League cookies. Complete login before scraping."
|
|
567
|
+
)
|
|
568
|
+
initial_cookie_header = _cookie_header_for_host(storage_state_payload, host)
|
|
569
|
+
storage_state_ref = {
|
|
570
|
+
"source": storage_state_source,
|
|
571
|
+
"path_used": storage_state_path if storage_state_source == "path" else None,
|
|
572
|
+
"path_missing": storage_state_path_missing,
|
|
573
|
+
"fingerprint": _fingerprint_storage_state(storage_state_payload),
|
|
574
|
+
"cookies_total": len(storage_state_payload.get("cookies", [])) if storage_state_payload else 0,
|
|
575
|
+
"host_cookie_present": True,
|
|
576
|
+
}
|
|
577
|
+
manifest["storage_state_ref"] = storage_state_ref
|
|
578
|
+
|
|
579
|
+
async with async_playwright() as playwright:
|
|
580
|
+
browser = await playwright.chromium.launch(headless=headless)
|
|
581
|
+
context = await browser.new_context(storage_state=storage_state_arg)
|
|
582
|
+
page = await context.new_page()
|
|
583
|
+
try:
|
|
584
|
+
completed_page_url = _abs_url(base_url, "/completed/")
|
|
585
|
+
response = await page.goto(completed_page_url, wait_until="domcontentloaded")
|
|
586
|
+
completed_final_url = page.url
|
|
587
|
+
completed_page_html = await page.content()
|
|
588
|
+
_write_text(out_dir / "completed.page.html", completed_page_html)
|
|
589
|
+
manifest["debug"] = {
|
|
590
|
+
"completed_url": completed_page_url,
|
|
591
|
+
"completed_final_url": completed_final_url,
|
|
592
|
+
"completed_status": response.status if response else None,
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
cookie_header = await _cookie_header_from_context(context, base_url) or initial_cookie_header
|
|
596
|
+
|
|
597
|
+
viewer_id = extract_viewer_user_id(completed_page_html)
|
|
598
|
+
manifest["viewer"] = {"user_id": viewer_id} if viewer_id else None
|
|
599
|
+
|
|
600
|
+
completed_url = _abs_url(base_url, COMPLETED_LEAGUES_XHR)
|
|
601
|
+
completed_debug: dict[str, Any] = {
|
|
602
|
+
"url": completed_url,
|
|
603
|
+
"expected_marker": "league-tile",
|
|
604
|
+
"attempts": [],
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
completed_html = await asyncio.to_thread(
|
|
608
|
+
_fetch_xhr_html_via_http,
|
|
609
|
+
completed_url,
|
|
610
|
+
headers=_build_xhr_headers(_abs_url(base_url, "/completed/"), cookie_header),
|
|
611
|
+
expected_marker="league-tile",
|
|
612
|
+
return_last_on_marker_miss=True,
|
|
613
|
+
)
|
|
614
|
+
completed_marker = bool(completed_html.strip()) and ("league-tile" in completed_html)
|
|
615
|
+
completed_debug["attempts"].append(
|
|
616
|
+
{"method": "http", "bytes": len(completed_html), "marker_found": completed_marker}
|
|
617
|
+
)
|
|
618
|
+
|
|
619
|
+
if not completed_marker:
|
|
620
|
+
resp2 = await context.request.get(
|
|
621
|
+
completed_url,
|
|
622
|
+
headers=_build_xhr_headers(_abs_url(base_url, "/completed/"), cookie_header),
|
|
623
|
+
)
|
|
624
|
+
completed_html = await resp2.text()
|
|
625
|
+
completed_marker = bool(completed_html.strip()) and ("league-tile" in completed_html)
|
|
626
|
+
completed_debug["attempts"].append(
|
|
627
|
+
{
|
|
628
|
+
"method": "context_request",
|
|
629
|
+
"status": resp2.status,
|
|
630
|
+
"final_url": resp2.url,
|
|
631
|
+
"bytes": len(completed_html),
|
|
632
|
+
"marker_found": completed_marker,
|
|
633
|
+
}
|
|
634
|
+
)
|
|
635
|
+
|
|
636
|
+
if not completed_marker:
|
|
637
|
+
completed_html = await _fetch_xhr_html_in_page(
|
|
638
|
+
page,
|
|
639
|
+
completed_url,
|
|
640
|
+
referer=_abs_url(base_url, "/completed/"),
|
|
641
|
+
)
|
|
642
|
+
completed_marker = bool(completed_html.strip()) and ("league-tile" in completed_html)
|
|
643
|
+
completed_debug["attempts"].append(
|
|
644
|
+
{"method": "in_page", "bytes": len(completed_html), "marker_found": completed_marker}
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
_write_text(out_dir / "completedLeagues.html", completed_html)
|
|
648
|
+
_write_text(out_dir / "scrape.debug.json", json.dumps(
|
|
649
|
+
{
|
|
650
|
+
"storage_state_ref": storage_state_ref,
|
|
651
|
+
"cookie_header_source": "context" if cookie_header != initial_cookie_header else "storage_state",
|
|
652
|
+
"completed": manifest.get("debug"),
|
|
653
|
+
"completedLeagues": completed_debug,
|
|
654
|
+
},
|
|
655
|
+
indent=2,
|
|
656
|
+
))
|
|
657
|
+
leagues = _extract_completed_leagues(completed_html, base_url)
|
|
658
|
+
if not leagues:
|
|
659
|
+
leagues = _extract_completed_leagues(completed_page_html, base_url)
|
|
660
|
+
|
|
661
|
+
for league_meta in leagues:
|
|
662
|
+
league_id = league_meta["league_id"]
|
|
663
|
+
league_dir = out_dir / "leagues" / league_id
|
|
664
|
+
league_dir.mkdir(parents=True, exist_ok=True)
|
|
665
|
+
|
|
666
|
+
try:
|
|
667
|
+
rounds_xhr_url = _abs_url(base_url, f"/l/{league_id}/-/rounds")
|
|
668
|
+
rounds_html = await asyncio.to_thread(
|
|
669
|
+
_fetch_xhr_html_via_http,
|
|
670
|
+
rounds_xhr_url,
|
|
671
|
+
headers=_build_xhr_headers(league_meta["league_url"], cookie_header),
|
|
672
|
+
expected_marker="league-round-item",
|
|
673
|
+
return_last_on_marker_miss=True,
|
|
674
|
+
)
|
|
675
|
+
rounds_marker = bool(rounds_html.strip()) and ("league-round-item" in rounds_html)
|
|
676
|
+
if not rounds_marker:
|
|
677
|
+
rounds_html = await _fetch_xhr_html(
|
|
678
|
+
context.request,
|
|
679
|
+
rounds_xhr_url,
|
|
680
|
+
referer=league_meta["league_url"],
|
|
681
|
+
expected_marker="league-round-item",
|
|
682
|
+
cookie_header=cookie_header,
|
|
683
|
+
return_last_on_marker_miss=True,
|
|
684
|
+
)
|
|
685
|
+
rounds_marker = bool(rounds_html.strip()) and ("league-round-item" in rounds_html)
|
|
686
|
+
if not rounds_marker:
|
|
687
|
+
rounds_html = await _fetch_xhr_html_in_page(
|
|
688
|
+
page,
|
|
689
|
+
rounds_xhr_url,
|
|
690
|
+
referer=league_meta["league_url"],
|
|
691
|
+
)
|
|
692
|
+
_write_text(league_dir / "rounds.html", rounds_html)
|
|
693
|
+
|
|
694
|
+
rounds = _extract_completed_rounds(league_id, rounds_html, base_url)
|
|
695
|
+
|
|
696
|
+
for rd in rounds:
|
|
697
|
+
round_id = rd["round_id"]
|
|
698
|
+
round_dir = league_dir / "rounds" / round_id
|
|
699
|
+
round_dir.mkdir(parents=True, exist_ok=True)
|
|
700
|
+
|
|
701
|
+
results_html = await asyncio.to_thread(
|
|
702
|
+
_fetch_xhr_html_via_http,
|
|
703
|
+
rd["results_xhr_url"],
|
|
704
|
+
headers=_build_xhr_headers(league_meta["league_url"], cookie_header),
|
|
705
|
+
expected_marker="open.spotify.com/track/",
|
|
706
|
+
return_last_on_marker_miss=True,
|
|
707
|
+
)
|
|
708
|
+
results_marker = bool(results_html.strip()) and (
|
|
709
|
+
"open.spotify.com/track/" in results_html
|
|
710
|
+
)
|
|
711
|
+
if not results_marker:
|
|
712
|
+
results_html = await _fetch_xhr_html(
|
|
713
|
+
context.request,
|
|
714
|
+
rd["results_xhr_url"],
|
|
715
|
+
referer=league_meta["league_url"],
|
|
716
|
+
expected_marker="open.spotify.com/track/",
|
|
717
|
+
cookie_header=cookie_header,
|
|
718
|
+
return_last_on_marker_miss=True,
|
|
719
|
+
)
|
|
720
|
+
results_marker = bool(results_html.strip()) and (
|
|
721
|
+
"open.spotify.com/track/" in results_html
|
|
722
|
+
)
|
|
723
|
+
if not results_marker:
|
|
724
|
+
results_html = await _fetch_xhr_html_in_page(
|
|
725
|
+
page,
|
|
726
|
+
rd["results_xhr_url"],
|
|
727
|
+
referer=league_meta["league_url"],
|
|
728
|
+
)
|
|
729
|
+
_write_text(round_dir / "results.html", results_html)
|
|
730
|
+
|
|
731
|
+
submissions = _parse_round_results(results_html)
|
|
732
|
+
rd["submissions_total"] = len(submissions)
|
|
733
|
+
rd["votes_total"] = sum(len(s.get("votes", [])) for s in submissions)
|
|
734
|
+
rd["submissions"] = submissions
|
|
735
|
+
|
|
736
|
+
_write_text(
|
|
737
|
+
round_dir / "results.parsed.json",
|
|
738
|
+
json.dumps({"round_id": round_id, "submissions": submissions}, indent=2),
|
|
739
|
+
)
|
|
740
|
+
|
|
741
|
+
dates = [
|
|
742
|
+
rd.get("completed_date_utc")
|
|
743
|
+
for rd in rounds
|
|
744
|
+
if rd.get("completed_date_utc")
|
|
745
|
+
]
|
|
746
|
+
league_completed_date_utc = max(dates) if dates else None
|
|
747
|
+
|
|
748
|
+
league_obj = {
|
|
749
|
+
**league_meta,
|
|
750
|
+
"league_completed_date_utc": league_completed_date_utc,
|
|
751
|
+
"rounds_total": len(rounds),
|
|
752
|
+
"rounds": rounds,
|
|
753
|
+
}
|
|
754
|
+
|
|
755
|
+
_write_text(
|
|
756
|
+
league_dir / "league.summary.json", json.dumps(league_obj, indent=2)
|
|
757
|
+
)
|
|
758
|
+
manifest["leagues"].append(league_obj)
|
|
759
|
+
except Exception as exc:
|
|
760
|
+
manifest["errors"].append(
|
|
761
|
+
{
|
|
762
|
+
"type": type(exc).__name__,
|
|
763
|
+
"message": str(exc),
|
|
764
|
+
"league_id": league_id,
|
|
765
|
+
}
|
|
766
|
+
)
|
|
767
|
+
|
|
768
|
+
finally:
|
|
769
|
+
await context.close()
|
|
770
|
+
await browser.close()
|
|
771
|
+
|
|
772
|
+
manifest_path = out_dir / "manifest.json"
|
|
773
|
+
_write_text(manifest_path, json.dumps(manifest, indent=2))
|
|
774
|
+
return manifest, str(manifest_path.resolve())
|