@music-league-eras/local-runner 0.1.2 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -9,7 +9,10 @@ from urllib.parse import urljoin, urlparse
|
|
|
9
9
|
|
|
10
10
|
from playwright.async_api import async_playwright
|
|
11
11
|
|
|
12
|
-
from ..services.viewer import
|
|
12
|
+
from ..services.viewer import (
|
|
13
|
+
extract_viewer_avatar_url,
|
|
14
|
+
resolve_viewer_user_id,
|
|
15
|
+
)
|
|
13
16
|
|
|
14
17
|
|
|
15
18
|
@dataclass(frozen=True)
|
|
@@ -233,9 +236,19 @@ async def capture_storage_state(
|
|
|
233
236
|
viewer_user_id = None
|
|
234
237
|
viewer_avatar_url = None
|
|
235
238
|
try:
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
+
html_candidates: list[str] = [await page.content()]
|
|
240
|
+
try:
|
|
241
|
+
home_url = urljoin(base_url.rstrip("/") + "/", "home/")
|
|
242
|
+
resp = await context.request.get(home_url)
|
|
243
|
+
if resp.ok:
|
|
244
|
+
html_candidates.append(await resp.text())
|
|
245
|
+
except Exception:
|
|
246
|
+
pass
|
|
247
|
+
viewer_user_id = resolve_viewer_user_id(
|
|
248
|
+
html_candidates=html_candidates,
|
|
249
|
+
storage_state=storage_state,
|
|
250
|
+
)
|
|
251
|
+
viewer_avatar_url = extract_viewer_avatar_url(html_candidates[0])
|
|
239
252
|
except Exception:
|
|
240
253
|
viewer_user_id = None
|
|
241
254
|
viewer_avatar_url = None
|
|
@@ -8,13 +8,13 @@ import time
|
|
|
8
8
|
import urllib.request
|
|
9
9
|
from datetime import datetime, timezone
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import Any, Optional
|
|
11
|
+
from typing import Any, Optional, cast
|
|
12
12
|
from urllib.parse import urljoin, urlparse
|
|
13
13
|
|
|
14
14
|
from bs4 import BeautifulSoup
|
|
15
15
|
from playwright.async_api import async_playwright
|
|
16
16
|
|
|
17
|
-
from .viewer import
|
|
17
|
+
from .viewer import resolve_viewer_user_id
|
|
18
18
|
|
|
19
19
|
COMPLETED_LEAGUES_XHR = "/completed/-/completedLeagues"
|
|
20
20
|
DEFAULT_USER_AGENT = (
|
|
@@ -559,6 +559,7 @@ async def build_manifest(
|
|
|
559
559
|
|
|
560
560
|
if not storage_state_arg:
|
|
561
561
|
raise RuntimeError("Missing storage state for session scrape.")
|
|
562
|
+
assert storage_state_payload is not None
|
|
562
563
|
|
|
563
564
|
host = urlparse(base_url).hostname
|
|
564
565
|
if not _has_host_cookie(storage_state_payload, host):
|
|
@@ -578,7 +579,7 @@ async def build_manifest(
|
|
|
578
579
|
|
|
579
580
|
async with async_playwright() as playwright:
|
|
580
581
|
browser = await playwright.chromium.launch(headless=headless)
|
|
581
|
-
context = await browser.new_context(storage_state=storage_state_arg)
|
|
582
|
+
context = await browser.new_context(storage_state=cast(Any, storage_state_arg))
|
|
582
583
|
page = await context.new_page()
|
|
583
584
|
try:
|
|
584
585
|
completed_page_url = _abs_url(base_url, "/completed/")
|
|
@@ -586,15 +587,54 @@ async def build_manifest(
|
|
|
586
587
|
completed_final_url = page.url
|
|
587
588
|
completed_page_html = await page.content()
|
|
588
589
|
_write_text(out_dir / "completed.page.html", completed_page_html)
|
|
590
|
+
|
|
591
|
+
# The completed leagues list is loaded via htmx; give it a moment to hydrate so our
|
|
592
|
+
# fallback parser can still find league links even if the XHR helpers fail.
|
|
593
|
+
try:
|
|
594
|
+
await page.wait_for_selector(".league-tile, a[href^=\"/l/\"]", timeout=6_000)
|
|
595
|
+
completed_page_html = await page.content()
|
|
596
|
+
_write_text(out_dir / "completed.page.after.html", completed_page_html)
|
|
597
|
+
except Exception:
|
|
598
|
+
pass
|
|
589
599
|
manifest["debug"] = {
|
|
590
600
|
"completed_url": completed_page_url,
|
|
591
601
|
"completed_final_url": completed_final_url,
|
|
592
602
|
"completed_status": response.status if response else None,
|
|
593
603
|
}
|
|
594
604
|
|
|
595
|
-
cookie_header =
|
|
596
|
-
|
|
597
|
-
|
|
605
|
+
cookie_header = (
|
|
606
|
+
await _cookie_header_from_context(context, base_url)
|
|
607
|
+
) or initial_cookie_header
|
|
608
|
+
|
|
609
|
+
# Music League occasionally serves a shell DOM to Playwright's `page.content()` that
|
|
610
|
+
# does not include the `/user/<id>` link. Fall back to server-rendered HTML fetched
|
|
611
|
+
# via the authenticated browser context.
|
|
612
|
+
html_candidates: list[str] = [completed_page_html]
|
|
613
|
+
try:
|
|
614
|
+
resp_completed = await context.request.get(completed_page_url)
|
|
615
|
+
if resp_completed.ok:
|
|
616
|
+
html_candidates.append(await resp_completed.text())
|
|
617
|
+
except Exception:
|
|
618
|
+
pass
|
|
619
|
+
try:
|
|
620
|
+
home_url = _abs_url(base_url, "/home/")
|
|
621
|
+
resp_home = await context.request.get(home_url)
|
|
622
|
+
if resp_home.ok:
|
|
623
|
+
html_candidates.append(await resp_home.text())
|
|
624
|
+
except Exception:
|
|
625
|
+
pass
|
|
626
|
+
try:
|
|
627
|
+
settings_url = _abs_url(base_url, "/settings/")
|
|
628
|
+
resp_settings = await context.request.get(settings_url)
|
|
629
|
+
if resp_settings.ok:
|
|
630
|
+
html_candidates.append(await resp_settings.text())
|
|
631
|
+
except Exception:
|
|
632
|
+
pass
|
|
633
|
+
|
|
634
|
+
viewer_id = resolve_viewer_user_id(
|
|
635
|
+
html_candidates=html_candidates,
|
|
636
|
+
storage_state=storage_state_payload,
|
|
637
|
+
)
|
|
598
638
|
manifest["viewer"] = {"user_id": viewer_id} if viewer_id else None
|
|
599
639
|
|
|
600
640
|
completed_url = _abs_url(base_url, COMPLETED_LEAGUES_XHR)
|
|
@@ -738,11 +778,11 @@ async def build_manifest(
|
|
|
738
778
|
json.dumps({"round_id": round_id, "submissions": submissions}, indent=2),
|
|
739
779
|
)
|
|
740
780
|
|
|
741
|
-
dates = [
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
if
|
|
745
|
-
|
|
781
|
+
dates: list[str] = []
|
|
782
|
+
for rd in rounds:
|
|
783
|
+
completed = rd.get("completed_date_utc")
|
|
784
|
+
if isinstance(completed, str) and completed:
|
|
785
|
+
dates.append(completed)
|
|
746
786
|
league_completed_date_utc = max(dates) if dates else None
|
|
747
787
|
|
|
748
788
|
league_obj = {
|
|
@@ -1,17 +1,41 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import re
|
|
4
|
-
from typing import Optional
|
|
4
|
+
from typing import Any, Optional
|
|
5
5
|
|
|
6
6
|
from bs4 import BeautifulSoup
|
|
7
7
|
|
|
8
8
|
USER_HREF_RE = re.compile(r"^/user/([0-9a-f]{32})/?$", re.I)
|
|
9
9
|
VIEW_PROFILE_LABEL = "view profile"
|
|
10
|
+
HEX_32_RE = re.compile(r"\b([0-9a-f]{32})\b", re.I)
|
|
11
|
+
USER_ASSET_PROFILE_RE = re.compile(r"/users/([0-9a-f]{32})/images/profile\b", re.I)
|
|
12
|
+
BEACON_USER_ID_RE = re.compile(
|
|
13
|
+
r"Beacon\(\s*['\"]session-data['\"]\s*,\s*\{[^}]*"
|
|
14
|
+
r"['\"]User ID['\"]\s*:\s*['\"]([0-9a-f]{32})['\"]",
|
|
15
|
+
re.I,
|
|
16
|
+
)
|
|
10
17
|
|
|
11
18
|
|
|
12
19
|
def extract_viewer_user_id(html: str) -> Optional[str]:
|
|
13
20
|
soup = BeautifulSoup(html or "", "html.parser")
|
|
14
21
|
|
|
22
|
+
# Some templates include the user id for support tooling (HelpScout Beacon).
|
|
23
|
+
beacon_match = BEACON_USER_ID_RE.search(html or "")
|
|
24
|
+
if beacon_match:
|
|
25
|
+
return beacon_match.group(1)
|
|
26
|
+
|
|
27
|
+
# Music League often embeds the viewer id in the header profile image URL, e.g.
|
|
28
|
+
# https://musicleague-user-assets.../users/<id>/images/profile?...
|
|
29
|
+
profile_img = soup.find(
|
|
30
|
+
"img",
|
|
31
|
+
src=lambda s: bool(s and USER_ASSET_PROFILE_RE.search(str(s))),
|
|
32
|
+
alt=lambda a: bool(a and "profile" in str(a).lower()),
|
|
33
|
+
)
|
|
34
|
+
if profile_img and profile_img.get("src"):
|
|
35
|
+
match = USER_ASSET_PROFILE_RE.search(str(profile_img.get("src") or ""))
|
|
36
|
+
if match:
|
|
37
|
+
return match.group(1)
|
|
38
|
+
|
|
15
39
|
a_tag = soup.find(
|
|
16
40
|
"a",
|
|
17
41
|
href=lambda h: bool(h and USER_HREF_RE.match(h)),
|
|
@@ -27,9 +51,31 @@ def extract_viewer_user_id(html: str) -> Optional[str]:
|
|
|
27
51
|
if match:
|
|
28
52
|
return match.group(1)
|
|
29
53
|
|
|
54
|
+
# Fallback: if we see exactly one profile-image-style asset URL, treat it as the viewer.
|
|
55
|
+
asset_ids: set[str] = set()
|
|
56
|
+
for img in soup.select("img[src]"):
|
|
57
|
+
src = str(img.get("src") or "")
|
|
58
|
+
match = USER_ASSET_PROFILE_RE.search(src)
|
|
59
|
+
if match:
|
|
60
|
+
asset_ids.add(match.group(1))
|
|
61
|
+
if len(asset_ids) == 1:
|
|
62
|
+
return next(iter(asset_ids))
|
|
63
|
+
|
|
30
64
|
return None
|
|
31
65
|
|
|
32
66
|
|
|
67
|
+
def resolve_viewer_user_id(
|
|
68
|
+
*,
|
|
69
|
+
html_candidates: list[str] | None,
|
|
70
|
+
storage_state: dict[str, Any] | None,
|
|
71
|
+
) -> Optional[str]:
|
|
72
|
+
for html in html_candidates or []:
|
|
73
|
+
viewer_user_id = extract_viewer_user_id(html)
|
|
74
|
+
if viewer_user_id:
|
|
75
|
+
return viewer_user_id
|
|
76
|
+
return extract_viewer_user_id_from_storage_state(storage_state)
|
|
77
|
+
|
|
78
|
+
|
|
33
79
|
def extract_viewer_avatar_url(html: str) -> Optional[str]:
|
|
34
80
|
soup = BeautifulSoup(html or "", "html.parser")
|
|
35
81
|
viewer_id = extract_viewer_user_id(html)
|
|
@@ -56,3 +102,94 @@ def extract_viewer_avatar_url(html: str) -> Optional[str]:
|
|
|
56
102
|
return src
|
|
57
103
|
|
|
58
104
|
return None
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def extract_viewer_user_id_from_storage_state(
|
|
108
|
+
storage_state: dict[str, Any] | None,
|
|
109
|
+
) -> Optional[str]:
|
|
110
|
+
if not storage_state:
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
def first_hex(text: str | None) -> Optional[str]:
|
|
114
|
+
if not text:
|
|
115
|
+
return None
|
|
116
|
+
match = HEX_32_RE.search(text)
|
|
117
|
+
return match.group(1) if match else None
|
|
118
|
+
|
|
119
|
+
# Prefer localStorage keys that look explicitly user-related.
|
|
120
|
+
origins = storage_state.get("origins") or []
|
|
121
|
+
if isinstance(origins, list):
|
|
122
|
+
for origin in origins:
|
|
123
|
+
if not isinstance(origin, dict):
|
|
124
|
+
continue
|
|
125
|
+
local_storage = origin.get("localStorage") or []
|
|
126
|
+
if not isinstance(local_storage, list):
|
|
127
|
+
continue
|
|
128
|
+
for entry in local_storage:
|
|
129
|
+
if not isinstance(entry, dict):
|
|
130
|
+
continue
|
|
131
|
+
name = str(entry.get("name") or "")
|
|
132
|
+
value = entry.get("value")
|
|
133
|
+
if value is None:
|
|
134
|
+
continue
|
|
135
|
+
value_str = value if isinstance(value, str) else str(value)
|
|
136
|
+
if re.search(r"(user|viewer|account|profile)", name, re.I):
|
|
137
|
+
candidate = first_hex(value_str)
|
|
138
|
+
if candidate:
|
|
139
|
+
return candidate
|
|
140
|
+
|
|
141
|
+
# Try JSON blobs stored as strings.
|
|
142
|
+
if value_str and value_str.lstrip().startswith(("{", "[")):
|
|
143
|
+
candidate = _extract_hex_from_json_like(value_str)
|
|
144
|
+
if candidate:
|
|
145
|
+
return candidate
|
|
146
|
+
|
|
147
|
+
# Next: cookies with user-ish names that directly store a 32-hex id.
|
|
148
|
+
cookies = storage_state.get("cookies") or []
|
|
149
|
+
if isinstance(cookies, list):
|
|
150
|
+
for cookie in cookies:
|
|
151
|
+
if not isinstance(cookie, dict):
|
|
152
|
+
continue
|
|
153
|
+
name = str(cookie.get("name") or "")
|
|
154
|
+
value = str(cookie.get("value") or "")
|
|
155
|
+
if re.search(r"(user|uid)", name, re.I):
|
|
156
|
+
candidate = first_hex(value)
|
|
157
|
+
if candidate:
|
|
158
|
+
return candidate
|
|
159
|
+
|
|
160
|
+
# Last: if there's exactly one unique 32-hex anywhere in localStorage values, use it.
|
|
161
|
+
candidates: set[str] = set()
|
|
162
|
+
if isinstance(origins, list):
|
|
163
|
+
for origin in origins:
|
|
164
|
+
if not isinstance(origin, dict):
|
|
165
|
+
continue
|
|
166
|
+
local_storage = origin.get("localStorage") or []
|
|
167
|
+
if not isinstance(local_storage, list):
|
|
168
|
+
continue
|
|
169
|
+
for entry in local_storage:
|
|
170
|
+
if not isinstance(entry, dict):
|
|
171
|
+
continue
|
|
172
|
+
value = entry.get("value")
|
|
173
|
+
if value is None:
|
|
174
|
+
continue
|
|
175
|
+
value_str = value if isinstance(value, str) else str(value)
|
|
176
|
+
for match in HEX_32_RE.finditer(value_str):
|
|
177
|
+
candidates.add(match.group(1))
|
|
178
|
+
if len(candidates) == 1:
|
|
179
|
+
return next(iter(candidates))
|
|
180
|
+
|
|
181
|
+
return None
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
def _extract_hex_from_json_like(value: str) -> Optional[str]:
|
|
185
|
+
# Keep it simple: avoid importing json here; we only need a heuristic.
|
|
186
|
+
# Common patterns: {"user_id":"..."} or {"userId":"..."} nested inside a blob.
|
|
187
|
+
for pattern in (
|
|
188
|
+
r"['\"]user_id['\"]\s*:\s*['\"]([0-9a-f]{32})['\"]",
|
|
189
|
+
r"['\"]userId['\"]\s*:\s*['\"]([0-9a-f]{32})['\"]",
|
|
190
|
+
r"['\"]viewer_user_id['\"]\s*:\s*['\"]([0-9a-f]{32})['\"]",
|
|
191
|
+
):
|
|
192
|
+
match = re.search(pattern, value, re.I)
|
|
193
|
+
if match:
|
|
194
|
+
return match.group(1)
|
|
195
|
+
return None
|