@music-league-eras/local-runner 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@music-league-eras/local-runner",
3
- "version": "0.1.2",
3
+ "version": "0.1.4",
4
4
  "description": "Music League Eras local runner (npx wrapper around the Python scraper runner).",
5
5
  "type": "module",
6
6
  "license": "UNLICENSED",
@@ -9,7 +9,10 @@ from urllib.parse import urljoin, urlparse
9
9
 
10
10
  from playwright.async_api import async_playwright
11
11
 
12
- from ..services.viewer import extract_viewer_avatar_url, extract_viewer_user_id
12
+ from ..services.viewer import (
13
+ extract_viewer_avatar_url,
14
+ resolve_viewer_user_id,
15
+ )
13
16
 
14
17
 
15
18
  @dataclass(frozen=True)
@@ -233,9 +236,19 @@ async def capture_storage_state(
233
236
  viewer_user_id = None
234
237
  viewer_avatar_url = None
235
238
  try:
236
- html = await page.content()
237
- viewer_user_id = extract_viewer_user_id(html)
238
- viewer_avatar_url = extract_viewer_avatar_url(html)
239
+ html_candidates: list[str] = [await page.content()]
240
+ try:
241
+ home_url = urljoin(base_url.rstrip("/") + "/", "home/")
242
+ resp = await context.request.get(home_url)
243
+ if resp.ok:
244
+ html_candidates.append(await resp.text())
245
+ except Exception:
246
+ pass
247
+ viewer_user_id = resolve_viewer_user_id(
248
+ html_candidates=html_candidates,
249
+ storage_state=storage_state,
250
+ )
251
+ viewer_avatar_url = extract_viewer_avatar_url(html_candidates[0])
239
252
  except Exception:
240
253
  viewer_user_id = None
241
254
  viewer_avatar_url = None
@@ -8,13 +8,13 @@ import time
8
8
  import urllib.request
9
9
  from datetime import datetime, timezone
10
10
  from pathlib import Path
11
- from typing import Any, Optional
11
+ from typing import Any, Optional, cast
12
12
  from urllib.parse import urljoin, urlparse
13
13
 
14
14
  from bs4 import BeautifulSoup
15
15
  from playwright.async_api import async_playwright
16
16
 
17
- from .viewer import extract_viewer_user_id
17
+ from .viewer import resolve_viewer_user_id
18
18
 
19
19
  COMPLETED_LEAGUES_XHR = "/completed/-/completedLeagues"
20
20
  DEFAULT_USER_AGENT = (
@@ -559,6 +559,7 @@ async def build_manifest(
559
559
 
560
560
  if not storage_state_arg:
561
561
  raise RuntimeError("Missing storage state for session scrape.")
562
+ assert storage_state_payload is not None
562
563
 
563
564
  host = urlparse(base_url).hostname
564
565
  if not _has_host_cookie(storage_state_payload, host):
@@ -578,7 +579,7 @@ async def build_manifest(
578
579
 
579
580
  async with async_playwright() as playwright:
580
581
  browser = await playwright.chromium.launch(headless=headless)
581
- context = await browser.new_context(storage_state=storage_state_arg)
582
+ context = await browser.new_context(storage_state=cast(Any, storage_state_arg))
582
583
  page = await context.new_page()
583
584
  try:
584
585
  completed_page_url = _abs_url(base_url, "/completed/")
@@ -586,15 +587,54 @@ async def build_manifest(
586
587
  completed_final_url = page.url
587
588
  completed_page_html = await page.content()
588
589
  _write_text(out_dir / "completed.page.html", completed_page_html)
590
+
591
+ # The completed leagues list is loaded via htmx; give it a moment to hydrate so our
592
+ # fallback parser can still find league links even if the XHR helpers fail.
593
+ try:
594
+ await page.wait_for_selector(".league-tile, a[href^=\"/l/\"]", timeout=6_000)
595
+ completed_page_html = await page.content()
596
+ _write_text(out_dir / "completed.page.after.html", completed_page_html)
597
+ except Exception:
598
+ pass
589
599
  manifest["debug"] = {
590
600
  "completed_url": completed_page_url,
591
601
  "completed_final_url": completed_final_url,
592
602
  "completed_status": response.status if response else None,
593
603
  }
594
604
 
595
- cookie_header = await _cookie_header_from_context(context, base_url) or initial_cookie_header
596
-
597
- viewer_id = extract_viewer_user_id(completed_page_html)
605
+ cookie_header = (
606
+ await _cookie_header_from_context(context, base_url)
607
+ ) or initial_cookie_header
608
+
609
+ # Music League occasionally serves a shell DOM to Playwright's `page.content()` that
610
+ # does not include the `/user/<id>` link. Fall back to server-rendered HTML fetched
611
+ # via the authenticated browser context.
612
+ html_candidates: list[str] = [completed_page_html]
613
+ try:
614
+ resp_completed = await context.request.get(completed_page_url)
615
+ if resp_completed.ok:
616
+ html_candidates.append(await resp_completed.text())
617
+ except Exception:
618
+ pass
619
+ try:
620
+ home_url = _abs_url(base_url, "/home/")
621
+ resp_home = await context.request.get(home_url)
622
+ if resp_home.ok:
623
+ html_candidates.append(await resp_home.text())
624
+ except Exception:
625
+ pass
626
+ try:
627
+ settings_url = _abs_url(base_url, "/settings/")
628
+ resp_settings = await context.request.get(settings_url)
629
+ if resp_settings.ok:
630
+ html_candidates.append(await resp_settings.text())
631
+ except Exception:
632
+ pass
633
+
634
+ viewer_id = resolve_viewer_user_id(
635
+ html_candidates=html_candidates,
636
+ storage_state=storage_state_payload,
637
+ )
598
638
  manifest["viewer"] = {"user_id": viewer_id} if viewer_id else None
599
639
 
600
640
  completed_url = _abs_url(base_url, COMPLETED_LEAGUES_XHR)
@@ -738,11 +778,11 @@ async def build_manifest(
738
778
  json.dumps({"round_id": round_id, "submissions": submissions}, indent=2),
739
779
  )
740
780
 
741
- dates = [
742
- rd.get("completed_date_utc")
743
- for rd in rounds
744
- if rd.get("completed_date_utc")
745
- ]
781
+ dates: list[str] = []
782
+ for rd in rounds:
783
+ completed = rd.get("completed_date_utc")
784
+ if isinstance(completed, str) and completed:
785
+ dates.append(completed)
746
786
  league_completed_date_utc = max(dates) if dates else None
747
787
 
748
788
  league_obj = {
@@ -1,17 +1,41 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import re
4
- from typing import Optional
4
+ from typing import Any, Optional
5
5
 
6
6
  from bs4 import BeautifulSoup
7
7
 
8
8
  USER_HREF_RE = re.compile(r"^/user/([0-9a-f]{32})/?$", re.I)
9
9
  VIEW_PROFILE_LABEL = "view profile"
10
+ HEX_32_RE = re.compile(r"\b([0-9a-f]{32})\b", re.I)
11
+ USER_ASSET_PROFILE_RE = re.compile(r"/users/([0-9a-f]{32})/images/profile\b", re.I)
12
+ BEACON_USER_ID_RE = re.compile(
13
+ r"Beacon\(\s*['\"]session-data['\"]\s*,\s*\{[^}]*"
14
+ r"['\"]User ID['\"]\s*:\s*['\"]([0-9a-f]{32})['\"]",
15
+ re.I,
16
+ )
10
17
 
11
18
 
12
19
  def extract_viewer_user_id(html: str) -> Optional[str]:
13
20
  soup = BeautifulSoup(html or "", "html.parser")
14
21
 
22
+ # Some templates include the user id for support tooling (HelpScout Beacon).
23
+ beacon_match = BEACON_USER_ID_RE.search(html or "")
24
+ if beacon_match:
25
+ return beacon_match.group(1)
26
+
27
+ # Music League often embeds the viewer id in the header profile image URL, e.g.
28
+ # https://musicleague-user-assets.../users/<id>/images/profile?...
29
+ profile_img = soup.find(
30
+ "img",
31
+ src=lambda s: bool(s and USER_ASSET_PROFILE_RE.search(str(s))),
32
+ alt=lambda a: bool(a and "profile" in str(a).lower()),
33
+ )
34
+ if profile_img and profile_img.get("src"):
35
+ match = USER_ASSET_PROFILE_RE.search(str(profile_img.get("src") or ""))
36
+ if match:
37
+ return match.group(1)
38
+
15
39
  a_tag = soup.find(
16
40
  "a",
17
41
  href=lambda h: bool(h and USER_HREF_RE.match(h)),
@@ -27,9 +51,31 @@ def extract_viewer_user_id(html: str) -> Optional[str]:
27
51
  if match:
28
52
  return match.group(1)
29
53
 
54
+ # Fallback: if we see exactly one profile-image-style asset URL, treat it as the viewer.
55
+ asset_ids: set[str] = set()
56
+ for img in soup.select("img[src]"):
57
+ src = str(img.get("src") or "")
58
+ match = USER_ASSET_PROFILE_RE.search(src)
59
+ if match:
60
+ asset_ids.add(match.group(1))
61
+ if len(asset_ids) == 1:
62
+ return next(iter(asset_ids))
63
+
30
64
  return None
31
65
 
32
66
 
67
+ def resolve_viewer_user_id(
68
+ *,
69
+ html_candidates: list[str] | None,
70
+ storage_state: dict[str, Any] | None,
71
+ ) -> Optional[str]:
72
+ for html in html_candidates or []:
73
+ viewer_user_id = extract_viewer_user_id(html)
74
+ if viewer_user_id:
75
+ return viewer_user_id
76
+ return extract_viewer_user_id_from_storage_state(storage_state)
77
+
78
+
33
79
  def extract_viewer_avatar_url(html: str) -> Optional[str]:
34
80
  soup = BeautifulSoup(html or "", "html.parser")
35
81
  viewer_id = extract_viewer_user_id(html)
@@ -56,3 +102,94 @@ def extract_viewer_avatar_url(html: str) -> Optional[str]:
56
102
  return src
57
103
 
58
104
  return None
105
+
106
+
107
+ def extract_viewer_user_id_from_storage_state(
108
+ storage_state: dict[str, Any] | None,
109
+ ) -> Optional[str]:
110
+ if not storage_state:
111
+ return None
112
+
113
+ def first_hex(text: str | None) -> Optional[str]:
114
+ if not text:
115
+ return None
116
+ match = HEX_32_RE.search(text)
117
+ return match.group(1) if match else None
118
+
119
+ # Prefer localStorage keys that look explicitly user-related.
120
+ origins = storage_state.get("origins") or []
121
+ if isinstance(origins, list):
122
+ for origin in origins:
123
+ if not isinstance(origin, dict):
124
+ continue
125
+ local_storage = origin.get("localStorage") or []
126
+ if not isinstance(local_storage, list):
127
+ continue
128
+ for entry in local_storage:
129
+ if not isinstance(entry, dict):
130
+ continue
131
+ name = str(entry.get("name") or "")
132
+ value = entry.get("value")
133
+ if value is None:
134
+ continue
135
+ value_str = value if isinstance(value, str) else str(value)
136
+ if re.search(r"(user|viewer|account|profile)", name, re.I):
137
+ candidate = first_hex(value_str)
138
+ if candidate:
139
+ return candidate
140
+
141
+ # Try JSON blobs stored as strings.
142
+ if value_str and value_str.lstrip().startswith(("{", "[")):
143
+ candidate = _extract_hex_from_json_like(value_str)
144
+ if candidate:
145
+ return candidate
146
+
147
+ # Next: cookies with user-ish names that directly store a 32-hex id.
148
+ cookies = storage_state.get("cookies") or []
149
+ if isinstance(cookies, list):
150
+ for cookie in cookies:
151
+ if not isinstance(cookie, dict):
152
+ continue
153
+ name = str(cookie.get("name") or "")
154
+ value = str(cookie.get("value") or "")
155
+ if re.search(r"(user|uid)", name, re.I):
156
+ candidate = first_hex(value)
157
+ if candidate:
158
+ return candidate
159
+
160
+ # Last: if there's exactly one unique 32-hex anywhere in localStorage values, use it.
161
+ candidates: set[str] = set()
162
+ if isinstance(origins, list):
163
+ for origin in origins:
164
+ if not isinstance(origin, dict):
165
+ continue
166
+ local_storage = origin.get("localStorage") or []
167
+ if not isinstance(local_storage, list):
168
+ continue
169
+ for entry in local_storage:
170
+ if not isinstance(entry, dict):
171
+ continue
172
+ value = entry.get("value")
173
+ if value is None:
174
+ continue
175
+ value_str = value if isinstance(value, str) else str(value)
176
+ for match in HEX_32_RE.finditer(value_str):
177
+ candidates.add(match.group(1))
178
+ if len(candidates) == 1:
179
+ return next(iter(candidates))
180
+
181
+ return None
182
+
183
+
184
+ def _extract_hex_from_json_like(value: str) -> Optional[str]:
185
+ # Keep it simple: avoid importing json here; we only need a heuristic.
186
+ # Common patterns: {"user_id":"..."} or {"userId":"..."} nested inside a blob.
187
+ for pattern in (
188
+ r"['\"]user_id['\"]\s*:\s*['\"]([0-9a-f]{32})['\"]",
189
+ r"['\"]userId['\"]\s*:\s*['\"]([0-9a-f]{32})['\"]",
190
+ r"['\"]viewer_user_id['\"]\s*:\s*['\"]([0-9a-f]{32})['\"]",
191
+ ):
192
+ match = re.search(pattern, value, re.I)
193
+ if match:
194
+ return match.group(1)
195
+ return None