npm - @music-league-eras/local-runner - Versions diffs - 0.1.2 → 0.1.4 - Mend

@music-league-eras/local-runner 0.1.2 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json +1 -1
package/vendor/python/app/browser/session.py +17 -4
package/vendor/python/app/services/scrape_manifest.py +51 -11
package/vendor/python/app/services/viewer.py +138 -1

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@music-league-eras/local-runner",
-  "version": "0.1.2",
+  "version": "0.1.4",
   "description": "Music League Eras local runner (npx wrapper around the Python scraper runner).",
   "type": "module",
   "license": "UNLICENSED",

package/vendor/python/app/browser/session.py CHANGED Viewed

@@ -9,7 +9,10 @@ from urllib.parse import urljoin, urlparse
 from playwright.async_api import async_playwright
-from ..services.viewer import extract_viewer_avatar_url, extract_viewer_user_id
+from ..services.viewer import (
+    extract_viewer_avatar_url,
+    resolve_viewer_user_id,
+)
 @dataclass(frozen=True)
@@ -233,9 +236,19 @@ async def capture_storage_state(
                         viewer_user_id = None
                         viewer_avatar_url = None
                         try:
-                            html = await page.content()
-                            viewer_user_id = extract_viewer_user_id(html)
-                            viewer_avatar_url = extract_viewer_avatar_url(html)
+                            html_candidates: list[str] = [await page.content()]
+                            try:
+                                home_url = urljoin(base_url.rstrip("/") + "/", "home/")
+                                resp = await context.request.get(home_url)
+                                if resp.ok:
+                                    html_candidates.append(await resp.text())
+                            except Exception:
+                                pass
+                            viewer_user_id = resolve_viewer_user_id(
+                                html_candidates=html_candidates,
+                                storage_state=storage_state,
+                            )
+                            viewer_avatar_url = extract_viewer_avatar_url(html_candidates[0])
                         except Exception:
                             viewer_user_id = None
                             viewer_avatar_url = None

package/vendor/python/app/services/scrape_manifest.py CHANGED Viewed

@@ -8,13 +8,13 @@ import time
 import urllib.request
 from datetime import datetime, timezone
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, Optional, cast
 from urllib.parse import urljoin, urlparse
 from bs4 import BeautifulSoup
 from playwright.async_api import async_playwright
-from .viewer import extract_viewer_user_id
+from .viewer import resolve_viewer_user_id
 COMPLETED_LEAGUES_XHR = "/completed/-/completedLeagues"
 DEFAULT_USER_AGENT = (
@@ -559,6 +559,7 @@ async def build_manifest(
     if not storage_state_arg:
         raise RuntimeError("Missing storage state for session scrape.")
+    assert storage_state_payload is not None
     host = urlparse(base_url).hostname
     if not _has_host_cookie(storage_state_payload, host):
@@ -578,7 +579,7 @@ async def build_manifest(
     async with async_playwright() as playwright:
         browser = await playwright.chromium.launch(headless=headless)
-        context = await browser.new_context(storage_state=storage_state_arg)
+        context = await browser.new_context(storage_state=cast(Any, storage_state_arg))
         page = await context.new_page()
         try:
             completed_page_url = _abs_url(base_url, "/completed/")
@@ -586,15 +587,54 @@ async def build_manifest(
             completed_final_url = page.url
             completed_page_html = await page.content()
             _write_text(out_dir / "completed.page.html", completed_page_html)
+            # The completed leagues list is loaded via htmx; give it a moment to hydrate so our
+            # fallback parser can still find league links even if the XHR helpers fail.
+            try:
+                await page.wait_for_selector(".league-tile, a[href^=\"/l/\"]", timeout=6_000)
+                completed_page_html = await page.content()
+                _write_text(out_dir / "completed.page.after.html", completed_page_html)
+            except Exception:
+                pass
             manifest["debug"] = {
                 "completed_url": completed_page_url,
                 "completed_final_url": completed_final_url,
                 "completed_status": response.status if response else None,
             }
-            cookie_header = await _cookie_header_from_context(context, base_url) or initial_cookie_header
-            viewer_id = extract_viewer_user_id(completed_page_html)
+            cookie_header = (
+                await _cookie_header_from_context(context, base_url)
+            ) or initial_cookie_header
+            # Music League occasionally serves a shell DOM to Playwright's `page.content()` that
+            # does not include the `/user/<id>` link. Fall back to server-rendered HTML fetched
+            # via the authenticated browser context.
+            html_candidates: list[str] = [completed_page_html]
+            try:
+                resp_completed = await context.request.get(completed_page_url)
+                if resp_completed.ok:
+                    html_candidates.append(await resp_completed.text())
+            except Exception:
+                pass
+            try:
+                home_url = _abs_url(base_url, "/home/")
+                resp_home = await context.request.get(home_url)
+                if resp_home.ok:
+                    html_candidates.append(await resp_home.text())
+            except Exception:
+                pass
+            try:
+                settings_url = _abs_url(base_url, "/settings/")
+                resp_settings = await context.request.get(settings_url)
+                if resp_settings.ok:
+                    html_candidates.append(await resp_settings.text())
+            except Exception:
+                pass
+            viewer_id = resolve_viewer_user_id(
+                html_candidates=html_candidates,
+                storage_state=storage_state_payload,
+            )
             manifest["viewer"] = {"user_id": viewer_id} if viewer_id else None
             completed_url = _abs_url(base_url, COMPLETED_LEAGUES_XHR)
@@ -738,11 +778,11 @@ async def build_manifest(
                             json.dumps({"round_id": round_id, "submissions": submissions}, indent=2),
                         )
-                    dates = [
-                        rd.get("completed_date_utc")
-                        for rd in rounds
-                        if rd.get("completed_date_utc")
-                    ]
+                    dates: list[str] = []
+                    for rd in rounds:
+                        completed = rd.get("completed_date_utc")
+                        if isinstance(completed, str) and completed:
+                            dates.append(completed)
                     league_completed_date_utc = max(dates) if dates else None
                     league_obj = {

package/vendor/python/app/services/viewer.py CHANGED Viewed

@@ -1,17 +1,41 @@
 from __future__ import annotations
 import re
-from typing import Optional
+from typing import Any, Optional
 from bs4 import BeautifulSoup
 USER_HREF_RE = re.compile(r"^/user/([0-9a-f]{32})/?$", re.I)
 VIEW_PROFILE_LABEL = "view profile"
+HEX_32_RE = re.compile(r"\b([0-9a-f]{32})\b", re.I)
+USER_ASSET_PROFILE_RE = re.compile(r"/users/([0-9a-f]{32})/images/profile\b", re.I)
+BEACON_USER_ID_RE = re.compile(
+    r"Beacon\(\s*['\"]session-data['\"]\s*,\s*\{[^}]*"
+    r"['\"]User ID['\"]\s*:\s*['\"]([0-9a-f]{32})['\"]",
+    re.I,
+)
 def extract_viewer_user_id(html: str) -> Optional[str]:
     soup = BeautifulSoup(html or "", "html.parser")
+    # Some templates include the user id for support tooling (HelpScout Beacon).
+    beacon_match = BEACON_USER_ID_RE.search(html or "")
+    if beacon_match:
+        return beacon_match.group(1)
+    # Music League often embeds the viewer id in the header profile image URL, e.g.
+    # https://musicleague-user-assets.../users/<id>/images/profile?...
+    profile_img = soup.find(
+        "img",
+        src=lambda s: bool(s and USER_ASSET_PROFILE_RE.search(str(s))),
+        alt=lambda a: bool(a and "profile" in str(a).lower()),
+    )
+    if profile_img and profile_img.get("src"):
+        match = USER_ASSET_PROFILE_RE.search(str(profile_img.get("src") or ""))
+        if match:
+            return match.group(1)
     a_tag = soup.find(
         "a",
         href=lambda h: bool(h and USER_HREF_RE.match(h)),
@@ -27,9 +51,31 @@ def extract_viewer_user_id(html: str) -> Optional[str]:
         if match:
             return match.group(1)
+    # Fallback: if we see exactly one profile-image-style asset URL, treat it as the viewer.
+    asset_ids: set[str] = set()
+    for img in soup.select("img[src]"):
+        src = str(img.get("src") or "")
+        match = USER_ASSET_PROFILE_RE.search(src)
+        if match:
+            asset_ids.add(match.group(1))
+    if len(asset_ids) == 1:
+        return next(iter(asset_ids))
     return None
+def resolve_viewer_user_id(
+    *,
+    html_candidates: list[str] | None,
+    storage_state: dict[str, Any] | None,
+) -> Optional[str]:
+    for html in html_candidates or []:
+        viewer_user_id = extract_viewer_user_id(html)
+        if viewer_user_id:
+            return viewer_user_id
+    return extract_viewer_user_id_from_storage_state(storage_state)
 def extract_viewer_avatar_url(html: str) -> Optional[str]:
     soup = BeautifulSoup(html or "", "html.parser")
     viewer_id = extract_viewer_user_id(html)
@@ -56,3 +102,94 @@ def extract_viewer_avatar_url(html: str) -> Optional[str]:
             return src
     return None
+def extract_viewer_user_id_from_storage_state(
+    storage_state: dict[str, Any] | None,
+) -> Optional[str]:
+    if not storage_state:
+        return None
+    def first_hex(text: str | None) -> Optional[str]:
+        if not text:
+            return None
+        match = HEX_32_RE.search(text)
+        return match.group(1) if match else None
+    # Prefer localStorage keys that look explicitly user-related.
+    origins = storage_state.get("origins") or []
+    if isinstance(origins, list):
+        for origin in origins:
+            if not isinstance(origin, dict):
+                continue
+            local_storage = origin.get("localStorage") or []
+            if not isinstance(local_storage, list):
+                continue
+            for entry in local_storage:
+                if not isinstance(entry, dict):
+                    continue
+                name = str(entry.get("name") or "")
+                value = entry.get("value")
+                if value is None:
+                    continue
+                value_str = value if isinstance(value, str) else str(value)
+                if re.search(r"(user|viewer|account|profile)", name, re.I):
+                    candidate = first_hex(value_str)
+                    if candidate:
+                        return candidate
+                # Try JSON blobs stored as strings.
+                if value_str and value_str.lstrip().startswith(("{", "[")):
+                    candidate = _extract_hex_from_json_like(value_str)
+                    if candidate:
+                        return candidate
+    # Next: cookies with user-ish names that directly store a 32-hex id.
+    cookies = storage_state.get("cookies") or []
+    if isinstance(cookies, list):
+        for cookie in cookies:
+            if not isinstance(cookie, dict):
+                continue
+            name = str(cookie.get("name") or "")
+            value = str(cookie.get("value") or "")
+            if re.search(r"(user|uid)", name, re.I):
+                candidate = first_hex(value)
+                if candidate:
+                    return candidate
+    # Last: if there's exactly one unique 32-hex anywhere in localStorage values, use it.
+    candidates: set[str] = set()
+    if isinstance(origins, list):
+        for origin in origins:
+            if not isinstance(origin, dict):
+                continue
+            local_storage = origin.get("localStorage") or []
+            if not isinstance(local_storage, list):
+                continue
+            for entry in local_storage:
+                if not isinstance(entry, dict):
+                    continue
+                value = entry.get("value")
+                if value is None:
+                    continue
+                value_str = value if isinstance(value, str) else str(value)
+                for match in HEX_32_RE.finditer(value_str):
+                    candidates.add(match.group(1))
+    if len(candidates) == 1:
+        return next(iter(candidates))
+    return None
+def _extract_hex_from_json_like(value: str) -> Optional[str]:
+    # Keep it simple: avoid importing json here; we only need a heuristic.
+    # Common patterns: {"user_id":"..."} or {"userId":"..."} nested inside a blob.
+    for pattern in (
+        r"['\"]user_id['\"]\s*:\s*['\"]([0-9a-f]{32})['\"]",
+        r"['\"]userId['\"]\s*:\s*['\"]([0-9a-f]{32})['\"]",
+        r"['\"]viewer_user_id['\"]\s*:\s*['\"]([0-9a-f]{32})['\"]",
+    ):
+        match = re.search(pattern, value, re.I)
+        if match:
+            return match.group(1)
+    return None