@music-league-eras/local-runner 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@music-league-eras/local-runner",
3
- "version": "0.1.2",
3
+ "version": "0.1.3",
4
4
  "description": "Music League Eras local runner (npx wrapper around the Python scraper runner).",
5
5
  "type": "module",
6
6
  "license": "UNLICENSED",
@@ -9,7 +9,11 @@ from urllib.parse import urljoin, urlparse
9
9
 
10
10
  from playwright.async_api import async_playwright
11
11
 
12
- from ..services.viewer import extract_viewer_avatar_url, extract_viewer_user_id
12
+ from ..services.viewer import (
13
+ extract_viewer_avatar_url,
14
+ extract_viewer_user_id,
15
+ extract_viewer_user_id_from_storage_state,
16
+ )
13
17
 
14
18
 
15
19
  @dataclass(frozen=True)
@@ -235,6 +239,8 @@ async def capture_storage_state(
235
239
  try:
236
240
  html = await page.content()
237
241
  viewer_user_id = extract_viewer_user_id(html)
242
+ if not viewer_user_id:
243
+ viewer_user_id = extract_viewer_user_id_from_storage_state(storage_state)
238
244
  viewer_avatar_url = extract_viewer_avatar_url(html)
239
245
  except Exception:
240
246
  viewer_user_id = None
@@ -14,7 +14,7 @@ from urllib.parse import urljoin, urlparse
14
14
  from bs4 import BeautifulSoup
15
15
  from playwright.async_api import async_playwright
16
16
 
17
- from .viewer import extract_viewer_user_id
17
+ from .viewer import extract_viewer_user_id, extract_viewer_user_id_from_storage_state
18
18
 
19
19
  COMPLETED_LEAGUES_XHR = "/completed/-/completedLeagues"
20
20
  DEFAULT_USER_AGENT = (
@@ -586,6 +586,15 @@ async def build_manifest(
586
586
  completed_final_url = page.url
587
587
  completed_page_html = await page.content()
588
588
  _write_text(out_dir / "completed.page.html", completed_page_html)
589
+
590
+ # The completed leagues list is loaded via htmx; give it a moment to hydrate so our
591
+ # fallback parser can still find league links even if the XHR helpers fail.
592
+ try:
593
+ await page.wait_for_selector(".league-tile, a[href^=\"/l/\"]", timeout=6_000)
594
+ completed_page_html = await page.content()
595
+ _write_text(out_dir / "completed.page.after.html", completed_page_html)
596
+ except Exception:
597
+ pass
589
598
  manifest["debug"] = {
590
599
  "completed_url": completed_page_url,
591
600
  "completed_final_url": completed_final_url,
@@ -595,6 +604,8 @@ async def build_manifest(
595
604
  cookie_header = await _cookie_header_from_context(context, base_url) or initial_cookie_header
596
605
 
597
606
  viewer_id = extract_viewer_user_id(completed_page_html)
607
+ if not viewer_id:
608
+ viewer_id = extract_viewer_user_id_from_storage_state(storage_state_payload)
598
609
  manifest["viewer"] = {"user_id": viewer_id} if viewer_id else None
599
610
 
600
611
  completed_url = _abs_url(base_url, COMPLETED_LEAGUES_XHR)
@@ -1,17 +1,28 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import re
4
+ from typing import Any
4
5
  from typing import Optional
5
6
 
6
7
  from bs4 import BeautifulSoup
7
8
 
8
9
  USER_HREF_RE = re.compile(r"^/user/([0-9a-f]{32})/?$", re.I)
9
10
  VIEW_PROFILE_LABEL = "view profile"
11
+ HEX_32_RE = re.compile(r"\b([0-9a-f]{32})\b", re.I)
12
+ BEACON_USER_ID_RE = re.compile(
13
+ r"Beacon\(\s*['\"]session-data['\"]\s*,\s*\{[^}]*['\"]User ID['\"]\s*:\s*['\"]([0-9a-f]{32})['\"]",
14
+ re.I,
15
+ )
10
16
 
11
17
 
12
18
  def extract_viewer_user_id(html: str) -> Optional[str]:
13
19
  soup = BeautifulSoup(html or "", "html.parser")
14
20
 
21
+ # Some templates include the user id for support tooling (HelpScout Beacon).
22
+ beacon_match = BEACON_USER_ID_RE.search(html or "")
23
+ if beacon_match:
24
+ return beacon_match.group(1)
25
+
15
26
  a_tag = soup.find(
16
27
  "a",
17
28
  href=lambda h: bool(h and USER_HREF_RE.match(h)),
@@ -56,3 +67,92 @@ def extract_viewer_avatar_url(html: str) -> Optional[str]:
56
67
  return src
57
68
 
58
69
  return None
70
+
71
+
72
+ def extract_viewer_user_id_from_storage_state(storage_state: dict[str, Any] | None) -> Optional[str]:
73
+ if not storage_state:
74
+ return None
75
+
76
+ def first_hex(text: str | None) -> Optional[str]:
77
+ if not text:
78
+ return None
79
+ match = HEX_32_RE.search(text)
80
+ return match.group(1) if match else None
81
+
82
+ # Prefer localStorage keys that look explicitly user-related.
83
+ origins = storage_state.get("origins") or []
84
+ if isinstance(origins, list):
85
+ for origin in origins:
86
+ if not isinstance(origin, dict):
87
+ continue
88
+ local_storage = origin.get("localStorage") or []
89
+ if not isinstance(local_storage, list):
90
+ continue
91
+ for entry in local_storage:
92
+ if not isinstance(entry, dict):
93
+ continue
94
+ name = str(entry.get("name") or "")
95
+ value = entry.get("value")
96
+ if value is None:
97
+ continue
98
+ value_str = value if isinstance(value, str) else str(value)
99
+ if re.search(r"(user|viewer|account|profile)", name, re.I):
100
+ candidate = first_hex(value_str)
101
+ if candidate:
102
+ return candidate
103
+
104
+ # Try JSON blobs stored as strings.
105
+ if value_str and value_str.lstrip().startswith(("{", "[")):
106
+ candidate = _extract_hex_from_json_like(value_str)
107
+ if candidate:
108
+ return candidate
109
+
110
+ # Next: cookies with user-ish names that directly store a 32-hex id.
111
+ cookies = storage_state.get("cookies") or []
112
+ if isinstance(cookies, list):
113
+ for cookie in cookies:
114
+ if not isinstance(cookie, dict):
115
+ continue
116
+ name = str(cookie.get("name") or "")
117
+ value = str(cookie.get("value") or "")
118
+ if re.search(r"(user|uid)", name, re.I):
119
+ candidate = first_hex(value)
120
+ if candidate:
121
+ return candidate
122
+
123
+ # Last: if there's exactly one unique 32-hex anywhere in localStorage values, use it.
124
+ candidates: set[str] = set()
125
+ if isinstance(origins, list):
126
+ for origin in origins:
127
+ if not isinstance(origin, dict):
128
+ continue
129
+ local_storage = origin.get("localStorage") or []
130
+ if not isinstance(local_storage, list):
131
+ continue
132
+ for entry in local_storage:
133
+ if not isinstance(entry, dict):
134
+ continue
135
+ value = entry.get("value")
136
+ if value is None:
137
+ continue
138
+ value_str = value if isinstance(value, str) else str(value)
139
+ for match in HEX_32_RE.finditer(value_str):
140
+ candidates.add(match.group(1))
141
+ if len(candidates) == 1:
142
+ return next(iter(candidates))
143
+
144
+ return None
145
+
146
+
147
+ def _extract_hex_from_json_like(value: str) -> Optional[str]:
148
+ # Keep it simple: avoid importing json here; we only need a heuristic.
149
+ # Common patterns: {"user_id":"..."} or {"userId":"..."} nested inside a blob.
150
+ for pattern in (
151
+ r"['\"]user_id['\"]\s*:\s*['\"]([0-9a-f]{32})['\"]",
152
+ r"['\"]userId['\"]\s*:\s*['\"]([0-9a-f]{32})['\"]",
153
+ r"['\"]viewer_user_id['\"]\s*:\s*['\"]([0-9a-f]{32})['\"]",
154
+ ):
155
+ match = re.search(pattern, value, re.I)
156
+ if match:
157
+ return match.group(1)
158
+ return None