cli-web-hackernews 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli_web/hackernews/README.md +91 -0
- cli_web/hackernews/__init__.py +0 -0
- cli_web/hackernews/__main__.py +6 -0
- cli_web/hackernews/commands/__init__.py +0 -0
- cli_web/hackernews/commands/actions.py +105 -0
- cli_web/hackernews/commands/auth.py +80 -0
- cli_web/hackernews/commands/search.py +69 -0
- cli_web/hackernews/commands/stories.py +160 -0
- cli_web/hackernews/commands/user.py +112 -0
- cli_web/hackernews/core/__init__.py +0 -0
- cli_web/hackernews/core/auth.py +290 -0
- cli_web/hackernews/core/client.py +517 -0
- cli_web/hackernews/core/exceptions.py +63 -0
- cli_web/hackernews/core/models.py +144 -0
- cli_web/hackernews/hackernews_cli.py +171 -0
- cli_web/hackernews/tests/TEST.md +143 -0
- cli_web/hackernews/tests/__init__.py +0 -0
- cli_web/hackernews/tests/test_core.py +365 -0
- cli_web/hackernews/tests/test_e2e.py +267 -0
- cli_web/hackernews/utils/__init__.py +0 -0
- cli_web/hackernews/utils/doctor.py +188 -0
- cli_web/hackernews/utils/helpers.py +73 -0
- cli_web/hackernews/utils/mcp_server.py +290 -0
- cli_web/hackernews/utils/output.py +136 -0
- cli_web/hackernews/utils/repl_skin.py +486 -0
- cli_web_hackernews-0.1.0.dist-info/METADATA +12 -0
- cli_web_hackernews-0.1.0.dist-info/RECORD +30 -0
- cli_web_hackernews-0.1.0.dist-info/WHEEL +5 -0
- cli_web_hackernews-0.1.0.dist-info/entry_points.txt +2 -0
- cli_web_hackernews-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,517 @@
|
|
|
1
|
+
"""HTTP client for Hacker News — Firebase API + Algolia search + authenticated actions."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import re
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
import httpx
|
|
10
|
+
|
|
11
|
+
from .auth import load_auth, refresh_auth
|
|
12
|
+
from .exceptions import AuthError, NetworkError, NotFoundError, RateLimitError, ServerError
|
|
13
|
+
from .models import Comment, SearchResult, Story, User
|
|
14
|
+
|
|
15
|
+
FIREBASE_BASE = "https://hacker-news.firebaseio.com/v0"
|
|
16
|
+
ALGOLIA_BASE = "https://hn.algolia.com/api/v1"
|
|
17
|
+
HN_BASE = "https://news.ycombinator.com"
|
|
18
|
+
|
|
19
|
+
DEFAULT_HEADERS = {
|
|
20
|
+
"User-Agent": (
|
|
21
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
22
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
23
|
+
"Chrome/122.0.0.0 Safari/537.36"
|
|
24
|
+
),
|
|
25
|
+
"Accept": "application/json",
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
WEB_HEADERS = {
|
|
29
|
+
"User-Agent": (
|
|
30
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
31
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
32
|
+
"Chrome/122.0.0.0 Safari/537.36"
|
|
33
|
+
),
|
|
34
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
35
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
# Feed name → Firebase endpoint
|
|
39
|
+
FEED_ENDPOINTS = {
|
|
40
|
+
"top": "topstories",
|
|
41
|
+
"new": "newstories",
|
|
42
|
+
"best": "beststories",
|
|
43
|
+
"ask": "askstories",
|
|
44
|
+
"show": "showstories",
|
|
45
|
+
"job": "jobstories",
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class HackerNewsClient:
|
|
50
|
+
"""HTTP client wrapping HN Firebase API and Algolia search."""
|
|
51
|
+
|
|
52
|
+
def __init__(self, timeout: float = 30.0, user_cookie: str | None = None):
|
|
53
|
+
self._timeout = timeout
|
|
54
|
+
self._user_cookie = user_cookie
|
|
55
|
+
self._client = httpx.Client(
|
|
56
|
+
headers=DEFAULT_HEADERS,
|
|
57
|
+
follow_redirects=True,
|
|
58
|
+
timeout=timeout,
|
|
59
|
+
)
|
|
60
|
+
self._web_client = httpx.Client(
|
|
61
|
+
headers=WEB_HEADERS,
|
|
62
|
+
follow_redirects=True,
|
|
63
|
+
timeout=timeout,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
def close(self):
|
|
67
|
+
"""Close underlying HTTP clients."""
|
|
68
|
+
self._client.close()
|
|
69
|
+
self._web_client.close()
|
|
70
|
+
|
|
71
|
+
def __enter__(self):
|
|
72
|
+
return self
|
|
73
|
+
|
|
74
|
+
def __exit__(self, *args):
|
|
75
|
+
self.close()
|
|
76
|
+
|
|
77
|
+
def _get_json(self, url: str, params: dict[str, Any] | None = None) -> Any:
|
|
78
|
+
"""Fetch a URL and return parsed JSON."""
|
|
79
|
+
try:
|
|
80
|
+
response = self._client.get(url, params=params)
|
|
81
|
+
except httpx.TimeoutException as exc:
|
|
82
|
+
raise NetworkError(f"Request timed out: {url}") from exc
|
|
83
|
+
except httpx.RequestError as exc:
|
|
84
|
+
raise NetworkError(f"Network error: {exc}") from exc
|
|
85
|
+
|
|
86
|
+
if response.status_code == 404:
|
|
87
|
+
raise NotFoundError(url)
|
|
88
|
+
if response.status_code == 429:
|
|
89
|
+
retry_after = int(response.headers.get("retry-after", "60"))
|
|
90
|
+
raise RateLimitError(retry_after)
|
|
91
|
+
if response.status_code >= 500:
|
|
92
|
+
raise ServerError(response.status_code)
|
|
93
|
+
if response.status_code != 200:
|
|
94
|
+
raise NetworkError(f"Unexpected status {response.status_code}: {url}")
|
|
95
|
+
|
|
96
|
+
return response.json()
|
|
97
|
+
|
|
98
|
+
# ------------------------------------------------------------------ feeds
|
|
99
|
+
|
|
100
|
+
def get_story_ids(self, feed: str = "top") -> list[int]:
|
|
101
|
+
"""Get story IDs for a feed (top, new, best, ask, show, job)."""
|
|
102
|
+
endpoint = FEED_ENDPOINTS.get(feed, "topstories")
|
|
103
|
+
return self._get_json(f"{FIREBASE_BASE}/{endpoint}.json")
|
|
104
|
+
|
|
105
|
+
def get_stories(self, feed: str = "top", limit: int = 30) -> list[Story]:
|
|
106
|
+
"""Get full story objects for a feed."""
|
|
107
|
+
ids = self.get_story_ids(feed)[:limit]
|
|
108
|
+
return self._fetch_items_parallel(ids, Story)
|
|
109
|
+
|
|
110
|
+
# ------------------------------------------------------------------ items
|
|
111
|
+
|
|
112
|
+
def get_item(self, item_id: int) -> dict:
|
|
113
|
+
"""Get a single item (story, comment, job, poll) by ID."""
|
|
114
|
+
data = self._get_json(f"{FIREBASE_BASE}/item/{item_id}.json")
|
|
115
|
+
if data is None:
|
|
116
|
+
raise NotFoundError(f"Item {item_id}")
|
|
117
|
+
return data
|
|
118
|
+
|
|
119
|
+
def get_story(self, story_id: int) -> Story:
|
|
120
|
+
"""Get a story by ID."""
|
|
121
|
+
data = self.get_item(story_id)
|
|
122
|
+
return Story(
|
|
123
|
+
id=data.get("id", story_id),
|
|
124
|
+
title=data.get("title", ""),
|
|
125
|
+
url=data.get("url"),
|
|
126
|
+
score=data.get("score", 0),
|
|
127
|
+
by=data.get("by", ""),
|
|
128
|
+
time=data.get("time", 0),
|
|
129
|
+
descendants=data.get("descendants", 0),
|
|
130
|
+
type=data.get("type", "story"),
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
def get_comments(self, story_id: int, limit: int = 30) -> list[Comment]:
|
|
134
|
+
"""Get top-level comments for a story."""
|
|
135
|
+
data = self.get_item(story_id)
|
|
136
|
+
kid_ids = data.get("kids", [])[:limit]
|
|
137
|
+
if not kid_ids:
|
|
138
|
+
return []
|
|
139
|
+
return self._fetch_items_parallel(kid_ids, Comment)
|
|
140
|
+
|
|
141
|
+
# ------------------------------------------------------------------ users
|
|
142
|
+
|
|
143
|
+
def get_user(self, username: str) -> User:
|
|
144
|
+
"""Get a user profile."""
|
|
145
|
+
data = self._get_json(f"{FIREBASE_BASE}/user/{username}.json")
|
|
146
|
+
if data is None:
|
|
147
|
+
raise NotFoundError(f"User '{username}'")
|
|
148
|
+
return User(
|
|
149
|
+
id=data.get("id", username),
|
|
150
|
+
karma=data.get("karma", 0),
|
|
151
|
+
created=data.get("created", 0),
|
|
152
|
+
about=data.get("about", ""),
|
|
153
|
+
submitted=data.get("submitted", []),
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
# ------------------------------------------------------------------ search
|
|
157
|
+
|
|
158
|
+
def search(
|
|
159
|
+
self,
|
|
160
|
+
query: str,
|
|
161
|
+
tags: str = "story",
|
|
162
|
+
sort_by_date: bool = False,
|
|
163
|
+
hits_per_page: int = 20,
|
|
164
|
+
page: int = 0,
|
|
165
|
+
) -> list[SearchResult]:
|
|
166
|
+
"""Search HN via Algolia API."""
|
|
167
|
+
endpoint = "search_by_date" if sort_by_date else "search"
|
|
168
|
+
params: dict[str, Any] = {
|
|
169
|
+
"query": query,
|
|
170
|
+
"hitsPerPage": hits_per_page,
|
|
171
|
+
"page": page,
|
|
172
|
+
}
|
|
173
|
+
if tags:
|
|
174
|
+
params["tags"] = tags
|
|
175
|
+
|
|
176
|
+
data = self._get_json(f"{ALGOLIA_BASE}/{endpoint}", params=params)
|
|
177
|
+
results = []
|
|
178
|
+
for hit in data.get("hits", []):
|
|
179
|
+
results.append(
|
|
180
|
+
SearchResult(
|
|
181
|
+
objectID=hit.get("objectID", ""),
|
|
182
|
+
title=hit.get("title", ""),
|
|
183
|
+
url=hit.get("url"),
|
|
184
|
+
author=hit.get("author", ""),
|
|
185
|
+
points=hit.get("points"),
|
|
186
|
+
num_comments=hit.get("num_comments"),
|
|
187
|
+
created_at=hit.get("created_at", ""),
|
|
188
|
+
story_id=int(hit.get("objectID", "0"))
|
|
189
|
+
if hit.get("objectID", "").isdigit()
|
|
190
|
+
else None,
|
|
191
|
+
)
|
|
192
|
+
)
|
|
193
|
+
return results
|
|
194
|
+
|
|
195
|
+
# -------------------------------------------------------- authenticated web requests
|
|
196
|
+
|
|
197
|
+
def _require_auth(self) -> str:
|
|
198
|
+
"""Return user cookie or raise AuthError."""
|
|
199
|
+
if not self._user_cookie:
|
|
200
|
+
raise AuthError()
|
|
201
|
+
return self._user_cookie
|
|
202
|
+
|
|
203
|
+
def _web_request(self, method: str, url: str, *, _attempt: int = 0, **kwargs) -> httpx.Response:
|
|
204
|
+
"""Execute an authenticated web request with standard error handling.
|
|
205
|
+
|
|
206
|
+
Auth retry flow (CONVENTIONS.md §Auth Rules — never more than 3 attempts):
|
|
207
|
+
attempt 0 → 401/403 → reload auth.json from disk
|
|
208
|
+
attempt 1 → 401/403 → headless browser refresh via refresh_auth()
|
|
209
|
+
attempt 2 → 401/403 → raise AuthError
|
|
210
|
+
"""
|
|
211
|
+
cookie = self._require_auth()
|
|
212
|
+
kwargs.setdefault("cookies", {"user": cookie})
|
|
213
|
+
try:
|
|
214
|
+
response = self._web_client.request(method, url, **kwargs)
|
|
215
|
+
except httpx.TimeoutException as exc:
|
|
216
|
+
raise NetworkError(f"Request timed out: {url}") from exc
|
|
217
|
+
except httpx.RequestError as exc:
|
|
218
|
+
raise NetworkError(f"Network error: {exc}") from exc
|
|
219
|
+
|
|
220
|
+
if response.status_code in (401, 403):
|
|
221
|
+
if _attempt >= 2:
|
|
222
|
+
raise AuthError(
|
|
223
|
+
"Auth cookie expired. Run: cli-web-hackernews auth login",
|
|
224
|
+
recoverable=False,
|
|
225
|
+
)
|
|
226
|
+
if _attempt == 0:
|
|
227
|
+
self._reload_cookie_from_disk()
|
|
228
|
+
else:
|
|
229
|
+
self._user_cookie = refresh_auth()["user_cookie"]
|
|
230
|
+
kwargs.pop("cookies", None)
|
|
231
|
+
return self._web_request(method, url, _attempt=_attempt + 1, **kwargs)
|
|
232
|
+
if response.status_code >= 500:
|
|
233
|
+
raise ServerError(response.status_code)
|
|
234
|
+
return response
|
|
235
|
+
|
|
236
|
+
def _reload_cookie_from_disk(self) -> None:
|
|
237
|
+
"""Reload the user cookie from auth.json (another process may have refreshed it)."""
|
|
238
|
+
try:
|
|
239
|
+
self._user_cookie = load_auth()["user_cookie"]
|
|
240
|
+
except AuthError:
|
|
241
|
+
pass # fall through to headless refresh on the next attempt
|
|
242
|
+
|
|
243
|
+
def _get_html(
|
|
244
|
+
self,
|
|
245
|
+
url: str,
|
|
246
|
+
params: dict[str, str] | None = None,
|
|
247
|
+
) -> str:
|
|
248
|
+
"""Fetch a URL with auth cookie and return HTML body."""
|
|
249
|
+
response = self._web_request("GET", url, params=params)
|
|
250
|
+
if response.status_code != 200:
|
|
251
|
+
raise NetworkError(f"Unexpected status {response.status_code}: {url}")
|
|
252
|
+
return response.text
|
|
253
|
+
|
|
254
|
+
def _post_form(
|
|
255
|
+
self,
|
|
256
|
+
url: str,
|
|
257
|
+
data: dict[str, str],
|
|
258
|
+
) -> str:
|
|
259
|
+
"""POST form data with auth cookie, return response text."""
|
|
260
|
+
response = self._web_request(
|
|
261
|
+
"POST",
|
|
262
|
+
url,
|
|
263
|
+
data=data,
|
|
264
|
+
headers={"Content-Type": "application/x-www-form-urlencoded"},
|
|
265
|
+
)
|
|
266
|
+
return response.text
|
|
267
|
+
|
|
268
|
+
def _extract_auth_token(self, html: str, item_id: int) -> str:
|
|
269
|
+
"""Extract the auth token for an item from HN page HTML.
|
|
270
|
+
|
|
271
|
+
HN embeds auth tokens in links like: vote?id=X&how=up&auth=HEXTOKEN
|
|
272
|
+
"""
|
|
273
|
+
pattern = rf"auth=([a-f0-9]+).*?(?:id={item_id}|{item_id})"
|
|
274
|
+
match = re.search(pattern, html)
|
|
275
|
+
if match:
|
|
276
|
+
return match.group(1)
|
|
277
|
+
# Try reverse order (auth after id)
|
|
278
|
+
pattern2 = rf'id={item_id}[^"]*auth=([a-f0-9]+)'
|
|
279
|
+
match2 = re.search(pattern2, html)
|
|
280
|
+
if match2:
|
|
281
|
+
return match2.group(1)
|
|
282
|
+
raise AuthError("Could not extract auth token — page format may have changed")
|
|
283
|
+
|
|
284
|
+
# ------------------------------------------------------------------ upvote
|
|
285
|
+
|
|
286
|
+
def upvote(self, item_id: int) -> dict:
|
|
287
|
+
"""Upvote a story or comment. Returns success status."""
|
|
288
|
+
# First, load the item page to get the auth token
|
|
289
|
+
html = self._get_html(f"{HN_BASE}/item?id={item_id}")
|
|
290
|
+
auth_token = self._extract_auth_token(html, item_id)
|
|
291
|
+
|
|
292
|
+
# Execute the upvote via GET
|
|
293
|
+
self._get_html(
|
|
294
|
+
f"{HN_BASE}/vote",
|
|
295
|
+
params={"id": str(item_id), "how": "up", "auth": auth_token},
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
return {"success": True, "item_id": item_id, "action": "upvoted"}
|
|
299
|
+
|
|
300
|
+
# ------------------------------------------------------------------ submit
|
|
301
|
+
|
|
302
|
+
def submit_story(self, title: str, url: str | None = None, text: str | None = None) -> dict:
|
|
303
|
+
"""Submit a new story to HN. Returns submission result."""
|
|
304
|
+
# Get the submit page to extract fnid
|
|
305
|
+
html = self._get_html(f"{HN_BASE}/submit")
|
|
306
|
+
fnid_match = re.search(r'name="fnid"\s+value="([^"]+)"', html)
|
|
307
|
+
if not fnid_match:
|
|
308
|
+
raise AuthError("Could not get submission token — login may have expired")
|
|
309
|
+
|
|
310
|
+
form_data: dict[str, str] = {
|
|
311
|
+
"fnid": fnid_match.group(1),
|
|
312
|
+
"fnop": "submit-page",
|
|
313
|
+
"title": title,
|
|
314
|
+
"url": url or "",
|
|
315
|
+
"text": text or "",
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
result_html = self._post_form(f"{HN_BASE}/r", form_data)
|
|
319
|
+
|
|
320
|
+
# Check for errors in result
|
|
321
|
+
if "unknown or expired link" in result_html.lower():
|
|
322
|
+
raise AuthError("Submission token expired — please try again")
|
|
323
|
+
if "please slow down" in result_html.lower():
|
|
324
|
+
raise RateLimitError(retry_after=120)
|
|
325
|
+
|
|
326
|
+
return {"success": True, "title": title, "type": "url" if url else "ask"}
|
|
327
|
+
|
|
328
|
+
# ------------------------------------------------------------------ comment
|
|
329
|
+
|
|
330
|
+
def post_comment(self, parent_id: int, text: str) -> dict:
|
|
331
|
+
"""Post a comment on a story or reply to another comment."""
|
|
332
|
+
# Get the item page to extract hmac
|
|
333
|
+
html = self._get_html(f"{HN_BASE}/item?id={parent_id}")
|
|
334
|
+
hmac_match = re.search(r'name="hmac"\s+value="([^"]+)"', html)
|
|
335
|
+
if not hmac_match:
|
|
336
|
+
raise AuthError("Could not get comment token — login may have expired")
|
|
337
|
+
|
|
338
|
+
form_data = {
|
|
339
|
+
"parent": str(parent_id),
|
|
340
|
+
"goto": f"item?id={parent_id}",
|
|
341
|
+
"hmac": hmac_match.group(1),
|
|
342
|
+
"text": text,
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
result_html = self._post_form(f"{HN_BASE}/comment", form_data)
|
|
346
|
+
|
|
347
|
+
if "unknown or expired link" in result_html.lower():
|
|
348
|
+
raise AuthError("Comment token expired — please try again")
|
|
349
|
+
|
|
350
|
+
return {"success": True, "parent_id": parent_id}
|
|
351
|
+
|
|
352
|
+
# ------------------------------------------------------------------ favorite
|
|
353
|
+
|
|
354
|
+
def favorite(self, item_id: int) -> dict:
|
|
355
|
+
"""Favorite (save) a story."""
|
|
356
|
+
html = self._get_html(f"{HN_BASE}/item?id={item_id}")
|
|
357
|
+
auth_token = self._extract_auth_token(html, item_id)
|
|
358
|
+
|
|
359
|
+
self._get_html(
|
|
360
|
+
f"{HN_BASE}/fave",
|
|
361
|
+
params={"id": str(item_id), "auth": auth_token},
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
return {"success": True, "item_id": item_id, "action": "favorited"}
|
|
365
|
+
|
|
366
|
+
# ------------------------------------------------------------------ hide
|
|
367
|
+
|
|
368
|
+
def hide(self, item_id: int) -> dict:
|
|
369
|
+
"""Hide a story from the feed."""
|
|
370
|
+
html = self._get_html(f"{HN_BASE}/item?id={item_id}")
|
|
371
|
+
auth_token = self._extract_auth_token(html, item_id)
|
|
372
|
+
|
|
373
|
+
self._get_html(
|
|
374
|
+
f"{HN_BASE}/hide",
|
|
375
|
+
params={"id": str(item_id), "auth": auth_token},
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
return {"success": True, "item_id": item_id, "action": "hidden"}
|
|
379
|
+
|
|
380
|
+
# ------------------------------------------------------------ favorites page
|
|
381
|
+
|
|
382
|
+
def get_favorites(self, username: str, limit: int = 30) -> list[Story]:
|
|
383
|
+
"""Get a user's favorite stories by scraping the favorites page."""
|
|
384
|
+
html = self._get_html(f"{HN_BASE}/favorites", params={"id": username})
|
|
385
|
+
return self._parse_stories_from_html(html, limit)
|
|
386
|
+
|
|
387
|
+
# ----------------------------------------------------------- submissions page
|
|
388
|
+
|
|
389
|
+
def get_submissions(self, username: str, limit: int = 30) -> list[Story]:
|
|
390
|
+
"""Get a user's submitted stories by scraping the submitted page."""
|
|
391
|
+
html = self._get_html(f"{HN_BASE}/submitted", params={"id": username})
|
|
392
|
+
return self._parse_stories_from_html(html, limit)
|
|
393
|
+
|
|
394
|
+
def _parse_stories_from_html(self, html: str, limit: int = 30) -> list[Story]:
|
|
395
|
+
"""Parse story items from HN HTML pages (favorites, submitted, etc.)."""
|
|
396
|
+
# Find all story IDs from the HTML
|
|
397
|
+
id_matches = re.findall(r'class="athing[^"]*"\s+id="(\d+)"', html)
|
|
398
|
+
if not id_matches:
|
|
399
|
+
return []
|
|
400
|
+
item_ids = [int(m) for m in id_matches[:limit]]
|
|
401
|
+
return self._fetch_items_parallel(item_ids, Story)
|
|
402
|
+
|
|
403
|
+
# --------------------------------------------------------------- threads page
|
|
404
|
+
|
|
405
|
+
def get_threads(self, username: str, limit: int = 20) -> list[Comment]:
|
|
406
|
+
"""Get comment replies to a user (threads page) by scraping HTML."""
|
|
407
|
+
html = self._get_html(f"{HN_BASE}/threads", params={"id": username})
|
|
408
|
+
return self._parse_comments_from_html(html, limit)
|
|
409
|
+
|
|
410
|
+
def _parse_comments_from_html(self, html: str, limit: int = 20) -> list[Comment]:
|
|
411
|
+
"""Parse comment items from HN threads HTML page."""
|
|
412
|
+
id_matches = re.findall(r'class="athing[^"]*"\s+id="(\d+)"', html)
|
|
413
|
+
if not id_matches:
|
|
414
|
+
return []
|
|
415
|
+
item_ids = [int(m) for m in id_matches[:limit]]
|
|
416
|
+
return self._fetch_items_parallel(item_ids, Comment)
|
|
417
|
+
|
|
418
|
+
# -------------------------------------------------------------- parallel fetch
|
|
419
|
+
|
|
420
|
+
def _fetch_items_parallel(self, ids: list[int], model_cls: type) -> list:
|
|
421
|
+
"""Fetch multiple items in parallel using asyncio + httpx."""
|
|
422
|
+
|
|
423
|
+
async def _fetch_all():
|
|
424
|
+
async with httpx.AsyncClient(
|
|
425
|
+
headers=DEFAULT_HEADERS,
|
|
426
|
+
follow_redirects=True,
|
|
427
|
+
timeout=self._timeout,
|
|
428
|
+
) as client:
|
|
429
|
+
tasks = [client.get(f"{FIREBASE_BASE}/item/{item_id}.json") for item_id in ids]
|
|
430
|
+
responses = await asyncio.gather(*tasks, return_exceptions=True)
|
|
431
|
+
|
|
432
|
+
items = []
|
|
433
|
+
for resp in responses:
|
|
434
|
+
if isinstance(resp, Exception):
|
|
435
|
+
continue
|
|
436
|
+
if resp.status_code != 200:
|
|
437
|
+
continue
|
|
438
|
+
data = resp.json()
|
|
439
|
+
if data is None or data.get("deleted"):
|
|
440
|
+
continue
|
|
441
|
+
try:
|
|
442
|
+
if model_cls == Story:
|
|
443
|
+
items.append(
|
|
444
|
+
Story(
|
|
445
|
+
id=data.get("id", 0),
|
|
446
|
+
title=data.get("title", ""),
|
|
447
|
+
url=data.get("url"),
|
|
448
|
+
score=data.get("score", 0),
|
|
449
|
+
by=data.get("by", ""),
|
|
450
|
+
time=data.get("time", 0),
|
|
451
|
+
descendants=data.get("descendants", 0),
|
|
452
|
+
type=data.get("type", "story"),
|
|
453
|
+
)
|
|
454
|
+
)
|
|
455
|
+
elif model_cls == Comment:
|
|
456
|
+
items.append(
|
|
457
|
+
Comment(
|
|
458
|
+
id=data.get("id", 0),
|
|
459
|
+
by=data.get("by", ""),
|
|
460
|
+
text=data.get("text", ""),
|
|
461
|
+
time=data.get("time", 0),
|
|
462
|
+
parent=data.get("parent", 0),
|
|
463
|
+
kids=data.get("kids", []),
|
|
464
|
+
dead=data.get("dead", False),
|
|
465
|
+
deleted=data.get("deleted", False),
|
|
466
|
+
type=data.get("type", "comment"),
|
|
467
|
+
)
|
|
468
|
+
)
|
|
469
|
+
except (KeyError, TypeError):
|
|
470
|
+
continue
|
|
471
|
+
return items
|
|
472
|
+
|
|
473
|
+
try:
|
|
474
|
+
loop = asyncio.get_running_loop()
|
|
475
|
+
except RuntimeError:
|
|
476
|
+
loop = None
|
|
477
|
+
|
|
478
|
+
if loop and loop.is_running():
|
|
479
|
+
# Already in async context — run synchronously as fallback
|
|
480
|
+
items = []
|
|
481
|
+
for item_id in ids:
|
|
482
|
+
try:
|
|
483
|
+
data = self.get_item(item_id)
|
|
484
|
+
if data.get("deleted"):
|
|
485
|
+
continue
|
|
486
|
+
if model_cls == Story:
|
|
487
|
+
items.append(
|
|
488
|
+
Story(
|
|
489
|
+
id=data.get("id", 0),
|
|
490
|
+
title=data.get("title", ""),
|
|
491
|
+
url=data.get("url"),
|
|
492
|
+
score=data.get("score", 0),
|
|
493
|
+
by=data.get("by", ""),
|
|
494
|
+
time=data.get("time", 0),
|
|
495
|
+
descendants=data.get("descendants", 0),
|
|
496
|
+
type=data.get("type", "story"),
|
|
497
|
+
)
|
|
498
|
+
)
|
|
499
|
+
elif model_cls == Comment:
|
|
500
|
+
items.append(
|
|
501
|
+
Comment(
|
|
502
|
+
id=data.get("id", 0),
|
|
503
|
+
by=data.get("by", ""),
|
|
504
|
+
text=data.get("text", ""),
|
|
505
|
+
time=data.get("time", 0),
|
|
506
|
+
parent=data.get("parent", 0),
|
|
507
|
+
kids=data.get("kids", []),
|
|
508
|
+
dead=data.get("dead", False),
|
|
509
|
+
deleted=data.get("deleted", False),
|
|
510
|
+
type=data.get("type", "comment"),
|
|
511
|
+
)
|
|
512
|
+
)
|
|
513
|
+
except Exception:
|
|
514
|
+
continue
|
|
515
|
+
return items
|
|
516
|
+
else:
|
|
517
|
+
return asyncio.run(_fetch_all())
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""Domain exception hierarchy for cli-web-hackernews."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class AppError(Exception):
|
|
5
|
+
"""Base exception for all cli-web-hackernews errors."""
|
|
6
|
+
|
|
7
|
+
def __init__(self, message: str, code: str = "APP_ERROR"):
|
|
8
|
+
self.message = message
|
|
9
|
+
self.code = code
|
|
10
|
+
super().__init__(message)
|
|
11
|
+
|
|
12
|
+
def to_dict(self) -> dict:
|
|
13
|
+
return {"error": True, "code": self.code, "message": self.message}
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RateLimitError(AppError):
|
|
17
|
+
"""API rate limit hit."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, retry_after: int = 60):
|
|
20
|
+
self.retry_after = retry_after
|
|
21
|
+
super().__init__(
|
|
22
|
+
f"Rate limited. Retry after {retry_after}s.",
|
|
23
|
+
"RATE_LIMITED",
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
def to_dict(self) -> dict:
|
|
27
|
+
d = super().to_dict()
|
|
28
|
+
d["retry_after"] = self.retry_after
|
|
29
|
+
return d
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class NetworkError(AppError):
|
|
33
|
+
"""Network or connectivity error."""
|
|
34
|
+
|
|
35
|
+
def __init__(self, message: str):
|
|
36
|
+
super().__init__(message, "NETWORK_ERROR")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class ServerError(AppError):
|
|
40
|
+
"""Remote server returned a 5xx error."""
|
|
41
|
+
|
|
42
|
+
def __init__(self, status: int):
|
|
43
|
+
self.status_code = status
|
|
44
|
+
super().__init__(f"Server error: HTTP {status}", "SERVER_ERROR")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class AuthError(AppError):
|
|
48
|
+
"""Authentication failed or credentials missing."""
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
message: str = "Authentication required. Run: cli-web-hackernews auth login",
|
|
53
|
+
recoverable: bool = False,
|
|
54
|
+
):
|
|
55
|
+
self.recoverable = recoverable
|
|
56
|
+
super().__init__(message, "AUTH_EXPIRED")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class NotFoundError(AppError):
|
|
60
|
+
"""Requested resource not found."""
|
|
61
|
+
|
|
62
|
+
def __init__(self, resource: str = "resource"):
|
|
63
|
+
super().__init__(f"{resource} not found", "NOT_FOUND")
|