cli-web-hackernews 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,517 @@
1
+ """HTTP client for Hacker News — Firebase API + Algolia search + authenticated actions."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import re
7
+ from typing import Any
8
+
9
+ import httpx
10
+
11
+ from .auth import load_auth, refresh_auth
12
+ from .exceptions import AuthError, NetworkError, NotFoundError, RateLimitError, ServerError
13
+ from .models import Comment, SearchResult, Story, User
14
+
15
+ FIREBASE_BASE = "https://hacker-news.firebaseio.com/v0"
16
+ ALGOLIA_BASE = "https://hn.algolia.com/api/v1"
17
+ HN_BASE = "https://news.ycombinator.com"
18
+
19
+ DEFAULT_HEADERS = {
20
+ "User-Agent": (
21
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
22
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
23
+ "Chrome/122.0.0.0 Safari/537.36"
24
+ ),
25
+ "Accept": "application/json",
26
+ }
27
+
28
+ WEB_HEADERS = {
29
+ "User-Agent": (
30
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
31
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
32
+ "Chrome/122.0.0.0 Safari/537.36"
33
+ ),
34
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
35
+ "Accept-Language": "en-US,en;q=0.9",
36
+ }
37
+
38
+ # Feed name → Firebase endpoint
39
+ FEED_ENDPOINTS = {
40
+ "top": "topstories",
41
+ "new": "newstories",
42
+ "best": "beststories",
43
+ "ask": "askstories",
44
+ "show": "showstories",
45
+ "job": "jobstories",
46
+ }
47
+
48
+
49
+ class HackerNewsClient:
50
+ """HTTP client wrapping HN Firebase API and Algolia search."""
51
+
52
+ def __init__(self, timeout: float = 30.0, user_cookie: str | None = None):
53
+ self._timeout = timeout
54
+ self._user_cookie = user_cookie
55
+ self._client = httpx.Client(
56
+ headers=DEFAULT_HEADERS,
57
+ follow_redirects=True,
58
+ timeout=timeout,
59
+ )
60
+ self._web_client = httpx.Client(
61
+ headers=WEB_HEADERS,
62
+ follow_redirects=True,
63
+ timeout=timeout,
64
+ )
65
+
66
+ def close(self):
67
+ """Close underlying HTTP clients."""
68
+ self._client.close()
69
+ self._web_client.close()
70
+
71
+ def __enter__(self):
72
+ return self
73
+
74
+ def __exit__(self, *args):
75
+ self.close()
76
+
77
+ def _get_json(self, url: str, params: dict[str, Any] | None = None) -> Any:
78
+ """Fetch a URL and return parsed JSON."""
79
+ try:
80
+ response = self._client.get(url, params=params)
81
+ except httpx.TimeoutException as exc:
82
+ raise NetworkError(f"Request timed out: {url}") from exc
83
+ except httpx.RequestError as exc:
84
+ raise NetworkError(f"Network error: {exc}") from exc
85
+
86
+ if response.status_code == 404:
87
+ raise NotFoundError(url)
88
+ if response.status_code == 429:
89
+ retry_after = int(response.headers.get("retry-after", "60"))
90
+ raise RateLimitError(retry_after)
91
+ if response.status_code >= 500:
92
+ raise ServerError(response.status_code)
93
+ if response.status_code != 200:
94
+ raise NetworkError(f"Unexpected status {response.status_code}: {url}")
95
+
96
+ return response.json()
97
+
98
+ # ------------------------------------------------------------------ feeds
99
+
100
+ def get_story_ids(self, feed: str = "top") -> list[int]:
101
+ """Get story IDs for a feed (top, new, best, ask, show, job)."""
102
+ endpoint = FEED_ENDPOINTS.get(feed, "topstories")
103
+ return self._get_json(f"{FIREBASE_BASE}/{endpoint}.json")
104
+
105
+ def get_stories(self, feed: str = "top", limit: int = 30) -> list[Story]:
106
+ """Get full story objects for a feed."""
107
+ ids = self.get_story_ids(feed)[:limit]
108
+ return self._fetch_items_parallel(ids, Story)
109
+
110
+ # ------------------------------------------------------------------ items
111
+
112
+ def get_item(self, item_id: int) -> dict:
113
+ """Get a single item (story, comment, job, poll) by ID."""
114
+ data = self._get_json(f"{FIREBASE_BASE}/item/{item_id}.json")
115
+ if data is None:
116
+ raise NotFoundError(f"Item {item_id}")
117
+ return data
118
+
119
+ def get_story(self, story_id: int) -> Story:
120
+ """Get a story by ID."""
121
+ data = self.get_item(story_id)
122
+ return Story(
123
+ id=data.get("id", story_id),
124
+ title=data.get("title", ""),
125
+ url=data.get("url"),
126
+ score=data.get("score", 0),
127
+ by=data.get("by", ""),
128
+ time=data.get("time", 0),
129
+ descendants=data.get("descendants", 0),
130
+ type=data.get("type", "story"),
131
+ )
132
+
133
+ def get_comments(self, story_id: int, limit: int = 30) -> list[Comment]:
134
+ """Get top-level comments for a story."""
135
+ data = self.get_item(story_id)
136
+ kid_ids = data.get("kids", [])[:limit]
137
+ if not kid_ids:
138
+ return []
139
+ return self._fetch_items_parallel(kid_ids, Comment)
140
+
141
+ # ------------------------------------------------------------------ users
142
+
143
+ def get_user(self, username: str) -> User:
144
+ """Get a user profile."""
145
+ data = self._get_json(f"{FIREBASE_BASE}/user/{username}.json")
146
+ if data is None:
147
+ raise NotFoundError(f"User '{username}'")
148
+ return User(
149
+ id=data.get("id", username),
150
+ karma=data.get("karma", 0),
151
+ created=data.get("created", 0),
152
+ about=data.get("about", ""),
153
+ submitted=data.get("submitted", []),
154
+ )
155
+
156
+ # ------------------------------------------------------------------ search
157
+
158
+ def search(
159
+ self,
160
+ query: str,
161
+ tags: str = "story",
162
+ sort_by_date: bool = False,
163
+ hits_per_page: int = 20,
164
+ page: int = 0,
165
+ ) -> list[SearchResult]:
166
+ """Search HN via Algolia API."""
167
+ endpoint = "search_by_date" if sort_by_date else "search"
168
+ params: dict[str, Any] = {
169
+ "query": query,
170
+ "hitsPerPage": hits_per_page,
171
+ "page": page,
172
+ }
173
+ if tags:
174
+ params["tags"] = tags
175
+
176
+ data = self._get_json(f"{ALGOLIA_BASE}/{endpoint}", params=params)
177
+ results = []
178
+ for hit in data.get("hits", []):
179
+ results.append(
180
+ SearchResult(
181
+ objectID=hit.get("objectID", ""),
182
+ title=hit.get("title", ""),
183
+ url=hit.get("url"),
184
+ author=hit.get("author", ""),
185
+ points=hit.get("points"),
186
+ num_comments=hit.get("num_comments"),
187
+ created_at=hit.get("created_at", ""),
188
+ story_id=int(hit.get("objectID", "0"))
189
+ if hit.get("objectID", "").isdigit()
190
+ else None,
191
+ )
192
+ )
193
+ return results
194
+
195
+ # -------------------------------------------------------- authenticated web requests
196
+
197
+ def _require_auth(self) -> str:
198
+ """Return user cookie or raise AuthError."""
199
+ if not self._user_cookie:
200
+ raise AuthError()
201
+ return self._user_cookie
202
+
203
+ def _web_request(self, method: str, url: str, *, _attempt: int = 0, **kwargs) -> httpx.Response:
204
+ """Execute an authenticated web request with standard error handling.
205
+
206
+ Auth retry flow (CONVENTIONS.md §Auth Rules — never more than 3 attempts):
207
+ attempt 0 → 401/403 → reload auth.json from disk
208
+ attempt 1 → 401/403 → headless browser refresh via refresh_auth()
209
+ attempt 2 → 401/403 → raise AuthError
210
+ """
211
+ cookie = self._require_auth()
212
+ kwargs.setdefault("cookies", {"user": cookie})
213
+ try:
214
+ response = self._web_client.request(method, url, **kwargs)
215
+ except httpx.TimeoutException as exc:
216
+ raise NetworkError(f"Request timed out: {url}") from exc
217
+ except httpx.RequestError as exc:
218
+ raise NetworkError(f"Network error: {exc}") from exc
219
+
220
+ if response.status_code in (401, 403):
221
+ if _attempt >= 2:
222
+ raise AuthError(
223
+ "Auth cookie expired. Run: cli-web-hackernews auth login",
224
+ recoverable=False,
225
+ )
226
+ if _attempt == 0:
227
+ self._reload_cookie_from_disk()
228
+ else:
229
+ self._user_cookie = refresh_auth()["user_cookie"]
230
+ kwargs.pop("cookies", None)
231
+ return self._web_request(method, url, _attempt=_attempt + 1, **kwargs)
232
+ if response.status_code >= 500:
233
+ raise ServerError(response.status_code)
234
+ return response
235
+
236
+ def _reload_cookie_from_disk(self) -> None:
237
+ """Reload the user cookie from auth.json (another process may have refreshed it)."""
238
+ try:
239
+ self._user_cookie = load_auth()["user_cookie"]
240
+ except AuthError:
241
+ pass # fall through to headless refresh on the next attempt
242
+
243
+ def _get_html(
244
+ self,
245
+ url: str,
246
+ params: dict[str, str] | None = None,
247
+ ) -> str:
248
+ """Fetch a URL with auth cookie and return HTML body."""
249
+ response = self._web_request("GET", url, params=params)
250
+ if response.status_code != 200:
251
+ raise NetworkError(f"Unexpected status {response.status_code}: {url}")
252
+ return response.text
253
+
254
+ def _post_form(
255
+ self,
256
+ url: str,
257
+ data: dict[str, str],
258
+ ) -> str:
259
+ """POST form data with auth cookie, return response text."""
260
+ response = self._web_request(
261
+ "POST",
262
+ url,
263
+ data=data,
264
+ headers={"Content-Type": "application/x-www-form-urlencoded"},
265
+ )
266
+ return response.text
267
+
268
+ def _extract_auth_token(self, html: str, item_id: int) -> str:
269
+ """Extract the auth token for an item from HN page HTML.
270
+
271
+ HN embeds auth tokens in links like: vote?id=X&how=up&auth=HEXTOKEN
272
+ """
273
+ pattern = rf"auth=([a-f0-9]+).*?(?:id={item_id}|{item_id})"
274
+ match = re.search(pattern, html)
275
+ if match:
276
+ return match.group(1)
277
+ # Try reverse order (auth after id)
278
+ pattern2 = rf'id={item_id}[^"]*auth=([a-f0-9]+)'
279
+ match2 = re.search(pattern2, html)
280
+ if match2:
281
+ return match2.group(1)
282
+ raise AuthError("Could not extract auth token — page format may have changed")
283
+
284
+ # ------------------------------------------------------------------ upvote
285
+
286
+ def upvote(self, item_id: int) -> dict:
287
+ """Upvote a story or comment. Returns success status."""
288
+ # First, load the item page to get the auth token
289
+ html = self._get_html(f"{HN_BASE}/item?id={item_id}")
290
+ auth_token = self._extract_auth_token(html, item_id)
291
+
292
+ # Execute the upvote via GET
293
+ self._get_html(
294
+ f"{HN_BASE}/vote",
295
+ params={"id": str(item_id), "how": "up", "auth": auth_token},
296
+ )
297
+
298
+ return {"success": True, "item_id": item_id, "action": "upvoted"}
299
+
300
+ # ------------------------------------------------------------------ submit
301
+
302
+ def submit_story(self, title: str, url: str | None = None, text: str | None = None) -> dict:
303
+ """Submit a new story to HN. Returns submission result."""
304
+ # Get the submit page to extract fnid
305
+ html = self._get_html(f"{HN_BASE}/submit")
306
+ fnid_match = re.search(r'name="fnid"\s+value="([^"]+)"', html)
307
+ if not fnid_match:
308
+ raise AuthError("Could not get submission token — login may have expired")
309
+
310
+ form_data: dict[str, str] = {
311
+ "fnid": fnid_match.group(1),
312
+ "fnop": "submit-page",
313
+ "title": title,
314
+ "url": url or "",
315
+ "text": text or "",
316
+ }
317
+
318
+ result_html = self._post_form(f"{HN_BASE}/r", form_data)
319
+
320
+ # Check for errors in result
321
+ if "unknown or expired link" in result_html.lower():
322
+ raise AuthError("Submission token expired — please try again")
323
+ if "please slow down" in result_html.lower():
324
+ raise RateLimitError(retry_after=120)
325
+
326
+ return {"success": True, "title": title, "type": "url" if url else "ask"}
327
+
328
+ # ------------------------------------------------------------------ comment
329
+
330
+ def post_comment(self, parent_id: int, text: str) -> dict:
331
+ """Post a comment on a story or reply to another comment."""
332
+ # Get the item page to extract hmac
333
+ html = self._get_html(f"{HN_BASE}/item?id={parent_id}")
334
+ hmac_match = re.search(r'name="hmac"\s+value="([^"]+)"', html)
335
+ if not hmac_match:
336
+ raise AuthError("Could not get comment token — login may have expired")
337
+
338
+ form_data = {
339
+ "parent": str(parent_id),
340
+ "goto": f"item?id={parent_id}",
341
+ "hmac": hmac_match.group(1),
342
+ "text": text,
343
+ }
344
+
345
+ result_html = self._post_form(f"{HN_BASE}/comment", form_data)
346
+
347
+ if "unknown or expired link" in result_html.lower():
348
+ raise AuthError("Comment token expired — please try again")
349
+
350
+ return {"success": True, "parent_id": parent_id}
351
+
352
+ # ------------------------------------------------------------------ favorite
353
+
354
+ def favorite(self, item_id: int) -> dict:
355
+ """Favorite (save) a story."""
356
+ html = self._get_html(f"{HN_BASE}/item?id={item_id}")
357
+ auth_token = self._extract_auth_token(html, item_id)
358
+
359
+ self._get_html(
360
+ f"{HN_BASE}/fave",
361
+ params={"id": str(item_id), "auth": auth_token},
362
+ )
363
+
364
+ return {"success": True, "item_id": item_id, "action": "favorited"}
365
+
366
+ # ------------------------------------------------------------------ hide
367
+
368
+ def hide(self, item_id: int) -> dict:
369
+ """Hide a story from the feed."""
370
+ html = self._get_html(f"{HN_BASE}/item?id={item_id}")
371
+ auth_token = self._extract_auth_token(html, item_id)
372
+
373
+ self._get_html(
374
+ f"{HN_BASE}/hide",
375
+ params={"id": str(item_id), "auth": auth_token},
376
+ )
377
+
378
+ return {"success": True, "item_id": item_id, "action": "hidden"}
379
+
380
+ # ------------------------------------------------------------ favorites page
381
+
382
+ def get_favorites(self, username: str, limit: int = 30) -> list[Story]:
383
+ """Get a user's favorite stories by scraping the favorites page."""
384
+ html = self._get_html(f"{HN_BASE}/favorites", params={"id": username})
385
+ return self._parse_stories_from_html(html, limit)
386
+
387
+ # ----------------------------------------------------------- submissions page
388
+
389
+ def get_submissions(self, username: str, limit: int = 30) -> list[Story]:
390
+ """Get a user's submitted stories by scraping the submitted page."""
391
+ html = self._get_html(f"{HN_BASE}/submitted", params={"id": username})
392
+ return self._parse_stories_from_html(html, limit)
393
+
394
+ def _parse_stories_from_html(self, html: str, limit: int = 30) -> list[Story]:
395
+ """Parse story items from HN HTML pages (favorites, submitted, etc.)."""
396
+ # Find all story IDs from the HTML
397
+ id_matches = re.findall(r'class="athing[^"]*"\s+id="(\d+)"', html)
398
+ if not id_matches:
399
+ return []
400
+ item_ids = [int(m) for m in id_matches[:limit]]
401
+ return self._fetch_items_parallel(item_ids, Story)
402
+
403
+ # --------------------------------------------------------------- threads page
404
+
405
+ def get_threads(self, username: str, limit: int = 20) -> list[Comment]:
406
+ """Get comment replies to a user (threads page) by scraping HTML."""
407
+ html = self._get_html(f"{HN_BASE}/threads", params={"id": username})
408
+ return self._parse_comments_from_html(html, limit)
409
+
410
+ def _parse_comments_from_html(self, html: str, limit: int = 20) -> list[Comment]:
411
+ """Parse comment items from HN threads HTML page."""
412
+ id_matches = re.findall(r'class="athing[^"]*"\s+id="(\d+)"', html)
413
+ if not id_matches:
414
+ return []
415
+ item_ids = [int(m) for m in id_matches[:limit]]
416
+ return self._fetch_items_parallel(item_ids, Comment)
417
+
418
+ # -------------------------------------------------------------- parallel fetch
419
+
420
+ def _fetch_items_parallel(self, ids: list[int], model_cls: type) -> list:
421
+ """Fetch multiple items in parallel using asyncio + httpx."""
422
+
423
+ async def _fetch_all():
424
+ async with httpx.AsyncClient(
425
+ headers=DEFAULT_HEADERS,
426
+ follow_redirects=True,
427
+ timeout=self._timeout,
428
+ ) as client:
429
+ tasks = [client.get(f"{FIREBASE_BASE}/item/{item_id}.json") for item_id in ids]
430
+ responses = await asyncio.gather(*tasks, return_exceptions=True)
431
+
432
+ items = []
433
+ for resp in responses:
434
+ if isinstance(resp, Exception):
435
+ continue
436
+ if resp.status_code != 200:
437
+ continue
438
+ data = resp.json()
439
+ if data is None or data.get("deleted"):
440
+ continue
441
+ try:
442
+ if model_cls == Story:
443
+ items.append(
444
+ Story(
445
+ id=data.get("id", 0),
446
+ title=data.get("title", ""),
447
+ url=data.get("url"),
448
+ score=data.get("score", 0),
449
+ by=data.get("by", ""),
450
+ time=data.get("time", 0),
451
+ descendants=data.get("descendants", 0),
452
+ type=data.get("type", "story"),
453
+ )
454
+ )
455
+ elif model_cls == Comment:
456
+ items.append(
457
+ Comment(
458
+ id=data.get("id", 0),
459
+ by=data.get("by", ""),
460
+ text=data.get("text", ""),
461
+ time=data.get("time", 0),
462
+ parent=data.get("parent", 0),
463
+ kids=data.get("kids", []),
464
+ dead=data.get("dead", False),
465
+ deleted=data.get("deleted", False),
466
+ type=data.get("type", "comment"),
467
+ )
468
+ )
469
+ except (KeyError, TypeError):
470
+ continue
471
+ return items
472
+
473
+ try:
474
+ loop = asyncio.get_running_loop()
475
+ except RuntimeError:
476
+ loop = None
477
+
478
+ if loop and loop.is_running():
479
+ # Already in async context — run synchronously as fallback
480
+ items = []
481
+ for item_id in ids:
482
+ try:
483
+ data = self.get_item(item_id)
484
+ if data.get("deleted"):
485
+ continue
486
+ if model_cls == Story:
487
+ items.append(
488
+ Story(
489
+ id=data.get("id", 0),
490
+ title=data.get("title", ""),
491
+ url=data.get("url"),
492
+ score=data.get("score", 0),
493
+ by=data.get("by", ""),
494
+ time=data.get("time", 0),
495
+ descendants=data.get("descendants", 0),
496
+ type=data.get("type", "story"),
497
+ )
498
+ )
499
+ elif model_cls == Comment:
500
+ items.append(
501
+ Comment(
502
+ id=data.get("id", 0),
503
+ by=data.get("by", ""),
504
+ text=data.get("text", ""),
505
+ time=data.get("time", 0),
506
+ parent=data.get("parent", 0),
507
+ kids=data.get("kids", []),
508
+ dead=data.get("dead", False),
509
+ deleted=data.get("deleted", False),
510
+ type=data.get("type", "comment"),
511
+ )
512
+ )
513
+ except Exception:
514
+ continue
515
+ return items
516
+ else:
517
+ return asyncio.run(_fetch_all())
@@ -0,0 +1,63 @@
1
+ """Domain exception hierarchy for cli-web-hackernews."""
2
+
3
+
4
+ class AppError(Exception):
5
+ """Base exception for all cli-web-hackernews errors."""
6
+
7
+ def __init__(self, message: str, code: str = "APP_ERROR"):
8
+ self.message = message
9
+ self.code = code
10
+ super().__init__(message)
11
+
12
+ def to_dict(self) -> dict:
13
+ return {"error": True, "code": self.code, "message": self.message}
14
+
15
+
16
+ class RateLimitError(AppError):
17
+ """API rate limit hit."""
18
+
19
+ def __init__(self, retry_after: int = 60):
20
+ self.retry_after = retry_after
21
+ super().__init__(
22
+ f"Rate limited. Retry after {retry_after}s.",
23
+ "RATE_LIMITED",
24
+ )
25
+
26
+ def to_dict(self) -> dict:
27
+ d = super().to_dict()
28
+ d["retry_after"] = self.retry_after
29
+ return d
30
+
31
+
32
+ class NetworkError(AppError):
33
+ """Network or connectivity error."""
34
+
35
+ def __init__(self, message: str):
36
+ super().__init__(message, "NETWORK_ERROR")
37
+
38
+
39
+ class ServerError(AppError):
40
+ """Remote server returned a 5xx error."""
41
+
42
+ def __init__(self, status: int):
43
+ self.status_code = status
44
+ super().__init__(f"Server error: HTTP {status}", "SERVER_ERROR")
45
+
46
+
47
+ class AuthError(AppError):
48
+ """Authentication failed or credentials missing."""
49
+
50
+ def __init__(
51
+ self,
52
+ message: str = "Authentication required. Run: cli-web-hackernews auth login",
53
+ recoverable: bool = False,
54
+ ):
55
+ self.recoverable = recoverable
56
+ super().__init__(message, "AUTH_EXPIRED")
57
+
58
+
59
+ class NotFoundError(AppError):
60
+ """Requested resource not found."""
61
+
62
+ def __init__(self, resource: str = "resource"):
63
+ super().__init__(f"{resource} not found", "NOT_FOUND")