python-library-ff14-news 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,129 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ from crawl4weibo import WeiboClient
6
+ from crawl4weibo.exceptions.base import CrawlError, NetworkError, RateLimitError
7
+ from crawl4weibo.models.post import Post
8
+
9
+ from ff14_news.channels.cn_weibo.browser_cookies import fetch_mobile_cookies
10
+ from ff14_news.channels.cn_weibo.exceptions import WeiboAccessError
11
+ from ff14_news.channels.cn_weibo.proxy_url import normalize_proxy_url
12
+
13
+ _PLAYWRIGHT_INSTALL_HINT = (
14
+ "请先安装 Playwright Chromium:\n"
15
+ " python -m example.ensure_browser --proxy 127.0.0.1:7897"
16
+ )
17
+
18
+
19
+ class WeiboCrawlBackend:
20
+ """基于 crawl4weibo 的微博 HTTP 后端(Playwright 取 Cookie + 移动端 API)。"""
21
+
22
+ def __init__(
23
+ self,
24
+ *,
25
+ cookie: str | None = None,
26
+ cookie_storage_path: Path | None = None,
27
+ browser_headless: bool = True,
28
+ proxy_url: str | None = None,
29
+ ) -> None:
30
+ """创建后端。
31
+
32
+ Args:
33
+ cookie: 浏览器 Cookie 整串;提供后不再自动开浏览器
34
+ cookie_storage_path: Playwright 会话缓存路径,可复用 Cookie
35
+ browser_headless: 自动取 Cookie 时是否无头运行 Chromium
36
+ proxy_url: HTTP 代理,如 ``127.0.0.1:7897`` 或 ``http://127.0.0.1:7897``
37
+ """
38
+ self._proxy = normalize_proxy_url(proxy_url)
39
+ if cookie:
40
+ cookie_value: str | dict[str, str] = cookie
41
+ else:
42
+ try:
43
+ cookie_value = fetch_mobile_cookies(
44
+ proxy_url=self._proxy,
45
+ headless=browser_headless,
46
+ storage_state_path=cookie_storage_path,
47
+ )
48
+ except Exception as exc:
49
+ raise _access_error(exc) from exc
50
+ if not cookie_value:
51
+ raise WeiboAccessError(
52
+ "Playwright 未获取到微博 Cookie,请检查代理或手动配置 Cookie。"
53
+ )
54
+ try:
55
+ self._client = WeiboClient(
56
+ cookies=cookie_value,
57
+ auto_fetch_cookies=False,
58
+ use_browser_cookies=False,
59
+ log_level="WARNING",
60
+ )
61
+ except Exception as exc:
62
+ raise _access_error(exc) from exc
63
+ if self._proxy:
64
+ self._client.add_proxy(self._proxy)
65
+
66
+ def resolve_screen_name(self, screen_name: str) -> str:
67
+ """按 screen_name 解析 numeric uid。"""
68
+ name = screen_name.strip()
69
+ if not name:
70
+ raise ValueError("screen_name must not be empty")
71
+ try:
72
+ users = self._client.search_users(name, count=20)
73
+ except (CrawlError, NetworkError, RateLimitError) as exc:
74
+ raise _access_error(exc) from exc
75
+ lowered = name.lower()
76
+ for user in users:
77
+ if user.screen_name.lower() == lowered:
78
+ return str(user.id)
79
+ raise ValueError(f"cannot resolve uid for screen_name={name!r}")
80
+
81
+ def fetch_timeline_posts(self, uid: str, *, page: int) -> list[Post]:
82
+ """拉取用户时间线一页。"""
83
+ if page < 1:
84
+ raise ValueError("page must be >= 1")
85
+ try:
86
+ return self._client.get_user_posts(
87
+ uid,
88
+ page=page,
89
+ expand=False,
90
+ )
91
+ except (CrawlError, NetworkError, RateLimitError) as exc:
92
+ raise _access_error(exc) from exc
93
+
94
+ def fetch_post(self, article_id: str) -> Post:
95
+ """拉取单条微博(列表级字段,不展开长文)。"""
96
+ article_id = str(article_id).strip()
97
+ if not article_id:
98
+ raise ValueError("article_id must not be empty")
99
+ try:
100
+ return self._client.get_post_by_bid(article_id, expand=False)
101
+ except (CrawlError, NetworkError, RateLimitError) as exc:
102
+ raise _access_error(exc) from exc
103
+
104
+ def fetch_post_detail(self, article_id: str) -> Post:
105
+ """拉取单条微博详情(展开长文)。"""
106
+ article_id = str(article_id).strip()
107
+ if not article_id:
108
+ raise ValueError("article_id must not be empty")
109
+ try:
110
+ return self._client.get_post_by_bid(article_id, expand=True)
111
+ except (CrawlError, NetworkError, RateLimitError) as exc:
112
+ raise _access_error(exc) from exc
113
+
114
+
115
+ def _access_error(exc: Exception) -> WeiboAccessError:
116
+ message = str(exc).strip()
117
+ lowered = message.lower()
118
+ if "playwright" in lowered or "executable" in lowered or "chromium" in lowered:
119
+ return WeiboAccessError(
120
+ f"微博需要 Playwright Chromium。{_PLAYWRIGHT_INSTALL_HINT}"
121
+ )
122
+ if "432" in message:
123
+ return WeiboAccessError(
124
+ "微博返回 HTTP 432。"
125
+ f"请确认代理可用(如 127.0.0.1:7897),或配置 example/weibo_cookie.txt。"
126
+ )
127
+ if isinstance(exc, RateLimitError):
128
+ return WeiboAccessError(f"微博限流:{message}")
129
+ return WeiboAccessError(message or "微博请求失败")
@@ -0,0 +1,2 @@
1
+ class WeiboAccessError(ValueError):
2
+ """微博接口拒绝访问或会话初始化失败。"""
@@ -0,0 +1,161 @@
1
+ import html
2
+ import re
3
+ from datetime import datetime
4
+
5
+ from ff14_news.channels.cn_weibo.constants import (
6
+ DETAIL_URL_TEMPLATE,
7
+ MOBILE_ORIGIN,
8
+ PERMALINK_TEMPLATE,
9
+ )
10
+ from ff14_news.common.html_blocks import html_to_blocks
11
+ from ff14_news.models import NewsArticle, NewsBlockType, NewsContentBlock, NewsListItem
12
+
13
+ _TAG_RE = re.compile(r"<[^>]+>")
14
+ _FULL_TEXT_RE = re.compile(r"…?\.\.\.?全文\s*$|…全文\s*$")
15
+ _TITLE_MAX_LEN = 80
16
+ _SUMMARY_MAX_LEN = 200
17
+
18
+
19
+ def effective_mblog(mblog: dict) -> dict:
20
+ """转发微博取内层原博;普通微博返回自身。"""
21
+ inner = mblog.get("retweeted_status")
22
+ if isinstance(inner, dict):
23
+ return inner
24
+ return mblog
25
+
26
+
27
+ def needs_detail_fetch(mblog: dict) -> bool:
28
+ """正文被截断或标记长文时须拉 statuses/show。"""
29
+ if mblog.get("isLongText"):
30
+ return True
31
+ raw = str(mblog.get("text") or "")
32
+ plain = _plain_text(raw)
33
+ if "全文" in raw and ("status" in raw or _FULL_TEXT_RE.search(plain)):
34
+ return True
35
+ return False
36
+
37
+
38
+ def mblog_id(mblog: dict) -> str:
39
+ value = mblog.get("id") or mblog.get("mid") or mblog.get("bid")
40
+ if not value:
41
+ raise ValueError("mblog missing id")
42
+ return str(value)
43
+
44
+
45
+ def parse_created_at(raw: str) -> datetime:
46
+ text = (raw or "").strip()
47
+ if not text:
48
+ return datetime.fromtimestamp(0)
49
+ try:
50
+ return datetime.strptime(text, "%a %b %d %H:%M:%S %z %Y")
51
+ except ValueError:
52
+ return datetime.fromtimestamp(0)
53
+
54
+
55
+ def mblog_to_list_item(mblog: dict, *, channel_id: str) -> NewsListItem:
56
+ article_id = mblog_id(mblog)
57
+ effective = effective_mblog(mblog)
58
+ title = _title_from_text(effective)
59
+ summary = _summary_from_text(effective)
60
+ cover = _first_pic_url(effective)
61
+ return NewsListItem(
62
+ channel_id=channel_id,
63
+ id=article_id,
64
+ title=title,
65
+ publish_date=parse_created_at(str(mblog.get("created_at") or "")),
66
+ summary=summary,
67
+ cover_image_url=cover,
68
+ source_page_url=DETAIL_URL_TEMPLATE.format(article_id=article_id),
69
+ )
70
+
71
+
72
+ def mblog_to_article(
73
+ mblog: dict,
74
+ *,
75
+ channel_id: str,
76
+ ) -> NewsArticle:
77
+ article_id = mblog_id(mblog)
78
+ effective = effective_mblog(mblog)
79
+ blocks = blocks_from_mblog(mblog)
80
+ title = _title_from_text(effective)
81
+ summary = _summary_from_blocks(blocks) or _summary_from_text(effective)
82
+ cover = _first_pic_url(effective)
83
+ return NewsArticle(
84
+ channel_id=channel_id,
85
+ id=article_id,
86
+ title=title,
87
+ publish_date=parse_created_at(str(mblog.get("created_at") or "")),
88
+ summary=summary,
89
+ category_code=None,
90
+ cover_image_url=cover,
91
+ source_page_url=PERMALINK_TEMPLATE.format(article_id=article_id),
92
+ blocks=blocks,
93
+ )
94
+
95
+
96
+ def blocks_from_mblog(mblog: dict) -> list[NewsContentBlock]:
97
+ effective = effective_mblog(mblog)
98
+ text_html = str(effective.get("text") or "")
99
+ wrapped = f"<div>{text_html}</div>" if text_html else ""
100
+ blocks = html_to_blocks(wrapped, base_url=MOBILE_ORIGIN)
101
+ for pic in effective.get("pics") or []:
102
+ url = _pic_url(pic)
103
+ if url:
104
+ blocks.append(
105
+ NewsContentBlock(type=NewsBlockType.IMAGE, url=url)
106
+ )
107
+ return blocks
108
+
109
+
110
+ def _plain_text(text_html: str) -> str:
111
+ text = _TAG_RE.sub("", text_html)
112
+ return html.unescape(text).replace("\xa0", " ").strip()
113
+
114
+
115
+ def _title_from_text(mblog: dict) -> str:
116
+ plain = _plain_text(str(mblog.get("text") or ""))
117
+ first_line = plain.split("\n", 1)[0].strip()
118
+ if not first_line:
119
+ return "微博"
120
+ if len(first_line) <= _TITLE_MAX_LEN:
121
+ return first_line
122
+ return first_line[: _TITLE_MAX_LEN - 1] + "…"
123
+
124
+
125
+ def _summary_from_text(mblog: dict) -> str:
126
+ plain = _plain_text(str(mblog.get("text") or ""))
127
+ plain = _FULL_TEXT_RE.sub("", plain).strip()
128
+ if len(plain) <= _SUMMARY_MAX_LEN:
129
+ return plain
130
+ return plain[: _SUMMARY_MAX_LEN - 1] + "…"
131
+
132
+
133
+ def _summary_from_blocks(blocks: list[NewsContentBlock]) -> str:
134
+ for block in blocks:
135
+ if block.text and block.text.strip():
136
+ text = block.text.strip()
137
+ if len(text) <= _SUMMARY_MAX_LEN:
138
+ return text
139
+ return text[: _SUMMARY_MAX_LEN - 1] + "…"
140
+ return ""
141
+
142
+
143
+ def _first_pic_url(mblog: dict) -> str | None:
144
+ pics = mblog.get("pics") or []
145
+ if not pics:
146
+ return None
147
+ return _pic_url(pics[0])
148
+
149
+
150
+ def _pic_url(pic: object) -> str | None:
151
+ if not isinstance(pic, dict):
152
+ return None
153
+ large = pic.get("large")
154
+ if isinstance(large, dict):
155
+ url = large.get("url")
156
+ if url:
157
+ return str(url).strip() or None
158
+ url = pic.get("url")
159
+ if url:
160
+ return str(url).strip() or None
161
+ return None
@@ -0,0 +1,105 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime, timezone
4
+
5
+ from crawl4weibo.models.post import Post
6
+
7
+ from ff14_news.channels.cn_weibo.constants import (
8
+ DETAIL_URL_TEMPLATE,
9
+ PERMALINK_TEMPLATE,
10
+ )
11
+ from ff14_news.channels.cn_weibo.mblog_parser import (
12
+ _SUMMARY_MAX_LEN,
13
+ _TITLE_MAX_LEN,
14
+ _FULL_TEXT_RE,
15
+ )
16
+ from ff14_news.models import NewsArticle, NewsBlockType, NewsContentBlock, NewsListItem
17
+
18
+
19
+ def post_to_list_item(post: Post, *, channel_id: str) -> NewsListItem:
20
+ """将 crawl4weibo Post 转为列表级 NewsListItem。"""
21
+ effective = _effective_post(post)
22
+ article_id = str(post.id).strip()
23
+ title = _title_from_plain(effective.text)
24
+ summary = _summary_from_plain(effective.text)
25
+ cover = effective.pic_urls[0] if effective.pic_urls else None
26
+ return NewsListItem(
27
+ channel_id=channel_id,
28
+ id=article_id,
29
+ title=title,
30
+ publish_date=_publish_date(post.created_at),
31
+ summary=summary,
32
+ cover_image_url=cover,
33
+ source_page_url=DETAIL_URL_TEMPLATE.format(article_id=article_id),
34
+ )
35
+
36
+
37
+ def post_to_article(post: Post, *, channel_id: str) -> NewsArticle:
38
+ """将 crawl4weibo Post 转为含正文块的 NewsArticle。"""
39
+ effective = _effective_post(post)
40
+ article_id = str(post.id).strip()
41
+ blocks = _blocks_from_post(effective)
42
+ title = _title_from_plain(effective.text)
43
+ summary = _summary_from_blocks(blocks) or _summary_from_plain(effective.text)
44
+ cover = effective.pic_urls[0] if effective.pic_urls else None
45
+ return NewsArticle(
46
+ channel_id=channel_id,
47
+ id=article_id,
48
+ title=title,
49
+ publish_date=_publish_date(post.created_at),
50
+ summary=summary,
51
+ category_code=None,
52
+ cover_image_url=cover,
53
+ source_page_url=PERMALINK_TEMPLATE.format(article_id=article_id),
54
+ blocks=blocks,
55
+ )
56
+
57
+
58
+ def _effective_post(post: Post) -> Post:
59
+ if post.retweeted_status is not None:
60
+ return post.retweeted_status
61
+ return post
62
+
63
+
64
+ def _publish_date(value: datetime | None) -> datetime:
65
+ if value is None:
66
+ return datetime.fromtimestamp(0, tz=timezone.utc)
67
+ if value.tzinfo is None:
68
+ return value.replace(tzinfo=timezone.utc)
69
+ return value
70
+
71
+
72
+ def _title_from_plain(text: str) -> str:
73
+ first_line = text.strip().split("\n", 1)[0].strip()
74
+ if not first_line:
75
+ return "微博"
76
+ if len(first_line) <= _TITLE_MAX_LEN:
77
+ return first_line
78
+ return first_line[: _TITLE_MAX_LEN - 1] + "…"
79
+
80
+
81
+ def _summary_from_plain(text: str) -> str:
82
+ plain = _FULL_TEXT_RE.sub("", text.strip()).strip()
83
+ if len(plain) <= _SUMMARY_MAX_LEN:
84
+ return plain
85
+ return plain[: _SUMMARY_MAX_LEN - 1] + "…"
86
+
87
+
88
+ def _summary_from_blocks(blocks: list[NewsContentBlock]) -> str:
89
+ for block in blocks:
90
+ if block.text and block.text.strip():
91
+ snippet = block.text.strip()
92
+ if len(snippet) <= _SUMMARY_MAX_LEN:
93
+ return snippet
94
+ return snippet[: _SUMMARY_MAX_LEN - 1] + "…"
95
+ return ""
96
+
97
+
98
+ def _blocks_from_post(post: Post) -> list[NewsContentBlock]:
99
+ blocks: list[NewsContentBlock] = []
100
+ text = post.text.strip()
101
+ if text:
102
+ blocks.append(NewsContentBlock(type=NewsBlockType.TEXT, text=text))
103
+ for url in post.pic_urls:
104
+ blocks.append(NewsContentBlock(type=NewsBlockType.IMAGE, url=url))
105
+ return blocks
@@ -0,0 +1,10 @@
1
+ def weibo_timeline_container_id(uid: str) -> str:
2
+ """用户微博时间线 containerid(固定规则)。
3
+
4
+ Args:
5
+ uid: 微博 numeric uid
6
+ """
7
+ uid = str(uid).strip()
8
+ if not uid:
9
+ raise ValueError("uid must not be empty")
10
+ return f"107603{uid}"
@@ -0,0 +1,14 @@
1
+ def normalize_proxy_url(value: str | None) -> str | None:
2
+ """将 ``host:port`` 或完整 URL 规范为 ``http://…`` 代理地址。
3
+
4
+ Args:
5
+ value: 代理地址;空串或 None 表示不使用代理
6
+ """
7
+ if value is None:
8
+ return None
9
+ text = value.strip()
10
+ if not text:
11
+ return None
12
+ if "://" not in text:
13
+ return f"http://{text}"
14
+ return text
@@ -0,0 +1,3 @@
1
+ from ff14_news.channels.jp_official.channel import JpOfficialChannel
2
+
3
+ __all__ = ["JpOfficialChannel"]
@@ -0,0 +1,110 @@
1
+ from datetime import datetime, timezone
2
+
3
+ from ff14_news.channels.jp_official.constants import (
4
+ CHANNEL_ID,
5
+ DETAIL_URL_TEMPLATE,
6
+ DISPLAY_NAME,
7
+ TOPICS_LIST_URL,
8
+ )
9
+ from ff14_news.channels.jp_official.detail_parser import (
10
+ parse_detail_metadata,
11
+ parse_detail_page,
12
+ )
13
+ from ff14_news.channels.jp_official.http_client import fetch_html
14
+ from ff14_news.channels.jp_official.list_parser import (
15
+ list_row_to_item,
16
+ parse_topics_list_page,
17
+ topics_list_url,
18
+ )
19
+ from ff14_news.common.list_feed import article_from_list_item
20
+ from ff14_news.models import NewsArticle, NewsFeed, NewsListItem
21
+
22
+ _LIST_SCAN_PAGE_SIZE = 30
23
+ _LIST_SCAN_MAX_PAGES = 20
24
+
25
+
26
+ class JpOfficialChannel:
27
+ """FF14 日文官网 Lodestone トピックス。
28
+
29
+ 默认抓取列表级字段(含 news__list--banner 摘要);正文块须 fetch_article_detail。
30
+ """
31
+
32
+ channel_id = CHANNEL_ID
33
+ display_name = DISPLAY_NAME
34
+
35
+ def __init__(self, *, timeout_seconds: float = 120.0) -> None:
36
+ self._timeout = timeout_seconds
37
+
38
+ def list_items(
39
+ self,
40
+ *,
41
+ limit: int = 10,
42
+ page_index: int = 0,
43
+ ) -> list[NewsListItem]:
44
+ if limit < 1:
45
+ raise ValueError("limit must be >= 1")
46
+ url = topics_list_url(page_index)
47
+ html = fetch_html(url, timeout_seconds=self._timeout)
48
+ rows = parse_topics_list_page(html, limit=limit)
49
+ return [list_row_to_item(row, channel_id=self.channel_id) for row in rows]
50
+
51
+ def fetch_article_detail(self, article_id: str) -> NewsArticle:
52
+ article_id = str(article_id).strip()
53
+ if not article_id:
54
+ raise ValueError("article_id must not be empty")
55
+ url = DETAIL_URL_TEMPLATE.format(article_id=article_id)
56
+ html = fetch_html(url, timeout_seconds=self._timeout)
57
+ return parse_detail_page(html, article_id, channel_id=self.channel_id)
58
+
59
+ def fetch_article(self, article_id: str) -> NewsArticle:
60
+ return self.fetch_article_detail(article_id)
61
+
62
+ def fetch_articles(
63
+ self,
64
+ *,
65
+ limit: int = 10,
66
+ page_index: int = 0,
67
+ ) -> NewsFeed:
68
+ items = self.list_items(limit=limit, page_index=page_index)
69
+ articles = [article_from_list_item(item) for item in items]
70
+ return self._build_feed(articles)
71
+
72
+ def fetch_articles_by_ids(self, article_ids: list[str]) -> NewsFeed:
73
+ if not article_ids:
74
+ raise ValueError("article_ids must not be empty")
75
+ wanted = {str(aid).strip() for aid in article_ids}
76
+ found: dict[str, NewsArticle] = {}
77
+ for page_index in range(_LIST_SCAN_MAX_PAGES):
78
+ if wanted.issubset(found):
79
+ break
80
+ items = self.list_items(
81
+ limit=_LIST_SCAN_PAGE_SIZE,
82
+ page_index=page_index,
83
+ )
84
+ if not items:
85
+ break
86
+ for item in items:
87
+ if item.id in wanted and item.id not in found:
88
+ found[item.id] = article_from_list_item(item)
89
+ articles: list[NewsArticle] = []
90
+ for aid in article_ids:
91
+ key = str(aid).strip()
92
+ if key in found:
93
+ articles.append(found[key])
94
+ else:
95
+ articles.append(self._metadata_from_detail(key))
96
+ return self._build_feed(articles)
97
+
98
+ def _metadata_from_detail(self, article_id: str) -> NewsArticle:
99
+ url = DETAIL_URL_TEMPLATE.format(article_id=article_id)
100
+ html = fetch_html(url, timeout_seconds=self._timeout)
101
+ return parse_detail_metadata(html, article_id, channel_id=self.channel_id)
102
+
103
+ def _build_feed(self, articles: list[NewsArticle]) -> NewsFeed:
104
+ return NewsFeed(
105
+ channel_id=self.channel_id,
106
+ source_list_url=TOPICS_LIST_URL,
107
+ category_code=None,
108
+ fetched_at=datetime.now(timezone.utc),
109
+ articles=articles,
110
+ )
@@ -0,0 +1,6 @@
1
+ CHANNEL_ID = "jp_official"
2
+ DISPLAY_NAME = "FF14 日文官网(Lodestone)"
3
+
4
+ SITE_ORIGIN = "https://jp.finalfantasyxiv.com"
5
+ TOPICS_LIST_URL = f"{SITE_ORIGIN}/lodestone/topics/"
6
+ DETAIL_URL_TEMPLATE = f"{SITE_ORIGIN}/lodestone/topics/detail/{{article_id}}/"
@@ -0,0 +1,128 @@
1
+ import html
2
+ import re
3
+ from datetime import datetime, timezone
4
+
5
+ from ff14_news.channels.jp_official.constants import DETAIL_URL_TEMPLATE, SITE_ORIGIN
6
+ from ff14_news.common.html_blocks import html_to_blocks
7
+ from ff14_news.models import NewsArticle, NewsContentBlock
8
+
9
+ _WRAPPER_RE = re.compile(
10
+ r'<div class="news__detail__wrapper">(.*?)</div>\s*<div class="news__detail__social">',
11
+ re.DOTALL,
12
+ )
13
+ _TITLE_RE = re.compile(
14
+ r'<article class="news__detail">.*?<h1>([^<]+)</h1>',
15
+ re.DOTALL,
16
+ )
17
+ _TIMESTAMP_RE = re.compile(
18
+ r'<article class="news__detail">.*?ldst_strftime\((\d+),',
19
+ re.DOTALL,
20
+ )
21
+ _FIRST_IMG_RE = re.compile(r'<img[^>]+src="([^"]+)"')
22
+
23
+
24
+ def parse_detail_metadata(
25
+ page_html: str,
26
+ article_id: str,
27
+ *,
28
+ channel_id: str,
29
+ ) -> NewsArticle:
30
+ """详情页元数据:标题、时间、头图、摘要,不解析正文块。"""
31
+ title_match = _TITLE_RE.search(page_html)
32
+ title = html.unescape(title_match.group(1).strip()) if title_match else ""
33
+
34
+ ts_match = _TIMESTAMP_RE.search(page_html)
35
+ if ts_match:
36
+ publish_date = datetime.fromtimestamp(
37
+ int(ts_match.group(1)),
38
+ tz=timezone.utc,
39
+ )
40
+ else:
41
+ publish_date = datetime.fromtimestamp(0, tz=timezone.utc)
42
+
43
+ wrapper_match = _WRAPPER_RE.search(page_html)
44
+ wrapper_html = wrapper_match.group(1) if wrapper_match else ""
45
+ cover_url = _first_image_url(wrapper_html)
46
+ summary = _plain_summary_from_html(wrapper_html)
47
+
48
+ return NewsArticle(
49
+ channel_id=channel_id,
50
+ id=article_id,
51
+ title=title,
52
+ publish_date=publish_date,
53
+ summary=summary,
54
+ category_code=None,
55
+ cover_image_url=cover_url,
56
+ source_page_url=DETAIL_URL_TEMPLATE.format(article_id=article_id),
57
+ blocks=[],
58
+ )
59
+
60
+
61
+ def parse_detail_page(
62
+ page_html: str,
63
+ article_id: str,
64
+ *,
65
+ channel_id: str,
66
+ ) -> NewsArticle:
67
+ title_match = _TITLE_RE.search(page_html)
68
+ title = html.unescape(title_match.group(1).strip()) if title_match else ""
69
+
70
+ ts_match = _TIMESTAMP_RE.search(page_html)
71
+ if ts_match:
72
+ publish_date = datetime.fromtimestamp(
73
+ int(ts_match.group(1)),
74
+ tz=timezone.utc,
75
+ )
76
+ else:
77
+ publish_date = datetime.fromtimestamp(0, tz=timezone.utc)
78
+
79
+ wrapper_match = _WRAPPER_RE.search(page_html)
80
+ wrapper_html = wrapper_match.group(1) if wrapper_match else ""
81
+ blocks = html_to_blocks(wrapper_html, base_url=SITE_ORIGIN)
82
+
83
+ cover_url = _first_image_url(wrapper_html)
84
+ summary = _summary_from_blocks(blocks)
85
+
86
+ return NewsArticle(
87
+ channel_id=channel_id,
88
+ id=article_id,
89
+ title=title,
90
+ publish_date=publish_date,
91
+ summary=summary,
92
+ category_code=None,
93
+ cover_image_url=cover_url,
94
+ source_page_url=DETAIL_URL_TEMPLATE.format(article_id=article_id),
95
+ blocks=blocks,
96
+ )
97
+
98
+
99
+ def _first_image_url(wrapper_html: str) -> str | None:
100
+ match = _FIRST_IMG_RE.search(wrapper_html)
101
+ if not match:
102
+ return None
103
+ url = match.group(1).strip()
104
+ return url or None
105
+
106
+
107
+ _TAG_RE = re.compile(r"<[^>]+>")
108
+
109
+
110
+ def _plain_summary_from_html(wrapper_html: str, max_len: int = 200) -> str:
111
+ text = _TAG_RE.sub(" ", wrapper_html)
112
+ text = html.unescape(text)
113
+ text = re.sub(r"\s+", " ", text).strip()
114
+ if not text:
115
+ return ""
116
+ if len(text) <= max_len:
117
+ return text
118
+ return text[: max_len - 1] + "…"
119
+
120
+
121
+ def _summary_from_blocks(blocks: list[NewsContentBlock], max_len: int = 200) -> str:
122
+ for block in blocks:
123
+ if block.text and block.text.strip():
124
+ text = block.text.strip()
125
+ if len(text) <= max_len:
126
+ return text
127
+ return text[: max_len - 1] + "…"
128
+ return ""
@@ -0,0 +1,16 @@
1
+ import urllib.error
2
+ import urllib.request
3
+
4
+ _USER_AGENT = "Mozilla/5.0 (compatible; python-library-ff14-news/0.1)"
5
+
6
+
7
+ def fetch_html(url: str, *, timeout_seconds: float) -> str:
8
+ req = urllib.request.Request(
9
+ url,
10
+ headers={"User-Agent": _USER_AGENT, "Accept": "text/html"},
11
+ )
12
+ try:
13
+ raw = urllib.request.urlopen(req, timeout=timeout_seconds).read()
14
+ except urllib.error.HTTPError as exc:
15
+ raise ValueError(f"HTTP {exc.code} for {url}") from exc
16
+ return raw.decode("utf-8", "replace")