python-library-ff14-news 0.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ff14_news/__init__.py +25 -0
- ff14_news/channel_protocol.py +41 -0
- ff14_news/channels/__init__.py +1 -0
- ff14_news/channels/cn_official/__init__.py +3 -0
- ff14_news/channels/cn_official/channel.py +112 -0
- ff14_news/channels/cn_official/constants.py +13 -0
- ff14_news/channels/cn_official/cqnews_client.py +112 -0
- ff14_news/channels/cn_official/html_content.py +11 -0
- ff14_news/channels/cn_weibo/__init__.py +3 -0
- ff14_news/channels/cn_weibo/browser_cookies.py +93 -0
- ff14_news/channels/cn_weibo/channel.py +141 -0
- ff14_news/channels/cn_weibo/constants.py +10 -0
- ff14_news/channels/cn_weibo/crawl_backend.py +129 -0
- ff14_news/channels/cn_weibo/exceptions.py +2 -0
- ff14_news/channels/cn_weibo/mblog_parser.py +161 -0
- ff14_news/channels/cn_weibo/post_adapter.py +105 -0
- ff14_news/channels/cn_weibo/profile.py +10 -0
- ff14_news/channels/cn_weibo/proxy_url.py +14 -0
- ff14_news/channels/jp_official/__init__.py +3 -0
- ff14_news/channels/jp_official/channel.py +110 -0
- ff14_news/channels/jp_official/constants.py +6 -0
- ff14_news/channels/jp_official/detail_parser.py +128 -0
- ff14_news/channels/jp_official/http_client.py +16 -0
- ff14_news/channels/jp_official/list_parser.py +112 -0
- ff14_news/common/html_blocks.py +183 -0
- ff14_news/common/list_feed.py +20 -0
- ff14_news/ff14_news.py +64 -0
- ff14_news/models.py +74 -0
- python_library_ff14_news-0.0.0.dist-info/METADATA +8 -0
- python_library_ff14_news-0.0.0.dist-info/RECORD +31 -0
- python_library_ff14_news-0.0.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from crawl4weibo import WeiboClient
|
|
6
|
+
from crawl4weibo.exceptions.base import CrawlError, NetworkError, RateLimitError
|
|
7
|
+
from crawl4weibo.models.post import Post
|
|
8
|
+
|
|
9
|
+
from ff14_news.channels.cn_weibo.browser_cookies import fetch_mobile_cookies
|
|
10
|
+
from ff14_news.channels.cn_weibo.exceptions import WeiboAccessError
|
|
11
|
+
from ff14_news.channels.cn_weibo.proxy_url import normalize_proxy_url
|
|
12
|
+
|
|
13
|
+
_PLAYWRIGHT_INSTALL_HINT = (
|
|
14
|
+
"请先安装 Playwright Chromium:\n"
|
|
15
|
+
" python -m example.ensure_browser --proxy 127.0.0.1:7897"
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class WeiboCrawlBackend:
|
|
20
|
+
"""基于 crawl4weibo 的微博 HTTP 后端(Playwright 取 Cookie + 移动端 API)。"""
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
*,
|
|
25
|
+
cookie: str | None = None,
|
|
26
|
+
cookie_storage_path: Path | None = None,
|
|
27
|
+
browser_headless: bool = True,
|
|
28
|
+
proxy_url: str | None = None,
|
|
29
|
+
) -> None:
|
|
30
|
+
"""创建后端。
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
cookie: 浏览器 Cookie 整串;提供后不再自动开浏览器
|
|
34
|
+
cookie_storage_path: Playwright 会话缓存路径,可复用 Cookie
|
|
35
|
+
browser_headless: 自动取 Cookie 时是否无头运行 Chromium
|
|
36
|
+
proxy_url: HTTP 代理,如 ``127.0.0.1:7897`` 或 ``http://127.0.0.1:7897``
|
|
37
|
+
"""
|
|
38
|
+
self._proxy = normalize_proxy_url(proxy_url)
|
|
39
|
+
if cookie:
|
|
40
|
+
cookie_value: str | dict[str, str] = cookie
|
|
41
|
+
else:
|
|
42
|
+
try:
|
|
43
|
+
cookie_value = fetch_mobile_cookies(
|
|
44
|
+
proxy_url=self._proxy,
|
|
45
|
+
headless=browser_headless,
|
|
46
|
+
storage_state_path=cookie_storage_path,
|
|
47
|
+
)
|
|
48
|
+
except Exception as exc:
|
|
49
|
+
raise _access_error(exc) from exc
|
|
50
|
+
if not cookie_value:
|
|
51
|
+
raise WeiboAccessError(
|
|
52
|
+
"Playwright 未获取到微博 Cookie,请检查代理或手动配置 Cookie。"
|
|
53
|
+
)
|
|
54
|
+
try:
|
|
55
|
+
self._client = WeiboClient(
|
|
56
|
+
cookies=cookie_value,
|
|
57
|
+
auto_fetch_cookies=False,
|
|
58
|
+
use_browser_cookies=False,
|
|
59
|
+
log_level="WARNING",
|
|
60
|
+
)
|
|
61
|
+
except Exception as exc:
|
|
62
|
+
raise _access_error(exc) from exc
|
|
63
|
+
if self._proxy:
|
|
64
|
+
self._client.add_proxy(self._proxy)
|
|
65
|
+
|
|
66
|
+
def resolve_screen_name(self, screen_name: str) -> str:
|
|
67
|
+
"""按 screen_name 解析 numeric uid。"""
|
|
68
|
+
name = screen_name.strip()
|
|
69
|
+
if not name:
|
|
70
|
+
raise ValueError("screen_name must not be empty")
|
|
71
|
+
try:
|
|
72
|
+
users = self._client.search_users(name, count=20)
|
|
73
|
+
except (CrawlError, NetworkError, RateLimitError) as exc:
|
|
74
|
+
raise _access_error(exc) from exc
|
|
75
|
+
lowered = name.lower()
|
|
76
|
+
for user in users:
|
|
77
|
+
if user.screen_name.lower() == lowered:
|
|
78
|
+
return str(user.id)
|
|
79
|
+
raise ValueError(f"cannot resolve uid for screen_name={name!r}")
|
|
80
|
+
|
|
81
|
+
def fetch_timeline_posts(self, uid: str, *, page: int) -> list[Post]:
|
|
82
|
+
"""拉取用户时间线一页。"""
|
|
83
|
+
if page < 1:
|
|
84
|
+
raise ValueError("page must be >= 1")
|
|
85
|
+
try:
|
|
86
|
+
return self._client.get_user_posts(
|
|
87
|
+
uid,
|
|
88
|
+
page=page,
|
|
89
|
+
expand=False,
|
|
90
|
+
)
|
|
91
|
+
except (CrawlError, NetworkError, RateLimitError) as exc:
|
|
92
|
+
raise _access_error(exc) from exc
|
|
93
|
+
|
|
94
|
+
def fetch_post(self, article_id: str) -> Post:
|
|
95
|
+
"""拉取单条微博(列表级字段,不展开长文)。"""
|
|
96
|
+
article_id = str(article_id).strip()
|
|
97
|
+
if not article_id:
|
|
98
|
+
raise ValueError("article_id must not be empty")
|
|
99
|
+
try:
|
|
100
|
+
return self._client.get_post_by_bid(article_id, expand=False)
|
|
101
|
+
except (CrawlError, NetworkError, RateLimitError) as exc:
|
|
102
|
+
raise _access_error(exc) from exc
|
|
103
|
+
|
|
104
|
+
def fetch_post_detail(self, article_id: str) -> Post:
|
|
105
|
+
"""拉取单条微博详情(展开长文)。"""
|
|
106
|
+
article_id = str(article_id).strip()
|
|
107
|
+
if not article_id:
|
|
108
|
+
raise ValueError("article_id must not be empty")
|
|
109
|
+
try:
|
|
110
|
+
return self._client.get_post_by_bid(article_id, expand=True)
|
|
111
|
+
except (CrawlError, NetworkError, RateLimitError) as exc:
|
|
112
|
+
raise _access_error(exc) from exc
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _access_error(exc: Exception) -> WeiboAccessError:
|
|
116
|
+
message = str(exc).strip()
|
|
117
|
+
lowered = message.lower()
|
|
118
|
+
if "playwright" in lowered or "executable" in lowered or "chromium" in lowered:
|
|
119
|
+
return WeiboAccessError(
|
|
120
|
+
f"微博需要 Playwright Chromium。{_PLAYWRIGHT_INSTALL_HINT}"
|
|
121
|
+
)
|
|
122
|
+
if "432" in message:
|
|
123
|
+
return WeiboAccessError(
|
|
124
|
+
"微博返回 HTTP 432。"
|
|
125
|
+
f"请确认代理可用(如 127.0.0.1:7897),或配置 example/weibo_cookie.txt。"
|
|
126
|
+
)
|
|
127
|
+
if isinstance(exc, RateLimitError):
|
|
128
|
+
return WeiboAccessError(f"微博限流:{message}")
|
|
129
|
+
return WeiboAccessError(message or "微博请求失败")
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import html
|
|
2
|
+
import re
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
from ff14_news.channels.cn_weibo.constants import (
|
|
6
|
+
DETAIL_URL_TEMPLATE,
|
|
7
|
+
MOBILE_ORIGIN,
|
|
8
|
+
PERMALINK_TEMPLATE,
|
|
9
|
+
)
|
|
10
|
+
from ff14_news.common.html_blocks import html_to_blocks
|
|
11
|
+
from ff14_news.models import NewsArticle, NewsBlockType, NewsContentBlock, NewsListItem
|
|
12
|
+
|
|
13
|
+
_TAG_RE = re.compile(r"<[^>]+>")
|
|
14
|
+
_FULL_TEXT_RE = re.compile(r"…?\.\.\.?全文\s*$|…全文\s*$")
|
|
15
|
+
_TITLE_MAX_LEN = 80
|
|
16
|
+
_SUMMARY_MAX_LEN = 200
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def effective_mblog(mblog: dict) -> dict:
|
|
20
|
+
"""转发微博取内层原博;普通微博返回自身。"""
|
|
21
|
+
inner = mblog.get("retweeted_status")
|
|
22
|
+
if isinstance(inner, dict):
|
|
23
|
+
return inner
|
|
24
|
+
return mblog
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def needs_detail_fetch(mblog: dict) -> bool:
|
|
28
|
+
"""正文被截断或标记长文时须拉 statuses/show。"""
|
|
29
|
+
if mblog.get("isLongText"):
|
|
30
|
+
return True
|
|
31
|
+
raw = str(mblog.get("text") or "")
|
|
32
|
+
plain = _plain_text(raw)
|
|
33
|
+
if "全文" in raw and ("status" in raw or _FULL_TEXT_RE.search(plain)):
|
|
34
|
+
return True
|
|
35
|
+
return False
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def mblog_id(mblog: dict) -> str:
|
|
39
|
+
value = mblog.get("id") or mblog.get("mid") or mblog.get("bid")
|
|
40
|
+
if not value:
|
|
41
|
+
raise ValueError("mblog missing id")
|
|
42
|
+
return str(value)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def parse_created_at(raw: str) -> datetime:
|
|
46
|
+
text = (raw or "").strip()
|
|
47
|
+
if not text:
|
|
48
|
+
return datetime.fromtimestamp(0)
|
|
49
|
+
try:
|
|
50
|
+
return datetime.strptime(text, "%a %b %d %H:%M:%S %z %Y")
|
|
51
|
+
except ValueError:
|
|
52
|
+
return datetime.fromtimestamp(0)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def mblog_to_list_item(mblog: dict, *, channel_id: str) -> NewsListItem:
|
|
56
|
+
article_id = mblog_id(mblog)
|
|
57
|
+
effective = effective_mblog(mblog)
|
|
58
|
+
title = _title_from_text(effective)
|
|
59
|
+
summary = _summary_from_text(effective)
|
|
60
|
+
cover = _first_pic_url(effective)
|
|
61
|
+
return NewsListItem(
|
|
62
|
+
channel_id=channel_id,
|
|
63
|
+
id=article_id,
|
|
64
|
+
title=title,
|
|
65
|
+
publish_date=parse_created_at(str(mblog.get("created_at") or "")),
|
|
66
|
+
summary=summary,
|
|
67
|
+
cover_image_url=cover,
|
|
68
|
+
source_page_url=DETAIL_URL_TEMPLATE.format(article_id=article_id),
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def mblog_to_article(
|
|
73
|
+
mblog: dict,
|
|
74
|
+
*,
|
|
75
|
+
channel_id: str,
|
|
76
|
+
) -> NewsArticle:
|
|
77
|
+
article_id = mblog_id(mblog)
|
|
78
|
+
effective = effective_mblog(mblog)
|
|
79
|
+
blocks = blocks_from_mblog(mblog)
|
|
80
|
+
title = _title_from_text(effective)
|
|
81
|
+
summary = _summary_from_blocks(blocks) or _summary_from_text(effective)
|
|
82
|
+
cover = _first_pic_url(effective)
|
|
83
|
+
return NewsArticle(
|
|
84
|
+
channel_id=channel_id,
|
|
85
|
+
id=article_id,
|
|
86
|
+
title=title,
|
|
87
|
+
publish_date=parse_created_at(str(mblog.get("created_at") or "")),
|
|
88
|
+
summary=summary,
|
|
89
|
+
category_code=None,
|
|
90
|
+
cover_image_url=cover,
|
|
91
|
+
source_page_url=PERMALINK_TEMPLATE.format(article_id=article_id),
|
|
92
|
+
blocks=blocks,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def blocks_from_mblog(mblog: dict) -> list[NewsContentBlock]:
|
|
97
|
+
effective = effective_mblog(mblog)
|
|
98
|
+
text_html = str(effective.get("text") or "")
|
|
99
|
+
wrapped = f"<div>{text_html}</div>" if text_html else ""
|
|
100
|
+
blocks = html_to_blocks(wrapped, base_url=MOBILE_ORIGIN)
|
|
101
|
+
for pic in effective.get("pics") or []:
|
|
102
|
+
url = _pic_url(pic)
|
|
103
|
+
if url:
|
|
104
|
+
blocks.append(
|
|
105
|
+
NewsContentBlock(type=NewsBlockType.IMAGE, url=url)
|
|
106
|
+
)
|
|
107
|
+
return blocks
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _plain_text(text_html: str) -> str:
|
|
111
|
+
text = _TAG_RE.sub("", text_html)
|
|
112
|
+
return html.unescape(text).replace("\xa0", " ").strip()
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _title_from_text(mblog: dict) -> str:
|
|
116
|
+
plain = _plain_text(str(mblog.get("text") or ""))
|
|
117
|
+
first_line = plain.split("\n", 1)[0].strip()
|
|
118
|
+
if not first_line:
|
|
119
|
+
return "微博"
|
|
120
|
+
if len(first_line) <= _TITLE_MAX_LEN:
|
|
121
|
+
return first_line
|
|
122
|
+
return first_line[: _TITLE_MAX_LEN - 1] + "…"
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _summary_from_text(mblog: dict) -> str:
|
|
126
|
+
plain = _plain_text(str(mblog.get("text") or ""))
|
|
127
|
+
plain = _FULL_TEXT_RE.sub("", plain).strip()
|
|
128
|
+
if len(plain) <= _SUMMARY_MAX_LEN:
|
|
129
|
+
return plain
|
|
130
|
+
return plain[: _SUMMARY_MAX_LEN - 1] + "…"
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _summary_from_blocks(blocks: list[NewsContentBlock]) -> str:
|
|
134
|
+
for block in blocks:
|
|
135
|
+
if block.text and block.text.strip():
|
|
136
|
+
text = block.text.strip()
|
|
137
|
+
if len(text) <= _SUMMARY_MAX_LEN:
|
|
138
|
+
return text
|
|
139
|
+
return text[: _SUMMARY_MAX_LEN - 1] + "…"
|
|
140
|
+
return ""
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _first_pic_url(mblog: dict) -> str | None:
|
|
144
|
+
pics = mblog.get("pics") or []
|
|
145
|
+
if not pics:
|
|
146
|
+
return None
|
|
147
|
+
return _pic_url(pics[0])
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _pic_url(pic: object) -> str | None:
|
|
151
|
+
if not isinstance(pic, dict):
|
|
152
|
+
return None
|
|
153
|
+
large = pic.get("large")
|
|
154
|
+
if isinstance(large, dict):
|
|
155
|
+
url = large.get("url")
|
|
156
|
+
if url:
|
|
157
|
+
return str(url).strip() or None
|
|
158
|
+
url = pic.get("url")
|
|
159
|
+
if url:
|
|
160
|
+
return str(url).strip() or None
|
|
161
|
+
return None
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
|
|
5
|
+
from crawl4weibo.models.post import Post
|
|
6
|
+
|
|
7
|
+
from ff14_news.channels.cn_weibo.constants import (
|
|
8
|
+
DETAIL_URL_TEMPLATE,
|
|
9
|
+
PERMALINK_TEMPLATE,
|
|
10
|
+
)
|
|
11
|
+
from ff14_news.channels.cn_weibo.mblog_parser import (
|
|
12
|
+
_SUMMARY_MAX_LEN,
|
|
13
|
+
_TITLE_MAX_LEN,
|
|
14
|
+
_FULL_TEXT_RE,
|
|
15
|
+
)
|
|
16
|
+
from ff14_news.models import NewsArticle, NewsBlockType, NewsContentBlock, NewsListItem
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def post_to_list_item(post: Post, *, channel_id: str) -> NewsListItem:
|
|
20
|
+
"""将 crawl4weibo Post 转为列表级 NewsListItem。"""
|
|
21
|
+
effective = _effective_post(post)
|
|
22
|
+
article_id = str(post.id).strip()
|
|
23
|
+
title = _title_from_plain(effective.text)
|
|
24
|
+
summary = _summary_from_plain(effective.text)
|
|
25
|
+
cover = effective.pic_urls[0] if effective.pic_urls else None
|
|
26
|
+
return NewsListItem(
|
|
27
|
+
channel_id=channel_id,
|
|
28
|
+
id=article_id,
|
|
29
|
+
title=title,
|
|
30
|
+
publish_date=_publish_date(post.created_at),
|
|
31
|
+
summary=summary,
|
|
32
|
+
cover_image_url=cover,
|
|
33
|
+
source_page_url=DETAIL_URL_TEMPLATE.format(article_id=article_id),
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def post_to_article(post: Post, *, channel_id: str) -> NewsArticle:
|
|
38
|
+
"""将 crawl4weibo Post 转为含正文块的 NewsArticle。"""
|
|
39
|
+
effective = _effective_post(post)
|
|
40
|
+
article_id = str(post.id).strip()
|
|
41
|
+
blocks = _blocks_from_post(effective)
|
|
42
|
+
title = _title_from_plain(effective.text)
|
|
43
|
+
summary = _summary_from_blocks(blocks) or _summary_from_plain(effective.text)
|
|
44
|
+
cover = effective.pic_urls[0] if effective.pic_urls else None
|
|
45
|
+
return NewsArticle(
|
|
46
|
+
channel_id=channel_id,
|
|
47
|
+
id=article_id,
|
|
48
|
+
title=title,
|
|
49
|
+
publish_date=_publish_date(post.created_at),
|
|
50
|
+
summary=summary,
|
|
51
|
+
category_code=None,
|
|
52
|
+
cover_image_url=cover,
|
|
53
|
+
source_page_url=PERMALINK_TEMPLATE.format(article_id=article_id),
|
|
54
|
+
blocks=blocks,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _effective_post(post: Post) -> Post:
|
|
59
|
+
if post.retweeted_status is not None:
|
|
60
|
+
return post.retweeted_status
|
|
61
|
+
return post
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _publish_date(value: datetime | None) -> datetime:
|
|
65
|
+
if value is None:
|
|
66
|
+
return datetime.fromtimestamp(0, tz=timezone.utc)
|
|
67
|
+
if value.tzinfo is None:
|
|
68
|
+
return value.replace(tzinfo=timezone.utc)
|
|
69
|
+
return value
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _title_from_plain(text: str) -> str:
|
|
73
|
+
first_line = text.strip().split("\n", 1)[0].strip()
|
|
74
|
+
if not first_line:
|
|
75
|
+
return "微博"
|
|
76
|
+
if len(first_line) <= _TITLE_MAX_LEN:
|
|
77
|
+
return first_line
|
|
78
|
+
return first_line[: _TITLE_MAX_LEN - 1] + "…"
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _summary_from_plain(text: str) -> str:
|
|
82
|
+
plain = _FULL_TEXT_RE.sub("", text.strip()).strip()
|
|
83
|
+
if len(plain) <= _SUMMARY_MAX_LEN:
|
|
84
|
+
return plain
|
|
85
|
+
return plain[: _SUMMARY_MAX_LEN - 1] + "…"
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _summary_from_blocks(blocks: list[NewsContentBlock]) -> str:
|
|
89
|
+
for block in blocks:
|
|
90
|
+
if block.text and block.text.strip():
|
|
91
|
+
snippet = block.text.strip()
|
|
92
|
+
if len(snippet) <= _SUMMARY_MAX_LEN:
|
|
93
|
+
return snippet
|
|
94
|
+
return snippet[: _SUMMARY_MAX_LEN - 1] + "…"
|
|
95
|
+
return ""
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _blocks_from_post(post: Post) -> list[NewsContentBlock]:
|
|
99
|
+
blocks: list[NewsContentBlock] = []
|
|
100
|
+
text = post.text.strip()
|
|
101
|
+
if text:
|
|
102
|
+
blocks.append(NewsContentBlock(type=NewsBlockType.TEXT, text=text))
|
|
103
|
+
for url in post.pic_urls:
|
|
104
|
+
blocks.append(NewsContentBlock(type=NewsBlockType.IMAGE, url=url))
|
|
105
|
+
return blocks
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
def normalize_proxy_url(value: str | None) -> str | None:
|
|
2
|
+
"""将 ``host:port`` 或完整 URL 规范为 ``http://…`` 代理地址。
|
|
3
|
+
|
|
4
|
+
Args:
|
|
5
|
+
value: 代理地址;空串或 None 表示不使用代理
|
|
6
|
+
"""
|
|
7
|
+
if value is None:
|
|
8
|
+
return None
|
|
9
|
+
text = value.strip()
|
|
10
|
+
if not text:
|
|
11
|
+
return None
|
|
12
|
+
if "://" not in text:
|
|
13
|
+
return f"http://{text}"
|
|
14
|
+
return text
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
from datetime import datetime, timezone
|
|
2
|
+
|
|
3
|
+
from ff14_news.channels.jp_official.constants import (
|
|
4
|
+
CHANNEL_ID,
|
|
5
|
+
DETAIL_URL_TEMPLATE,
|
|
6
|
+
DISPLAY_NAME,
|
|
7
|
+
TOPICS_LIST_URL,
|
|
8
|
+
)
|
|
9
|
+
from ff14_news.channels.jp_official.detail_parser import (
|
|
10
|
+
parse_detail_metadata,
|
|
11
|
+
parse_detail_page,
|
|
12
|
+
)
|
|
13
|
+
from ff14_news.channels.jp_official.http_client import fetch_html
|
|
14
|
+
from ff14_news.channels.jp_official.list_parser import (
|
|
15
|
+
list_row_to_item,
|
|
16
|
+
parse_topics_list_page,
|
|
17
|
+
topics_list_url,
|
|
18
|
+
)
|
|
19
|
+
from ff14_news.common.list_feed import article_from_list_item
|
|
20
|
+
from ff14_news.models import NewsArticle, NewsFeed, NewsListItem
|
|
21
|
+
|
|
22
|
+
_LIST_SCAN_PAGE_SIZE = 30
|
|
23
|
+
_LIST_SCAN_MAX_PAGES = 20
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class JpOfficialChannel:
|
|
27
|
+
"""FF14 日文官网 Lodestone トピックス。
|
|
28
|
+
|
|
29
|
+
默认抓取列表级字段(含 news__list--banner 摘要);正文块须 fetch_article_detail。
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
channel_id = CHANNEL_ID
|
|
33
|
+
display_name = DISPLAY_NAME
|
|
34
|
+
|
|
35
|
+
def __init__(self, *, timeout_seconds: float = 120.0) -> None:
|
|
36
|
+
self._timeout = timeout_seconds
|
|
37
|
+
|
|
38
|
+
def list_items(
|
|
39
|
+
self,
|
|
40
|
+
*,
|
|
41
|
+
limit: int = 10,
|
|
42
|
+
page_index: int = 0,
|
|
43
|
+
) -> list[NewsListItem]:
|
|
44
|
+
if limit < 1:
|
|
45
|
+
raise ValueError("limit must be >= 1")
|
|
46
|
+
url = topics_list_url(page_index)
|
|
47
|
+
html = fetch_html(url, timeout_seconds=self._timeout)
|
|
48
|
+
rows = parse_topics_list_page(html, limit=limit)
|
|
49
|
+
return [list_row_to_item(row, channel_id=self.channel_id) for row in rows]
|
|
50
|
+
|
|
51
|
+
def fetch_article_detail(self, article_id: str) -> NewsArticle:
|
|
52
|
+
article_id = str(article_id).strip()
|
|
53
|
+
if not article_id:
|
|
54
|
+
raise ValueError("article_id must not be empty")
|
|
55
|
+
url = DETAIL_URL_TEMPLATE.format(article_id=article_id)
|
|
56
|
+
html = fetch_html(url, timeout_seconds=self._timeout)
|
|
57
|
+
return parse_detail_page(html, article_id, channel_id=self.channel_id)
|
|
58
|
+
|
|
59
|
+
def fetch_article(self, article_id: str) -> NewsArticle:
|
|
60
|
+
return self.fetch_article_detail(article_id)
|
|
61
|
+
|
|
62
|
+
def fetch_articles(
|
|
63
|
+
self,
|
|
64
|
+
*,
|
|
65
|
+
limit: int = 10,
|
|
66
|
+
page_index: int = 0,
|
|
67
|
+
) -> NewsFeed:
|
|
68
|
+
items = self.list_items(limit=limit, page_index=page_index)
|
|
69
|
+
articles = [article_from_list_item(item) for item in items]
|
|
70
|
+
return self._build_feed(articles)
|
|
71
|
+
|
|
72
|
+
def fetch_articles_by_ids(self, article_ids: list[str]) -> NewsFeed:
|
|
73
|
+
if not article_ids:
|
|
74
|
+
raise ValueError("article_ids must not be empty")
|
|
75
|
+
wanted = {str(aid).strip() for aid in article_ids}
|
|
76
|
+
found: dict[str, NewsArticle] = {}
|
|
77
|
+
for page_index in range(_LIST_SCAN_MAX_PAGES):
|
|
78
|
+
if wanted.issubset(found):
|
|
79
|
+
break
|
|
80
|
+
items = self.list_items(
|
|
81
|
+
limit=_LIST_SCAN_PAGE_SIZE,
|
|
82
|
+
page_index=page_index,
|
|
83
|
+
)
|
|
84
|
+
if not items:
|
|
85
|
+
break
|
|
86
|
+
for item in items:
|
|
87
|
+
if item.id in wanted and item.id not in found:
|
|
88
|
+
found[item.id] = article_from_list_item(item)
|
|
89
|
+
articles: list[NewsArticle] = []
|
|
90
|
+
for aid in article_ids:
|
|
91
|
+
key = str(aid).strip()
|
|
92
|
+
if key in found:
|
|
93
|
+
articles.append(found[key])
|
|
94
|
+
else:
|
|
95
|
+
articles.append(self._metadata_from_detail(key))
|
|
96
|
+
return self._build_feed(articles)
|
|
97
|
+
|
|
98
|
+
def _metadata_from_detail(self, article_id: str) -> NewsArticle:
|
|
99
|
+
url = DETAIL_URL_TEMPLATE.format(article_id=article_id)
|
|
100
|
+
html = fetch_html(url, timeout_seconds=self._timeout)
|
|
101
|
+
return parse_detail_metadata(html, article_id, channel_id=self.channel_id)
|
|
102
|
+
|
|
103
|
+
def _build_feed(self, articles: list[NewsArticle]) -> NewsFeed:
|
|
104
|
+
return NewsFeed(
|
|
105
|
+
channel_id=self.channel_id,
|
|
106
|
+
source_list_url=TOPICS_LIST_URL,
|
|
107
|
+
category_code=None,
|
|
108
|
+
fetched_at=datetime.now(timezone.utc),
|
|
109
|
+
articles=articles,
|
|
110
|
+
)
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import html
|
|
2
|
+
import re
|
|
3
|
+
from datetime import datetime, timezone
|
|
4
|
+
|
|
5
|
+
from ff14_news.channels.jp_official.constants import DETAIL_URL_TEMPLATE, SITE_ORIGIN
|
|
6
|
+
from ff14_news.common.html_blocks import html_to_blocks
|
|
7
|
+
from ff14_news.models import NewsArticle, NewsContentBlock
|
|
8
|
+
|
|
9
|
+
_WRAPPER_RE = re.compile(
|
|
10
|
+
r'<div class="news__detail__wrapper">(.*?)</div>\s*<div class="news__detail__social">',
|
|
11
|
+
re.DOTALL,
|
|
12
|
+
)
|
|
13
|
+
_TITLE_RE = re.compile(
|
|
14
|
+
r'<article class="news__detail">.*?<h1>([^<]+)</h1>',
|
|
15
|
+
re.DOTALL,
|
|
16
|
+
)
|
|
17
|
+
_TIMESTAMP_RE = re.compile(
|
|
18
|
+
r'<article class="news__detail">.*?ldst_strftime\((\d+),',
|
|
19
|
+
re.DOTALL,
|
|
20
|
+
)
|
|
21
|
+
_FIRST_IMG_RE = re.compile(r'<img[^>]+src="([^"]+)"')
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def parse_detail_metadata(
|
|
25
|
+
page_html: str,
|
|
26
|
+
article_id: str,
|
|
27
|
+
*,
|
|
28
|
+
channel_id: str,
|
|
29
|
+
) -> NewsArticle:
|
|
30
|
+
"""详情页元数据:标题、时间、头图、摘要,不解析正文块。"""
|
|
31
|
+
title_match = _TITLE_RE.search(page_html)
|
|
32
|
+
title = html.unescape(title_match.group(1).strip()) if title_match else ""
|
|
33
|
+
|
|
34
|
+
ts_match = _TIMESTAMP_RE.search(page_html)
|
|
35
|
+
if ts_match:
|
|
36
|
+
publish_date = datetime.fromtimestamp(
|
|
37
|
+
int(ts_match.group(1)),
|
|
38
|
+
tz=timezone.utc,
|
|
39
|
+
)
|
|
40
|
+
else:
|
|
41
|
+
publish_date = datetime.fromtimestamp(0, tz=timezone.utc)
|
|
42
|
+
|
|
43
|
+
wrapper_match = _WRAPPER_RE.search(page_html)
|
|
44
|
+
wrapper_html = wrapper_match.group(1) if wrapper_match else ""
|
|
45
|
+
cover_url = _first_image_url(wrapper_html)
|
|
46
|
+
summary = _plain_summary_from_html(wrapper_html)
|
|
47
|
+
|
|
48
|
+
return NewsArticle(
|
|
49
|
+
channel_id=channel_id,
|
|
50
|
+
id=article_id,
|
|
51
|
+
title=title,
|
|
52
|
+
publish_date=publish_date,
|
|
53
|
+
summary=summary,
|
|
54
|
+
category_code=None,
|
|
55
|
+
cover_image_url=cover_url,
|
|
56
|
+
source_page_url=DETAIL_URL_TEMPLATE.format(article_id=article_id),
|
|
57
|
+
blocks=[],
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def parse_detail_page(
|
|
62
|
+
page_html: str,
|
|
63
|
+
article_id: str,
|
|
64
|
+
*,
|
|
65
|
+
channel_id: str,
|
|
66
|
+
) -> NewsArticle:
|
|
67
|
+
title_match = _TITLE_RE.search(page_html)
|
|
68
|
+
title = html.unescape(title_match.group(1).strip()) if title_match else ""
|
|
69
|
+
|
|
70
|
+
ts_match = _TIMESTAMP_RE.search(page_html)
|
|
71
|
+
if ts_match:
|
|
72
|
+
publish_date = datetime.fromtimestamp(
|
|
73
|
+
int(ts_match.group(1)),
|
|
74
|
+
tz=timezone.utc,
|
|
75
|
+
)
|
|
76
|
+
else:
|
|
77
|
+
publish_date = datetime.fromtimestamp(0, tz=timezone.utc)
|
|
78
|
+
|
|
79
|
+
wrapper_match = _WRAPPER_RE.search(page_html)
|
|
80
|
+
wrapper_html = wrapper_match.group(1) if wrapper_match else ""
|
|
81
|
+
blocks = html_to_blocks(wrapper_html, base_url=SITE_ORIGIN)
|
|
82
|
+
|
|
83
|
+
cover_url = _first_image_url(wrapper_html)
|
|
84
|
+
summary = _summary_from_blocks(blocks)
|
|
85
|
+
|
|
86
|
+
return NewsArticle(
|
|
87
|
+
channel_id=channel_id,
|
|
88
|
+
id=article_id,
|
|
89
|
+
title=title,
|
|
90
|
+
publish_date=publish_date,
|
|
91
|
+
summary=summary,
|
|
92
|
+
category_code=None,
|
|
93
|
+
cover_image_url=cover_url,
|
|
94
|
+
source_page_url=DETAIL_URL_TEMPLATE.format(article_id=article_id),
|
|
95
|
+
blocks=blocks,
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _first_image_url(wrapper_html: str) -> str | None:
|
|
100
|
+
match = _FIRST_IMG_RE.search(wrapper_html)
|
|
101
|
+
if not match:
|
|
102
|
+
return None
|
|
103
|
+
url = match.group(1).strip()
|
|
104
|
+
return url or None
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
_TAG_RE = re.compile(r"<[^>]+>")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _plain_summary_from_html(wrapper_html: str, max_len: int = 200) -> str:
|
|
111
|
+
text = _TAG_RE.sub(" ", wrapper_html)
|
|
112
|
+
text = html.unescape(text)
|
|
113
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
114
|
+
if not text:
|
|
115
|
+
return ""
|
|
116
|
+
if len(text) <= max_len:
|
|
117
|
+
return text
|
|
118
|
+
return text[: max_len - 1] + "…"
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _summary_from_blocks(blocks: list[NewsContentBlock], max_len: int = 200) -> str:
|
|
122
|
+
for block in blocks:
|
|
123
|
+
if block.text and block.text.strip():
|
|
124
|
+
text = block.text.strip()
|
|
125
|
+
if len(text) <= max_len:
|
|
126
|
+
return text
|
|
127
|
+
return text[: max_len - 1] + "…"
|
|
128
|
+
return ""
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import urllib.error
|
|
2
|
+
import urllib.request
|
|
3
|
+
|
|
4
|
+
_USER_AGENT = "Mozilla/5.0 (compatible; python-library-ff14-news/0.1)"
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def fetch_html(url: str, *, timeout_seconds: float) -> str:
|
|
8
|
+
req = urllib.request.Request(
|
|
9
|
+
url,
|
|
10
|
+
headers={"User-Agent": _USER_AGENT, "Accept": "text/html"},
|
|
11
|
+
)
|
|
12
|
+
try:
|
|
13
|
+
raw = urllib.request.urlopen(req, timeout=timeout_seconds).read()
|
|
14
|
+
except urllib.error.HTTPError as exc:
|
|
15
|
+
raise ValueError(f"HTTP {exc.code} for {url}") from exc
|
|
16
|
+
return raw.decode("utf-8", "replace")
|