python-library-ff14-news 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
ff14_news/__init__.py ADDED
@@ -0,0 +1,25 @@
1
+ from ff14_news.channel_protocol import NewsChannel
2
+ from ff14_news.channels.cn_official import CnOfficialChannel
3
+ from ff14_news.channels.cn_weibo import CnWeiboChannel
4
+ from ff14_news.channels.jp_official import JpOfficialChannel
5
+ from ff14_news.ff14_news import FF14News
6
+ from ff14_news.models import (
7
+ NewsArticle,
8
+ NewsBlockType,
9
+ NewsContentBlock,
10
+ NewsFeed,
11
+ NewsListItem,
12
+ )
13
+
14
+ __all__ = [
15
+ "FF14News",
16
+ "NewsChannel",
17
+ "CnOfficialChannel",
18
+ "CnWeiboChannel",
19
+ "JpOfficialChannel",
20
+ "NewsArticle",
21
+ "NewsBlockType",
22
+ "NewsContentBlock",
23
+ "NewsFeed",
24
+ "NewsListItem",
25
+ ]
@@ -0,0 +1,41 @@
1
+ from typing import Protocol, runtime_checkable
2
+
3
+ from ff14_news.models import NewsArticle, NewsFeed, NewsListItem
4
+
5
+
6
+ @runtime_checkable
7
+ class NewsChannel(Protocol):
8
+ """新闻渠道:各渠道独立实现,输出统一的 Feed / Article 结构。"""
9
+
10
+ channel_id: str
11
+ display_name: str
12
+
13
+ def list_items(
14
+ self,
15
+ *,
16
+ limit: int = 10,
17
+ page_index: int = 0,
18
+ ) -> list[NewsListItem]:
19
+ """拉取列表元数据,顺序与对应站点列表一致。"""
20
+ ...
21
+
22
+ def fetch_article_detail(self, article_id: str) -> NewsArticle:
23
+ """拉取单篇详情并解析正文块(blocks 非空)。"""
24
+ ...
25
+
26
+ def fetch_article(self, article_id: str) -> NewsArticle:
27
+ """拉取单篇详情;与 fetch_article_detail 相同。"""
28
+ ...
29
+
30
+ def fetch_articles(
31
+ self,
32
+ *,
33
+ limit: int = 10,
34
+ page_index: int = 0,
35
+ ) -> NewsFeed:
36
+ """按列表顺序抓取列表级字段(标题、摘要、头图、链接),blocks 为空。"""
37
+ ...
38
+
39
+ def fetch_articles_by_ids(self, article_ids: list[str]) -> NewsFeed:
40
+ """按给定 ID 顺序抓取列表级字段,不展开正文块。"""
41
+ ...
@@ -0,0 +1 @@
1
+ """各新闻渠道实现(按子目录拆分,互不混用)。"""
@@ -0,0 +1,3 @@
1
+ from ff14_news.channels.cn_official.channel import CnOfficialChannel
2
+
3
+ __all__ = ["CnOfficialChannel"]
@@ -0,0 +1,112 @@
1
+ from datetime import datetime, timezone
2
+
3
+ from ff14_news.channels.cn_official.constants import (
4
+ CHANNEL_ID,
5
+ DISPLAY_NAME,
6
+ NEWS_LIST_CATEGORY_CODE,
7
+ OFFICIAL_NEWS_DETAIL_URL_TEMPLATE,
8
+ OFFICIAL_NEWS_LIST_URL,
9
+ )
10
+ from ff14_news.channels.cn_official.cqnews_client import CqNewsClient, parse_publish_date
11
+ from ff14_news.channels.cn_official.html_content import html_to_blocks
12
+ from ff14_news.common.list_feed import article_from_list_item
13
+ from ff14_news.models import NewsArticle, NewsFeed, NewsListItem
14
+
15
+
16
+ class CnOfficialChannel:
17
+ """FF14 国服官网新闻(ff.web.sdo.com / cqnews)。
18
+
19
+ 默认抓取与旧 Selenium 列表一致:头图、标题、摘要、详情页链接。
20
+ 正文块须显式调用 fetch_article_detail。
21
+ """
22
+
23
+ channel_id = CHANNEL_ID
24
+ display_name = DISPLAY_NAME
25
+
26
+ def __init__(
27
+ self,
28
+ *,
29
+ category_code: int = NEWS_LIST_CATEGORY_CODE,
30
+ timeout_seconds: float = 60.0,
31
+ ) -> None:
32
+ self.category_code = category_code
33
+ self._client = CqNewsClient(timeout_seconds=timeout_seconds)
34
+
35
+ def list_items(
36
+ self,
37
+ *,
38
+ limit: int = 10,
39
+ page_index: int = 0,
40
+ ) -> list[NewsListItem]:
41
+ if limit < 1:
42
+ raise ValueError("limit must be >= 1")
43
+ items, _total = self._client.fetch_list_page(
44
+ self.category_code,
45
+ page_index,
46
+ limit,
47
+ )
48
+ return items[:limit]
49
+
50
+ def fetch_article_detail(self, article_id: str) -> NewsArticle:
51
+ return self._detail_to_article(self._client.fetch_detail_raw(article_id))
52
+
53
+ def fetch_article(self, article_id: str) -> NewsArticle:
54
+ return self.fetch_article_detail(article_id)
55
+
56
+ def fetch_articles(
57
+ self,
58
+ *,
59
+ limit: int = 10,
60
+ page_index: int = 0,
61
+ ) -> NewsFeed:
62
+ items = self.list_items(limit=limit, page_index=page_index)
63
+ articles = [
64
+ article_from_list_item(item, category_code=self.category_code)
65
+ for item in items
66
+ ]
67
+ return self._build_feed(articles)
68
+
69
+ def fetch_articles_by_ids(self, article_ids: list[str]) -> NewsFeed:
70
+ if not article_ids:
71
+ raise ValueError("article_ids must not be empty")
72
+ articles = [
73
+ self._detail_to_list_article(self._client.fetch_detail_raw(aid))
74
+ for aid in article_ids
75
+ ]
76
+ return self._build_feed(articles)
77
+
78
+ def _detail_to_list_article(self, data: dict) -> NewsArticle:
79
+ article_id = str(int(data["Id"]))
80
+ cover = data.get("HomeImagePath")
81
+ cover_url = str(cover).strip() if cover else None
82
+ if cover_url == "":
83
+ cover_url = None
84
+ publish_raw = str(data.get("PublishDate") or "")
85
+ return NewsArticle(
86
+ channel_id=self.channel_id,
87
+ id=article_id,
88
+ title=str(data.get("Title") or ""),
89
+ publish_date=parse_publish_date(publish_raw),
90
+ summary=str(data.get("Summary") or ""),
91
+ category_code=int(data.get("CategoryCode") or self.category_code),
92
+ cover_image_url=cover_url,
93
+ source_page_url=OFFICIAL_NEWS_DETAIL_URL_TEMPLATE.format(
94
+ article_id=article_id
95
+ ),
96
+ blocks=[],
97
+ )
98
+
99
+ def _detail_to_article(self, data: dict) -> NewsArticle:
100
+ article = self._detail_to_list_article(data)
101
+ html = str(data.get("Content") or "")
102
+ blocks = html_to_blocks(html)
103
+ return article.model_copy(update={"blocks": blocks})
104
+
105
+ def _build_feed(self, articles: list[NewsArticle]) -> NewsFeed:
106
+ return NewsFeed(
107
+ channel_id=self.channel_id,
108
+ source_list_url=OFFICIAL_NEWS_LIST_URL,
109
+ category_code=self.category_code,
110
+ fetched_at=datetime.now(timezone.utc),
111
+ articles=articles,
112
+ )
@@ -0,0 +1,13 @@
1
+ CHANNEL_ID = "cn_official"
2
+ DISPLAY_NAME = "FF14 国服官网"
3
+
4
+ GAME_CODE = "ff"
5
+ NEWS_LIST_CATEGORY_CODE = 5310
6
+ CQNEWS_API_BASE = "https://cqnews.web.sdo.com/api/news"
7
+ OFFICIAL_NEWS_LIST_URL = (
8
+ "https://ff.web.sdo.com/web8/index.html#/newstab/newslist"
9
+ )
10
+ OFFICIAL_NEWS_DETAIL_URL_TEMPLATE = (
11
+ "https://ff.web.sdo.com/web8/index.html#/newstab/newscont/{article_id}"
12
+ )
13
+ HTML_BASE_URL = "https://ff.web.sdo.com"
@@ -0,0 +1,112 @@
1
+ import json
2
+ import urllib.error
3
+ import urllib.parse
4
+ import urllib.request
5
+ from datetime import datetime
6
+ from typing import Any
7
+
8
+ from ff14_news.channels.cn_official.constants import (
9
+ CHANNEL_ID,
10
+ CQNEWS_API_BASE,
11
+ GAME_CODE,
12
+ OFFICIAL_NEWS_DETAIL_URL_TEMPLATE,
13
+ )
14
+ from ff14_news.models import NewsListItem
15
+
16
+ _DEFAULT_HEADERS = {
17
+ "User-Agent": "Mozilla/5.0 (compatible; python-library-ff14-news/0.1)",
18
+ "Accept": "application/json",
19
+ }
20
+
21
+
22
+ class CqNewsClient:
23
+ """盛趣 cqnews 新闻 JSON 接口(国服官网 SPA 同源)。"""
24
+
25
+ def __init__(self, timeout_seconds: float = 60.0) -> None:
26
+ self._timeout = timeout_seconds
27
+
28
+ def fetch_list_page(
29
+ self,
30
+ category_code: int,
31
+ page_index: int,
32
+ page_size: int,
33
+ ) -> tuple[list[NewsListItem], int]:
34
+ """拉取一页列表。
35
+
36
+ Returns:
37
+ 列表项与 TotalCount。
38
+ """
39
+ query = urllib.parse.urlencode(
40
+ {
41
+ "gameCode": GAME_CODE,
42
+ "CategoryCode": str(category_code),
43
+ "pageIndex": str(page_index),
44
+ "pageSize": str(page_size),
45
+ }
46
+ )
47
+ url = f"{CQNEWS_API_BASE}/newsList?{query}"
48
+ payload = self._get_json(url)
49
+ rows = payload.get("Data") or []
50
+ total = int(payload.get("TotalCount") or 0)
51
+ items = [self._parse_list_row(row) for row in rows]
52
+ return items, total
53
+
54
+ def fetch_detail_raw(self, article_id: str) -> dict[str, Any]:
55
+ """拉取详情 JSON 的 Data 字段。"""
56
+ query = urllib.parse.urlencode(
57
+ {"gameCode": GAME_CODE, "id": str(article_id).strip()}
58
+ )
59
+ url = f"{CQNEWS_API_BASE}/newsDetail?{query}"
60
+ payload = self._get_json(url)
61
+ data = payload.get("Data")
62
+ if not isinstance(data, dict):
63
+ msg = payload.get("Message") or "empty detail"
64
+ raise ValueError(f"news detail {article_id} failed: {msg}")
65
+ return data
66
+
67
+ def _get_json(self, url: str) -> dict[str, Any]:
68
+ req = urllib.request.Request(url, headers=_DEFAULT_HEADERS)
69
+ try:
70
+ raw = urllib.request.urlopen(req, timeout=self._timeout).read()
71
+ except urllib.error.HTTPError as exc:
72
+ raise ValueError(f"HTTP {exc.code} for {url}") from exc
73
+ payload = json.loads(raw.decode("utf-8"))
74
+ if not isinstance(payload, dict):
75
+ raise ValueError(f"unexpected JSON root from {url}")
76
+ code = payload.get("Code")
77
+ if code not in (0, "0", None):
78
+ raise ValueError(
79
+ f"API error Code={code} Message={payload.get('Message')}"
80
+ )
81
+ return payload
82
+
83
+ def _parse_list_row(self, row: dict[str, Any]) -> NewsListItem:
84
+ article_id = str(int(row["Id"]))
85
+ return NewsListItem(
86
+ channel_id=CHANNEL_ID,
87
+ id=article_id,
88
+ title=str(row.get("Title") or ""),
89
+ publish_date=parse_publish_date(str(row.get("PublishDate") or "")),
90
+ summary=str(row.get("Summary") or ""),
91
+ cover_image_url=_optional_str(row.get("HomeImagePath")),
92
+ source_page_url=OFFICIAL_NEWS_DETAIL_URL_TEMPLATE.format(
93
+ article_id=article_id
94
+ ),
95
+ )
96
+
97
+
98
+ def _optional_str(value: Any) -> str | None:
99
+ if value is None:
100
+ return None
101
+ text = str(value).strip()
102
+ return text or None
103
+
104
+
105
+ def parse_publish_date(text: str) -> datetime:
106
+ text = text.strip()
107
+ for fmt in ("%Y/%m/%d %H:%M:%S", "%Y-%m-%d %H:%M:%S"):
108
+ try:
109
+ return datetime.strptime(text, fmt)
110
+ except ValueError:
111
+ continue
112
+ raise ValueError(f"unsupported PublishDate: {text!r}")
@@ -0,0 +1,11 @@
1
+ from ff14_news.channels.cn_official.constants import HTML_BASE_URL
2
+ from ff14_news.common.html_blocks import html_to_blocks as _html_to_blocks
3
+ from ff14_news.models import NewsContentBlock
4
+
5
+
6
+ def html_to_blocks(
7
+ html: str,
8
+ *,
9
+ base_url: str = HTML_BASE_URL,
10
+ ) -> list[NewsContentBlock]:
11
+ return _html_to_blocks(html, base_url=base_url)
@@ -0,0 +1,3 @@
1
+ from ff14_news.channels.cn_weibo.channel import CnWeiboChannel
2
+
3
+ __all__ = ["CnWeiboChannel"]
@@ -0,0 +1,93 @@
1
+ from __future__ import annotations
2
+
3
+ import random
4
+ import time
5
+ from pathlib import Path
6
+
7
+ _MOBILE_UA = (
8
+ "Mozilla/5.0 (Linux; Android 13; SM-G9980) "
9
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
10
+ "Chrome/112.0.5615.135 Mobile Safari/537.36"
11
+ )
12
+ _MOBILE_HOME = "https://m.weibo.cn/"
13
+
14
+
15
+ def fetch_mobile_cookies(
16
+ *,
17
+ proxy_url: str | None = None,
18
+ headless: bool = True,
19
+ storage_state_path: Path | None = None,
20
+ timeout_seconds: float = 60.0,
21
+ ) -> dict[str, str]:
22
+ """用 Playwright 打开 m.weibo.cn 并返回 Cookie 字典。
23
+
24
+ Args:
25
+ proxy_url: HTTP 代理,如 ``http://127.0.0.1:7897``
26
+ headless: 是否无头运行 Chromium
27
+ storage_state_path: 可复用的 Playwright storage state 路径
28
+ timeout_seconds: 页面加载超时秒数
29
+ """
30
+ try:
31
+ from playwright.sync_api import sync_playwright
32
+ except ImportError as exc:
33
+ raise ImportError(
34
+ "Playwright 未安装。请执行:pip install crawl4weibo && "
35
+ "python -m example.ensure_browser --proxy 127.0.0.1:7897"
36
+ ) from exc
37
+
38
+ storage = None
39
+ if storage_state_path is not None and storage_state_path.is_file():
40
+ storage = str(storage_state_path.expanduser())
41
+
42
+ launch_kwargs: dict = {
43
+ "headless": headless,
44
+ "args": [
45
+ "--disable-blink-features=AutomationControlled",
46
+ "--no-sandbox",
47
+ "--disable-setuid-sandbox",
48
+ ],
49
+ }
50
+ if proxy_url:
51
+ launch_kwargs["proxy"] = {"server": proxy_url}
52
+
53
+ cookies_dict: dict[str, str] = {}
54
+ with sync_playwright() as playwright:
55
+ browser = playwright.chromium.launch(**launch_kwargs)
56
+ context = browser.new_context(
57
+ user_agent=_MOBILE_UA,
58
+ viewport={"width": 393, "height": 851},
59
+ locale="zh-CN",
60
+ timezone_id="Asia/Shanghai",
61
+ device_scale_factor=2.75,
62
+ is_mobile=True,
63
+ has_touch=True,
64
+ storage_state=storage,
65
+ )
66
+ context.set_extra_http_headers(
67
+ {
68
+ "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
69
+ "Accept": (
70
+ "text/html,application/xhtml+xml,"
71
+ "application/xml;q=0.9,image/webp,*/*;q=0.8"
72
+ ),
73
+ },
74
+ )
75
+ page = context.new_page()
76
+ try:
77
+ page.goto(
78
+ _MOBILE_HOME,
79
+ timeout=int(timeout_seconds * 1000),
80
+ wait_until="networkidle",
81
+ )
82
+ time.sleep(random.uniform(2, 4))
83
+ page.evaluate("window.scrollBy(0, 300)")
84
+ time.sleep(random.uniform(0.5, 1))
85
+ for cookie in context.cookies():
86
+ cookies_dict[cookie["name"]] = cookie["value"]
87
+ if storage_state_path is not None:
88
+ storage_state_path.parent.mkdir(parents=True, exist_ok=True)
89
+ context.storage_state(path=str(storage_state_path))
90
+ finally:
91
+ context.close()
92
+ browser.close()
93
+ return cookies_dict
@@ -0,0 +1,141 @@
1
+ from datetime import datetime, timezone
2
+ from pathlib import Path
3
+
4
+ from ff14_news.channels.cn_weibo.constants import (
5
+ CHANNEL_ID,
6
+ DEFAULT_UID,
7
+ DISPLAY_NAME,
8
+ MOBILE_ORIGIN,
9
+ SCREEN_NAME,
10
+ )
11
+ from ff14_news.channels.cn_weibo.crawl_backend import WeiboCrawlBackend
12
+ from ff14_news.channels.cn_weibo.post_adapter import post_to_article, post_to_list_item
13
+ from ff14_news.common.list_feed import article_from_list_item
14
+ from ff14_news.models import NewsArticle, NewsFeed, NewsListItem
15
+
16
+ _TIMELINE_SCAN_PAGE_SIZE = 20
17
+ _TIMELINE_SCAN_MAX_PAGES = 30
18
+
19
+
20
+ class CnWeiboChannel:
21
+ """FF14 官方微博(m.weibo.cn 时间线,crawl4weibo + Playwright 过反爬)。
22
+
23
+ 默认抓取列表级字段;长文正文块须 fetch_article_detail。
24
+ """
25
+
26
+ channel_id = CHANNEL_ID
27
+ display_name = DISPLAY_NAME
28
+
29
+ def __init__(
30
+ self,
31
+ *,
32
+ screen_name: str = SCREEN_NAME,
33
+ uid: str | None = None,
34
+ cookie: str | None = None,
35
+ cookie_storage_path: Path | None = None,
36
+ browser_headless: bool = True,
37
+ proxy_url: str | None = None,
38
+ timeout_seconds: float = 60.0,
39
+ ) -> None:
40
+ """绑定微博账号。
41
+
42
+ Args:
43
+ screen_name: 微博 screen_name,默认 cnff14
44
+ uid: 已知 numeric uid 时可省略解析;省略且为默认账号时用 DEFAULT_UID
45
+ cookie: 浏览器 m.weibo.cn Cookie 整串;省略时用 Playwright 自动获取
46
+ cookie_storage_path: Playwright 会话缓存,便于复用 Cookie
47
+ browser_headless: 自动取 Cookie 时是否无头运行浏览器
48
+ proxy_url: HTTP 代理,如 ``127.0.0.1:7897``;用于取 Cookie 与 API 请求
49
+ timeout_seconds: 保留参数,与 crawl4weibo 内部超时一致
50
+ """
51
+ self.screen_name = screen_name
52
+ self._timeout = timeout_seconds
53
+ self._backend = WeiboCrawlBackend(
54
+ cookie=cookie,
55
+ cookie_storage_path=cookie_storage_path,
56
+ browser_headless=browser_headless,
57
+ proxy_url=proxy_url,
58
+ )
59
+ if uid is not None:
60
+ self._uid = str(uid).strip()
61
+ elif screen_name == SCREEN_NAME:
62
+ self._uid = DEFAULT_UID
63
+ else:
64
+ self._uid = self._backend.resolve_screen_name(screen_name)
65
+ @property
66
+ def uid(self) -> str:
67
+ return self._uid
68
+
69
+ def list_items(
70
+ self,
71
+ *,
72
+ limit: int = 10,
73
+ page_index: int = 0,
74
+ ) -> list[NewsListItem]:
75
+ if limit < 1:
76
+ raise ValueError("limit must be >= 1")
77
+ page = page_index + 1
78
+ posts = self._backend.fetch_timeline_posts(self._uid, page=page)
79
+ return [
80
+ post_to_list_item(post, channel_id=self.channel_id)
81
+ for post in posts[:limit]
82
+ ]
83
+
84
+ def fetch_article_detail(self, article_id: str) -> NewsArticle:
85
+ article_id = str(article_id).strip()
86
+ if not article_id:
87
+ raise ValueError("article_id must not be empty")
88
+ post = self._backend.fetch_post_detail(article_id)
89
+ return post_to_article(post, channel_id=self.channel_id)
90
+
91
+ def fetch_article(self, article_id: str) -> NewsArticle:
92
+ return self.fetch_article_detail(article_id)
93
+
94
+ def fetch_articles(
95
+ self,
96
+ *,
97
+ limit: int = 10,
98
+ page_index: int = 0,
99
+ ) -> NewsFeed:
100
+ items = self.list_items(limit=limit, page_index=page_index)
101
+ articles = [article_from_list_item(item) for item in items]
102
+ return self._build_feed(articles)
103
+
104
+ def fetch_articles_by_ids(self, article_ids: list[str]) -> NewsFeed:
105
+ if not article_ids:
106
+ raise ValueError("article_ids must not be empty")
107
+ articles = [
108
+ self._article_list_level_by_id(str(aid).strip()) for aid in article_ids
109
+ ]
110
+ return self._build_feed(articles)
111
+
112
+ def _article_list_level_by_id(self, article_id: str) -> NewsArticle:
113
+ item = self._find_list_item_by_id(article_id)
114
+ if item is not None:
115
+ return article_from_list_item(item)
116
+ post = self._backend.fetch_post(article_id)
117
+ return article_from_list_item(
118
+ post_to_list_item(post, channel_id=self.channel_id)
119
+ )
120
+
121
+ def _find_list_item_by_id(self, article_id: str) -> NewsListItem | None:
122
+ for page_index in range(_TIMELINE_SCAN_MAX_PAGES):
123
+ items = self.list_items(
124
+ limit=_TIMELINE_SCAN_PAGE_SIZE,
125
+ page_index=page_index,
126
+ )
127
+ for item in items:
128
+ if item.id == article_id:
129
+ return item
130
+ if len(items) < _TIMELINE_SCAN_PAGE_SIZE:
131
+ break
132
+ return None
133
+
134
+ def _build_feed(self, articles: list[NewsArticle]) -> NewsFeed:
135
+ return NewsFeed(
136
+ channel_id=self.channel_id,
137
+ source_list_url=f"{MOBILE_ORIGIN}/u/{self._uid}",
138
+ category_code=None,
139
+ fetched_at=datetime.now(timezone.utc),
140
+ articles=articles,
141
+ )
@@ -0,0 +1,10 @@
1
+ CHANNEL_ID = "cn_weibo"
2
+ DISPLAY_NAME = "FF14 官方微博"
3
+ SCREEN_NAME = "cnff14"
4
+ DEFAULT_UID = "1784473157"
5
+
6
+ WEB_ORIGIN = "https://weibo.com"
7
+ MOBILE_ORIGIN = "https://m.weibo.cn"
8
+ PROFILE_URL = f"{WEB_ORIGIN}/{SCREEN_NAME}"
9
+ DETAIL_URL_TEMPLATE = f"{MOBILE_ORIGIN}/detail/{{article_id}}"
10
+ PERMALINK_TEMPLATE = f"{WEB_ORIGIN}/{DEFAULT_UID}/{{article_id}}"