PyPI - python-library-ff14-news - Versions diffs - 0.0.0__py3-none-any.whl - Mend

python-library-ff14-news 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

ff14_news/__init__.py +25 -0
ff14_news/channel_protocol.py +41 -0
ff14_news/channels/__init__.py +1 -0
ff14_news/channels/cn_official/__init__.py +3 -0
ff14_news/channels/cn_official/channel.py +112 -0
ff14_news/channels/cn_official/constants.py +13 -0
ff14_news/channels/cn_official/cqnews_client.py +112 -0
ff14_news/channels/cn_official/html_content.py +11 -0
ff14_news/channels/cn_weibo/__init__.py +3 -0
ff14_news/channels/cn_weibo/browser_cookies.py +93 -0
ff14_news/channels/cn_weibo/channel.py +141 -0
ff14_news/channels/cn_weibo/constants.py +10 -0
ff14_news/channels/cn_weibo/crawl_backend.py +129 -0
ff14_news/channels/cn_weibo/exceptions.py +2 -0
ff14_news/channels/cn_weibo/mblog_parser.py +161 -0
ff14_news/channels/cn_weibo/post_adapter.py +105 -0
ff14_news/channels/cn_weibo/profile.py +10 -0
ff14_news/channels/cn_weibo/proxy_url.py +14 -0
ff14_news/channels/jp_official/__init__.py +3 -0
ff14_news/channels/jp_official/channel.py +110 -0
ff14_news/channels/jp_official/constants.py +6 -0
ff14_news/channels/jp_official/detail_parser.py +128 -0
ff14_news/channels/jp_official/http_client.py +16 -0
ff14_news/channels/jp_official/list_parser.py +112 -0
ff14_news/common/html_blocks.py +183 -0
ff14_news/common/list_feed.py +20 -0
ff14_news/ff14_news.py +64 -0
ff14_news/models.py +74 -0
python_library_ff14_news-0.0.0.dist-info/METADATA +8 -0
python_library_ff14_news-0.0.0.dist-info/RECORD +31 -0
python_library_ff14_news-0.0.0.dist-info/WHEEL +4 -0

ff14_news/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+from ff14_news.channel_protocol import NewsChannel
+from ff14_news.channels.cn_official import CnOfficialChannel
+from ff14_news.channels.cn_weibo import CnWeiboChannel
+from ff14_news.channels.jp_official import JpOfficialChannel
+from ff14_news.ff14_news import FF14News
+from ff14_news.models import (
+    NewsArticle,
+    NewsBlockType,
+    NewsContentBlock,
+    NewsFeed,
+    NewsListItem,
+)
+__all__ = [
+    "FF14News",
+    "NewsChannel",
+    "CnOfficialChannel",
+    "CnWeiboChannel",
+    "JpOfficialChannel",
+    "NewsArticle",
+    "NewsBlockType",
+    "NewsContentBlock",
+    "NewsFeed",
+    "NewsListItem",
+]

ff14_news/channel_protocol.py ADDED Viewed

@@ -0,0 +1,41 @@
+from typing import Protocol, runtime_checkable
+from ff14_news.models import NewsArticle, NewsFeed, NewsListItem
+@runtime_checkable
+class NewsChannel(Protocol):
+    """新闻渠道：各渠道独立实现，输出统一的 Feed / Article 结构。"""
+    channel_id: str
+    display_name: str
+    def list_items(
+        self,
+        *,
+        limit: int = 10,
+        page_index: int = 0,
+    ) -> list[NewsListItem]:
+        """拉取列表元数据，顺序与对应站点列表一致。"""
+        ...
+    def fetch_article_detail(self, article_id: str) -> NewsArticle:
+        """拉取单篇详情并解析正文块（blocks 非空）。"""
+        ...
+    def fetch_article(self, article_id: str) -> NewsArticle:
+        """拉取单篇详情；与 fetch_article_detail 相同。"""
+        ...
+    def fetch_articles(
+        self,
+        *,
+        limit: int = 10,
+        page_index: int = 0,
+    ) -> NewsFeed:
+        """按列表顺序抓取列表级字段（标题、摘要、头图、链接），blocks 为空。"""
+        ...
+    def fetch_articles_by_ids(self, article_ids: list[str]) -> NewsFeed:
+        """按给定 ID 顺序抓取列表级字段，不展开正文块。"""
+        ...

ff14_news/channels/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """各新闻渠道实现（按子目录拆分，互不混用）。"""

ff14_news/channels/cn_official/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from ff14_news.channels.cn_official.channel import CnOfficialChannel
+__all__ = ["CnOfficialChannel"]

ff14_news/channels/cn_official/channel.py ADDED Viewed

@@ -0,0 +1,112 @@
+from datetime import datetime, timezone
+from ff14_news.channels.cn_official.constants import (
+    CHANNEL_ID,
+    DISPLAY_NAME,
+    NEWS_LIST_CATEGORY_CODE,
+    OFFICIAL_NEWS_DETAIL_URL_TEMPLATE,
+    OFFICIAL_NEWS_LIST_URL,
+)
+from ff14_news.channels.cn_official.cqnews_client import CqNewsClient, parse_publish_date
+from ff14_news.channels.cn_official.html_content import html_to_blocks
+from ff14_news.common.list_feed import article_from_list_item
+from ff14_news.models import NewsArticle, NewsFeed, NewsListItem
+class CnOfficialChannel:
+    """FF14 国服官网新闻（ff.web.sdo.com / cqnews）。
+    默认抓取与旧 Selenium 列表一致：头图、标题、摘要、详情页链接。
+    正文块须显式调用 fetch_article_detail。
+    """
+    channel_id = CHANNEL_ID
+    display_name = DISPLAY_NAME
+    def __init__(
+        self,
+        *,
+        category_code: int = NEWS_LIST_CATEGORY_CODE,
+        timeout_seconds: float = 60.0,
+    ) -> None:
+        self.category_code = category_code
+        self._client = CqNewsClient(timeout_seconds=timeout_seconds)
+    def list_items(
+        self,
+        *,
+        limit: int = 10,
+        page_index: int = 0,
+    ) -> list[NewsListItem]:
+        if limit < 1:
+            raise ValueError("limit must be >= 1")
+        items, _total = self._client.fetch_list_page(
+            self.category_code,
+            page_index,
+            limit,
+        )
+        return items[:limit]
+    def fetch_article_detail(self, article_id: str) -> NewsArticle:
+        return self._detail_to_article(self._client.fetch_detail_raw(article_id))
+    def fetch_article(self, article_id: str) -> NewsArticle:
+        return self.fetch_article_detail(article_id)
+    def fetch_articles(
+        self,
+        *,
+        limit: int = 10,
+        page_index: int = 0,
+    ) -> NewsFeed:
+        items = self.list_items(limit=limit, page_index=page_index)
+        articles = [
+            article_from_list_item(item, category_code=self.category_code)
+            for item in items
+        ]
+        return self._build_feed(articles)
+    def fetch_articles_by_ids(self, article_ids: list[str]) -> NewsFeed:
+        if not article_ids:
+            raise ValueError("article_ids must not be empty")
+        articles = [
+            self._detail_to_list_article(self._client.fetch_detail_raw(aid))
+            for aid in article_ids
+        ]
+        return self._build_feed(articles)
+    def _detail_to_list_article(self, data: dict) -> NewsArticle:
+        article_id = str(int(data["Id"]))
+        cover = data.get("HomeImagePath")
+        cover_url = str(cover).strip() if cover else None
+        if cover_url == "":
+            cover_url = None
+        publish_raw = str(data.get("PublishDate") or "")
+        return NewsArticle(
+            channel_id=self.channel_id,
+            id=article_id,
+            title=str(data.get("Title") or ""),
+            publish_date=parse_publish_date(publish_raw),
+            summary=str(data.get("Summary") or ""),
+            category_code=int(data.get("CategoryCode") or self.category_code),
+            cover_image_url=cover_url,
+            source_page_url=OFFICIAL_NEWS_DETAIL_URL_TEMPLATE.format(
+                article_id=article_id
+            ),
+            blocks=[],
+        )
+    def _detail_to_article(self, data: dict) -> NewsArticle:
+        article = self._detail_to_list_article(data)
+        html = str(data.get("Content") or "")
+        blocks = html_to_blocks(html)
+        return article.model_copy(update={"blocks": blocks})
+    def _build_feed(self, articles: list[NewsArticle]) -> NewsFeed:
+        return NewsFeed(
+            channel_id=self.channel_id,
+            source_list_url=OFFICIAL_NEWS_LIST_URL,
+            category_code=self.category_code,
+            fetched_at=datetime.now(timezone.utc),
+            articles=articles,
+        )

ff14_news/channels/cn_official/constants.py ADDED Viewed

@@ -0,0 +1,13 @@
+CHANNEL_ID = "cn_official"
+DISPLAY_NAME = "FF14 国服官网"
+GAME_CODE = "ff"
+NEWS_LIST_CATEGORY_CODE = 5310
+CQNEWS_API_BASE = "https://cqnews.web.sdo.com/api/news"
+OFFICIAL_NEWS_LIST_URL = (
+    "https://ff.web.sdo.com/web8/index.html#/newstab/newslist"
+)
+OFFICIAL_NEWS_DETAIL_URL_TEMPLATE = (
+    "https://ff.web.sdo.com/web8/index.html#/newstab/newscont/{article_id}"
+)
+HTML_BASE_URL = "https://ff.web.sdo.com"

ff14_news/channels/cn_official/cqnews_client.py ADDED Viewed

@@ -0,0 +1,112 @@
+import json
+import urllib.error
+import urllib.parse
+import urllib.request
+from datetime import datetime
+from typing import Any
+from ff14_news.channels.cn_official.constants import (
+    CHANNEL_ID,
+    CQNEWS_API_BASE,
+    GAME_CODE,
+    OFFICIAL_NEWS_DETAIL_URL_TEMPLATE,
+)
+from ff14_news.models import NewsListItem
+_DEFAULT_HEADERS = {
+    "User-Agent": "Mozilla/5.0 (compatible; python-library-ff14-news/0.1)",
+    "Accept": "application/json",
+}
+class CqNewsClient:
+    """盛趣 cqnews 新闻 JSON 接口（国服官网 SPA 同源）。"""
+    def __init__(self, timeout_seconds: float = 60.0) -> None:
+        self._timeout = timeout_seconds
+    def fetch_list_page(
+        self,
+        category_code: int,
+        page_index: int,
+        page_size: int,
+    ) -> tuple[list[NewsListItem], int]:
+        """拉取一页列表。
+        Returns:
+            列表项与 TotalCount。
+        """
+        query = urllib.parse.urlencode(
+            {
+                "gameCode": GAME_CODE,
+                "CategoryCode": str(category_code),
+                "pageIndex": str(page_index),
+                "pageSize": str(page_size),
+            }
+        )
+        url = f"{CQNEWS_API_BASE}/newsList?{query}"
+        payload = self._get_json(url)
+        rows = payload.get("Data") or []
+        total = int(payload.get("TotalCount") or 0)
+        items = [self._parse_list_row(row) for row in rows]
+        return items, total
+    def fetch_detail_raw(self, article_id: str) -> dict[str, Any]:
+        """拉取详情 JSON 的 Data 字段。"""
+        query = urllib.parse.urlencode(
+            {"gameCode": GAME_CODE, "id": str(article_id).strip()}
+        )
+        url = f"{CQNEWS_API_BASE}/newsDetail?{query}"
+        payload = self._get_json(url)
+        data = payload.get("Data")
+        if not isinstance(data, dict):
+            msg = payload.get("Message") or "empty detail"
+            raise ValueError(f"news detail {article_id} failed: {msg}")
+        return data
+    def _get_json(self, url: str) -> dict[str, Any]:
+        req = urllib.request.Request(url, headers=_DEFAULT_HEADERS)
+        try:
+            raw = urllib.request.urlopen(req, timeout=self._timeout).read()
+        except urllib.error.HTTPError as exc:
+            raise ValueError(f"HTTP {exc.code} for {url}") from exc
+        payload = json.loads(raw.decode("utf-8"))
+        if not isinstance(payload, dict):
+            raise ValueError(f"unexpected JSON root from {url}")
+        code = payload.get("Code")
+        if code not in (0, "0", None):
+            raise ValueError(
+                f"API error Code={code} Message={payload.get('Message')}"
+            )
+        return payload
+    def _parse_list_row(self, row: dict[str, Any]) -> NewsListItem:
+        article_id = str(int(row["Id"]))
+        return NewsListItem(
+            channel_id=CHANNEL_ID,
+            id=article_id,
+            title=str(row.get("Title") or ""),
+            publish_date=parse_publish_date(str(row.get("PublishDate") or "")),
+            summary=str(row.get("Summary") or ""),
+            cover_image_url=_optional_str(row.get("HomeImagePath")),
+            source_page_url=OFFICIAL_NEWS_DETAIL_URL_TEMPLATE.format(
+                article_id=article_id
+            ),
+        )
+def _optional_str(value: Any) -> str | None:
+    if value is None:
+        return None
+    text = str(value).strip()
+    return text or None
+def parse_publish_date(text: str) -> datetime:
+    text = text.strip()
+    for fmt in ("%Y/%m/%d %H:%M:%S", "%Y-%m-%d %H:%M:%S"):
+        try:
+            return datetime.strptime(text, fmt)
+        except ValueError:
+            continue
+    raise ValueError(f"unsupported PublishDate: {text!r}")

ff14_news/channels/cn_official/html_content.py ADDED Viewed

@@ -0,0 +1,11 @@
+from ff14_news.channels.cn_official.constants import HTML_BASE_URL
+from ff14_news.common.html_blocks import html_to_blocks as _html_to_blocks
+from ff14_news.models import NewsContentBlock
+def html_to_blocks(
+    html: str,
+    *,
+    base_url: str = HTML_BASE_URL,
+) -> list[NewsContentBlock]:
+    return _html_to_blocks(html, base_url=base_url)

ff14_news/channels/cn_weibo/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from ff14_news.channels.cn_weibo.channel import CnWeiboChannel
+__all__ = ["CnWeiboChannel"]

ff14_news/channels/cn_weibo/browser_cookies.py ADDED Viewed

@@ -0,0 +1,93 @@
+from __future__ import annotations
+import random
+import time
+from pathlib import Path
+_MOBILE_UA = (
+    "Mozilla/5.0 (Linux; Android 13; SM-G9980) "
+    "AppleWebKit/537.36 (KHTML, like Gecko) "
+    "Chrome/112.0.5615.135 Mobile Safari/537.36"
+)
+_MOBILE_HOME = "https://m.weibo.cn/"
+def fetch_mobile_cookies(
+    *,
+    proxy_url: str | None = None,
+    headless: bool = True,
+    storage_state_path: Path | None = None,
+    timeout_seconds: float = 60.0,
+) -> dict[str, str]:
+    """用 Playwright 打开 m.weibo.cn 并返回 Cookie 字典。
+    Args:
+        proxy_url: HTTP 代理，如 ``http://127.0.0.1:7897``
+        headless: 是否无头运行 Chromium
+        storage_state_path: 可复用的 Playwright storage state 路径
+        timeout_seconds: 页面加载超时秒数
+    """
+    try:
+        from playwright.sync_api import sync_playwright
+    except ImportError as exc:
+        raise ImportError(
+            "Playwright 未安装。请执行：pip install crawl4weibo && "
+            "python -m example.ensure_browser --proxy 127.0.0.1:7897"
+        ) from exc
+    storage = None
+    if storage_state_path is not None and storage_state_path.is_file():
+        storage = str(storage_state_path.expanduser())
+    launch_kwargs: dict = {
+        "headless": headless,
+        "args": [
+            "--disable-blink-features=AutomationControlled",
+            "--no-sandbox",
+            "--disable-setuid-sandbox",
+        ],
+    }
+    if proxy_url:
+        launch_kwargs["proxy"] = {"server": proxy_url}
+    cookies_dict: dict[str, str] = {}
+    with sync_playwright() as playwright:
+        browser = playwright.chromium.launch(**launch_kwargs)
+        context = browser.new_context(
+            user_agent=_MOBILE_UA,
+            viewport={"width": 393, "height": 851},
+            locale="zh-CN",
+            timezone_id="Asia/Shanghai",
+            device_scale_factor=2.75,
+            is_mobile=True,
+            has_touch=True,
+            storage_state=storage,
+        )
+        context.set_extra_http_headers(
+            {
+                "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
+                "Accept": (
+                    "text/html,application/xhtml+xml,"
+                    "application/xml;q=0.9,image/webp,*/*;q=0.8"
+                ),
+            },
+        )
+        page = context.new_page()
+        try:
+            page.goto(
+                _MOBILE_HOME,
+                timeout=int(timeout_seconds * 1000),
+                wait_until="networkidle",
+            )
+            time.sleep(random.uniform(2, 4))
+            page.evaluate("window.scrollBy(0, 300)")
+            time.sleep(random.uniform(0.5, 1))
+            for cookie in context.cookies():
+                cookies_dict[cookie["name"]] = cookie["value"]
+            if storage_state_path is not None:
+                storage_state_path.parent.mkdir(parents=True, exist_ok=True)
+                context.storage_state(path=str(storage_state_path))
+        finally:
+            context.close()
+            browser.close()
+    return cookies_dict

ff14_news/channels/cn_weibo/channel.py ADDED Viewed

@@ -0,0 +1,141 @@
+from datetime import datetime, timezone
+from pathlib import Path
+from ff14_news.channels.cn_weibo.constants import (
+    CHANNEL_ID,
+    DEFAULT_UID,
+    DISPLAY_NAME,
+    MOBILE_ORIGIN,
+    SCREEN_NAME,
+)
+from ff14_news.channels.cn_weibo.crawl_backend import WeiboCrawlBackend
+from ff14_news.channels.cn_weibo.post_adapter import post_to_article, post_to_list_item
+from ff14_news.common.list_feed import article_from_list_item
+from ff14_news.models import NewsArticle, NewsFeed, NewsListItem
+_TIMELINE_SCAN_PAGE_SIZE = 20
+_TIMELINE_SCAN_MAX_PAGES = 30
+class CnWeiboChannel:
+    """FF14 官方微博（m.weibo.cn 时间线，crawl4weibo + Playwright 过反爬）。
+    默认抓取列表级字段；长文正文块须 fetch_article_detail。
+    """
+    channel_id = CHANNEL_ID
+    display_name = DISPLAY_NAME
+    def __init__(
+        self,
+        *,
+        screen_name: str = SCREEN_NAME,
+        uid: str | None = None,
+        cookie: str | None = None,
+        cookie_storage_path: Path | None = None,
+        browser_headless: bool = True,
+        proxy_url: str | None = None,
+        timeout_seconds: float = 60.0,
+    ) -> None:
+        """绑定微博账号。
+        Args:
+            screen_name: 微博 screen_name，默认 cnff14
+            uid: 已知 numeric uid 时可省略解析；省略且为默认账号时用 DEFAULT_UID
+            cookie: 浏览器 m.weibo.cn Cookie 整串；省略时用 Playwright 自动获取
+            cookie_storage_path: Playwright 会话缓存，便于复用 Cookie
+            browser_headless: 自动取 Cookie 时是否无头运行浏览器
+            proxy_url: HTTP 代理，如 ``127.0.0.1:7897``；用于取 Cookie 与 API 请求
+            timeout_seconds: 保留参数，与 crawl4weibo 内部超时一致
+        """
+        self.screen_name = screen_name
+        self._timeout = timeout_seconds
+        self._backend = WeiboCrawlBackend(
+            cookie=cookie,
+            cookie_storage_path=cookie_storage_path,
+            browser_headless=browser_headless,
+            proxy_url=proxy_url,
+        )
+        if uid is not None:
+            self._uid = str(uid).strip()
+        elif screen_name == SCREEN_NAME:
+            self._uid = DEFAULT_UID
+        else:
+            self._uid = self._backend.resolve_screen_name(screen_name)
+    @property
+    def uid(self) -> str:
+        return self._uid
+    def list_items(
+        self,
+        *,
+        limit: int = 10,
+        page_index: int = 0,
+    ) -> list[NewsListItem]:
+        if limit < 1:
+            raise ValueError("limit must be >= 1")
+        page = page_index + 1
+        posts = self._backend.fetch_timeline_posts(self._uid, page=page)
+        return [
+            post_to_list_item(post, channel_id=self.channel_id)
+            for post in posts[:limit]
+        ]
+    def fetch_article_detail(self, article_id: str) -> NewsArticle:
+        article_id = str(article_id).strip()
+        if not article_id:
+            raise ValueError("article_id must not be empty")
+        post = self._backend.fetch_post_detail(article_id)
+        return post_to_article(post, channel_id=self.channel_id)
+    def fetch_article(self, article_id: str) -> NewsArticle:
+        return self.fetch_article_detail(article_id)
+    def fetch_articles(
+        self,
+        *,
+        limit: int = 10,
+        page_index: int = 0,
+    ) -> NewsFeed:
+        items = self.list_items(limit=limit, page_index=page_index)
+        articles = [article_from_list_item(item) for item in items]
+        return self._build_feed(articles)
+    def fetch_articles_by_ids(self, article_ids: list[str]) -> NewsFeed:
+        if not article_ids:
+            raise ValueError("article_ids must not be empty")
+        articles = [
+            self._article_list_level_by_id(str(aid).strip()) for aid in article_ids
+        ]
+        return self._build_feed(articles)
+    def _article_list_level_by_id(self, article_id: str) -> NewsArticle:
+        item = self._find_list_item_by_id(article_id)
+        if item is not None:
+            return article_from_list_item(item)
+        post = self._backend.fetch_post(article_id)
+        return article_from_list_item(
+            post_to_list_item(post, channel_id=self.channel_id)
+        )
+    def _find_list_item_by_id(self, article_id: str) -> NewsListItem | None:
+        for page_index in range(_TIMELINE_SCAN_MAX_PAGES):
+            items = self.list_items(
+                limit=_TIMELINE_SCAN_PAGE_SIZE,
+                page_index=page_index,
+            )
+            for item in items:
+                if item.id == article_id:
+                    return item
+            if len(items) < _TIMELINE_SCAN_PAGE_SIZE:
+                break
+        return None
+    def _build_feed(self, articles: list[NewsArticle]) -> NewsFeed:
+        return NewsFeed(
+            channel_id=self.channel_id,
+            source_list_url=f"{MOBILE_ORIGIN}/u/{self._uid}",
+            category_code=None,
+            fetched_at=datetime.now(timezone.utc),
+            articles=articles,
+        )

ff14_news/channels/cn_weibo/constants.py ADDED Viewed

@@ -0,0 +1,10 @@
+CHANNEL_ID = "cn_weibo"
+DISPLAY_NAME = "FF14 官方微博"
+SCREEN_NAME = "cnff14"
+DEFAULT_UID = "1784473157"
+WEB_ORIGIN = "https://weibo.com"
+MOBILE_ORIGIN = "https://m.weibo.cn"
+PROFILE_URL = f"{WEB_ORIGIN}/{SCREEN_NAME}"
+DETAIL_URL_TEMPLATE = f"{MOBILE_ORIGIN}/detail/{{article_id}}"
+PERMALINK_TEMPLATE = f"{WEB_ORIGIN}/{DEFAULT_UID}/{{article_id}}"