python-library-ff14-news 0.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,112 @@
1
+ import html
2
+ import re
3
+ from dataclasses import dataclass
4
+ from datetime import datetime, timezone
5
+
6
+ from ff14_news.channels.jp_official.constants import DETAIL_URL_TEMPLATE
7
+ from ff14_news.models import NewsListItem
8
+
9
+ _SUMMARY_MAX_LEN = 200
10
+ _TAG_RE = re.compile(r"<[^>]+>")
11
+ _DETAIL_ID_RE = re.compile(r"/lodestone/topics/detail/([a-f0-9]+)/?")
12
+ _BANNER_RE = re.compile(
13
+ r'<div class="news__list--banner">(.*?)</div>\s*(?=</li>|<header)',
14
+ re.DOTALL,
15
+ )
16
+ _TITLE_RE = re.compile(
17
+ r'class="news__list--title"[^>]*>\s*<a[^>]*>([^<]+)</a>',
18
+ re.DOTALL,
19
+ )
20
+ _TIMESTAMP_RE = re.compile(r"ldst_strftime\((\d+),")
21
+ _COVER_RE = re.compile(
22
+ r'class="news__list--img"[^>]*>\s*<img[^>]+src="([^"]+)"',
23
+ re.DOTALL,
24
+ )
25
+ _ITEM_SPLIT_RE = re.compile(r'<li class="news__list--topics[^"]*">')
26
+
27
+
28
+ @dataclass(frozen=True)
29
+ class TopicsListRow:
30
+ article_id: str
31
+ title: str
32
+ publish_date: datetime
33
+ summary: str
34
+ cover_image_url: str | None
35
+
36
+
37
+ def topics_list_url(page_index: int) -> str:
38
+ from ff14_news.channels.jp_official.constants import TOPICS_LIST_URL
39
+
40
+ if page_index <= 0:
41
+ return TOPICS_LIST_URL
42
+ return f"{TOPICS_LIST_URL}?page={page_index + 1}"
43
+
44
+
45
+ def parse_topics_list_page(html: str, *, limit: int) -> list[TopicsListRow]:
46
+ fragments = _ITEM_SPLIT_RE.split(html)
47
+ rows: list[TopicsListRow] = []
48
+ for fragment in fragments[1:]:
49
+ row = _parse_item_fragment(fragment)
50
+ if row is not None:
51
+ rows.append(row)
52
+ if len(rows) >= limit:
53
+ break
54
+ return rows
55
+
56
+
57
+ def list_row_to_item(row: TopicsListRow, *, channel_id: str) -> NewsListItem:
58
+ return NewsListItem(
59
+ channel_id=channel_id,
60
+ id=row.article_id,
61
+ title=row.title,
62
+ publish_date=row.publish_date,
63
+ summary=row.summary,
64
+ cover_image_url=row.cover_image_url,
65
+ source_page_url=DETAIL_URL_TEMPLATE.format(article_id=row.article_id),
66
+ )
67
+
68
+
69
+ def _parse_item_fragment(fragment: str) -> TopicsListRow | None:
70
+ id_match = _DETAIL_ID_RE.search(fragment)
71
+ if not id_match:
72
+ return None
73
+ article_id = id_match.group(1)
74
+
75
+ title_match = _TITLE_RE.search(fragment)
76
+ title = html.unescape(title_match.group(1).strip()) if title_match else ""
77
+
78
+ ts_match = _TIMESTAMP_RE.search(fragment)
79
+ if ts_match:
80
+ publish_date = datetime.fromtimestamp(
81
+ int(ts_match.group(1)),
82
+ tz=timezone.utc,
83
+ )
84
+ else:
85
+ publish_date = datetime.fromtimestamp(0, tz=timezone.utc)
86
+
87
+ cover_match = _COVER_RE.search(fragment)
88
+ cover = cover_match.group(1).strip() if cover_match else None
89
+ summary = _banner_plain_summary(fragment)
90
+
91
+ return TopicsListRow(
92
+ article_id=article_id,
93
+ title=title,
94
+ publish_date=publish_date,
95
+ summary=summary,
96
+ cover_image_url=cover,
97
+ )
98
+
99
+
100
+ def _banner_plain_summary(fragment: str) -> str:
101
+ match = _BANNER_RE.search(fragment)
102
+ if not match:
103
+ return ""
104
+ inner = re.sub(r"<img[^>]*>", " ", match.group(1), flags=re.IGNORECASE)
105
+ text = _TAG_RE.sub(" ", inner)
106
+ text = html.unescape(text)
107
+ text = re.sub(r"\s+", " ", text).strip()
108
+ if not text:
109
+ return ""
110
+ if len(text) <= _SUMMARY_MAX_LEN:
111
+ return text
112
+ return text[: _SUMMARY_MAX_LEN - 1] + "…"
@@ -0,0 +1,183 @@
1
+ from html.parser import HTMLParser
2
+ from urllib.parse import urljoin
3
+
4
+ from ff14_news.models import NewsBlockType, NewsContentBlock
5
+
6
+ _BLOCK_TAGS = frozenset(
7
+ {"p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "tr", "img", "br"}
8
+ )
9
+ _SKIP_TAGS = frozenset({"style", "script", "head", "meta", "link", "noscript"})
10
+ _HEADING_LEVEL = {"h1": 1, "h2": 2, "h3": 3, "h4": 4, "h5": 5, "h6": 6}
11
+
12
+
13
+ class _ContentHTMLParser(HTMLParser):
14
+ def __init__(self) -> None:
15
+ super().__init__(convert_charrefs=True)
16
+ self.blocks: list[NewsContentBlock] = []
17
+ self._skip_depth = 0
18
+ self._text_buf: list[str] = []
19
+ self._stack: list[str] = []
20
+
21
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
22
+ tag = tag.lower()
23
+ if tag in _SKIP_TAGS:
24
+ self._skip_depth += 1
25
+ return
26
+ if self._skip_depth:
27
+ return
28
+ self._stack.append(tag)
29
+ if tag == "img":
30
+ src = _attr(attrs, "src")
31
+ if src:
32
+ self._flush_text()
33
+ self.blocks.append(
34
+ NewsContentBlock(
35
+ type=NewsBlockType.IMAGE,
36
+ url=src,
37
+ text=_attr(attrs, "alt"),
38
+ )
39
+ )
40
+ return
41
+ if tag == "br":
42
+ self._text_buf.append("\n")
43
+
44
+ def handle_endtag(self, tag: str) -> None:
45
+ tag = tag.lower()
46
+ if tag in _SKIP_TAGS:
47
+ self._skip_depth = max(0, self._skip_depth - 1)
48
+ return
49
+ if self._skip_depth:
50
+ return
51
+ if self._stack and self._stack[-1] == tag:
52
+ self._stack.pop()
53
+ if tag in _HEADING_LEVEL:
54
+ self._emit_text_block(NewsBlockType.HEADING, _HEADING_LEVEL[tag])
55
+ return
56
+ if tag == "p" or tag == "li":
57
+ self._emit_text_block(NewsBlockType.TEXT, None)
58
+ return
59
+ if tag == "tr":
60
+ self._emit_text_block(NewsBlockType.TEXT, None, join_cells=True)
61
+ return
62
+
63
+ def handle_data(self, data: str) -> None:
64
+ if self._skip_depth:
65
+ return
66
+ if self._stack and self._stack[-1] == "img":
67
+ return
68
+ stripped = data.replace("\xa0", " ")
69
+ if stripped.strip():
70
+ self._text_buf.append(stripped)
71
+
72
+ def close(self) -> None:
73
+ super().close()
74
+ if not self._skip_depth:
75
+ self._flush_text()
76
+
77
+ def _emit_text_block(
78
+ self,
79
+ block_type: NewsBlockType,
80
+ level: int | None,
81
+ *,
82
+ join_cells: bool = False,
83
+ ) -> None:
84
+ text = "".join(self._text_buf).strip()
85
+ self._text_buf.clear()
86
+ if not text:
87
+ return
88
+ if join_cells:
89
+ text = " | ".join(part.strip() for part in text.split("\n") if part.strip())
90
+ self.blocks.append(
91
+ NewsContentBlock(type=block_type, text=text, level=level)
92
+ )
93
+
94
+ def _flush_text(self) -> None:
95
+ self._emit_text_block(NewsBlockType.TEXT, None)
96
+
97
+
98
+ def html_to_blocks(
99
+ html: str,
100
+ *,
101
+ base_url: str,
102
+ extra_boilerplate: frozenset[str] | None = None,
103
+ ) -> list[NewsContentBlock]:
104
+ """将 HTML 片段转为有序正文块。"""
105
+ parser = _ContentHTMLParser()
106
+ parser.feed(html or "")
107
+ parser.close()
108
+ return _normalize_blocks(
109
+ parser.blocks,
110
+ base_url=base_url,
111
+ extra_boilerplate=extra_boilerplate,
112
+ )
113
+
114
+
115
+ def _normalize_blocks(
116
+ blocks: list[NewsContentBlock],
117
+ *,
118
+ base_url: str,
119
+ extra_boilerplate: frozenset[str] | None,
120
+ ) -> list[NewsContentBlock]:
121
+ out: list[NewsContentBlock] = []
122
+ for block in blocks:
123
+ if block.type == NewsBlockType.IMAGE and block.url:
124
+ url = block.url.strip()
125
+ if not url.startswith(("http://", "https://")):
126
+ url = urljoin(base_url, url)
127
+ alt = (block.text or "").strip() or None
128
+ out.append(
129
+ NewsContentBlock(type=NewsBlockType.IMAGE, url=url, text=alt)
130
+ )
131
+ continue
132
+ text = (block.text or "").strip()
133
+ if not text:
134
+ continue
135
+ if block.type == NewsBlockType.TEXT and _is_boilerplate(
136
+ text, extra_boilerplate
137
+ ):
138
+ continue
139
+ out.append(
140
+ NewsContentBlock(
141
+ type=block.type,
142
+ text=text,
143
+ level=block.level,
144
+ url=block.url,
145
+ )
146
+ )
147
+ return _merge_adjacent_text(out)
148
+
149
+
150
+ def _merge_adjacent_text(blocks: list[NewsContentBlock]) -> list[NewsContentBlock]:
151
+ merged: list[NewsContentBlock] = []
152
+ for block in blocks:
153
+ if (
154
+ merged
155
+ and block.type == NewsBlockType.TEXT
156
+ and merged[-1].type == NewsBlockType.TEXT
157
+ ):
158
+ prev = merged[-1]
159
+ merged[-1] = NewsContentBlock(
160
+ type=NewsBlockType.TEXT,
161
+ text=f"{prev.text}\n{block.text}",
162
+ )
163
+ continue
164
+ merged.append(block)
165
+ return merged
166
+
167
+
168
+ def _is_boilerplate(text: str, extra: frozenset[str] | None) -> bool:
169
+ lowered = text.lower()
170
+ if lowered in {"分享到:", "分享到:"}:
171
+ return True
172
+ if "copyright" in lowered and "square enix" in lowered:
173
+ return True
174
+ if extra and text.strip() in extra:
175
+ return True
176
+ return False
177
+
178
+
179
+ def _attr(attrs: list[tuple[str, str | None]], name: str) -> str | None:
180
+ for key, value in attrs:
181
+ if key.lower() == name and value:
182
+ return value.strip()
183
+ return None
@@ -0,0 +1,20 @@
1
+ from ff14_news.models import NewsArticle, NewsListItem
2
+
3
+
4
+ def article_from_list_item(
5
+ item: NewsListItem,
6
+ *,
7
+ category_code: int | None = None,
8
+ ) -> NewsArticle:
9
+ """列表项转文章:保留列表级字段,正文块为空。"""
10
+ return NewsArticle(
11
+ channel_id=item.channel_id,
12
+ id=item.id,
13
+ title=item.title,
14
+ publish_date=item.publish_date,
15
+ summary=item.summary,
16
+ category_code=category_code,
17
+ cover_image_url=item.cover_image_url,
18
+ source_page_url=item.source_page_url,
19
+ blocks=[],
20
+ )
ff14_news/ff14_news.py ADDED
@@ -0,0 +1,64 @@
1
+ from pathlib import Path
2
+
3
+ from ff14_news.channel_protocol import NewsChannel
4
+ from ff14_news.channels.cn_official import CnOfficialChannel
5
+ from ff14_news.channels.cn_weibo import CnWeiboChannel
6
+ from ff14_news.channels.jp_official import JpOfficialChannel
7
+
8
+
9
+ class FF14News:
10
+ """FF14 新闻聚合门面:各渠道独立实现,通过属性访问。"""
11
+
12
+ def __init__(
13
+ self,
14
+ *,
15
+ cn_official_timeout_seconds: float = 60.0,
16
+ cn_weibo_timeout_seconds: float = 60.0,
17
+ cn_weibo_cookie: str | None = None,
18
+ cn_weibo_cookie_storage_path: Path | str | None = None,
19
+ cn_weibo_browser_headless: bool = True,
20
+ cn_weibo_proxy_url: str | None = None,
21
+ jp_official_timeout_seconds: float = 120.0,
22
+ ) -> None:
23
+ """聚合各渠道实例。
24
+
25
+ Args:
26
+ cn_official_timeout_seconds: 国服官网 HTTP 超时
27
+ cn_weibo_timeout_seconds: 微博渠道 HTTP 超时
28
+ cn_weibo_cookie: 微博 m.weibo.cn Cookie 整串;未提供时用 Playwright 自动获取
29
+ cn_weibo_cookie_storage_path: Playwright 会话缓存路径
30
+ cn_weibo_browser_headless: 微博自动取 Cookie 时是否无头浏览器
31
+ cn_weibo_proxy_url: 微博 HTTP 代理,如 ``127.0.0.1:7897``
32
+ jp_official_timeout_seconds: 日文 Lodestone HTTP 超时
33
+ """
34
+ self.cn_official = CnOfficialChannel(
35
+ timeout_seconds=cn_official_timeout_seconds,
36
+ )
37
+ weibo_storage = (
38
+ Path(cn_weibo_cookie_storage_path).expanduser()
39
+ if cn_weibo_cookie_storage_path is not None
40
+ else None
41
+ )
42
+ self.cn_weibo = CnWeiboChannel(
43
+ timeout_seconds=cn_weibo_timeout_seconds,
44
+ cookie=cn_weibo_cookie,
45
+ cookie_storage_path=weibo_storage,
46
+ browser_headless=cn_weibo_browser_headless,
47
+ proxy_url=cn_weibo_proxy_url,
48
+ )
49
+ self.jp_official = JpOfficialChannel(
50
+ timeout_seconds=jp_official_timeout_seconds,
51
+ )
52
+
53
+ def available_channels(self) -> list[str]:
54
+ return ["cn_official", "cn_weibo", "jp_official"]
55
+
56
+ def channel(self, channel_id: str) -> NewsChannel:
57
+ if channel_id == "cn_official":
58
+ return self.cn_official
59
+ if channel_id == "cn_weibo":
60
+ return self.cn_weibo
61
+ if channel_id == "jp_official":
62
+ return self.jp_official
63
+ known = ", ".join(self.available_channels())
64
+ raise KeyError(f"unknown channel {channel_id!r}; known: {known}")
ff14_news/models.py ADDED
@@ -0,0 +1,74 @@
1
+ from datetime import datetime
2
+ from enum import StrEnum
3
+ from typing import Literal
4
+
5
+ from pydantic import BaseModel, Field
6
+
7
+
8
+ class NewsBlockType(StrEnum):
9
+ """正文块类型。"""
10
+
11
+ TEXT = "text"
12
+ HEADING = "heading"
13
+ IMAGE = "image"
14
+
15
+
16
+ class NewsContentBlock(BaseModel):
17
+ """单条有序正文块:纯文本、标题或图片。"""
18
+
19
+ type: NewsBlockType = Field(description="块类型")
20
+ text: str | None = Field(default=None, description="文本或标题内容")
21
+ url: str | None = Field(default=None, description="图片绝对地址")
22
+ level: int | None = Field(default=None, description="标题级别,1–6")
23
+
24
+
25
+ class NewsArticle(BaseModel):
26
+ """一篇新闻。
27
+
28
+ 默认 fetch_articles / fetch_articles_by_ids 仅填充列表级字段,blocks 为空。
29
+ 正文块须通过各渠道 fetch_article_detail 拉取。
30
+ """
31
+
32
+ channel_id: str = Field(description="渠道标识,如 cn_official")
33
+ id: str = Field(description="渠道内文章 ID")
34
+ title: str = Field(description="标题")
35
+ publish_date: datetime = Field(description="发布时间")
36
+ summary: str = Field(default="", description="摘要")
37
+ category_code: int | None = Field(
38
+ default=None,
39
+ description="栏目编号;仅部分渠道有(如国服 cqnews)",
40
+ )
41
+ cover_image_url: str | None = Field(default=None, description="列表头图")
42
+ source_page_url: str = Field(description="官网详情页 hash 链接")
43
+ blocks: list[NewsContentBlock] = Field(
44
+ default_factory=list,
45
+ description="按阅读顺序排列的正文块",
46
+ )
47
+
48
+
49
+ class NewsFeed(BaseModel):
50
+ """一次抓取结果:列表顺序与对应渠道新闻列表一致。"""
51
+
52
+ channel_id: str = Field(description="渠道标识,如 cn_official")
53
+ source_list_url: str = Field(description="列表页地址")
54
+ category_code: int | None = Field(
55
+ default=None,
56
+ description="列表栏目编号;仅部分渠道有",
57
+ )
58
+ fetched_at: datetime = Field(description="抓取完成时间")
59
+ articles: list[NewsArticle] = Field(
60
+ default_factory=list,
61
+ description="文章列表,顺序与列表 API 返回一致",
62
+ )
63
+
64
+
65
+ class NewsListItem(BaseModel):
66
+ """列表项元数据(未展开正文)。"""
67
+
68
+ channel_id: str = Field(description="渠道标识,如 cn_official")
69
+ id: str = Field(description="文章 ID")
70
+ title: str = Field(description="标题")
71
+ publish_date: datetime = Field(description="发布时间")
72
+ summary: str = Field(default="", description="摘要")
73
+ cover_image_url: str | None = Field(default=None, description="头图")
74
+ source_page_url: str = Field(description="官网详情页链接")
@@ -0,0 +1,8 @@
1
+ Metadata-Version: 2.4
2
+ Name: python-library-ff14-news
3
+ Version: 0.0.0
4
+ Requires-Python: >=3.10
5
+ Requires-Dist: crawl4weibo>=0.1.0
6
+ Requires-Dist: pydantic>=2.0
7
+ Provides-Extra: dev
8
+ Requires-Dist: pytest>=8.0; extra == 'dev'
@@ -0,0 +1,31 @@
1
+ ff14_news/__init__.py,sha256=IZTOJge_RhqdAGpNwUbbWBKH_v7D5qTHcwJ4cVsR3sg,639
2
+ ff14_news/channel_protocol.py,sha256=p5dYCQK93jnRUQGrg6VvDCkR1FJkNbKFCXhoEVejA3Q,1260
3
+ ff14_news/ff14_news.py,sha256=eLz_8F7HCOiDCiJvbGqV-fmP3esqjIcvuX5j8G90ACU,2627
4
+ ff14_news/models.py,sha256=d8FrQcoBxWv5UEE7PdKr7Zd9-82w5PqrYfXpFjb46j8,2785
5
+ ff14_news/channels/__init__.py,sha256=SLvlzIcCrPiGxHh1DzMEZMa4myiTb_coAk7GUHVEBXM,71
6
+ ff14_news/channels/cn_official/__init__.py,sha256=JT6_rlJaSE5BUf7jmCGIvBfzx34DyYVi3S_Ua-6D1Oc,105
7
+ ff14_news/channels/cn_official/channel.py,sha256=YxPuHJczyZ8gFMKBGc7q_GJ-M-N0Gw2Op3z5xDlIn4c,3986
8
+ ff14_news/channels/cn_official/constants.py,sha256=9XfWkEFes82wOmKnuWraZeIlvgzB_BTLziRTIyz2pc4,429
9
+ ff14_news/channels/cn_official/cqnews_client.py,sha256=TpTLOVHgM_OP9LewvY9BQGryxQ6sZGh5-aqyev-yBLs,3829
10
+ ff14_news/channels/cn_official/html_content.py,sha256=kb8bfMwDXbnOnOlQLnnlQMKBjvNNrhs2m2k7kazniLI,359
11
+ ff14_news/channels/cn_weibo/__init__.py,sha256=8jhON62o1XsjUI8_3IN9p58a4YXXnLeE1z448ZFwxDQ,96
12
+ ff14_news/channels/cn_weibo/browser_cookies.py,sha256=IOi592r6mmNjeipgOMMhtMm9A8CcTeCEyRxOWCXZmik,3161
13
+ ff14_news/channels/cn_weibo/channel.py,sha256=YyGkvBK5ahh8FiegHw8UnqY7t45Zy1gaqwwm6AW_r44,5184
14
+ ff14_news/channels/cn_weibo/constants.py,sha256=ss0Zt27u_XPM1N7Dtei4MV6okB8LcM1VxIT53jwDBqA,363
15
+ ff14_news/channels/cn_weibo/crawl_backend.py,sha256=y_vd_gkywgyOCG2fkabFqgY-BxkNMp0op8kgpfCGmhg,5165
16
+ ff14_news/channels/cn_weibo/exceptions.py,sha256=A4BiOwzzR56yjmUE99xv0MOImoeyHMyjrHm4_HrI8ME,100
17
+ ff14_news/channels/cn_weibo/mblog_parser.py,sha256=ORohcZtToeDJJaXl9b13BxaIPBADT8XzKFfg8fbS53g,4965
18
+ ff14_news/channels/cn_weibo/post_adapter.py,sha256=bE7MdJYkEs61vusmGnpt2p5kkc2iLtd5FFvrlqg5aUc,3509
19
+ ff14_news/channels/cn_weibo/profile.py,sha256=8bntCHpR94Lu2Y54npss2NKD7_K3LXQ5_aMG464myio,292
20
+ ff14_news/channels/cn_weibo/proxy_url.py,sha256=0a7p-yICIJ6ICeCV_H9cVEx3058S2pOljId1IOuWGt8,416
21
+ ff14_news/channels/jp_official/__init__.py,sha256=7ro_82cfJgYJbGnyLWqRlcX21-o2wz_4Gwf8f418Nzw,105
22
+ ff14_news/channels/jp_official/channel.py,sha256=M0f3QSMfdYE_S7J8KMymlR8nsCl4mqx4ggJ1D7VhGW0,4021
23
+ ff14_news/channels/jp_official/constants.py,sha256=xh1PL70D69k7V1j2QZrMsBEilp7Fa7THxig3Jeft8uQ,263
24
+ ff14_news/channels/jp_official/detail_parser.py,sha256=jr8VEjjzf_degkiYKe04aH6Mz5g2NTphbv_ZS78DojM,3897
25
+ ff14_news/channels/jp_official/http_client.py,sha256=u4JqzRilOwouTLnc4amFqYF8RYn25p0dBZjbZHNyqbI,544
26
+ ff14_news/channels/jp_official/list_parser.py,sha256=bJ8sA3GQe_JFEVAp3igmpx7yJBGHmGbR0XVG8pvcPy4,3392
27
+ ff14_news/common/html_blocks.py,sha256=W3kz0hlti30Jc3R8lOIk43wevK5QDrO4KRHx84519jw,5778
28
+ ff14_news/common/list_feed.py,sha256=9-p3RRTD9g2zukeLvSJP7dlidrMl1hUHS-lUe23hYrI,596
29
+ python_library_ff14_news-0.0.0.dist-info/METADATA,sha256=UCq3aY8-HaIgyIRUqHceKidUkjZOEGF93rNvxlniZ-U,218
30
+ python_library_ff14_news-0.0.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
31
+ python_library_ff14_news-0.0.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.30.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any