wechat-article-parser 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wechat_article_parser/__init__.py +4 -0
- wechat_article_parser/models.py +49 -0
- wechat_article_parser/parser.py +472 -0
- wechat_article_parser-0.0.1.dist-info/METADATA +183 -0
- wechat_article_parser-0.0.1.dist-info/RECORD +7 -0
- wechat_article_parser-0.0.1.dist-info/WHEEL +4 -0
- wechat_article_parser-0.0.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class WeChatVerifyError(Exception):
|
|
7
|
+
"""微信返回了验证码/人机验证页面,而非文章内容时抛出此异常。"""
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class ArticleResult:
|
|
12
|
+
"""微信公众号文章的解析结果。"""
|
|
13
|
+
|
|
14
|
+
# 公众号信息
|
|
15
|
+
mp_id_b64: str = ""
|
|
16
|
+
mp_id: int = 0
|
|
17
|
+
mp_name: str = ""
|
|
18
|
+
mp_alias: str = ""
|
|
19
|
+
mp_image: str = ""
|
|
20
|
+
mp_description: str = ""
|
|
21
|
+
|
|
22
|
+
# 文章信息
|
|
23
|
+
article_id: str = ""
|
|
24
|
+
article_msg_id: int = 0
|
|
25
|
+
article_idx: int = 0
|
|
26
|
+
article_sn: str = ""
|
|
27
|
+
article_title: str = ""
|
|
28
|
+
article_cover_image: str = ""
|
|
29
|
+
article_description: str = ""
|
|
30
|
+
article_markdown: str = ""
|
|
31
|
+
article_publish_time: int = 0
|
|
32
|
+
|
|
33
|
+
# 文章中提取的图片列表
|
|
34
|
+
images: list[str] = field(default_factory=list)
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def is_valid(self) -> bool:
|
|
38
|
+
"""检查关键字段是否解析成功。"""
|
|
39
|
+
return bool(
|
|
40
|
+
self.mp_id
|
|
41
|
+
and self.mp_name
|
|
42
|
+
and self.article_id
|
|
43
|
+
and self.article_msg_id
|
|
44
|
+
and self.article_idx
|
|
45
|
+
and self.article_sn
|
|
46
|
+
and self.article_title
|
|
47
|
+
and self.article_markdown
|
|
48
|
+
and self.article_publish_time
|
|
49
|
+
)
|
|
@@ -0,0 +1,472 @@
|
|
|
1
|
+
"""微信公众号文章解析器核心模块。"""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import base64
|
|
6
|
+
import html as html_module
|
|
7
|
+
import re
|
|
8
|
+
from urllib.parse import unquote
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
from bs4 import BeautifulSoup, Tag
|
|
12
|
+
from markdownify import MarkdownConverter
|
|
13
|
+
|
|
14
|
+
from .models import ArticleResult, WeChatVerifyError
|
|
15
|
+
|
|
16
|
+
_USER_AGENT = (
|
|
17
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
|
18
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
19
|
+
"Chrome/139.0.0.0 Safari/537.36"
|
|
20
|
+
)
|
|
21
|
+
_TIMEOUT = 15
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
# Markdown 转换器
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
|
|
28
|
+
class _MarkdownConverter(MarkdownConverter):
|
|
29
|
+
def convert_span(self, el, text, parent_tags):
|
|
30
|
+
return text.strip()
|
|
31
|
+
|
|
32
|
+
def convert_p(self, el, text, parent_tags):
|
|
33
|
+
text = " ".join(text.split())
|
|
34
|
+
return super().convert_p(el, text, parent_tags)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _to_markdown(html: str) -> str:
|
|
38
|
+
return _MarkdownConverter().convert(html)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# ---------------------------------------------------------------------------
|
|
42
|
+
# 文本解码辅助函数
|
|
43
|
+
# ---------------------------------------------------------------------------
|
|
44
|
+
|
|
45
|
+
def _strip_html_tags(text: str) -> str:
|
|
46
|
+
"""移除文本中的 HTML 标签,只保留纯文本内容。"""
|
|
47
|
+
return BeautifulSoup(text, "html.parser").get_text()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _decode_hex_escapes(text: str) -> str:
|
|
51
|
+
return re.sub(r"\\x([0-9a-fA-F]{2})", lambda m: chr(int(m.group(1), 16)), text)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _decode_text(text: str, *, preserve_newlines: bool = False) -> str:
|
|
55
|
+
if not text:
|
|
56
|
+
return ""
|
|
57
|
+
text = _decode_hex_escapes(text)
|
|
58
|
+
text = html_module.unescape(text)
|
|
59
|
+
text = text.replace("&", "&")
|
|
60
|
+
if preserve_newlines:
|
|
61
|
+
text = text.replace("\r", "").replace("\n", "<br>")
|
|
62
|
+
else:
|
|
63
|
+
text = text.replace("\r", "").replace("\n", "")
|
|
64
|
+
return re.sub(r"\s+", " ", text)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# ---------------------------------------------------------------------------
|
|
68
|
+
# 图片链接标准化
|
|
69
|
+
# ---------------------------------------------------------------------------
|
|
70
|
+
|
|
71
|
+
def _normalize_image_url(url: str) -> str:
|
|
72
|
+
"""将微信图片链接标准化为 640px 宽度版本。"""
|
|
73
|
+
parts = url.split("/")
|
|
74
|
+
if len(parts) >= 5:
|
|
75
|
+
return f"{'/'.join(parts[:5])}/640"
|
|
76
|
+
return url
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _extract_picture_cdn_urls(script_text: str) -> list[str]:
|
|
80
|
+
"""从 picture_page_info_list 中只提取正文图片的 cdn_url,排除 watermark_info 和 share_cover 中的。"""
|
|
81
|
+
urls: list[str] = []
|
|
82
|
+
seen: set[str] = set()
|
|
83
|
+
for m in re.finditer(r"(watermark_info|share_cover)?\s*(?::\s*\{[^}]*?)?\bcdn_url:\s*'([^']*)'", script_text):
|
|
84
|
+
prefix = m.group(1)
|
|
85
|
+
url = m.group(2)
|
|
86
|
+
if prefix or not url:
|
|
87
|
+
continue
|
|
88
|
+
normalized = _normalize_image_url(url)
|
|
89
|
+
if normalized not in seen:
|
|
90
|
+
seen.add(normalized)
|
|
91
|
+
urls.append(normalized)
|
|
92
|
+
return urls
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
# ---------------------------------------------------------------------------
|
|
96
|
+
# 内部提取函数
|
|
97
|
+
# ---------------------------------------------------------------------------
|
|
98
|
+
|
|
99
|
+
def _extract_article_id(url: str) -> str:
|
|
100
|
+
parts = url.split("/")
|
|
101
|
+
if len(parts) == 5 and len(parts[4]) == 22:
|
|
102
|
+
return parts[4]
|
|
103
|
+
return ""
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _extract_meta(soup: BeautifulSoup, result: ArticleResult) -> None:
|
|
107
|
+
for prop, attr in [
|
|
108
|
+
("og:title", "article_title"),
|
|
109
|
+
("og:image", "article_cover_image"),
|
|
110
|
+
("og:description", "article_description"),
|
|
111
|
+
]:
|
|
112
|
+
tag = soup.find("meta", attrs={"property": prop})
|
|
113
|
+
if tag and tag.get("content"):
|
|
114
|
+
value = tag["content"]
|
|
115
|
+
if attr != "article_cover_image":
|
|
116
|
+
value = _decode_text(value)
|
|
117
|
+
value = _strip_html_tags(value)
|
|
118
|
+
if attr == "article_description" and len(value) > 2048:
|
|
119
|
+
value = value[:2048]
|
|
120
|
+
setattr(result, attr, value)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _extract_rich_text_meta(script_text: str, result: ArticleResult) -> None:
|
|
124
|
+
"""从富文本文章的 script 标签中提取元数据。"""
|
|
125
|
+
if "var hd_head_img" in script_text:
|
|
126
|
+
m = re.search(r'var hd_head_img = "([^"]+)"', script_text)
|
|
127
|
+
if m:
|
|
128
|
+
result.mp_image = m.group(1)
|
|
129
|
+
|
|
130
|
+
m = re.search(r"var nickname = htmlDecode\((.*)\);", script_text)
|
|
131
|
+
if m:
|
|
132
|
+
result.mp_name = m.group(1).strip('"').strip("'")
|
|
133
|
+
|
|
134
|
+
m = re.search(r'var profile_signature = "([^"]+)"', script_text)
|
|
135
|
+
if m:
|
|
136
|
+
result.mp_description = m.group(1)
|
|
137
|
+
|
|
138
|
+
m = re.search(r"alias: '([^']+)'", script_text)
|
|
139
|
+
if m:
|
|
140
|
+
result.mp_alias = m.group(1)
|
|
141
|
+
|
|
142
|
+
if "var oriCreateTime" in script_text:
|
|
143
|
+
m = re.search(r"var oriCreateTime = '(\d+)'", script_text)
|
|
144
|
+
if m:
|
|
145
|
+
result.article_publish_time = int(m.group(1))
|
|
146
|
+
|
|
147
|
+
if "window.__allowLoadResFromMp" in script_text:
|
|
148
|
+
variables: dict[str, str] = {}
|
|
149
|
+
for m in re.finditer(r"var\s+(\w+)\s*=\s*(.*?);", script_text):
|
|
150
|
+
literals = re.findall(r'"(.*?)"', m.group(2))
|
|
151
|
+
variables[m.group(1)] = next((s for s in literals if s.strip()), "")
|
|
152
|
+
|
|
153
|
+
biz = variables.get("biz", "")
|
|
154
|
+
if biz:
|
|
155
|
+
result.mp_id_b64 = biz
|
|
156
|
+
try:
|
|
157
|
+
result.mp_id = int(base64.b64decode(biz).decode())
|
|
158
|
+
except Exception:
|
|
159
|
+
pass
|
|
160
|
+
result.article_msg_id = int(variables["mid"]) if variables.get("mid") else 0
|
|
161
|
+
result.article_idx = int(variables["idx"]) if variables.get("idx") else 0
|
|
162
|
+
result.article_sn = variables.get("sn", "")
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _extract_swiper_meta(script_text: str, result: ArticleResult) -> None:
|
|
166
|
+
"""从图片轮播 / 纯文本 / 视频分享页面的 script 标签中提取元数据。"""
|
|
167
|
+
if "window.__initCgiDataConfig =" in script_text:
|
|
168
|
+
m = re.search(r"d\.hd_head_img.*?:\s*'([^']+)'", script_text)
|
|
169
|
+
if m:
|
|
170
|
+
result.mp_image = m.group(1)
|
|
171
|
+
|
|
172
|
+
m = re.search(r"d\.nick_name.*?:\s*'([^']+)'", script_text)
|
|
173
|
+
if m:
|
|
174
|
+
result.mp_name = m.group(1).strip('"').strip("'")
|
|
175
|
+
|
|
176
|
+
m = re.search(r"d\.biz.*?:\s*'([^']+)'", script_text)
|
|
177
|
+
if m:
|
|
178
|
+
biz = m.group(1)
|
|
179
|
+
result.mp_id_b64 = biz
|
|
180
|
+
try:
|
|
181
|
+
result.mp_id = int(base64.b64decode(biz).decode())
|
|
182
|
+
except Exception:
|
|
183
|
+
pass
|
|
184
|
+
|
|
185
|
+
m = re.search(r"d\.mid.*?:\s*'([^']+)'", script_text)
|
|
186
|
+
if m:
|
|
187
|
+
result.article_msg_id = int(m.group(1))
|
|
188
|
+
|
|
189
|
+
m = re.search(r"d\.idx.*?:\s*'([^']+)'", script_text)
|
|
190
|
+
if m:
|
|
191
|
+
result.article_idx = int(m.group(1))
|
|
192
|
+
|
|
193
|
+
m = re.search(r"d\.sn.*?:\s*'([^']+)'", script_text)
|
|
194
|
+
if m:
|
|
195
|
+
result.article_sn = m.group(1)
|
|
196
|
+
|
|
197
|
+
m = re.search(r"d\.create_time.*?:\s*'([^']+)'", script_text)
|
|
198
|
+
if m:
|
|
199
|
+
result.article_publish_time = int(m.group(1))
|
|
200
|
+
|
|
201
|
+
if not result.article_id:
|
|
202
|
+
m = re.search(r"d\.msg_link.*?:\s*'([^']+)'", script_text)
|
|
203
|
+
if m:
|
|
204
|
+
parts = m.group(1).split("/")
|
|
205
|
+
if len(parts) == 5 and len(parts[4]) == 22:
|
|
206
|
+
result.article_id = parts[4]
|
|
207
|
+
|
|
208
|
+
if "window.alias =" in script_text:
|
|
209
|
+
m = re.search(r'window.alias = "([^"]+)"', script_text)
|
|
210
|
+
if m:
|
|
211
|
+
result.mp_alias = m.group(1)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
# ---------------------------------------------------------------------------
|
|
215
|
+
# 内容提取
|
|
216
|
+
# ---------------------------------------------------------------------------
|
|
217
|
+
|
|
218
|
+
def _extract_rich_media_content(content_tag: Tag, result: ArticleResult) -> None:
|
|
219
|
+
"""将 rich_media_content 区块转换为 Markdown。"""
|
|
220
|
+
soup = BeautifulSoup(content_tag.prettify(), "html.parser")
|
|
221
|
+
seen: set[str] = set()
|
|
222
|
+
|
|
223
|
+
# 处理 <img> 标签
|
|
224
|
+
to_remove = []
|
|
225
|
+
for img in soup.find_all("img"):
|
|
226
|
+
src = img.get("src") or img.get("data-src")
|
|
227
|
+
if src and src.startswith("http"):
|
|
228
|
+
normalized = _normalize_image_url(src)
|
|
229
|
+
img["src"] = normalized
|
|
230
|
+
if normalized not in seen:
|
|
231
|
+
seen.add(normalized)
|
|
232
|
+
result.images.append(normalized)
|
|
233
|
+
else:
|
|
234
|
+
to_remove.append(img)
|
|
235
|
+
for tag in to_remove:
|
|
236
|
+
tag.decompose()
|
|
237
|
+
|
|
238
|
+
# 处理含有 background-image 的 <svg> 标签
|
|
239
|
+
# 这些通常是装饰性元素(分隔线、底纹等),转为 <img> 用于 Markdown 渲染,但不计入 images 列表
|
|
240
|
+
to_remove = []
|
|
241
|
+
for svg in soup.find_all("svg"):
|
|
242
|
+
style = svg.get("style", "")
|
|
243
|
+
if "background-image" in style:
|
|
244
|
+
m = re.search(r'url\("([^"]+)"\)', style)
|
|
245
|
+
if m:
|
|
246
|
+
normalized = _normalize_image_url(m.group(1))
|
|
247
|
+
new_img = soup.new_tag("img", src=normalized)
|
|
248
|
+
svg.replace_with(new_img)
|
|
249
|
+
continue
|
|
250
|
+
to_remove.append(svg)
|
|
251
|
+
for tag in to_remove:
|
|
252
|
+
tag.decompose()
|
|
253
|
+
|
|
254
|
+
result.article_markdown = _to_markdown(soup.prettify())
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _extract_repost_content(content_tag: Tag, result: ArticleResult) -> None:
|
|
258
|
+
"""从转载式文章中提取内容。"""
|
|
259
|
+
soup = BeautifulSoup(content_tag.prettify(), "html.parser")
|
|
260
|
+
notice = soup.find("p", id="js_share_notice")
|
|
261
|
+
if not notice:
|
|
262
|
+
return
|
|
263
|
+
|
|
264
|
+
m = re.search(r'innerHTML = "([^"]+)"', str(notice))
|
|
265
|
+
if not m:
|
|
266
|
+
return
|
|
267
|
+
|
|
268
|
+
text = _decode_text(m.group(1), preserve_newlines=True)
|
|
269
|
+
html_content = f"<p>{text}</p>"
|
|
270
|
+
|
|
271
|
+
share_link = soup.find("span", id="js_share_source")
|
|
272
|
+
if share_link:
|
|
273
|
+
href = share_link.get("data-url")
|
|
274
|
+
if href:
|
|
275
|
+
html_content += f'<p><a href="{href}">查看原文</a></p>'
|
|
276
|
+
|
|
277
|
+
result.article_markdown = _to_markdown(html_content)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _extract_plain_text_content(soup: BeautifulSoup, result: ArticleResult) -> None:
|
|
281
|
+
"""从纯文本文章中提取内容。"""
|
|
282
|
+
for script in soup.find_all("script", attrs={"type": "text/javascript"}):
|
|
283
|
+
if "var TextContentNoEncode =" in script.text:
|
|
284
|
+
m = re.search(
|
|
285
|
+
r"var ContentNoEncode = window\.a_value_which_never_exists \|\| '([^']+)';",
|
|
286
|
+
script.text,
|
|
287
|
+
)
|
|
288
|
+
if m:
|
|
289
|
+
text = _decode_text(m.group(1), preserve_newlines=True)
|
|
290
|
+
text = unquote(text)
|
|
291
|
+
result.article_markdown = _to_markdown(f"<p>{text}</p>")
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def _extract_swiper_content(soup: BeautifulSoup, result: ArticleResult) -> None:
|
|
295
|
+
"""从小红书风格的图片轮播文章中提取内容。"""
|
|
296
|
+
for script in soup.find_all("script", attrs={"type": "text/javascript"}):
|
|
297
|
+
if "window.picture_page_info_list =" not in script.text:
|
|
298
|
+
continue
|
|
299
|
+
|
|
300
|
+
result.images = _extract_picture_cdn_urls(script.text)
|
|
301
|
+
|
|
302
|
+
html_parts = [f'<img src="{img}" /><br>' for img in result.images]
|
|
303
|
+
|
|
304
|
+
m = re.search(r'window.desc = "([^"]+)"', script.text)
|
|
305
|
+
if m:
|
|
306
|
+
text = _decode_text(m.group(1), preserve_newlines=True)
|
|
307
|
+
html_parts.append(f"<p>{text}</p>")
|
|
308
|
+
|
|
309
|
+
if html_parts:
|
|
310
|
+
result.article_markdown = _to_markdown("".join(html_parts))
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
def _extract_fullscreen_content(soup: BeautifulSoup, result: ArticleResult) -> None:
|
|
314
|
+
"""从全屏布局文章中提取内容(appmsg_type 10002)。
|
|
315
|
+
|
|
316
|
+
此类文章的文本存储在 text_page_info.content 中,通过 JsDecode() 编码;
|
|
317
|
+
图片存储在 picture_page_info_list 的 cdn_url 字段中。
|
|
318
|
+
"""
|
|
319
|
+
for script in soup.find_all("script", attrs={"type": "text/javascript"}):
|
|
320
|
+
if "picture_page_info_list" not in script.text:
|
|
321
|
+
continue
|
|
322
|
+
|
|
323
|
+
html_parts: list[str] = []
|
|
324
|
+
|
|
325
|
+
# 从 picture_page_info_list 中提取图片
|
|
326
|
+
result.images = _extract_picture_cdn_urls(script.text)
|
|
327
|
+
for img in result.images:
|
|
328
|
+
html_parts.append(f'<img src="{img}" /><br>')
|
|
329
|
+
|
|
330
|
+
# 从 text_page_info.content_noencode 或 content 中提取文本
|
|
331
|
+
for field in ("content_noencode", "content"):
|
|
332
|
+
m = re.search(
|
|
333
|
+
rf"{field}:\s*JsDecode\('(.*?)'\)",
|
|
334
|
+
script.text,
|
|
335
|
+
re.DOTALL,
|
|
336
|
+
)
|
|
337
|
+
if m:
|
|
338
|
+
text = _decode_text(m.group(1), preserve_newlines=True)
|
|
339
|
+
text = unquote(text)
|
|
340
|
+
html_parts.append(f"<p>{text}</p>")
|
|
341
|
+
break
|
|
342
|
+
|
|
343
|
+
if html_parts:
|
|
344
|
+
result.article_markdown = _to_markdown("".join(html_parts))
|
|
345
|
+
return
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
# ---------------------------------------------------------------------------
|
|
349
|
+
# 主解析流程
|
|
350
|
+
# ---------------------------------------------------------------------------
|
|
351
|
+
|
|
352
|
+
def _parse_html(url: str, html: str) -> ArticleResult:
|
|
353
|
+
"""将原始 HTML 解析为 ArticleResult。"""
|
|
354
|
+
# 检测微信验证码/人机验证页面
|
|
355
|
+
if "secitptpage/template/verify.js" in html or "register_code" in html[:3000]:
|
|
356
|
+
raise WeChatVerifyError(
|
|
357
|
+
f"WeChat returned a verification page for {url}. "
|
|
358
|
+
"This usually means the IP has been rate-limited. "
|
|
359
|
+
"Try again later or use a different IP."
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
result = ArticleResult()
|
|
363
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
364
|
+
|
|
365
|
+
result.article_id = _extract_article_id(url)
|
|
366
|
+
_extract_meta(soup, result)
|
|
367
|
+
|
|
368
|
+
scripts = soup.find_all("script", attrs={"type": "text/javascript"})
|
|
369
|
+
|
|
370
|
+
# 尝试解析标准富文本文章
|
|
371
|
+
content = soup.find("div", class_="rich_media_content")
|
|
372
|
+
if content:
|
|
373
|
+
for s in scripts:
|
|
374
|
+
_extract_rich_text_meta(s.text, result)
|
|
375
|
+
_extract_rich_media_content(content, result)
|
|
376
|
+
return result
|
|
377
|
+
|
|
378
|
+
# 尝试解析转载式文章
|
|
379
|
+
content = soup.find("div", class_="original_page")
|
|
380
|
+
if content:
|
|
381
|
+
for s in scripts:
|
|
382
|
+
_extract_rich_text_meta(s.text, result)
|
|
383
|
+
_extract_repost_content(content, result)
|
|
384
|
+
return result
|
|
385
|
+
|
|
386
|
+
# 尝试解析纯文本文章
|
|
387
|
+
content = soup.find("p", id="js_text_desc")
|
|
388
|
+
if content:
|
|
389
|
+
for s in scripts:
|
|
390
|
+
_extract_swiper_meta(s.text, result)
|
|
391
|
+
_extract_plain_text_content(soup, result)
|
|
392
|
+
if result.article_title and len(result.article_title) > 50:
|
|
393
|
+
short = result.article_title.split("。")[0]
|
|
394
|
+
result.article_title = short if len(short) <= 50 else result.article_title[:30]
|
|
395
|
+
return result
|
|
396
|
+
|
|
397
|
+
# 尝试解析视频分享类文章
|
|
398
|
+
content = soup.find("div", id="js_common_share_desc_wrap")
|
|
399
|
+
if content:
|
|
400
|
+
for s in scripts:
|
|
401
|
+
_extract_swiper_meta(s.text, result)
|
|
402
|
+
_extract_plain_text_content(soup, result)
|
|
403
|
+
return result
|
|
404
|
+
|
|
405
|
+
# 尝试解析小红书风格图片轮播文章
|
|
406
|
+
content = soup.find("div", class_="share_media_swiper_content")
|
|
407
|
+
if content:
|
|
408
|
+
for s in scripts:
|
|
409
|
+
_extract_swiper_meta(s.text, result)
|
|
410
|
+
_extract_swiper_content(soup, result)
|
|
411
|
+
return result
|
|
412
|
+
|
|
413
|
+
# 尝试解析全屏布局文章(如短文本帖子,appmsg_type 10002)
|
|
414
|
+
content = soup.find("div", id="js_fullscreen_layout_padding")
|
|
415
|
+
if content:
|
|
416
|
+
for s in scripts:
|
|
417
|
+
_extract_swiper_meta(s.text, result)
|
|
418
|
+
_extract_fullscreen_content(soup, result)
|
|
419
|
+
if result.article_title and len(result.article_title) > 50:
|
|
420
|
+
short = result.article_title.split("。")[0]
|
|
421
|
+
result.article_title = short if len(short) <= 50 else result.article_title[:30]
|
|
422
|
+
return result
|
|
423
|
+
|
|
424
|
+
# 兜底:尽可能提取元数据
|
|
425
|
+
for s in scripts:
|
|
426
|
+
_extract_rich_text_meta(s.text, result)
|
|
427
|
+
_extract_swiper_meta(s.text, result)
|
|
428
|
+
|
|
429
|
+
return result
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
# ---------------------------------------------------------------------------
|
|
433
|
+
# 公开接口
|
|
434
|
+
# ---------------------------------------------------------------------------
|
|
435
|
+
|
|
436
|
+
def parse(url: str, *, timeout: int = _TIMEOUT, user_agent: str | None = None) -> ArticleResult:
|
|
437
|
+
"""抓取并解析微信公众号文章(同步方式)。
|
|
438
|
+
|
|
439
|
+
Args:
|
|
440
|
+
url: 微信文章链接。
|
|
441
|
+
timeout: 请求超时时间(秒)。
|
|
442
|
+
user_agent: 自定义 User-Agent,不传则使用内置默认值。
|
|
443
|
+
|
|
444
|
+
Returns:
|
|
445
|
+
包含解析数据的 ArticleResult。
|
|
446
|
+
"""
|
|
447
|
+
ua = user_agent or _USER_AGENT
|
|
448
|
+
response = httpx.get(url, headers={"User-Agent": ua}, timeout=timeout, follow_redirects=True)
|
|
449
|
+
response.raise_for_status()
|
|
450
|
+
return _parse_html(url, response.text)
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
async def parse_async(url: str, *, timeout: int = _TIMEOUT, user_agent: str | None = None) -> ArticleResult:
|
|
454
|
+
"""抓取并解析微信公众号文章(异步方式)。
|
|
455
|
+
|
|
456
|
+
Args:
|
|
457
|
+
url: 微信文章链接。
|
|
458
|
+
timeout: 请求超时时间(秒)。
|
|
459
|
+
user_agent: 自定义 User-Agent,不传则使用内置默认值。
|
|
460
|
+
|
|
461
|
+
Returns:
|
|
462
|
+
包含解析数据的 ArticleResult。
|
|
463
|
+
"""
|
|
464
|
+
ua = user_agent or _USER_AGENT
|
|
465
|
+
async with httpx.AsyncClient(
|
|
466
|
+
headers={"User-Agent": ua},
|
|
467
|
+
timeout=timeout,
|
|
468
|
+
follow_redirects=True,
|
|
469
|
+
) as client:
|
|
470
|
+
response = await client.get(url)
|
|
471
|
+
response.raise_for_status()
|
|
472
|
+
return _parse_html(url, response.text)
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: wechat-article-parser
|
|
3
|
+
Version: 0.0.1
|
|
4
|
+
Summary: WeChat MP article parser - extract metadata and content from WeChat public account articles
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
9
|
+
Requires-Dist: httpx>=0.27
|
|
10
|
+
Requires-Dist: markdownify>=0.13
|
|
11
|
+
Provides-Extra: dev
|
|
12
|
+
Requires-Dist: pytest-asyncio>=0.24; extra == 'dev'
|
|
13
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
14
|
+
Description-Content-Type: text/markdown
|
|
15
|
+
|
|
16
|
+
# wechat-article-parser
|
|
17
|
+
|
|
18
|
+
微信公众号文章解析器。传入一个公众号文章链接,自动抓取页面并提取公众号信息、文章元数据及正文的 Markdown 内容。
|
|
19
|
+
|
|
20
|
+
支持的文章类型:
|
|
21
|
+
|
|
22
|
+
- 标准富文本文章
|
|
23
|
+
- 转载式文章
|
|
24
|
+
- 纯文本文章
|
|
25
|
+
- 视频分享类文章
|
|
26
|
+
- 小红书风格图片轮播文章
|
|
27
|
+
- 全屏布局短文本文章
|
|
28
|
+
|
|
29
|
+
## 安装
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install wechat-article-parser
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
**本地开发安装:**
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
git clone https://github.com/huanjuedadehen/wechat-article-parser.git
|
|
39
|
+
cd wechat-article-parser
|
|
40
|
+
pip install -e ".[dev]"
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
依赖项(会自动安装):
|
|
44
|
+
|
|
45
|
+
- Python >= 3.10
|
|
46
|
+
- httpx
|
|
47
|
+
- beautifulsoup4
|
|
48
|
+
- markdownify
|
|
49
|
+
|
|
50
|
+
## 使用方式
|
|
51
|
+
|
|
52
|
+
### 同步调用
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
from wechat_article_parser import parse
|
|
56
|
+
|
|
57
|
+
result = parse("https://mp.weixin.qq.com/s/xxxxx")
|
|
58
|
+
print(result.article_title)
|
|
59
|
+
print(result.article_markdown)
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### 异步调用
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from wechat_article_parser import parse_async
|
|
66
|
+
|
|
67
|
+
result = await parse_async("https://mp.weixin.qq.com/s/xxxxx")
|
|
68
|
+
print(result.article_title)
|
|
69
|
+
print(result.article_markdown)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### 可选参数
|
|
73
|
+
|
|
74
|
+
两个方法都支持以下可选参数:
|
|
75
|
+
|
|
76
|
+
- `timeout`:请求超时时间,单位秒,默认 15
|
|
77
|
+
- `user_agent`:自定义 User-Agent,不传则使用内置默认值
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
result = parse(
|
|
81
|
+
"https://mp.weixin.qq.com/s/xxxxx",
|
|
82
|
+
timeout=30,
|
|
83
|
+
user_agent="MyBot/1.0",
|
|
84
|
+
)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
## 返回结构
|
|
88
|
+
|
|
89
|
+
`parse` 和 `parse_async` 均返回 `ArticleResult` 数据类,包含以下字段:
|
|
90
|
+
|
|
91
|
+
| 字段 | 类型 | 说明 |
|
|
92
|
+
|---|---|---|
|
|
93
|
+
| `mp_id_b64` | `str` | 公众号 ID(Base64 编码原始值) |
|
|
94
|
+
| `mp_id` | `int` | 公众号 ID(解码后的整数) |
|
|
95
|
+
| `mp_name` | `str` | 公众号名称 |
|
|
96
|
+
| `mp_alias` | `str` | 公众号别名 |
|
|
97
|
+
| `mp_image` | `str` | 公众号头像链接 |
|
|
98
|
+
| `mp_description` | `str` | 公众号简介 |
|
|
99
|
+
| `article_id` | `str` | 文章 ID |
|
|
100
|
+
| `article_msg_id` | `int` | 文章所在的群发消息 ID |
|
|
101
|
+
| `article_idx` | `int` | 群发图文中的位置(从 1 开始) |
|
|
102
|
+
| `article_sn` | `str` | 文章签名(防伪校验) |
|
|
103
|
+
| `article_title` | `str` | 文章标题 |
|
|
104
|
+
| `article_cover_image` | `str` | 文章封面图链接 |
|
|
105
|
+
| `article_description` | `str` | 文章摘要 |
|
|
106
|
+
| `article_markdown` | `str` | 文章正文的 Markdown 内容 |
|
|
107
|
+
| `article_publish_time` | `int` | 发布时间(Unix 时间戳) |
|
|
108
|
+
| `images` | `list[str]` | 文章中提取的所有图片链接 |
|
|
109
|
+
| `is_valid` | `bool` | 关键字段是否全部解析成功(属性) |
|
|
110
|
+
|
|
111
|
+
`is_valid` 为 `True` 的条件:`mp_id`、`mp_name`、`article_id`、`article_msg_id`、`article_idx`、`article_sn`、`article_title`、`article_markdown`、`article_publish_time` 均不为空/零。
|
|
112
|
+
|
|
113
|
+
## 异常处理
|
|
114
|
+
|
|
115
|
+
### WeChatVerifyError
|
|
116
|
+
|
|
117
|
+
当请求频率过高或 IP 被限流时,微信会返回验证码页面而非文章内容。此时会抛出 `WeChatVerifyError`:
|
|
118
|
+
|
|
119
|
+
```python
|
|
120
|
+
from wechat_article_parser import parse, WeChatVerifyError
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
result = parse("https://mp.weixin.qq.com/s/xxxxx")
|
|
124
|
+
except WeChatVerifyError:
|
|
125
|
+
print("触发了微信人机验证,请稍后重试或更换 IP")
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### httpx.HTTPStatusError
|
|
129
|
+
|
|
130
|
+
当 HTTP 请求返回非 2xx 状态码时,会抛出 `httpx.HTTPStatusError`:
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
import httpx
|
|
134
|
+
from wechat_article_parser import parse
|
|
135
|
+
|
|
136
|
+
try:
|
|
137
|
+
result = parse("https://mp.weixin.qq.com/s/xxxxx")
|
|
138
|
+
except httpx.HTTPStatusError as e:
|
|
139
|
+
print(f"请求失败: {e.response.status_code}")
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
### 解析不完整
|
|
143
|
+
|
|
144
|
+
部分文章类型可能无法提取所有字段(如某些文章没有封面图或摘要)。这种情况不会抛出异常,但 `result.is_valid` 会返回 `False`。建议在业务逻辑中检查:
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
result = parse("https://mp.weixin.qq.com/s/xxxxx")
|
|
148
|
+
if not result.is_valid:
|
|
149
|
+
print("部分关键字段未能解析")
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
## 测试
|
|
153
|
+
|
|
154
|
+
安装开发依赖:
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
pip install -e ".[dev]"
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### 运行全部测试
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
pytest tests/test_parser.py -v -s
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
### 只运行同步 / 异步测试
|
|
167
|
+
|
|
168
|
+
```bash
|
|
169
|
+
pytest tests/test_parser.py -v -s -k "sync"
|
|
170
|
+
pytest tests/test_parser.py -v -s -k "async"
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
### 测试单个链接 - 查看所有字段
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
pytest tests/test_parser.py::test_fetch_all -s --url "https://mp.weixin.qq.com/s/xxxxx"
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
### 测试单个链接 - 只看 Markdown 内容
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
pytest tests/test_parser.py::test_fetch_markdown -s --url "https://mp.weixin.qq.com/s/xxxxx"
|
|
183
|
+
```
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
wechat_article_parser/__init__.py,sha256=FPx8CRCMeG1IoFOsooFpwMa_p_vFbUZP0IxT_qiL_VY,166
|
|
2
|
+
wechat_article_parser/models.py,sha256=lVDZ2hhiVgzjP2bKg8-okOYq29uX_YI3NgA0XCWCNok,1264
|
|
3
|
+
wechat_article_parser/parser.py,sha256=4rno4tuUAyWb1ne5GK6aKzwHkDF1FWeGqt2mO9o7liw,16944
|
|
4
|
+
wechat_article_parser-0.0.1.dist-info/METADATA,sha256=d337mXYyCcHl3dDUp1Fpf9ZxhtWLzguDOX-LwSXt0C0,4852
|
|
5
|
+
wechat_article_parser-0.0.1.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
6
|
+
wechat_article_parser-0.0.1.dist-info/licenses/LICENSE,sha256=iMsnZ4LccklxTrSh5wtzlkVSfG4FLnO6o2lZnBWtCyk,1061
|
|
7
|
+
wechat_article_parser-0.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Gang
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|