wxmp 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
wxmp-1.0.0/PKG-INFO ADDED
@@ -0,0 +1,105 @@
1
+ Metadata-Version: 2.3
2
+ Name: wxmp
3
+ Version: 1.0.0
4
+ Summary: 微信公众号 API 等相关工具
5
+ Author: morning-start
6
+ Author-email: morning-start <morning-start@foxmail.com>
7
+ Requires-Dist: fake-useragent>=2.2.0
8
+ Requires-Dist: loguru>=0.7.3
9
+ Requires-Dist: pandas>=2.3.3
10
+ Requires-Dist: pydantic>=2.12.5
11
+ Requires-Dist: requests>=2.32.5
12
+ Requires-Dist: tqdm>=4.67.3
13
+ Requires-Dist: urllib3>=2.6.3
14
+ Requires-Python: >=3.10
15
+ Description-Content-Type: text/markdown
16
+
17
+ # 微信公众号 API 工具
18
+
19
+ 微信公众平台(微信公众号)API 相关工具,提供公众号搜索、文章列表获取、文章内容下载等功能。
20
+
21
+ **项目地址**: https://github.com/morning-start/wxmp
22
+
23
+ ## 项目简介
24
+
25
+ `wxmp` 是一个 Python 库,用于与微信公众平台后台 API 进行交互。通过本项目可以:
26
+
27
+ - 通过微信登录后的 cookies 获取访问 token
28
+ - 搜索公众号(根据关键词查找公众号信息)
29
+ - 获取指定公众号的文章列表
30
+ - 验证文章链接有效性
31
+ - 下载文章内容并转换为 Markdown 格式
32
+ - 支持时间范围缓存和增量更新
33
+ - 支持并发下载
34
+
35
+ ## 安装
36
+
37
+ ```bash
38
+ pip install wxmp
39
+ ```
40
+
41
+ 或从源码安装:
42
+
43
+ ```bash
44
+ git clone https://github.com/morning-start/wxmp.git
45
+ cd wxmp
46
+ pip install -e .
47
+ ```
48
+
49
+ ## 快速开始
50
+
51
+ ### 1. 初始化 API
52
+
53
+ ```python
54
+ from wxmp import WxMPAPI
55
+
56
+ cookies = {
57
+ "wxuin": "your_wxuin",
58
+ "pass_ticket": "your_pass_ticket",
59
+ }
60
+
61
+ api = WxMPAPI(cookies)
62
+ ```
63
+
64
+ ### 2. 搜索公众号
65
+
66
+ ```python
67
+ response = api.search_fakeid("Python")
68
+ for account in response.list:
69
+ print(f"名称: {account.nickname}")
70
+ ```
71
+
72
+ ### 3. 使用时间范围爬虫(推荐)
73
+
74
+ ```python
75
+ from wxmp.spider import TimeRangeSpider
76
+ from datetime import datetime
77
+
78
+ spider = TimeRangeSpider.from_cookies_file("cookies.json")
79
+ bizs = spider.load_or_search_bizs(["Python编程"])
80
+
81
+ time_range = TimeRange(
82
+ begin=datetime(2024, 1, 1),
83
+ end=datetime(2024, 12, 31)
84
+ )
85
+
86
+ df = spider.search_articles_content(bizs, time_range)
87
+ spider.save_all_article_content(df, save_dir="temp/article_content/")
88
+ ```
89
+
90
+ ## 文档
91
+
92
+ 详细文档请查看 [Wiki](./wiki/README.md):
93
+
94
+ - [项目概览](./wiki/项目概览.md) - 项目简介、技术栈、架构概览
95
+ - [架构设计](./wiki/架构设计.md) - 设计原则、模块设计、缓存策略
96
+ - [API 文档](./wiki/API文档.md) - API 层完整文档、数据模型、异常类
97
+ - [使用指南](./wiki/使用指南.md) - 快速开始、使用场景、最佳实践
98
+ - [数据流动与状态管理](./wiki/数据流动与状态管理.md) - 数据流转、状态机、缓存策略
99
+ - [贡献指南](./wiki/贡献指南.md) - 如何贡献代码、开发环境设置
100
+ - [常见问题](./wiki/常见问题.md) - 常见问题和解决方案
101
+ - [更新日志](./wiki/CHANGELOG.md) - 版本历史、变更记录
102
+
103
+ ## License
104
+
105
+ MIT License
wxmp-1.0.0/README.md ADDED
@@ -0,0 +1,89 @@
1
+ # 微信公众号 API 工具
2
+
3
+ 微信公众平台(微信公众号)API 相关工具,提供公众号搜索、文章列表获取、文章内容下载等功能。
4
+
5
+ **项目地址**: https://github.com/morning-start/wxmp
6
+
7
+ ## 项目简介
8
+
9
+ `wxmp` 是一个 Python 库,用于与微信公众平台后台 API 进行交互。通过本项目可以:
10
+
11
+ - 通过微信登录后的 cookies 获取访问 token
12
+ - 搜索公众号(根据关键词查找公众号信息)
13
+ - 获取指定公众号的文章列表
14
+ - 验证文章链接有效性
15
+ - 下载文章内容并转换为 Markdown 格式
16
+ - 支持时间范围缓存和增量更新
17
+ - 支持并发下载
18
+
19
+ ## 安装
20
+
21
+ ```bash
22
+ pip install wxmp
23
+ ```
24
+
25
+ 或从源码安装:
26
+
27
+ ```bash
28
+ git clone https://github.com/morning-start/wxmp.git
29
+ cd wxmp
30
+ pip install -e .
31
+ ```
32
+
33
+ ## 快速开始
34
+
35
+ ### 1. 初始化 API
36
+
37
+ ```python
38
+ from wxmp import WxMPAPI
39
+
40
+ cookies = {
41
+ "wxuin": "your_wxuin",
42
+ "pass_ticket": "your_pass_ticket",
43
+ }
44
+
45
+ api = WxMPAPI(cookies)
46
+ ```
47
+
48
+ ### 2. 搜索公众号
49
+
50
+ ```python
51
+ response = api.search_fakeid("Python")
52
+ for account in response.list:
53
+ print(f"名称: {account.nickname}")
54
+ ```
55
+
56
+ ### 3. 使用时间范围爬虫(推荐)
57
+
58
+ ```python
59
+ from wxmp.spider import TimeRangeSpider
60
+ from datetime import datetime
61
+
62
+ spider = TimeRangeSpider.from_cookies_file("cookies.json")
63
+ bizs = spider.load_or_search_bizs(["Python编程"])
64
+
65
+ time_range = TimeRange(
66
+ begin=datetime(2024, 1, 1),
67
+ end=datetime(2024, 12, 31)
68
+ )
69
+
70
+ df = spider.search_articles_content(bizs, time_range)
71
+ spider.save_all_article_content(df, save_dir="temp/article_content/")
72
+ ```
73
+
74
+ ## 文档
75
+
76
+ 详细文档请查看 [Wiki](./wiki/README.md):
77
+
78
+ - [项目概览](./wiki/项目概览.md) - 项目简介、技术栈、架构概览
79
+ - [架构设计](./wiki/架构设计.md) - 设计原则、模块设计、缓存策略
80
+ - [API 文档](./wiki/API文档.md) - API 层完整文档、数据模型、异常类
81
+ - [使用指南](./wiki/使用指南.md) - 快速开始、使用场景、最佳实践
82
+ - [数据流动与状态管理](./wiki/数据流动与状态管理.md) - 数据流转、状态机、缓存策略
83
+ - [贡献指南](./wiki/贡献指南.md) - 如何贡献代码、开发环境设置
84
+ - [常见问题](./wiki/常见问题.md) - 常见问题和解决方案
85
+ - [更新日志](./wiki/CHANGELOG.md) - 版本历史、变更记录
86
+
87
+ ## License
88
+
89
+ MIT License
@@ -0,0 +1,26 @@
1
+ [project]
2
+ name = "wxmp"
3
+ version = "1.0.0"
4
+ description = "微信公众号 API 等相关工具"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "morning-start", email = "morning-start@foxmail.com" }
8
+ ]
9
+ requires-python = ">=3.10"
10
+ dependencies = [
11
+ "fake-useragent>=2.2.0",
12
+ "loguru>=0.7.3",
13
+ "pandas>=2.3.3",
14
+ "pydantic>=2.12.5",
15
+ "requests>=2.32.5",
16
+ "tqdm>=4.67.3",
17
+ "urllib3>=2.6.3",
18
+ ]
19
+
20
+ [build-system]
21
+ requires = ["uv_build>=0.10.4,<0.11.0"]
22
+ build-backend = "uv_build"
23
+
24
+ [[tool.uv.index]]
25
+ url = "https://pypi.tuna.tsinghua.edu.cn/simple"
26
+ default = true
@@ -0,0 +1,22 @@
1
+ from .api import WxMPAPI
2
+ from .api.common import WxMPAPIError
3
+ from .api.list_ex import ArticleListItem, ListExError, ListExRequest, ListExResponse
4
+ from .api.search_biz import SearchBizError, SearchBizRequest, SearchBizResponse
5
+ from .api.token import TokenError, TokenResponse
6
+
7
+ __version__ = "0.1.0"
8
+
9
+ __all__ = [
10
+ "__version__",
11
+ "WxMPAPI",
12
+ "WxMPAPIError",
13
+ "TokenError",
14
+ "TokenResponse",
15
+ "SearchBizError",
16
+ "SearchBizRequest",
17
+ "SearchBizResponse",
18
+ "ListExError",
19
+ "ListExRequest",
20
+ "ListExResponse",
21
+ "ArticleListItem",
22
+ ]
@@ -0,0 +1,18 @@
1
+ from .index import WxMPAPI
2
+ from .list_ex import ArticleListItem, ListExError, ListExRequest, ListExResponse
3
+ from .search_biz import SearchBizError, SearchBizRequest, SearchBizResponse
4
+ from .token import TokenError, TokenResponse
5
+
6
+ __all__ = [
7
+ "TokenResponse",
8
+ "SearchBizResponse",
9
+ "ListExResponse",
10
+ "SearchBizRequest",
11
+ "ListExRequest",
12
+ "WxMPAPI",
13
+ "WxMPAPIError",
14
+ "TokenError",
15
+ "SearchBizError",
16
+ "ListExError",
17
+ "ArticleListItem",
18
+ ]
@@ -0,0 +1,47 @@
1
+ from typing import Optional
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class WxMPAPIError(Exception):
7
+ """微信MP API基础异常类"""
8
+
9
+ pass
10
+
11
+
12
+ class BaseRequest(BaseModel):
13
+ """API基础请求参数"""
14
+
15
+ token: str = Field(description="授权token")
16
+ begin: int = Field(default=0, description="列表起始位置")
17
+ count: int = Field(default=5, description="返回消息条数,一个消息可能包含多个文章")
18
+ lang: str = Field(default="zh_CN", description="语言")
19
+ f: str = Field(default="json", description="数据格式")
20
+ ajax: int = Field(default=1, description="是否为ajax请求")
21
+
22
+
23
+ class BaseResp(BaseModel):
24
+ """基础响应对象(微信API标准格式)"""
25
+
26
+ ret: int = Field(description="返回码,0表示成功")
27
+ err_msg: str = Field(default="ok", description="返回信息")
28
+
29
+
30
+ class BaseResponse(BaseModel):
31
+ """API基础响应模型(带base_resp嵌套格式)"""
32
+
33
+ base_resp: BaseResp = Field(description="基础响应对象")
34
+
35
+
36
+ class ErrorDetail(BaseModel):
37
+ """错误详情模型"""
38
+
39
+ code: int = Field(description="错误码")
40
+ message: str = Field(description="错误信息")
41
+ details: Optional[str] = Field(default=None, description="错误详情")
42
+
43
+
44
+ class ErrorResponse(BaseModel):
45
+ """错误响应模型"""
46
+
47
+ base_resp: ErrorDetail = Field(description="基础响应")
@@ -0,0 +1,135 @@
1
+ import asyncio
2
+ import re
3
+ import warnings
4
+
5
+ import requests
6
+ from fake_useragent import UserAgent
7
+ from tqdm.asyncio import tqdm as tqdm_asyncio
8
+ from urllib3.exceptions import InsecureRequestWarning
9
+
10
+ from .list_ex import ListExError, ListExRequest, ListExResponse
11
+ from .search_biz import SearchBizError, SearchBizRequest, SearchBizResponse
12
+ from .token import TokenError
13
+
14
+ warnings.filterwarnings("ignore", category=InsecureRequestWarning)
15
+
16
+
17
+ class WxMPAPI:
18
+ def __init__(self, cookies: dict) -> None:
19
+ self.cookies = cookies
20
+ self.domain = "https://mp.weixin.qq.com"
21
+ self.headers = {
22
+ "User-Agent": UserAgent().random,
23
+ "Host": "mp.weixin.qq.com",
24
+ "Referer": "https://mp.weixin.qq.com/",
25
+ }
26
+ self.session = requests.Session()
27
+ self.token = None
28
+
29
+ def _fetch_token(self) -> str:
30
+ url = self.domain
31
+ try:
32
+ res = self.session.get(
33
+ url=url, headers=self.headers, cookies=self.cookies, verify=False
34
+ )
35
+ res.raise_for_status()
36
+
37
+ token = re.findall(r".*?token=(\d+)", res.url)
38
+ if token:
39
+ return token[0]
40
+ raise TokenError("从重定向URL中提取token失败")
41
+ except requests.HTTPError as e:
42
+ raise TokenError(f"HTTP请求失败: {e.response.status_code}")
43
+ except Exception as e:
44
+ raise TokenError(f"获取token时发生错误: {str(e)}")
45
+
46
+ def fetch_fakeid(
47
+ self, query: str, begin: int = 0, count: int = 5
48
+ ) -> SearchBizResponse:
49
+ url = self.domain + "/cgi-bin/searchbiz"
50
+ params = SearchBizRequest(
51
+ action="search_biz",
52
+ begin=begin,
53
+ count=count,
54
+ query=query,
55
+ token=self.token,
56
+ )
57
+ try:
58
+ res = self.session.get(
59
+ url=url,
60
+ params=params.model_dump(),
61
+ headers=self.headers,
62
+ cookies=self.cookies,
63
+ verify=False,
64
+ )
65
+ res.raise_for_status()
66
+ return SearchBizResponse(**res.json())
67
+ except requests.HTTPError as e:
68
+ raise SearchBizError(f"HTTP请求失败: {e.response.status_code}")
69
+ except Exception as e:
70
+ raise SearchBizError(f"搜索公众号时发生错误: {str(e)}")
71
+
72
+ def fetch_article_list(
73
+ self, fakeid: str, begin: int = 0, count: int = 5
74
+ ) -> ListExResponse:
75
+ url = self.domain + "/cgi-bin/appmsg"
76
+ params = ListExRequest(
77
+ begin=begin,
78
+ count=count,
79
+ fakeid=fakeid,
80
+ token=self.token,
81
+ )
82
+ try:
83
+ res = self.session.get(
84
+ url=url,
85
+ params=params.model_dump(),
86
+ headers=self.headers,
87
+ cookies=self.cookies,
88
+ verify=False,
89
+ )
90
+ res.raise_for_status()
91
+ return ListExResponse(**res.json())
92
+ except requests.HTTPError as e:
93
+ raise ListExError(f"HTTP请求失败: {e.response.status_code}")
94
+ except Exception as e:
95
+ raise ListExError(f"获取文章列表时发生错误: {str(e)}")
96
+
97
+ @staticmethod
98
+ def is_valid_article_link(link: str) -> bool:
99
+ """
100
+ 判断文章链接是否有效
101
+ 包含 tempkey= 的链接说明文章已删除或失效
102
+ """
103
+ if not link:
104
+ return False
105
+ # 检查是否包含 tempkey= 参数(说明文章已失效)
106
+ if "tempkey=" in link:
107
+ return False
108
+ return True
109
+
110
+ @staticmethod
111
+ def fetch_article_content(link: str, timeout: int = 10) -> str:
112
+ headers = {"User-Agent": UserAgent().random}
113
+ response = requests.get(link, headers=headers, timeout=timeout)
114
+ response.raise_for_status()
115
+ return response.text
116
+
117
+ @staticmethod
118
+ async def fetch_multi_article_content(
119
+ links: list[str], timeout: int = 10
120
+ ) -> list[str]:
121
+ """
122
+ 搜索多篇文章内容, 并返回同顺序内容列表
123
+ """
124
+
125
+ async def fetch_single(link: str) -> str:
126
+ headers = {"User-Agent": UserAgent().random}
127
+ response = requests.get(link, headers=headers, timeout=timeout)
128
+ response.raise_for_status()
129
+ return response.text
130
+
131
+ tasks = [asyncio.to_thread(fetch_single, link) for link in links]
132
+
133
+ results = await tqdm_asyncio.gather(*tasks, desc="获取文章内容", unit="篇")
134
+
135
+ return results
@@ -0,0 +1,55 @@
1
+ from datetime import datetime
2
+ from typing import List
3
+
4
+ from pydantic import BaseModel, Field, field_serializer
5
+
6
+ from .common import BaseRequest, BaseResponse, WxMPAPIError
7
+
8
+
9
+ class ListExError(WxMPAPIError):
10
+ """获取文章列表失败异常"""
11
+
12
+ pass
13
+
14
+
15
+ class ListExRequest(BaseRequest):
16
+ """获取文章列表请求参数"""
17
+
18
+ query: str = Field(default="", description="查询字符串")
19
+ action: str = Field(default="list_ex", description="动作")
20
+ fakeid: str = Field(description="公众号ID")
21
+ type: int = Field(default=9, description="类型")
22
+
23
+
24
+ class ArticleListItem(BaseModel):
25
+ """文章列表项"""
26
+
27
+ aid: str = Field(description="文章ID(格式:{appmsgid}_{idx})")
28
+ appmsgid: int = Field(description="文章消息ID")
29
+ cover: str = Field(description="封面图URL")
30
+ create_time: int = Field(description="创建时间戳")
31
+ digest: str = Field(description="文章摘要")
32
+ is_pay_subscribe: int = Field(default=0, description="是否付费订阅")
33
+ item_show_type: int = Field(default=0, description="展示类型")
34
+ itemidx: int = Field(description="文章索引")
35
+ link: str = Field(description="文章链接")
36
+ tagid: List[str] = Field(default_factory=list, description="标签ID列表")
37
+ title: str = Field(description="文章标题")
38
+ update_time: int = Field(description="更新时间戳")
39
+
40
+ @field_serializer("create_time", "update_time")
41
+ def serialize_timestamp(self, value: int) -> str:
42
+ """将时间戳转换为格式化的日期时间字符串"""
43
+ return datetime.fromtimestamp(value).strftime("%Y-%m-%d %H:%M:%S")
44
+
45
+ @field_serializer("tagid")
46
+ def serialize_tagid(self, value: List[str]) -> str:
47
+ """将标签ID列表转换为逗号分隔的字符串"""
48
+ return ",".join(value) if value else ""
49
+
50
+
51
+ class ListExResponse(BaseResponse):
52
+ """文章列表API响应(list_ex)"""
53
+
54
+ app_msg_cnt: int = Field(description="文章总数")
55
+ app_msg_list: List[ArticleListItem] = Field(description="文章列表")
@@ -0,0 +1,37 @@
1
+ from typing import List
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+ from .common import BaseRequest, BaseResponse, WxMPAPIError
6
+
7
+
8
+ class SearchBizError(WxMPAPIError):
9
+ """搜索公众号失败异常"""
10
+
11
+ pass
12
+
13
+
14
+ class SearchBizRequest(BaseRequest):
15
+ """搜索公众号请求参数"""
16
+
17
+ query: str = Field(description="查询字符串")
18
+ action: str = Field(default="search_biz", description="动作")
19
+
20
+
21
+ class AccountInfo(BaseModel):
22
+ """公众号信息"""
23
+
24
+ fakeid: str = Field(description="公众号fakeid(用于获取文章列表)")
25
+ nickname: str = Field(description="公众号名称")
26
+ alias: str = Field(default="", description="公众号微信号")
27
+ round_head_img: str = Field(description="公众号头像URL")
28
+ service_type: int = Field(description="服务类型")
29
+ signature: str = Field(description="公众号签名")
30
+ verify_status: int = Field(description="验证状态")
31
+
32
+
33
+ class SearchBizResponse(BaseResponse):
34
+ """搜索公众号API响应"""
35
+
36
+ arr: List[AccountInfo] = Field(description="公众号列表")
37
+ total: int = Field(description="搜索结果总数")
@@ -0,0 +1,15 @@
1
+ from pydantic import Field
2
+
3
+ from .common import BaseResponse, WxMPAPIError
4
+
5
+
6
+ class TokenError(WxMPAPIError):
7
+ """Token获取失败异常"""
8
+
9
+ pass
10
+
11
+
12
+ class TokenResponse(BaseResponse):
13
+ """Token获取响应"""
14
+
15
+ redirect_url: str = Field(description="重定向URL(包含token)")
File without changes
@@ -0,0 +1,3 @@
1
+ from .time_range_spider import TimeRangeSpider
2
+
3
+ __all__ = ["TimeRangeSpider"]
@@ -0,0 +1,433 @@
1
+ import time
2
+ from concurrent.futures import ThreadPoolExecutor, as_completed
3
+ from datetime import datetime
4
+ from pathlib import Path
5
+ from typing import Literal, NamedTuple
6
+
7
+ import pandas as pd
8
+ from loguru import logger
9
+ from pydantic import BaseModel, field_serializer
10
+ from tqdm import tqdm
11
+
12
+ from wxmp.api import ArticleListItem, SearchBizError, TokenError, WxMPAPI
13
+ from wxmp.tools import load_json, sanitize_filename, save_article_content, save_json
14
+
15
+
16
+ class ArticleDownloadTask(NamedTuple):
17
+ url: str
18
+ title: str
19
+ save_dir: Path
20
+ save_file: Literal["md", "html"] = "md"
21
+ max_retries: int = 3
22
+ timeout: int = 30
23
+ date_str: str = ""
24
+ account_name: str = ""
25
+ digest: str = ""
26
+ min_file_size_kb: int = 3
27
+
28
+
29
+ class TimeRange(BaseModel):
30
+ """文章时间范围,用于缓存管理"""
31
+
32
+ begin: datetime
33
+ end: datetime = datetime.today()
34
+
35
+ @field_serializer("begin", "end")
36
+ def serialize_datetime(self, dt: datetime) -> str:
37
+ """将 datetime 对象序列化为 YYYY-MM-DD 格式字符串"""
38
+ return dt.strftime("%Y-%m-%d")
39
+
40
+
41
+ class TimeRangeSpider(WxMPAPI):
42
+ def __init__(self, cookies: dict[str, str]) -> None:
43
+ super().__init__(cookies)
44
+ try:
45
+ self.token = self._fetch_token()
46
+ logger.info(f"获取token成功: {self.token}")
47
+ except TokenError as e:
48
+ logger.error(f"❌ 获取token失败: {str(e)}")
49
+ raise
50
+
51
+ @classmethod
52
+ def from_cookies_file(cls, file_path: str) -> "TimeRangeSpider":
53
+ data = load_json(file_path)
54
+ cookies = data["请求 Cookie"]
55
+ return cls(cookies)
56
+
57
+ def load_or_search_bizs(
58
+ self, gzh_names: list[str] = None, cache_file: Path = Path("temp/fakeids.json")
59
+ ) -> dict[str, str]:
60
+ """
61
+ 加载或获取公众号fakeid映射(带缓存优化)
62
+
63
+ Args:
64
+ gzh_names: 公众号名称列表,如果为空则使用缓存文件中的所有公众号
65
+ cache_file: 缓存文件路径
66
+
67
+ Returns:
68
+ 公众号名称到fakeid的映射
69
+ """
70
+ bizs = {}
71
+
72
+ if cache_file.exists():
73
+ logger.info(f"从缓存加载fakeids: {cache_file}")
74
+ cached_bizs = load_json(cache_file)
75
+
76
+ # 如果公众号名称列表为空,则使用缓存文件中的所有公众号
77
+ if not gzh_names:
78
+ bizs = cached_bizs
79
+ logger.info(f"使用缓存文件中的所有公众号,共 {len(bizs)} 个")
80
+ else:
81
+ required_names = set(gzh_names)
82
+ required_cached_bizs = {
83
+ name: cached_bizs[name]
84
+ for name in required_names & set(cached_bizs.keys())
85
+ }
86
+ bizs.update(required_cached_bizs)
87
+ else:
88
+ if not gzh_names:
89
+ logger.warning("缓存文件不存在且未指定公众号名称,返回空字典")
90
+ return {}
91
+
92
+ need_names = set(gzh_names) - set(bizs.keys()) if gzh_names else set()
93
+ if need_names:
94
+ logger.info(f"从网络获取fakeids: {need_names}")
95
+ for name in need_names:
96
+ try:
97
+ result = self.fetch_fakeid(name)
98
+ if result.arr:
99
+ nickname = result.arr[0].nickname
100
+ fakeid = result.arr[0].fakeid
101
+ bizs[nickname] = fakeid
102
+ logger.info(f"成功获取公众号: {nickname} -> {fakeid}")
103
+ else:
104
+ logger.warning(f"公众号搜索结果为空: {name}")
105
+ except SearchBizError as e:
106
+ logger.error(f"搜索公众号失败: {name}, 错误: {str(e)}")
107
+ save_json(bizs, cache_file)
108
+
109
+ return bizs
110
+
111
+ def search_article_list(
112
+ self, fakeid: str, begin: int, count: int
113
+ ) -> list[ArticleListItem]:
114
+ """
115
+ 获取文章列表
116
+
117
+ Args:
118
+ fakeid: 公众号fakeid
119
+ begin: 列表起始位置
120
+ count: 返回数量
121
+
122
+ Returns:
123
+ 过滤后的文章列表
124
+ """
125
+ articles = self.fetch_article_list(fakeid, begin, count)
126
+ valid_articles = [
127
+ article
128
+ for article in articles.app_msg_list
129
+ if self.is_valid_article_link(article.link)
130
+ ]
131
+ return valid_articles
132
+
133
+ def search_articles(
134
+ self,
135
+ fakeid: str,
136
+ max_count: int | None = None,
137
+ time_range: TimeRange | None = None,
138
+ ) -> list[ArticleListItem]:
139
+ """
140
+ 加载或获取文章链接列表(带缓存优化)
141
+
142
+ Args:
143
+ nickname: 公众号名称
144
+ fakeid: 公众号fakeid
145
+ max_count: 最大获取数量限制
146
+ time_range: 时间范围限制
147
+
148
+ Returns:
149
+ 文章链接列表
150
+ """
151
+ EACH_COUNT = 5
152
+ # article 中有时间属性,获取所有在时间范围的文章信息,
153
+ all_articles: list[ArticleListItem] = []
154
+ begin = 0
155
+ while True:
156
+ # 获取到的文章都是倒序排列,就是由近到远的顺序,越在后面的越早
157
+ articles = self.search_article_list(fakeid, begin, EACH_COUNT)
158
+ all_articles += articles
159
+ begin += EACH_COUNT
160
+ # 如果articles为空list,说明超出范围,停止获取
161
+ if not articles:
162
+ logger.warning(f"公众号「{nickname}」获取到的文章为空,停止获取")
163
+ break
164
+ # 如果时间范围限制存在,超过start_date,停止获取
165
+ if time_range and articles[-1].create_time < time_range.begin.timestamp():
166
+ break
167
+ # 如果最大数量限制存在,超过最大数量,停止获取
168
+ if max_count and len(all_articles) >= max_count:
169
+ logger.warning(
170
+ f"公众号「{nickname}」获取到的文章数量 {len(all_articles)} 已超过最大数量 {max_count},停止获取"
171
+ )
172
+ break
173
+
174
+ return all_articles
175
+
176
+ # 剩余的时间范围 meta_file,start_time,end_time
177
+ @staticmethod
178
+ def get_remaining_time_range(
179
+ meta_file: Path, need_time: TimeRange
180
+ ) -> tuple[TimeRange, TimeRange]:
181
+ """
182
+ 获取剩余的时间范围
183
+
184
+ Args:
185
+ meta_file: 元数据文件路径
186
+ time_range: 时间范围
187
+
188
+ Returns:
189
+ 剩余的时间范围(开始日期,结束日期)
190
+ """
191
+ if not meta_file.exists():
192
+ return need_time, need_time
193
+ meta_time = TimeRange(**load_json(meta_file))
194
+
195
+ remaining_range, new_meta_info = match_remaining_time_range(
196
+ meta_time, need_time
197
+ )
198
+ return remaining_range, new_meta_info
199
+
200
+ def search_articles_content(
201
+ self,
202
+ bizs: dict[str, str],
203
+ time_range: TimeRange,
204
+ save_dir: Path = Path("temp/articles_info/"),
205
+ ) -> pd.DataFrame:
206
+ """
207
+ 获取文章内容(不带缓存优化)
208
+
209
+ Args:
210
+ bizs: 公众号名称到fakeid的映射
211
+ time_range: 时间范围
212
+
213
+ Returns:
214
+ 文章内容DataFrame
215
+ """
216
+ # 创建保存目录
217
+ save_dir.mkdir(parents=True, exist_ok=True)
218
+
219
+ for nickname, fakeid in bizs.items():
220
+ safe_nickname = sanitize_filename(nickname)
221
+ save_path = save_dir / f"{safe_nickname}.csv"
222
+ meta_path = save_dir / f"{safe_nickname}.json"
223
+ remaining_range, new_meta_info = self.get_remaining_time_range(
224
+ meta_path, time_range
225
+ )
226
+ if remaining_range is None:
227
+ # debug剩余时间,检查是否计算正确
228
+ logger.debug(
229
+ f"公众号 {nickname} 剩余时间范围 {remaining_range},元数据 {new_meta_info}"
230
+ )
231
+ logger.info(f"公众号 {nickname} 已经获取到所有文章,跳过")
232
+ continue
233
+ articles = self.search_articles(fakeid, time_range=remaining_range)
234
+ if not articles:
235
+ logger.warning(f"公众号 {nickname} 没有获取到有效文章")
236
+ continue
237
+
238
+ df_articles = pd.DataFrame([article.model_dump() for article in articles])
239
+
240
+ # 如果文件存在,则合并
241
+ if save_path.exists():
242
+ df_existing = pd.read_csv(save_path)
243
+ df_articles = pd.concat([df_articles, df_existing], ignore_index=True)
244
+
245
+ # 去重
246
+ df_articles = df_articles.drop_duplicates(
247
+ subset=["title"], keep="first", ignore_index=True
248
+ )
249
+ # 按照时间排序
250
+ df_articles["create_time"] = pd.to_datetime(df_articles["create_time"])
251
+ df_articles = df_articles.sort_values(
252
+ by="create_time", ascending=False, ignore_index=True
253
+ )
254
+
255
+ # 保存到缓存文件
256
+ df_articles.to_csv(save_path, index=False, encoding="utf-8-sig")
257
+ # 保存元数据
258
+ save_json(new_meta_info.model_dump(), meta_path)
259
+ # 合并bizs中对应的csv文件,并且 nickname 列为对应公众号名称
260
+ csv_files: list[Path] = []
261
+ for nickname in bizs.keys():
262
+ safe_nickname = sanitize_filename(nickname)
263
+ csv_path = save_dir / f"{safe_nickname}.csv"
264
+ if csv_path.exists():
265
+ csv_files.append(csv_path)
266
+ if csv_files:
267
+ df = pd.concat(
268
+ [pd.read_csv(f).assign(nickname=f.stem) for f in csv_files],
269
+ ignore_index=True,
270
+ )
271
+ else:
272
+ df = pd.DataFrame()
273
+ return df
274
+
275
+ @staticmethod
276
+ def download_article_content(task: ArticleDownloadTask) -> bool:
277
+ """
278
+ 保存文章内容到Markdown文件
279
+
280
+ Args:
281
+ task: ArticleDownloadTask 文章下载任务
282
+
283
+ Returns:
284
+ 是否成功保存
285
+ """
286
+ max_retries = task.max_retries
287
+
288
+ task.save_dir.mkdir(parents=True, exist_ok=True)
289
+
290
+ safe_title = sanitize_filename(task.title)
291
+ save_path = task.save_dir / f"{safe_title}.{task.save_file}"
292
+
293
+ if save_path.exists():
294
+ return True
295
+
296
+ for attempt in range(max_retries):
297
+ try:
298
+ content = WxMPAPI.fetch_article_content(task.url, timeout=task.timeout)
299
+ result = save_article_content(
300
+ content,
301
+ save_path,
302
+ task.save_file,
303
+ title=task.title,
304
+ date_str=task.date_str,
305
+ link=task.url,
306
+ account_name=task.account_name,
307
+ digest=task.digest,
308
+ min_file_size_kb=task.min_file_size_kb,
309
+ )
310
+ return result
311
+ except Exception as e:
312
+ if attempt == max_retries - 1:
313
+ logger.error(
314
+ f"获取文章内容失败(重试{max_retries}次后): {task.title}, 错误: {e}"
315
+ )
316
+ return False
317
+ time.sleep(1)
318
+ return False
319
+
320
+ @staticmethod
321
+ def save_all_article_content(
322
+ df: pd.DataFrame,
323
+ save_dir: Path = Path("temp/article_content/"),
324
+ max_workers: int = 5,
325
+ time_range: TimeRange = None,
326
+ save_file: Literal["md", "html"] = "md",
327
+ min_file_size_kb: int = 3,
328
+ ):
329
+ """
330
+ 保存所有文章内容到Markdown文件(并发下载)
331
+
332
+ Args:
333
+ df: 包含文章信息的DataFrame
334
+ save_dir: 保存目录
335
+ max_workers: 最大并发数
336
+ time_range: 时间范围
337
+ save_file: 保存格式(md 或 html)
338
+ min_file_size_kb: 最小文件大小(KB)
339
+ """
340
+ save_dir.mkdir(parents=True, exist_ok=True)
341
+ # 筛选出在时间范围内的文章
342
+ df["create_time"] = pd.to_datetime(df["create_time"])
343
+ if time_range:
344
+ df = df[
345
+ (df["create_time"] >= time_range.begin)
346
+ & (df["create_time"] <= time_range.end)
347
+ ]
348
+
349
+ tasks = []
350
+ for _, row in df.iterrows():
351
+ safe_nickname = sanitize_filename(row["nickname"])
352
+ task = ArticleDownloadTask(
353
+ url=row["link"],
354
+ title=row["title"],
355
+ save_dir=save_dir / safe_nickname,
356
+ save_file=save_file,
357
+ max_retries=3,
358
+ timeout=30,
359
+ date_str=row.get("create_time", ""),
360
+ account_name=row.get("nickname", ""),
361
+ digest=row.get("digest", ""),
362
+ min_file_size_kb=min_file_size_kb,
363
+ )
364
+ tasks.append((task, row["link"], row["title"]))
365
+
366
+ success_count = 0
367
+ fail_count = 0
368
+ skip_count = 0
369
+
370
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
371
+ futures = {
372
+ executor.submit(TimeRangeSpider.download_article_content, task): (
373
+ url,
374
+ title,
375
+ )
376
+ for task, url, title in tasks
377
+ }
378
+
379
+ with tqdm(total=len(futures), desc="下载文章", unit="篇") as pbar:
380
+ for future in as_completed(futures):
381
+ url, title = futures[future]
382
+ try:
383
+ result = future.result()
384
+ if result:
385
+ success_count += 1
386
+ else:
387
+ fail_count += 1
388
+ except Exception as e:
389
+ fail_count += 1
390
+ logger.error(f"处理文章时发生异常: {title}, 错误: {e}")
391
+ pbar.update(1)
392
+
393
+ logger.info(
394
+ f"文章下载完成: 成功 {success_count} 篇, 失败 {fail_count} 篇, "
395
+ f"跳过 {skip_count} 篇, 总计 {len(tasks)} 篇"
396
+ )
397
+
398
+
399
+ def match_remaining_time_range(
400
+ meta_time: TimeRange, need_time: TimeRange
401
+ ) -> tuple[TimeRange, TimeRange]:
402
+ """
403
+ 获取剩余的时间范围
404
+
405
+ Args:
406
+ meta_time: 元数据时间范围
407
+ need_time: 需要获取的时间范围
408
+
409
+ Returns:
410
+ remaining_range: 剩余的时间范围
411
+ meta_time: 更新后的元数据时间范围
412
+ """
413
+
414
+ # 情况1: 完全没有重叠
415
+ # 情况2: 缓存在请求范围内,需要扩展(请求的开始日期在缓存内,但结束日期超出)
416
+ # 情况3: 缓存在请求范围内,需要扩展(请求的结束日期在缓存内,但开始日期超出)
417
+ # 情况4: 完全在范围内(无需获取)
418
+
419
+ if meta_time.end < need_time.begin or need_time.end < meta_time.begin:
420
+ remaining_range = TimeRange(begin=need_time.begin, end=need_time.end)
421
+ meta_time.begin = need_time.begin
422
+ meta_time.end = need_time.end
423
+ return remaining_range, meta_time
424
+ elif meta_time.begin < need_time.begin <= meta_time.end < need_time.end:
425
+ remaining_range = TimeRange(begin=meta_time.end, end=need_time.end)
426
+ meta_time.end = need_time.end
427
+ return remaining_range, meta_time
428
+ elif need_time.begin < meta_time.begin <= need_time.end < meta_time.end:
429
+ remaining_range = TimeRange(begin=need_time.begin, end=meta_time.begin)
430
+ meta_time.begin = need_time.begin
431
+ return remaining_range, meta_time
432
+ else:
433
+ return None, meta_time
@@ -0,0 +1,20 @@
1
+ from .article import generate_yaml_front_matter, save_article_content, save_markdown
2
+ from .file import (
3
+ load_html,
4
+ load_json,
5
+ load_text,
6
+ sanitize_filename,
7
+ save_html,
8
+ save_json,
9
+ save_text,
10
+ )
11
+
12
+ __all__ = [
13
+ "generate_yaml_front_matter",
14
+ "save_article_content",
15
+ "save_markdown",
16
+ "load_html",
17
+ "load_text",
18
+ "save_html",
19
+ "save_text",
20
+ ]
@@ -0,0 +1,199 @@
1
+ import re
2
+ from pathlib import Path
3
+ from typing import Literal
4
+
5
+ from .file import save_html, save_markdown
6
+
7
+
8
+ def generate_yaml_front_matter(
9
+ title: str = "",
10
+ date_str: str = "",
11
+ link: str = "",
12
+ account_name: str = "",
13
+ digest: str = "",
14
+ ) -> str:
15
+ """
16
+ 生成 YAML front matter 格式的元数据
17
+
18
+ Args:
19
+ title: 文章标题
20
+ date_str: 日期字符串
21
+ link: 文章链接
22
+ account_name: 公众号名称
23
+ digest: 文章摘要
24
+
25
+ Returns:
26
+ YAML front matter 字符串
27
+ """
28
+ yaml_front_matter = "---\n"
29
+ if title:
30
+ yaml_front_matter += f"title: {title}\n"
31
+ if date_str:
32
+ yaml_front_matter += f"date: {date_str}\n"
33
+ if link:
34
+ yaml_front_matter += f"link: {link}\n"
35
+ if account_name:
36
+ yaml_front_matter += f"account: {account_name}\n"
37
+ if digest:
38
+ yaml_front_matter += f"summary: {digest}\n"
39
+ yaml_front_matter += "---\n"
40
+ return yaml_front_matter
41
+
42
+
43
+ def save_article_content(
44
+ html: str,
45
+ save_path: Path,
46
+ save_file: Literal["md", "html"] = "md",
47
+ title: str = "",
48
+ date_str: str = "",
49
+ link: str = "",
50
+ account_name: str = "",
51
+ digest: str = "",
52
+ min_file_size_kb: int = 3,
53
+ ) -> bool:
54
+ """
55
+ 保存文章内容到文件(带元数据和文件大小检查)
56
+
57
+ Args:
58
+ html: 文章HTML内容
59
+ save_path: 保存路径
60
+ save_file: 保存格式(md 或 html)
61
+ title: 文章标题
62
+ date_str: 日期字符串
63
+ link: 文章链接
64
+ account_name: 公众号名称
65
+ digest: 文章摘要
66
+ min_file_size_kb: 最小文件大小(KB),小于此值会被删除
67
+
68
+ Returns:
69
+ 是否成功保存
70
+ """
71
+ save_path.parent.mkdir(parents=True, exist_ok=True)
72
+
73
+ if save_path.exists():
74
+ return True
75
+
76
+ try:
77
+ if save_file == "md":
78
+ yaml_front_matter = generate_yaml_front_matter(
79
+ title=title,
80
+ date_str=date_str,
81
+ link=link,
82
+ account_name=account_name,
83
+ digest=digest,
84
+ )
85
+
86
+ main_content = ""
87
+ content_match = re.search(
88
+ r'<div[^>]*id="js_content"[^>]*>(.*?)</div>', html, re.DOTALL
89
+ )
90
+
91
+ if content_match:
92
+ main_content = content_match.group(1)
93
+ else:
94
+ body_match = re.search(r"<body[^>]*>(.*?)</body>", html, re.DOTALL)
95
+ main_content = body_match.group(1) if body_match else html
96
+
97
+ markdown_content = yaml_front_matter + html_to_markdown(main_content)
98
+ save_markdown(markdown_content, save_path)
99
+ elif save_file == "html":
100
+ save_html(html, save_path)
101
+
102
+ file_size = save_path.stat().st_size
103
+ min_file_size_bytes = min_file_size_kb * 1024
104
+
105
+ if file_size < min_file_size_bytes:
106
+ save_path.unlink()
107
+ return False
108
+
109
+ return True
110
+
111
+ except Exception as e:
112
+ if save_path.exists():
113
+ try:
114
+ save_path.unlink()
115
+ except:
116
+ pass
117
+ return False
118
+
119
+
120
+ def html_to_markdown(html: str) -> str:
121
+ """
122
+ Simple Regex-based HTML to Markdown converter.
123
+ """
124
+ # Remove style and script
125
+ html = re.sub(r"<style.*?>.*?</style>", "", html, flags=re.DOTALL)
126
+ html = re.sub(r"<script.*?>.*?</script>", "", html, flags=re.DOTALL)
127
+
128
+ # Extract images: <img ... data-src="..."> or <img ... src="...">
129
+ # Do this BEFORE removing any tags
130
+ def replace_img(match):
131
+ src = match.group(1) or match.group(2)
132
+ return f"\n![]({src})\n"
133
+
134
+ # Replace img tags with markdown images
135
+ html = re.sub(r'<img[^>]+data-src="([^"]+)"[^>]*>', replace_img, html)
136
+ html = re.sub(r'<img[^>]+src="([^"]+)"[^>]*>', replace_img, html)
137
+
138
+ # Handle code blocks - <pre><code>...</code></pre> or <pre>...</pre>
139
+ def replace_pre_code(match):
140
+ code_content = match.group(1)
141
+ # Remove inner <code> tags if present
142
+ code_content = re.sub(
143
+ r"<code[^>]*>(.*?)</code>", r"\1", code_content, flags=re.DOTALL
144
+ )
145
+ # Decode HTML entities in code
146
+ code_content = (
147
+ code_content.replace("&lt;", "<")
148
+ .replace("&gt;", ">")
149
+ .replace("&amp;", "&")
150
+ .replace("&quot;", '"')
151
+ )
152
+ code_content = code_content.replace("&nbsp;", " ")
153
+ return f"\n```\n{code_content}\n```\n"
154
+
155
+ html = re.sub(r"<pre[^>]*>(.*?)</pre>", replace_pre_code, html, flags=re.DOTALL)
156
+
157
+ # Handle inline code - <code>...</code>
158
+ html = re.sub(r"<code[^>]*>(.*?)</code>", r"`\1`", html, flags=re.DOTALL)
159
+
160
+ # Remove lines that only contain HTML attributes (common in WeChat articles)
161
+ html = re.sub(
162
+ r"^\s*(class|data-|style|width|height|type|from|wx_fmt|data-ratio|data-type|data-w|data-imgfileid|data-aistatus|data-s)=[^>]*>\s*$",
163
+ "",
164
+ html,
165
+ flags=re.MULTILINE,
166
+ )
167
+
168
+ # Headers
169
+ for i in range(6, 0, -1):
170
+ html = re.sub(f"<h{i}[^>]*>(.*?)</h{i}>", "#" * i + r" \1\n", html)
171
+
172
+ # Paragraphs and Breaks
173
+ html = re.sub(r"<p[^>]*>", "\n", html)
174
+ html = re.sub(r"</p>", "\n", html)
175
+ html = re.sub(r"<br\s*/?>", "\n", html)
176
+
177
+ # Bold/Strong
178
+ html = re.sub(r"<(b|strong)[^>]*>(.*?)</\1>", r"**\2**", html)
179
+
180
+ # Lists (Simple)
181
+ html = re.sub(r"<li[^>]*>(.*?)</li>", r"- \1\n", html)
182
+
183
+ # Remove all remaining tags (including self-closing)
184
+ html = re.sub(r"<[^>]+>", "", html)
185
+
186
+ # Decode entities (basic)
187
+ html = (
188
+ html.replace("&nbsp;", " ")
189
+ .replace("&lt;", "<")
190
+ .replace("&gt;", ">")
191
+ .replace("&amp;", "&")
192
+ .replace("&quot;", '"')
193
+ )
194
+
195
+ # Collapse multiple newlines and spaces
196
+ html = re.sub(r"\n{3,}", "\n\n", html)
197
+ html = re.sub(r" +", " ", html)
198
+
199
+ return html.strip()
@@ -0,0 +1,86 @@
1
+ import json
2
+ import re
3
+ from pathlib import Path
4
+ from typing import Union
5
+
6
+
7
+ def sanitize_filename(filename: str, max_length: int = 200) -> str:
8
+ """
9
+ 清理文件名,移除非法字符
10
+
11
+ Args:
12
+ filename: 原始文件名
13
+ max_length: 最大文件名长度
14
+
15
+ Returns:
16
+ 清理后的合法文件名
17
+ """
18
+ # Windows 不允许的字符: \ / : * ? " < > |
19
+ illegal_chars = r'[\\/:*?"<>|]'
20
+
21
+ # 移除非法字符
22
+ filename = re.sub(illegal_chars, "_", filename)
23
+
24
+ # 移除控制字符
25
+ filename = "".join(char for char in filename if ord(char) >= 32)
26
+
27
+ # 移除首尾空格和点
28
+ filename = filename.strip(". ")
29
+
30
+ # 限制文件名长度
31
+ if len(filename) > max_length:
32
+ filename = filename[:max_length]
33
+
34
+ # 如果文件名为空,使用默认名称
35
+ if not filename:
36
+ filename = "untitled"
37
+
38
+ return filename
39
+
40
+
41
+ def load_json(file_path: Union[str, Path]) -> dict:
42
+ file_path = Path(file_path)
43
+ with open(file_path, "r", encoding="utf-8") as f:
44
+ return json.load(f)
45
+
46
+
47
+ def save_json(data: dict, file_path: Union[str, Path]):
48
+ file_path = Path(file_path)
49
+ with open(file_path, "w", encoding="utf-8") as f:
50
+ json.dump(data, f, ensure_ascii=False, indent=4)
51
+
52
+
53
+ def load_html(file_path: Union[str, Path]) -> str:
54
+ file_path = Path(file_path)
55
+ with open(file_path, "r", encoding="utf-8") as f:
56
+ return f.read()
57
+
58
+
59
+ def save_html(html: str, file_path: Union[str, Path]):
60
+ file_path = Path(file_path)
61
+ with open(file_path, "w", encoding="utf-8") as f:
62
+ f.write(html)
63
+
64
+
65
+ def load_text(file_path: Union[str, Path]) -> str:
66
+ file_path = Path(file_path)
67
+ with open(file_path, "r", encoding="utf-8") as f:
68
+ return f.read()
69
+
70
+
71
+ def save_text(text: str, file_path: Union[str, Path]):
72
+ file_path = Path(file_path)
73
+ with open(file_path, "w", encoding="utf-8") as f:
74
+ f.write(text)
75
+
76
+
77
+ def load_markdown(file_path: Union[str, Path]) -> str:
78
+ file_path = Path(file_path)
79
+ with open(file_path, "r", encoding="utf-8") as f:
80
+ return f.read()
81
+
82
+
83
+ def save_markdown(markdown: str, file_path: Union[str, Path]):
84
+ file_path = Path(file_path)
85
+ with open(file_path, "w", encoding="utf-8") as f:
86
+ f.write(markdown)