cfspider 1.7.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cfspider/__init__.py +230 -0
- cfspider/api.py +937 -0
- cfspider/async_api.py +418 -0
- cfspider/async_session.py +281 -0
- cfspider/browser.py +335 -0
- cfspider/cli.py +81 -0
- cfspider/impersonate.py +388 -0
- cfspider/ip_map.py +522 -0
- cfspider/mirror.py +682 -0
- cfspider/session.py +239 -0
- cfspider/stealth.py +537 -0
- cfspider/vless_client.py +572 -0
- cfspider-1.7.4.dist-info/METADATA +1390 -0
- cfspider-1.7.4.dist-info/RECORD +18 -0
- cfspider-1.7.4.dist-info/WHEEL +5 -0
- cfspider-1.7.4.dist-info/entry_points.txt +2 -0
- cfspider-1.7.4.dist-info/licenses/LICENSE +201 -0
- cfspider-1.7.4.dist-info/top_level.txt +1 -0
cfspider/mirror.py
ADDED
|
@@ -0,0 +1,682 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CFspider 网页镜像模块
|
|
3
|
+
|
|
4
|
+
将在线网页完整保存到本地,包括:
|
|
5
|
+
- HTML 页面(自动重写资源链接为相对路径)
|
|
6
|
+
- CSS 样式表(包括 @import 和 url() 引用)
|
|
7
|
+
- JavaScript 脚本
|
|
8
|
+
- 图片资源(PNG, JPG, WebP, SVG 等)
|
|
9
|
+
- 字体文件(WOFF, WOFF2, TTF 等)
|
|
10
|
+
- 其他资源(favicon, 视频, 音频等)
|
|
11
|
+
|
|
12
|
+
特性:
|
|
13
|
+
- 使用浏览器渲染:确保获取 JavaScript 动态生成的内容
|
|
14
|
+
- 并发下载:多线程下载资源,速度更快
|
|
15
|
+
- 隐身模式:自动使用完整浏览器请求头,避免被 CDN 拦截
|
|
16
|
+
- 自动打开预览:下载完成后自动在浏览器中预览
|
|
17
|
+
|
|
18
|
+
使用方式:
|
|
19
|
+
>>> import cfspider
|
|
20
|
+
>>>
|
|
21
|
+
>>> # 基本用法
|
|
22
|
+
>>> result = cfspider.mirror("https://example.com")
|
|
23
|
+
>>> print(result.index_file) # ./mirror/index.html
|
|
24
|
+
>>>
|
|
25
|
+
>>> # 指定保存目录
|
|
26
|
+
>>> result = cfspider.mirror(
|
|
27
|
+
... "https://example.com",
|
|
28
|
+
... save_dir="./my_backup",
|
|
29
|
+
... open_browser=False
|
|
30
|
+
... )
|
|
31
|
+
>>>
|
|
32
|
+
>>> # 使用 VLESS 代理
|
|
33
|
+
>>> result = cfspider.mirror(
|
|
34
|
+
... "https://example.com",
|
|
35
|
+
... cf_proxies="vless://uuid@host:443?path=/"
|
|
36
|
+
... )
|
|
37
|
+
|
|
38
|
+
目录结构:
|
|
39
|
+
save_dir/
|
|
40
|
+
├── index.html # 主页面
|
|
41
|
+
└── assets/
|
|
42
|
+
├── css/ # CSS 文件
|
|
43
|
+
├── js/ # JavaScript 文件
|
|
44
|
+
├── images/ # 图片文件
|
|
45
|
+
├── fonts/ # 字体文件
|
|
46
|
+
└── other/ # 其他资源
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
import os
|
|
50
|
+
import re
|
|
51
|
+
import hashlib
|
|
52
|
+
import webbrowser
|
|
53
|
+
from pathlib import Path
|
|
54
|
+
from urllib.parse import urljoin, urlparse, unquote
|
|
55
|
+
from dataclasses import dataclass, field
|
|
56
|
+
from typing import Optional, Dict, Set, List
|
|
57
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
from bs4 import BeautifulSoup
|
|
61
|
+
BS4_AVAILABLE = True
|
|
62
|
+
except ImportError:
|
|
63
|
+
BS4_AVAILABLE = False
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass
|
|
67
|
+
class MirrorResult:
|
|
68
|
+
"""
|
|
69
|
+
网页镜像结果
|
|
70
|
+
|
|
71
|
+
包含镜像操作的所有结果信息。
|
|
72
|
+
|
|
73
|
+
Attributes:
|
|
74
|
+
index_file (str): 主 HTML 文件的完整路径
|
|
75
|
+
例如:"/home/user/mirror/index.html"
|
|
76
|
+
assets_dir (str): 资源目录的完整路径
|
|
77
|
+
例如:"/home/user/mirror/assets"
|
|
78
|
+
total_files (int): 下载的文件总数(包含 index.html)
|
|
79
|
+
total_size (int): 所有文件的总大小(字节)
|
|
80
|
+
可用 total_size / 1024 转换为 KB
|
|
81
|
+
failed_urls (List[str]): 下载失败的 URL 列表
|
|
82
|
+
格式:["url: error_message", ...]
|
|
83
|
+
success (bool): 镜像是否成功
|
|
84
|
+
True 表示主页面已成功保存(部分资源失败不影响)
|
|
85
|
+
|
|
86
|
+
Example:
|
|
87
|
+
>>> result = cfspider.mirror("https://example.com")
|
|
88
|
+
>>> if result.success:
|
|
89
|
+
... print(f"保存到: {result.index_file}")
|
|
90
|
+
... print(f"文件数: {result.total_files}")
|
|
91
|
+
... print(f"大小: {result.total_size / 1024:.2f} KB")
|
|
92
|
+
... else:
|
|
93
|
+
... print(f"失败: {result.failed_urls}")
|
|
94
|
+
"""
|
|
95
|
+
index_file: str = "" # 主 HTML 文件路径
|
|
96
|
+
assets_dir: str = "" # 资源目录路径
|
|
97
|
+
total_files: int = 0 # 下载的文件总数
|
|
98
|
+
total_size: int = 0 # 总大小(字节)
|
|
99
|
+
failed_urls: List[str] = field(default_factory=list) # 下载失败的 URL
|
|
100
|
+
success: bool = True # 是否成功
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class WebMirror:
|
|
104
|
+
"""
|
|
105
|
+
网页镜像器
|
|
106
|
+
|
|
107
|
+
完整下载网页及其所有资源,并重写链接为本地相对路径。
|
|
108
|
+
|
|
109
|
+
工作流程:
|
|
110
|
+
1. 使用 Playwright 浏览器渲染页面(获取 JS 动态内容)
|
|
111
|
+
2. 解析 HTML 提取所有资源 URL
|
|
112
|
+
3. 并发下载资源到本地
|
|
113
|
+
4. 处理 CSS 文件中的额外资源(@import, url())
|
|
114
|
+
5. 重写所有资源链接为相对路径
|
|
115
|
+
6. 保存最终的 HTML 文件
|
|
116
|
+
|
|
117
|
+
Attributes:
|
|
118
|
+
ASSET_TYPES (dict): 资源类型映射,用于分类保存文件
|
|
119
|
+
|
|
120
|
+
Example:
|
|
121
|
+
>>> mirrorer = WebMirror(max_workers=20)
|
|
122
|
+
>>> result = mirrorer.mirror("https://example.com", save_dir="./backup")
|
|
123
|
+
|
|
124
|
+
Note:
|
|
125
|
+
直接使用 cfspider.mirror() 函数更方便,该函数会自动创建 WebMirror 实例。
|
|
126
|
+
"""
|
|
127
|
+
|
|
128
|
+
# 资源类型映射,用于将资源分类保存到不同目录
|
|
129
|
+
ASSET_TYPES = {
|
|
130
|
+
'css': ['css'],
|
|
131
|
+
'js': ['js', 'mjs'],
|
|
132
|
+
'images': ['png', 'jpg', 'jpeg', 'gif', 'webp', 'svg', 'ico', 'bmp', 'avif'],
|
|
133
|
+
'fonts': ['woff', 'woff2', 'ttf', 'otf', 'eot'],
|
|
134
|
+
'media': ['mp4', 'webm', 'mp3', 'ogg', 'wav'],
|
|
135
|
+
'other': []
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
def __init__(self, cf_proxies=None, vless_uuid=None, timeout=30, max_workers=10):
|
|
139
|
+
"""
|
|
140
|
+
初始化镜像器
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
cf_proxies (str, optional): 代理地址,支持以下格式:
|
|
144
|
+
- VLESS 链接:"vless://uuid@host:port?path=/..."
|
|
145
|
+
- HTTP 代理:"http://ip:port"
|
|
146
|
+
- SOCKS5 代理:"socks5://ip:port"
|
|
147
|
+
- 不填写:直接请求(无代理)
|
|
148
|
+
注意:浏览器渲染使用 VLESS 代理,资源下载使用直连
|
|
149
|
+
vless_uuid (str, optional): VLESS UUID
|
|
150
|
+
仅当 cf_proxies 是域名(非完整链接)时需要
|
|
151
|
+
timeout (int): 请求超时时间(秒),默认 30
|
|
152
|
+
适用于浏览器渲染和资源下载
|
|
153
|
+
max_workers (int): 并发下载线程数,默认 10
|
|
154
|
+
增大可加快下载速度,但可能被目标网站限制
|
|
155
|
+
|
|
156
|
+
Example:
|
|
157
|
+
>>> # 无代理
|
|
158
|
+
>>> mirrorer = WebMirror()
|
|
159
|
+
>>>
|
|
160
|
+
>>> # VLESS 代理
|
|
161
|
+
>>> mirrorer = WebMirror(cf_proxies="vless://uuid@host:443?path=/")
|
|
162
|
+
>>>
|
|
163
|
+
>>> # 高并发
|
|
164
|
+
>>> mirrorer = WebMirror(max_workers=20, timeout=60)
|
|
165
|
+
"""
|
|
166
|
+
self.cf_proxies = cf_proxies
|
|
167
|
+
self.vless_uuid = vless_uuid
|
|
168
|
+
self.timeout = timeout
|
|
169
|
+
self.max_workers = max_workers
|
|
170
|
+
self._browser = None
|
|
171
|
+
self._downloaded: Dict[str, str] = {} # URL -> 本地路径映射
|
|
172
|
+
self._failed: Set[str] = set()
|
|
173
|
+
|
|
174
|
+
def _get_browser(self):
|
|
175
|
+
"""获取浏览器实例"""
|
|
176
|
+
if self._browser is None:
|
|
177
|
+
from .browser import Browser
|
|
178
|
+
self._browser = Browser(
|
|
179
|
+
cf_proxies=self.cf_proxies,
|
|
180
|
+
headless=True,
|
|
181
|
+
timeout=self.timeout,
|
|
182
|
+
vless_uuid=self.vless_uuid
|
|
183
|
+
)
|
|
184
|
+
return self._browser
|
|
185
|
+
|
|
186
|
+
def _close_browser(self):
|
|
187
|
+
"""关闭浏览器"""
|
|
188
|
+
if self._browser:
|
|
189
|
+
try:
|
|
190
|
+
self._browser.close()
|
|
191
|
+
except:
|
|
192
|
+
pass
|
|
193
|
+
self._browser = None
|
|
194
|
+
|
|
195
|
+
def _get_asset_type(self, url: str) -> str:
|
|
196
|
+
"""根据 URL 判断资源类型"""
|
|
197
|
+
parsed = urlparse(url)
|
|
198
|
+
path = parsed.path.lower()
|
|
199
|
+
ext = path.rsplit('.', 1)[-1] if '.' in path else ''
|
|
200
|
+
|
|
201
|
+
for asset_type, extensions in self.ASSET_TYPES.items():
|
|
202
|
+
if ext in extensions:
|
|
203
|
+
return asset_type
|
|
204
|
+
return 'other'
|
|
205
|
+
|
|
206
|
+
def _generate_local_path(self, url: str, base_url: str, assets_dir: Path) -> str:
|
|
207
|
+
"""生成本地文件路径"""
|
|
208
|
+
parsed = urlparse(url)
|
|
209
|
+
path = unquote(parsed.path)
|
|
210
|
+
|
|
211
|
+
# 如果没有路径或是根路径,使用 hash
|
|
212
|
+
if not path or path == '/':
|
|
213
|
+
ext = '.html'
|
|
214
|
+
filename = hashlib.md5(url.encode()).hexdigest()[:12] + ext
|
|
215
|
+
else:
|
|
216
|
+
# 提取文件名
|
|
217
|
+
filename = path.rsplit('/', 1)[-1]
|
|
218
|
+
if not filename or '.' not in filename:
|
|
219
|
+
ext = self._guess_extension(url)
|
|
220
|
+
filename = hashlib.md5(url.encode()).hexdigest()[:12] + ext
|
|
221
|
+
|
|
222
|
+
# 确定资源类型目录
|
|
223
|
+
asset_type = self._get_asset_type(url)
|
|
224
|
+
|
|
225
|
+
# 生成安全的文件名
|
|
226
|
+
safe_filename = re.sub(r'[<>:"/\\|?*]', '_', filename)
|
|
227
|
+
if len(safe_filename) > 100:
|
|
228
|
+
ext = safe_filename.rsplit('.', 1)[-1] if '.' in safe_filename else ''
|
|
229
|
+
safe_filename = hashlib.md5(filename.encode()).hexdigest()[:12]
|
|
230
|
+
if ext:
|
|
231
|
+
safe_filename += '.' + ext
|
|
232
|
+
|
|
233
|
+
return str(assets_dir / asset_type / safe_filename)
|
|
234
|
+
|
|
235
|
+
def _guess_extension(self, url: str) -> str:
|
|
236
|
+
"""根据 URL 猜测扩展名"""
|
|
237
|
+
url_lower = url.lower()
|
|
238
|
+
if 'css' in url_lower:
|
|
239
|
+
return '.css'
|
|
240
|
+
elif 'js' in url_lower or 'javascript' in url_lower:
|
|
241
|
+
return '.js'
|
|
242
|
+
elif any(ext in url_lower for ext in ['png', 'jpg', 'jpeg', 'gif', 'webp', 'svg']):
|
|
243
|
+
for ext in ['png', 'jpg', 'jpeg', 'gif', 'webp', 'svg']:
|
|
244
|
+
if ext in url_lower:
|
|
245
|
+
return '.' + ext
|
|
246
|
+
return '.bin'
|
|
247
|
+
|
|
248
|
+
def _extract_urls_from_html(self, html: str, base_url: str) -> Set[str]:
|
|
249
|
+
"""从 HTML 中提取资源 URL"""
|
|
250
|
+
if not BS4_AVAILABLE:
|
|
251
|
+
raise ImportError("需要安装 beautifulsoup4: pip install beautifulsoup4")
|
|
252
|
+
|
|
253
|
+
urls = set()
|
|
254
|
+
soup = BeautifulSoup(html, 'html.parser')
|
|
255
|
+
base_domain = urlparse(base_url).netloc
|
|
256
|
+
|
|
257
|
+
# 提取各种资源
|
|
258
|
+
# CSS 链接
|
|
259
|
+
for link in soup.find_all('link', rel='stylesheet'):
|
|
260
|
+
href = link.get('href')
|
|
261
|
+
if href:
|
|
262
|
+
urls.add(urljoin(base_url, href))
|
|
263
|
+
|
|
264
|
+
# 其他 link 标签(favicon 等)
|
|
265
|
+
for link in soup.find_all('link'):
|
|
266
|
+
href = link.get('href')
|
|
267
|
+
if href and link.get('rel') not in [['stylesheet']]:
|
|
268
|
+
full_url = urljoin(base_url, href)
|
|
269
|
+
urls.add(full_url)
|
|
270
|
+
|
|
271
|
+
# JavaScript
|
|
272
|
+
for script in soup.find_all('script', src=True):
|
|
273
|
+
src = script.get('src')
|
|
274
|
+
if src:
|
|
275
|
+
urls.add(urljoin(base_url, src))
|
|
276
|
+
|
|
277
|
+
# 图片
|
|
278
|
+
for img in soup.find_all('img', src=True):
|
|
279
|
+
src = img.get('src')
|
|
280
|
+
if src and not src.startswith('data:'):
|
|
281
|
+
urls.add(urljoin(base_url, src))
|
|
282
|
+
# srcset
|
|
283
|
+
srcset = img.get('srcset')
|
|
284
|
+
if srcset:
|
|
285
|
+
for item in srcset.split(','):
|
|
286
|
+
url = item.strip().split()[0]
|
|
287
|
+
if url and not url.startswith('data:'):
|
|
288
|
+
urls.add(urljoin(base_url, url))
|
|
289
|
+
|
|
290
|
+
# 背景图和其他 style 属性
|
|
291
|
+
for elem in soup.find_all(style=True):
|
|
292
|
+
style = elem.get('style')
|
|
293
|
+
css_urls = self._extract_urls_from_css(style, base_url)
|
|
294
|
+
urls.update(css_urls)
|
|
295
|
+
|
|
296
|
+
# style 标签
|
|
297
|
+
for style_tag in soup.find_all('style'):
|
|
298
|
+
if style_tag.string:
|
|
299
|
+
css_urls = self._extract_urls_from_css(style_tag.string, base_url)
|
|
300
|
+
urls.update(css_urls)
|
|
301
|
+
|
|
302
|
+
# video/audio
|
|
303
|
+
for media in soup.find_all(['video', 'audio']):
|
|
304
|
+
src = media.get('src')
|
|
305
|
+
if src:
|
|
306
|
+
urls.add(urljoin(base_url, src))
|
|
307
|
+
poster = media.get('poster')
|
|
308
|
+
if poster:
|
|
309
|
+
urls.add(urljoin(base_url, poster))
|
|
310
|
+
for source in media.find_all('source'):
|
|
311
|
+
src = source.get('src')
|
|
312
|
+
if src:
|
|
313
|
+
urls.add(urljoin(base_url, src))
|
|
314
|
+
|
|
315
|
+
# 过滤只保留同域资源
|
|
316
|
+
filtered_urls = set()
|
|
317
|
+
for url in urls:
|
|
318
|
+
parsed = urlparse(url)
|
|
319
|
+
if parsed.netloc == base_domain or not parsed.netloc:
|
|
320
|
+
filtered_urls.add(url)
|
|
321
|
+
|
|
322
|
+
return filtered_urls
|
|
323
|
+
|
|
324
|
+
def _extract_urls_from_css(self, css_content: str, base_url: str) -> Set[str]:
|
|
325
|
+
"""从 CSS 中提取 url() 引用"""
|
|
326
|
+
urls = set()
|
|
327
|
+
pattern = r'url\(["\']?([^"\')\s]+)["\']?\)'
|
|
328
|
+
matches = re.findall(pattern, css_content)
|
|
329
|
+
|
|
330
|
+
for match in matches:
|
|
331
|
+
if not match.startswith('data:'):
|
|
332
|
+
urls.add(urljoin(base_url, match))
|
|
333
|
+
|
|
334
|
+
return urls
|
|
335
|
+
|
|
336
|
+
def _download_resource(self, url: str, local_path: str, referer: str = None) -> tuple:
|
|
337
|
+
"""下载单个资源(使用隐身模式避免反爬)"""
|
|
338
|
+
try:
|
|
339
|
+
from . import get
|
|
340
|
+
|
|
341
|
+
# 使用隐身模式自动添加完整的浏览器请求头
|
|
342
|
+
# 用户自定义的 Referer 会覆盖默认值
|
|
343
|
+
extra_headers = {}
|
|
344
|
+
if referer:
|
|
345
|
+
extra_headers['Referer'] = referer
|
|
346
|
+
|
|
347
|
+
# 检查是否是 VLESS 链接 - VLESS 不支持 HTTP 请求,只支持浏览器模式
|
|
348
|
+
# 如果是 VLESS 链接,资源下载时不使用代理
|
|
349
|
+
proxies_for_download = self.cf_proxies
|
|
350
|
+
if self.cf_proxies and str(self.cf_proxies).lower().startswith('vless://'):
|
|
351
|
+
proxies_for_download = None # VLESS 链接不支持 HTTP 请求,直接下载
|
|
352
|
+
|
|
353
|
+
response = get(
|
|
354
|
+
url,
|
|
355
|
+
cf_proxies=proxies_for_download,
|
|
356
|
+
timeout=self.timeout,
|
|
357
|
+
headers=extra_headers,
|
|
358
|
+
stealth=True, # 启用隐身模式,自动添加完整浏览器请求头
|
|
359
|
+
stealth_browser='chrome'
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
if response.status_code == 200:
|
|
363
|
+
content = response.content
|
|
364
|
+
|
|
365
|
+
# 检测是否下载到了错误页面(HTML 而不是预期的资源)
|
|
366
|
+
content_type = response.headers.get('content-type', '').lower()
|
|
367
|
+
expected_type = self._get_asset_type(url)
|
|
368
|
+
|
|
369
|
+
# 如果期望的是 JS/CSS 但得到了 HTML,可能是错误页面
|
|
370
|
+
if expected_type in ['js', 'css'] and 'text/html' in content_type:
|
|
371
|
+
# 检查是否是 nginx 默认页面或其他错误页面
|
|
372
|
+
content_str = content.decode('utf-8', errors='ignore')[:500]
|
|
373
|
+
if 'nginx' in content_str.lower() or '<!doctype html>' in content_str.lower():
|
|
374
|
+
return (url, None, 0, f"下载到错误页面(可能是 CDN 保护)")
|
|
375
|
+
|
|
376
|
+
# 创建目录
|
|
377
|
+
os.makedirs(os.path.dirname(local_path), exist_ok=True)
|
|
378
|
+
|
|
379
|
+
# 写入文件
|
|
380
|
+
with open(local_path, 'wb') as f:
|
|
381
|
+
f.write(content)
|
|
382
|
+
|
|
383
|
+
return (url, local_path, len(content), None)
|
|
384
|
+
else:
|
|
385
|
+
return (url, None, 0, f"HTTP {response.status_code}")
|
|
386
|
+
except Exception as e:
|
|
387
|
+
return (url, None, 0, str(e))
|
|
388
|
+
|
|
389
|
+
def _rewrite_html(self, html: str, base_url: str, url_mapping: Dict[str, str], save_dir: Path) -> str:
|
|
390
|
+
"""重写 HTML 中的资源链接"""
|
|
391
|
+
if not BS4_AVAILABLE:
|
|
392
|
+
return html
|
|
393
|
+
|
|
394
|
+
soup = BeautifulSoup(html, 'html.parser')
|
|
395
|
+
|
|
396
|
+
def get_relative_path(local_path: str) -> str:
|
|
397
|
+
"""获取相对于 index.html 的路径"""
|
|
398
|
+
try:
|
|
399
|
+
rel_path = os.path.relpath(local_path, save_dir)
|
|
400
|
+
return rel_path.replace('\\', '/')
|
|
401
|
+
except:
|
|
402
|
+
return local_path
|
|
403
|
+
|
|
404
|
+
def replace_url(url: str) -> Optional[str]:
|
|
405
|
+
"""替换 URL 为本地路径"""
|
|
406
|
+
full_url = urljoin(base_url, url)
|
|
407
|
+
if full_url in url_mapping:
|
|
408
|
+
return get_relative_path(url_mapping[full_url])
|
|
409
|
+
return None
|
|
410
|
+
|
|
411
|
+
# 替换 link href
|
|
412
|
+
for link in soup.find_all('link', href=True):
|
|
413
|
+
new_path = replace_url(link['href'])
|
|
414
|
+
if new_path:
|
|
415
|
+
link['href'] = new_path
|
|
416
|
+
|
|
417
|
+
# 替换 script src
|
|
418
|
+
for script in soup.find_all('script', src=True):
|
|
419
|
+
new_path = replace_url(script['src'])
|
|
420
|
+
if new_path:
|
|
421
|
+
script['src'] = new_path
|
|
422
|
+
|
|
423
|
+
# 替换 img src
|
|
424
|
+
for img in soup.find_all('img', src=True):
|
|
425
|
+
new_path = replace_url(img['src'])
|
|
426
|
+
if new_path:
|
|
427
|
+
img['src'] = new_path
|
|
428
|
+
|
|
429
|
+
# 替换 video/audio
|
|
430
|
+
for media in soup.find_all(['video', 'audio']):
|
|
431
|
+
if media.get('src'):
|
|
432
|
+
new_path = replace_url(media['src'])
|
|
433
|
+
if new_path:
|
|
434
|
+
media['src'] = new_path
|
|
435
|
+
if media.get('poster'):
|
|
436
|
+
new_path = replace_url(media['poster'])
|
|
437
|
+
if new_path:
|
|
438
|
+
media['poster'] = new_path
|
|
439
|
+
|
|
440
|
+
# 替换 style 标签中的 url()
|
|
441
|
+
for style_tag in soup.find_all('style'):
|
|
442
|
+
if style_tag.string:
|
|
443
|
+
new_css = self._rewrite_css(style_tag.string, base_url, url_mapping, save_dir)
|
|
444
|
+
style_tag.string = new_css
|
|
445
|
+
|
|
446
|
+
# 替换 style 属性中的 url()
|
|
447
|
+
for elem in soup.find_all(style=True):
|
|
448
|
+
style = elem.get('style')
|
|
449
|
+
new_style = self._rewrite_css(style, base_url, url_mapping, save_dir)
|
|
450
|
+
elem['style'] = new_style
|
|
451
|
+
|
|
452
|
+
return str(soup)
|
|
453
|
+
|
|
454
|
+
def _rewrite_css(self, css_content: str, base_url: str, url_mapping: Dict[str, str], save_dir: Path) -> str:
|
|
455
|
+
"""重写 CSS 中的 url() 引用"""
|
|
456
|
+
def replace_url(match):
|
|
457
|
+
url = match.group(1).strip('"\'')
|
|
458
|
+
full_url = urljoin(base_url, url)
|
|
459
|
+
if full_url in url_mapping:
|
|
460
|
+
local_path = url_mapping[full_url]
|
|
461
|
+
try:
|
|
462
|
+
rel_path = os.path.relpath(local_path, save_dir)
|
|
463
|
+
rel_path = rel_path.replace('\\', '/')
|
|
464
|
+
return f'url("{rel_path}")'
|
|
465
|
+
except:
|
|
466
|
+
pass
|
|
467
|
+
return match.group(0)
|
|
468
|
+
|
|
469
|
+
pattern = r'url\(["\']?([^"\')\s]+)["\']?\)'
|
|
470
|
+
return re.sub(pattern, replace_url, css_content)
|
|
471
|
+
|
|
472
|
+
def _process_css_file(self, css_path: str, css_url: str, base_url: str,
|
|
473
|
+
assets_dir: Path, url_mapping: Dict[str, str]) -> List[str]:
|
|
474
|
+
"""处理 CSS 文件,下载其中引用的资源"""
|
|
475
|
+
new_urls = []
|
|
476
|
+
|
|
477
|
+
try:
|
|
478
|
+
with open(css_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
479
|
+
css_content = f.read()
|
|
480
|
+
|
|
481
|
+
# 提取 CSS 中的 URL
|
|
482
|
+
css_base = css_url.rsplit('/', 1)[0] + '/'
|
|
483
|
+
urls_in_css = self._extract_urls_from_css(css_content, css_base)
|
|
484
|
+
|
|
485
|
+
for url in urls_in_css:
|
|
486
|
+
if url not in url_mapping and url not in self._failed:
|
|
487
|
+
new_urls.append(url)
|
|
488
|
+
except:
|
|
489
|
+
pass
|
|
490
|
+
|
|
491
|
+
return new_urls
|
|
492
|
+
|
|
493
|
+
def mirror(self, url: str, save_dir: str = "./mirror", open_browser: bool = True) -> MirrorResult:
|
|
494
|
+
"""
|
|
495
|
+
镜像网页到本地
|
|
496
|
+
|
|
497
|
+
Args:
|
|
498
|
+
url: 目标网页 URL
|
|
499
|
+
save_dir: 保存目录
|
|
500
|
+
open_browser: 是否自动打开浏览器预览
|
|
501
|
+
|
|
502
|
+
Returns:
|
|
503
|
+
MirrorResult: 镜像结果
|
|
504
|
+
"""
|
|
505
|
+
if not BS4_AVAILABLE:
|
|
506
|
+
raise ImportError("需要安装 beautifulsoup4: pip install beautifulsoup4")
|
|
507
|
+
|
|
508
|
+
result = MirrorResult()
|
|
509
|
+
save_path = Path(save_dir).resolve()
|
|
510
|
+
assets_path = save_path / "assets"
|
|
511
|
+
|
|
512
|
+
try:
|
|
513
|
+
# 创建目录
|
|
514
|
+
save_path.mkdir(parents=True, exist_ok=True)
|
|
515
|
+
assets_path.mkdir(exist_ok=True)
|
|
516
|
+
for asset_type in self.ASSET_TYPES.keys():
|
|
517
|
+
(assets_path / asset_type).mkdir(exist_ok=True)
|
|
518
|
+
|
|
519
|
+
# 使用浏览器渲染页面
|
|
520
|
+
print(f"[Mirror] 正在渲染页面: {url}")
|
|
521
|
+
browser = self._get_browser()
|
|
522
|
+
html = browser.html(url)
|
|
523
|
+
|
|
524
|
+
# 提取资源 URL
|
|
525
|
+
print("[Mirror] 正在提取资源链接...")
|
|
526
|
+
resource_urls = self._extract_urls_from_html(html, url)
|
|
527
|
+
print(f"[Mirror] 发现 {len(resource_urls)} 个资源")
|
|
528
|
+
|
|
529
|
+
# 并发下载资源
|
|
530
|
+
url_mapping: Dict[str, str] = {}
|
|
531
|
+
total_size = 0
|
|
532
|
+
|
|
533
|
+
if resource_urls:
|
|
534
|
+
print("[Mirror] 正在下载资源...")
|
|
535
|
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
|
536
|
+
futures = {}
|
|
537
|
+
for res_url in resource_urls:
|
|
538
|
+
local_path = self._generate_local_path(res_url, url, assets_path)
|
|
539
|
+
futures[executor.submit(self._download_resource, res_url, local_path, url)] = res_url
|
|
540
|
+
|
|
541
|
+
completed = 0
|
|
542
|
+
for future in as_completed(futures):
|
|
543
|
+
res_url, local_path, size, error = future.result()
|
|
544
|
+
completed += 1
|
|
545
|
+
|
|
546
|
+
if local_path:
|
|
547
|
+
url_mapping[res_url] = local_path
|
|
548
|
+
total_size += size
|
|
549
|
+
self._downloaded[res_url] = local_path
|
|
550
|
+
else:
|
|
551
|
+
self._failed.add(res_url)
|
|
552
|
+
result.failed_urls.append(f"{res_url}: {error}")
|
|
553
|
+
|
|
554
|
+
# 进度显示
|
|
555
|
+
if completed % 10 == 0 or completed == len(futures):
|
|
556
|
+
print(f"[Mirror] 下载进度: {completed}/{len(futures)}")
|
|
557
|
+
|
|
558
|
+
# 处理 CSS 文件中的额外资源
|
|
559
|
+
css_files = [(path, u) for u, path in url_mapping.items()
|
|
560
|
+
if path.endswith('.css')]
|
|
561
|
+
|
|
562
|
+
additional_urls = set()
|
|
563
|
+
for css_path, css_url in css_files:
|
|
564
|
+
new_urls = self._process_css_file(css_path, css_url, url, assets_path, url_mapping)
|
|
565
|
+
additional_urls.update(new_urls)
|
|
566
|
+
|
|
567
|
+
# 下载 CSS 中发现的额外资源
|
|
568
|
+
if additional_urls:
|
|
569
|
+
print(f"[Mirror] 发现 CSS 中的 {len(additional_urls)} 个额外资源")
|
|
570
|
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
|
571
|
+
futures = {}
|
|
572
|
+
for res_url in additional_urls:
|
|
573
|
+
local_path = self._generate_local_path(res_url, url, assets_path)
|
|
574
|
+
futures[executor.submit(self._download_resource, res_url, local_path, url)] = res_url
|
|
575
|
+
|
|
576
|
+
for future in as_completed(futures):
|
|
577
|
+
res_url, local_path, size, error = future.result()
|
|
578
|
+
if local_path:
|
|
579
|
+
url_mapping[res_url] = local_path
|
|
580
|
+
total_size += size
|
|
581
|
+
|
|
582
|
+
# 重写 HTML 链接
|
|
583
|
+
print("[Mirror] 正在重写资源链接...")
|
|
584
|
+
rewritten_html = self._rewrite_html(html, url, url_mapping, save_path)
|
|
585
|
+
|
|
586
|
+
# 重写 CSS 文件中的链接
|
|
587
|
+
for css_path, css_url in css_files:
|
|
588
|
+
try:
|
|
589
|
+
with open(css_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
590
|
+
css_content = f.read()
|
|
591
|
+
|
|
592
|
+
css_base = css_url.rsplit('/', 1)[0] + '/'
|
|
593
|
+
new_css = self._rewrite_css(css_content, css_base, url_mapping, save_path)
|
|
594
|
+
|
|
595
|
+
with open(css_path, 'w', encoding='utf-8') as f:
|
|
596
|
+
f.write(new_css)
|
|
597
|
+
except:
|
|
598
|
+
pass
|
|
599
|
+
|
|
600
|
+
# 保存 HTML
|
|
601
|
+
index_file = save_path / "index.html"
|
|
602
|
+
with open(index_file, 'w', encoding='utf-8') as f:
|
|
603
|
+
f.write(rewritten_html)
|
|
604
|
+
|
|
605
|
+
# 填充结果
|
|
606
|
+
result.index_file = str(index_file)
|
|
607
|
+
result.assets_dir = str(assets_path)
|
|
608
|
+
result.total_files = len(url_mapping) + 1 # +1 for index.html
|
|
609
|
+
result.total_size = total_size + len(rewritten_html.encode('utf-8'))
|
|
610
|
+
result.success = True
|
|
611
|
+
|
|
612
|
+
print(f"[Mirror] 镜像完成!")
|
|
613
|
+
print(f"[Mirror] 保存位置: {index_file}")
|
|
614
|
+
print(f"[Mirror] 总文件数: {result.total_files}")
|
|
615
|
+
print(f"[Mirror] 总大小: {result.total_size / 1024:.2f} KB")
|
|
616
|
+
if result.failed_urls:
|
|
617
|
+
print(f"[Mirror] 失败资源: {len(result.failed_urls)} 个")
|
|
618
|
+
|
|
619
|
+
# 自动打开浏览器
|
|
620
|
+
if open_browser:
|
|
621
|
+
print("[Mirror] 正在打开浏览器预览...")
|
|
622
|
+
webbrowser.open(f"file://{index_file}")
|
|
623
|
+
|
|
624
|
+
except Exception as e:
|
|
625
|
+
result.success = False
|
|
626
|
+
result.failed_urls.append(str(e))
|
|
627
|
+
print(f"[Mirror] 错误: {e}")
|
|
628
|
+
finally:
|
|
629
|
+
self._close_browser()
|
|
630
|
+
|
|
631
|
+
return result
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
def mirror(url: str, save_dir: str = "./mirror", open_browser: bool = True,
|
|
635
|
+
cf_proxies: str = None, vless_uuid: str = None,
|
|
636
|
+
timeout: int = 30, max_workers: int = 10) -> MirrorResult:
|
|
637
|
+
"""
|
|
638
|
+
镜像网页到本地
|
|
639
|
+
|
|
640
|
+
爬取网页及其所有资源(CSS、JS、图片、字体等),
|
|
641
|
+
保存到本地并自动打开浏览器预览。
|
|
642
|
+
|
|
643
|
+
Args:
|
|
644
|
+
url: 目标网页 URL
|
|
645
|
+
save_dir: 保存目录,默认 "./mirror"
|
|
646
|
+
open_browser: 是否自动打开浏览器预览,默认 True
|
|
647
|
+
cf_proxies: 代理地址,支持 VLESS 链接/HTTP/SOCKS5
|
|
648
|
+
vless_uuid: VLESS UUID(仅域名方式需要)
|
|
649
|
+
timeout: 请求超时时间(秒),默认 30
|
|
650
|
+
max_workers: 并发下载线程数,默认 10
|
|
651
|
+
|
|
652
|
+
Returns:
|
|
653
|
+
MirrorResult: 镜像结果,包含保存路径、文件数量等信息
|
|
654
|
+
|
|
655
|
+
Examples:
|
|
656
|
+
>>> import cfspider
|
|
657
|
+
>>>
|
|
658
|
+
>>> # 基本用法
|
|
659
|
+
>>> result = cfspider.mirror("https://example.com")
|
|
660
|
+
>>> print(result.index_file) # 保存的 HTML 路径
|
|
661
|
+
>>>
|
|
662
|
+
>>> # 指定保存目录
|
|
663
|
+
>>> result = cfspider.mirror(
|
|
664
|
+
... "https://example.com",
|
|
665
|
+
... save_dir="./my_mirror",
|
|
666
|
+
... open_browser=False
|
|
667
|
+
... )
|
|
668
|
+
>>>
|
|
669
|
+
>>> # 使用 VLESS 代理
|
|
670
|
+
>>> result = cfspider.mirror(
|
|
671
|
+
... "https://example.com",
|
|
672
|
+
... cf_proxies="vless://uuid@host:443?path=/"
|
|
673
|
+
... )
|
|
674
|
+
"""
|
|
675
|
+
mirrorer = WebMirror(
|
|
676
|
+
cf_proxies=cf_proxies,
|
|
677
|
+
vless_uuid=vless_uuid,
|
|
678
|
+
timeout=timeout,
|
|
679
|
+
max_workers=max_workers
|
|
680
|
+
)
|
|
681
|
+
return mirrorer.mirror(url, save_dir, open_browser)
|
|
682
|
+
|