cf-killer 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cf_killer/__init__.py +38 -0
- cf_killer/core.py +901 -0
- cf_killer-0.1.0.dist-info/METADATA +290 -0
- cf_killer-0.1.0.dist-info/RECORD +6 -0
- cf_killer-0.1.0.dist-info/WHEEL +5 -0
- cf_killer-0.1.0.dist-info/top_level.txt +1 -0
cf_killer/__init__.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
cf-killer: Cloudflare 5s 盾自动求解 + 页面批量抓取工具
|
|
4
|
+
|
|
5
|
+
基于 CloakBrowser(Chromium C++ 源码级反检测浏览器)。
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
from cf_killer import fetch_all, fetch_url, CFPageFetcher
|
|
9
|
+
|
|
10
|
+
# 批量抓取
|
|
11
|
+
results = fetch_all(["https://example.com", ...], headless=True)
|
|
12
|
+
|
|
13
|
+
# 单页抓取
|
|
14
|
+
result = fetch_url("https://example.com")
|
|
15
|
+
|
|
16
|
+
# 异步上下文管理器
|
|
17
|
+
async with CFPageFetcher(headless=True) as fetcher:
|
|
18
|
+
results = await fetcher.fetch_batch(urls)
|
|
19
|
+
await fetcher.download_file(pdf_url, "output.pdf")
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from .core import (
|
|
23
|
+
CFSolver,
|
|
24
|
+
CFPageFetcher,
|
|
25
|
+
fetch_all,
|
|
26
|
+
fetch_urls,
|
|
27
|
+
fetch_url,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"CFSolver",
|
|
32
|
+
"CFPageFetcher",
|
|
33
|
+
"fetch_all",
|
|
34
|
+
"fetch_urls",
|
|
35
|
+
"fetch_url",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
__version__ = "0.1.0"
|
cf_killer/core.py
ADDED
|
@@ -0,0 +1,901 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
Cloudflare 5s 盾自动求解器 — 精简版(CloakBrowser 集成)。
|
|
4
|
+
保留内容:
|
|
5
|
+
- CF challenge 检测 + Turnstile 自动求解(CFSolver)
|
|
6
|
+
- CloakBrowser 持久化浏览器封装
|
|
7
|
+
- 多实例并行抓取 + 上下文自动回收
|
|
8
|
+
|
|
9
|
+
使用前:
|
|
10
|
+
pip install cloakbrowser
|
|
11
|
+
python -c "import cloakbrowser; cloakbrowser.ensure_binary()"
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import asyncio
|
|
15
|
+
import concurrent.futures
|
|
16
|
+
import re
|
|
17
|
+
import tempfile
|
|
18
|
+
import time
|
|
19
|
+
import random
|
|
20
|
+
from random import randint
|
|
21
|
+
from typing import Dict, List, Optional
|
|
22
|
+
|
|
23
|
+
from playwright.async_api import Page
|
|
24
|
+
|
|
25
|
+
# ============================================================
|
|
26
|
+
# CF 解盾常量
|
|
27
|
+
# ============================================================
|
|
28
|
+
CF_CHALLENGE_TITLES_EN = [
|
|
29
|
+
'Just a moment...',
|
|
30
|
+
'DDoS-Guard',
|
|
31
|
+
'Attention Required! | Cloudflare',
|
|
32
|
+
]
|
|
33
|
+
CF_CHALLENGE_TITLES_CN = [
|
|
34
|
+
'请稍候…',
|
|
35
|
+
'稍候…',
|
|
36
|
+
'正在检查',
|
|
37
|
+
]
|
|
38
|
+
CF_CHALLENGE_TITLES = CF_CHALLENGE_TITLES_EN + CF_CHALLENGE_TITLES_CN
|
|
39
|
+
|
|
40
|
+
CF_IFRAME_PATTERN = re.compile(
|
|
41
|
+
r"^https?://challenges\.cloudflare\.com/cdn-cgi/challenge-platform/.*"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
TURNSTILE_CONTAINER_SELECTORS = [
|
|
45
|
+
'#cf_turnstile div', '#cf-turnstile div',
|
|
46
|
+
'.turnstile>div>div', '.main-content p+div>div>div',
|
|
47
|
+
'.cf-turnstile', '#challenge-stage',
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# ============================================================
|
|
52
|
+
# CF 检测函数
|
|
53
|
+
# ============================================================
|
|
54
|
+
def _detect_cf_challenge(html: str) -> Optional[str]:
|
|
55
|
+
"""从 HTML 中检测 CF Turnstile challenge 类型。"""
|
|
56
|
+
if not html:
|
|
57
|
+
return None
|
|
58
|
+
for ctype in ('non-interactive', 'managed', 'interactive'):
|
|
59
|
+
if f"cType: '{ctype}'" in html or f'cType: "{ctype}"' in html:
|
|
60
|
+
return ctype
|
|
61
|
+
if 'challenges.cloudflare.com/turnstile/v' in html:
|
|
62
|
+
return 'embedded'
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _cf_title_present(html: str) -> bool:
|
|
67
|
+
"""检查 HTML 是否仍显示 CF challenge 页面。"""
|
|
68
|
+
for t in CF_CHALLENGE_TITLES:
|
|
69
|
+
if f'<title>{t}</title>' in html:
|
|
70
|
+
return True
|
|
71
|
+
return False
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _is_cf_page(html: str) -> bool:
|
|
75
|
+
"""检查 HTML 是否是 CF 挑战页(综合检测)。"""
|
|
76
|
+
if not html:
|
|
77
|
+
return False
|
|
78
|
+
head = html[:3000]
|
|
79
|
+
if _cf_title_present(head):
|
|
80
|
+
return True
|
|
81
|
+
if 'challenges.cloudflare.com/turnstile' in head:
|
|
82
|
+
return True
|
|
83
|
+
if "cType:" in head and 'challenges.cloudflare.com' in head:
|
|
84
|
+
return True
|
|
85
|
+
for sel in ['cf-challenge-running', 'challenge-spinner',
|
|
86
|
+
'trk_jschal_js', 'cf-please-wait']:
|
|
87
|
+
if sel in head:
|
|
88
|
+
return True
|
|
89
|
+
return False
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
async def _quick_cf_check(page: Page) -> bool:
|
|
93
|
+
"""快速检测当前页面是否为 CF 5s 盾页面(用 title,比 content() 快 ~10x)。"""
|
|
94
|
+
try:
|
|
95
|
+
title = await page.title()
|
|
96
|
+
for t in CF_CHALLENGE_TITLES:
|
|
97
|
+
if t.lower() == title.lower():
|
|
98
|
+
return True
|
|
99
|
+
# title 没命中,再用 content 片段做二次确认
|
|
100
|
+
html = await page.content()
|
|
101
|
+
return _cf_title_present(html) or 'challenges.cloudflare.com/turnstile' in html[:3000]
|
|
102
|
+
except Exception:
|
|
103
|
+
return False
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
# ============================================================
|
|
107
|
+
# CF 求解器
|
|
108
|
+
# ============================================================
|
|
109
|
+
class CFSolver:
|
|
110
|
+
"""
|
|
111
|
+
Cloudflare 5s 盾自动求解器(v2 优化版)。
|
|
112
|
+
|
|
113
|
+
优化点:
|
|
114
|
+
- managed 类型点击后等待 spinner 出现→消失,再检测页面跳转
|
|
115
|
+
- 更精准的 checkbox 定位(多路径 iframe 内选择器 + 坐标点击)
|
|
116
|
+
- 延长超时时间,适配慢速网络
|
|
117
|
+
- 每步详细日志,方便排查失败原因
|
|
118
|
+
"""
|
|
119
|
+
|
|
120
|
+
def __init__(self, logger=None, max_retries: int = 5,
|
|
121
|
+
headless: bool = True, verbose: bool = True):
|
|
122
|
+
self.logger = logger
|
|
123
|
+
self.max_retries = max_retries
|
|
124
|
+
self.headless = headless
|
|
125
|
+
self.verbose = verbose
|
|
126
|
+
|
|
127
|
+
def _log(self, msg: str, level: str = "info"):
|
|
128
|
+
"""verbose=False 时只输出 warning/error;verbose=True 输出全部。"""
|
|
129
|
+
if not self.verbose and level in ("info", "debug"):
|
|
130
|
+
return
|
|
131
|
+
if self.logger:
|
|
132
|
+
getattr(self.logger, level, print)(msg)
|
|
133
|
+
else:
|
|
134
|
+
print(f"[{level.upper()}] {msg}")
|
|
135
|
+
|
|
136
|
+
async def solve(self, page: Page) -> bool:
|
|
137
|
+
"""
|
|
138
|
+
检测并求解 CF challenge。返回 True 表示已解决(或无 CF)。
|
|
139
|
+
应在 page.goto() 之后调用。
|
|
140
|
+
"""
|
|
141
|
+
await asyncio.sleep(0.5)
|
|
142
|
+
|
|
143
|
+
html = await self._get_content(page)
|
|
144
|
+
challenge = _detect_cf_challenge(html)
|
|
145
|
+
|
|
146
|
+
# cType 没找到但标题是 CF → 兜底按 managed 处理
|
|
147
|
+
if not challenge and _is_cf_page(html):
|
|
148
|
+
self._log("cType 未检测到但页面判定为 CF,兜底按 managed 求解")
|
|
149
|
+
challenge = 'managed'
|
|
150
|
+
|
|
151
|
+
if not challenge:
|
|
152
|
+
return True # 无 CF,直接返回
|
|
153
|
+
|
|
154
|
+
self._log(f"检测到 CF Challenge: {challenge}")
|
|
155
|
+
await self._solve_cb(page, challenge, retry=0)
|
|
156
|
+
|
|
157
|
+
# 解完后等页面稳定
|
|
158
|
+
try:
|
|
159
|
+
await page.wait_for_load_state('networkidle', timeout=15000)
|
|
160
|
+
except Exception:
|
|
161
|
+
pass
|
|
162
|
+
|
|
163
|
+
html = await self._get_content(page)
|
|
164
|
+
return not _is_cf_page(html)
|
|
165
|
+
|
|
166
|
+
async def _get_content(self, page: Page, retries: int = 10,
|
|
167
|
+
delay: float = 0.5) -> str:
|
|
168
|
+
"""带重试的 page.content()。"""
|
|
169
|
+
for i in range(retries):
|
|
170
|
+
try:
|
|
171
|
+
content = await page.content()
|
|
172
|
+
if content:
|
|
173
|
+
return content
|
|
174
|
+
except Exception:
|
|
175
|
+
if i == retries - 1:
|
|
176
|
+
return "" # 最终失败返回空字符串,不抛出
|
|
177
|
+
await asyncio.sleep(delay)
|
|
178
|
+
return ""
|
|
179
|
+
|
|
180
|
+
# ---------- 等待 CF 消失的辅助方法 ----------
|
|
181
|
+
|
|
182
|
+
async def _find_cf_frame(self, page: Page, deadline: float) -> bool:
|
|
183
|
+
"""轮询直到 CF 的 iframe 可见,返回是否找到。"""
|
|
184
|
+
while time.time() < deadline:
|
|
185
|
+
for f in page.frames:
|
|
186
|
+
if 'challenges.cloudflare.com' in (f.url or ''):
|
|
187
|
+
try:
|
|
188
|
+
el = await f.frame_element()
|
|
189
|
+
if await el.is_visible():
|
|
190
|
+
return True
|
|
191
|
+
except Exception:
|
|
192
|
+
pass
|
|
193
|
+
await asyncio.sleep(0.2)
|
|
194
|
+
return False
|
|
195
|
+
|
|
196
|
+
async def _wait_for_iframe_ready(self, page: Page, deadline: float):
|
|
197
|
+
"""等待 iframe 内部 checkbox 渲染就绪,避免过早点击导致无效。"""
|
|
198
|
+
while time.time() < deadline:
|
|
199
|
+
for f in page.frames:
|
|
200
|
+
if 'challenges.cloudflare.com' not in (f.url or ''):
|
|
201
|
+
continue
|
|
202
|
+
try:
|
|
203
|
+
# 尝试多种可能的选择器,任一可见即认为就绪
|
|
204
|
+
for sel in [
|
|
205
|
+
'input[type=checkbox]',
|
|
206
|
+
'div[role=checkbox]',
|
|
207
|
+
'.cb-lb',
|
|
208
|
+
'label',
|
|
209
|
+
'.mark',
|
|
210
|
+
'[class*="checkbox"]',
|
|
211
|
+
'[class*="spinner"]',
|
|
212
|
+
]:
|
|
213
|
+
el = f.locator(sel).first
|
|
214
|
+
if await el.is_visible(timeout=500):
|
|
215
|
+
self._log(f"iframe 内容就绪 ({sel})", "debug")
|
|
216
|
+
return
|
|
217
|
+
except Exception:
|
|
218
|
+
pass
|
|
219
|
+
await asyncio.sleep(0.3)
|
|
220
|
+
self._log("iframe 内容就绪等待超时,继续点击", "debug")
|
|
221
|
+
|
|
222
|
+
def _is_cf_gone_by_title(self, title: str) -> bool:
|
|
223
|
+
"""通过 title 判断 CF 是否已消失。"""
|
|
224
|
+
title_lower = title.lower()
|
|
225
|
+
return not any(t.lower() == title_lower for t in CF_CHALLENGE_TITLES)
|
|
226
|
+
|
|
227
|
+
async def _wait_for_cf_gone(self, page: Page, timeout: float,
|
|
228
|
+
interval: float = 0.25) -> bool:
|
|
229
|
+
"""
|
|
230
|
+
轮询等待 CF 消失。
|
|
231
|
+
双重检测:title 变化 + 页面内容不再包含 CF 特征。
|
|
232
|
+
返回 True 表示 CF 已消失。
|
|
233
|
+
"""
|
|
234
|
+
deadline = time.time() + timeout
|
|
235
|
+
while time.time() < deadline:
|
|
236
|
+
try:
|
|
237
|
+
title = await page.title()
|
|
238
|
+
if self._is_cf_gone_by_title(title):
|
|
239
|
+
return True
|
|
240
|
+
|
|
241
|
+
# title 未变但可能是网络卡了,用 content 片段确认
|
|
242
|
+
html_snippet = await page.content()
|
|
243
|
+
if not _is_cf_page(html_snippet):
|
|
244
|
+
return True
|
|
245
|
+
except Exception:
|
|
246
|
+
pass
|
|
247
|
+
await asyncio.sleep(interval)
|
|
248
|
+
return False
|
|
249
|
+
|
|
250
|
+
# ---------- Turnstile 点击 ----------
|
|
251
|
+
|
|
252
|
+
async def _click_turnstile(self, page: Page):
|
|
253
|
+
"""在 Turnstile iframe 内点击 checkbox(多路径递进)。"""
|
|
254
|
+
cf_frames = [f for f in page.frames
|
|
255
|
+
if 'challenges.cloudflare.com' in (f.url or '')]
|
|
256
|
+
|
|
257
|
+
for f in cf_frames:
|
|
258
|
+
# --- 路径 A: iframe 内精确选择器点击(最可靠) ---
|
|
259
|
+
for sel in [
|
|
260
|
+
'input[type=checkbox]',
|
|
261
|
+
'div[role=checkbox]',
|
|
262
|
+
'label.cb-lb',
|
|
263
|
+
'.cb-lb',
|
|
264
|
+
'label',
|
|
265
|
+
'.mark',
|
|
266
|
+
'div.mark',
|
|
267
|
+
'[id*="checkbox"]',
|
|
268
|
+
'[class*="checkbox"]',
|
|
269
|
+
]:
|
|
270
|
+
try:
|
|
271
|
+
el = f.locator(sel).first
|
|
272
|
+
if await el.is_visible(timeout=1000):
|
|
273
|
+
await el.click(delay=randint(100, 200))
|
|
274
|
+
self._log(f"iframe 内选择器点击成功: {sel}", "debug")
|
|
275
|
+
return
|
|
276
|
+
except Exception:
|
|
277
|
+
continue
|
|
278
|
+
|
|
279
|
+
# --- 路径 B: iframe 坐标点击(checkbox 通常在左上角 ~27,25 区域) ---
|
|
280
|
+
try:
|
|
281
|
+
frame_el = await f.frame_element()
|
|
282
|
+
box = await frame_el.bounding_box()
|
|
283
|
+
if box and box['width'] > 0 and box['height'] > 0:
|
|
284
|
+
# 多组坐标尝试(checkbox 大约在 iframe 内 25-35, 25-35 位置)
|
|
285
|
+
# 映射到页面坐标 = frame_el.x + iframe_inner_x
|
|
286
|
+
for offset_x, offset_y in [(27, 27), (30, 30), (35, 25), (25, 35)]:
|
|
287
|
+
cx = box['x'] + offset_x
|
|
288
|
+
cy = box['y'] + offset_y
|
|
289
|
+
await page.mouse.click(cx, cy, delay=randint(80, 150), button='left')
|
|
290
|
+
await asyncio.sleep(0.3)
|
|
291
|
+
# 检查是否触发了 spinner
|
|
292
|
+
try:
|
|
293
|
+
spinner = f.locator('[class*="spinner"], [id*="spinner"]').first
|
|
294
|
+
if await spinner.is_visible(timeout=500):
|
|
295
|
+
self._log(f"iframe 坐标点击触发 spinner @ ({cx:.0f},{cy:.0f})", "debug")
|
|
296
|
+
return
|
|
297
|
+
except Exception:
|
|
298
|
+
pass
|
|
299
|
+
# 如果坐标点击都没触发 spinner,最后用一个通用坐标
|
|
300
|
+
cx = box['x'] + randint(26, 30)
|
|
301
|
+
cy = box['y'] + randint(26, 30)
|
|
302
|
+
await page.mouse.click(cx, cy, delay=randint(100, 200), button='left')
|
|
303
|
+
self._log(f"iframe 坐标兜底点击 @ ({cx:.0f},{cy:.0f})", "debug")
|
|
304
|
+
return
|
|
305
|
+
except Exception:
|
|
306
|
+
pass
|
|
307
|
+
|
|
308
|
+
# --- 路径 C: 主页面 TURNSTILE 容器选择器 ---
|
|
309
|
+
for sel in TURNSTILE_CONTAINER_SELECTORS:
|
|
310
|
+
try:
|
|
311
|
+
el = page.locator(sel).last
|
|
312
|
+
if await el.is_visible(timeout=1000):
|
|
313
|
+
box = await el.bounding_box()
|
|
314
|
+
if box and box['width'] > 0:
|
|
315
|
+
cx = box['x'] + randint(10, 30)
|
|
316
|
+
cy = box['y'] + randint(10, 30)
|
|
317
|
+
await page.mouse.click(cx, cy, delay=randint(100, 200), button='left')
|
|
318
|
+
self._log(f"主页面容器坐标点击 @ ({cx:.0f},{cy:.0f})", "debug")
|
|
319
|
+
return
|
|
320
|
+
except Exception:
|
|
321
|
+
continue
|
|
322
|
+
|
|
323
|
+
# --- 路径 D: Tab+Space 兜底 ---
|
|
324
|
+
try:
|
|
325
|
+
await page.keyboard.press('Tab')
|
|
326
|
+
await asyncio.sleep(0.15)
|
|
327
|
+
await page.keyboard.press('Space')
|
|
328
|
+
self._log("Tab+Space 兜底点击", "debug")
|
|
329
|
+
except Exception:
|
|
330
|
+
pass
|
|
331
|
+
|
|
332
|
+
# ---------- 核心求解逻辑 ----------
|
|
333
|
+
|
|
334
|
+
async def _solve_cb(self, page: Page, challenge_type: str,
|
|
335
|
+
retry: int = 0):
|
|
336
|
+
"""递归求解 CF Turnstile challenge(v2 优化版)。"""
|
|
337
|
+
if retry >= self.max_retries:
|
|
338
|
+
self._log(f"CF 求解达到最大重试 {self.max_retries}", "warning")
|
|
339
|
+
return
|
|
340
|
+
|
|
341
|
+
# 时间参数(无头/有头统一加长,适配慢速网络)
|
|
342
|
+
max_polls = 200 if self.headless else 120
|
|
343
|
+
poll_interval = 0.2 if self.headless else 0.2
|
|
344
|
+
iframe_deadline = 12 if self.headless else 8
|
|
345
|
+
post_click_timeout = 45 if self.headless else 30
|
|
346
|
+
|
|
347
|
+
# ---- non-interactive: 纯等待 ----
|
|
348
|
+
if challenge_type == 'non-interactive':
|
|
349
|
+
self._log("non-interactive 模式:轮询等待 CF 自动消失...")
|
|
350
|
+
ok = await self._wait_for_cf_gone(
|
|
351
|
+
page, timeout=max_polls * poll_interval, interval=poll_interval,
|
|
352
|
+
)
|
|
353
|
+
if ok:
|
|
354
|
+
return
|
|
355
|
+
self._log(f"non-interactive 超时,进入重试 {retry + 1}/{self.max_retries}")
|
|
356
|
+
return await self._solve_cb(page, challenge_type, retry + 1)
|
|
357
|
+
|
|
358
|
+
# ---- managed / interactive / embedded: 等 iframe → 点击 → 等消失 ----
|
|
359
|
+
self._log(f"等待 CF iframe 就绪 (deadline={iframe_deadline}s)...")
|
|
360
|
+
if not await self._find_cf_frame(page, time.time() + iframe_deadline):
|
|
361
|
+
self._log("iframe 未在时限内就绪,仍尝试点击", "warning")
|
|
362
|
+
|
|
363
|
+
# 等 iframe 内部内容渲染完成(checkbox/spinner 需要 JS 初始化)
|
|
364
|
+
await self._wait_for_iframe_ready(page, deadline=time.time() + 5)
|
|
365
|
+
|
|
366
|
+
# 点击
|
|
367
|
+
self._log("执行 Turnstile 点击...")
|
|
368
|
+
await self._click_turnstile(page)
|
|
369
|
+
|
|
370
|
+
# 点击后短暂等待,让 CF 的 JS 响应
|
|
371
|
+
await asyncio.sleep(1.0)
|
|
372
|
+
|
|
373
|
+
# 检查是否出现 spinner(说明点击被接受)
|
|
374
|
+
spinner_seen = False
|
|
375
|
+
for f in page.frames:
|
|
376
|
+
if 'challenges.cloudflare.com' not in (f.url or ''):
|
|
377
|
+
continue
|
|
378
|
+
try:
|
|
379
|
+
spinner = f.locator(
|
|
380
|
+
'[class*="spinner"], [id*="spinner"], '
|
|
381
|
+
'[class*="loading"], [class*="verifying"]'
|
|
382
|
+
).first
|
|
383
|
+
if await spinner.is_visible(timeout=2000):
|
|
384
|
+
spinner_seen = True
|
|
385
|
+
self._log("检测到 spinner — 点击已被 CF 接受,等待验证完成...")
|
|
386
|
+
break
|
|
387
|
+
except Exception:
|
|
388
|
+
pass
|
|
389
|
+
|
|
390
|
+
if not spinner_seen:
|
|
391
|
+
self._log("未检测到 spinner,可能点击未生效或 challenge 类型不同")
|
|
392
|
+
|
|
393
|
+
# 等待 CF 消失(给足时间)
|
|
394
|
+
self._log(f"轮询等待 CF 消失 (最长 {post_click_timeout}s)...")
|
|
395
|
+
ok = await self._wait_for_cf_gone(
|
|
396
|
+
page, timeout=post_click_timeout, interval=poll_interval,
|
|
397
|
+
)
|
|
398
|
+
if ok:
|
|
399
|
+
self._log("CF 已消失!")
|
|
400
|
+
return
|
|
401
|
+
|
|
402
|
+
# 检查页面是否发生了跳转(CF 验证成功后常见)
|
|
403
|
+
try:
|
|
404
|
+
current_url = page.url
|
|
405
|
+
if not any(t.lower() in (await page.title()).lower() for t in CF_CHALLENGE_TITLES):
|
|
406
|
+
self._log(f"title 已非 CF,页面可能已跳转 url={current_url[:80]}")
|
|
407
|
+
return
|
|
408
|
+
except Exception:
|
|
409
|
+
pass
|
|
410
|
+
|
|
411
|
+
# 递归重试
|
|
412
|
+
self._log(f"CF 仍未消失,进入重试 {retry + 1}/{self.max_retries}")
|
|
413
|
+
html = await self._get_content(page)
|
|
414
|
+
if _cf_title_present(html):
|
|
415
|
+
return await self._solve_cb(page, challenge_type, retry + 1)
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
# ============================================================
|
|
419
|
+
# CloakBrowser 页面抓取器(含 CF 自动解盾)
|
|
420
|
+
# ============================================================
|
|
421
|
+
class CFPageFetcher:
|
|
422
|
+
"""
|
|
423
|
+
基于 CloakBrowser 的页面抓取器 + CF 自动解盾。
|
|
424
|
+
|
|
425
|
+
- max_pages_per_context: 每 N 个页面自动回收浏览器上下文,默认 20
|
|
426
|
+
- 用法:
|
|
427
|
+
async with CFPageFetcher(headless=True) as fetcher:
|
|
428
|
+
results = await fetcher.fetch_batch(urls)
|
|
429
|
+
"""
|
|
430
|
+
|
|
431
|
+
def __init__(
|
|
432
|
+
self,
|
|
433
|
+
headless: bool = True,
|
|
434
|
+
humanize: bool = False,
|
|
435
|
+
solve_cf: bool = True,
|
|
436
|
+
cf_max_retries: int = 5,
|
|
437
|
+
timeout: int = 90000,
|
|
438
|
+
profile_dir: Optional[str] = None,
|
|
439
|
+
proxy: Optional[str] = None,
|
|
440
|
+
verbose: bool = True,
|
|
441
|
+
max_pages_per_context: int = 20,
|
|
442
|
+
return_cookies: bool = False,
|
|
443
|
+
):
|
|
444
|
+
self.headless = headless
|
|
445
|
+
self.humanize = humanize
|
|
446
|
+
self.solve_cf = solve_cf
|
|
447
|
+
self.cf_max_retries = cf_max_retries
|
|
448
|
+
self.timeout = timeout
|
|
449
|
+
self.profile_dir = profile_dir or tempfile.mkdtemp(prefix="cb_cf_")
|
|
450
|
+
self.proxy = proxy
|
|
451
|
+
self.verbose = verbose
|
|
452
|
+
self.max_pages_per_context = max_pages_per_context
|
|
453
|
+
self.return_cookies = return_cookies
|
|
454
|
+
self._context = None
|
|
455
|
+
self._page_count = 0
|
|
456
|
+
self._context_lock = asyncio.Lock()
|
|
457
|
+
self._active_pages = 0
|
|
458
|
+
self._pending_recycle = False
|
|
459
|
+
|
|
460
|
+
# ---------- 上下文生命周期 ----------
|
|
461
|
+
|
|
462
|
+
async def _launch_context(self):
|
|
463
|
+
"""启动浏览器上下文。"""
|
|
464
|
+
from cloakbrowser import launch_persistent_context_async
|
|
465
|
+
self._context = await launch_persistent_context_async(
|
|
466
|
+
self.profile_dir,
|
|
467
|
+
headless=self.headless,
|
|
468
|
+
proxy=self.proxy,
|
|
469
|
+
viewport={"width": 1920, "height": 1080},
|
|
470
|
+
locale="zh-CN",
|
|
471
|
+
timezone="Asia/Shanghai",
|
|
472
|
+
humanize=self.humanize,
|
|
473
|
+
)
|
|
474
|
+
self._page_count = 0
|
|
475
|
+
self._pending_recycle = False
|
|
476
|
+
if self.verbose:
|
|
477
|
+
print(f"[上下文] 已创建")
|
|
478
|
+
|
|
479
|
+
async def _close_context(self):
|
|
480
|
+
"""关闭浏览器上下文。"""
|
|
481
|
+
if self._context:
|
|
482
|
+
ctx = self._context
|
|
483
|
+
self._context = None
|
|
484
|
+
try:
|
|
485
|
+
await ctx.close()
|
|
486
|
+
except Exception:
|
|
487
|
+
pass
|
|
488
|
+
|
|
489
|
+
async def _do_recycle(self):
|
|
490
|
+
"""执行上下文回收(调用方必须持有 _context_lock)。"""
|
|
491
|
+
if self.verbose:
|
|
492
|
+
print(f"[上下文] 已达 {self._page_count} 页,回收重建...")
|
|
493
|
+
await self._close_context()
|
|
494
|
+
await self._launch_context()
|
|
495
|
+
|
|
496
|
+
async def _ensure_context(self):
|
|
497
|
+
"""确保上下文可用;超过 max_pages_per_context 时延迟回收,不打断活跃页面。"""
|
|
498
|
+
async with self._context_lock:
|
|
499
|
+
if self._context is None:
|
|
500
|
+
await self._launch_context()
|
|
501
|
+
elif self._page_count >= self.max_pages_per_context:
|
|
502
|
+
if self._active_pages == 0:
|
|
503
|
+
await self._do_recycle()
|
|
504
|
+
else:
|
|
505
|
+
self._pending_recycle = True
|
|
506
|
+
if self.verbose:
|
|
507
|
+
print(f"[上下文] 回收延后({self._active_pages} 个活跃页面使用中)")
|
|
508
|
+
|
|
509
|
+
async def _maybe_recycle(self):
|
|
510
|
+
"""页面关闭时调用:如果之前有挂起的回收且当前无活跃页面,执行回收。"""
|
|
511
|
+
async with self._context_lock:
|
|
512
|
+
if self._pending_recycle and self._active_pages == 0:
|
|
513
|
+
await self._do_recycle()
|
|
514
|
+
|
|
515
|
+
async def __aenter__(self):
|
|
516
|
+
await self._launch_context()
|
|
517
|
+
return self
|
|
518
|
+
|
|
519
|
+
async def __aexit__(self, *args):
|
|
520
|
+
await self._close_context()
|
|
521
|
+
|
|
522
|
+
# ---------- 抓取方法 ----------
|
|
523
|
+
|
|
524
|
+
async def _get_cookies(self, url: str = None) -> dict:
|
|
525
|
+
"""获取当前浏览器上下文的 cookies,返回两种格式。"""
|
|
526
|
+
raw = await self._context.cookies(url)
|
|
527
|
+
return {
|
|
528
|
+
"dict": {c["name"]: c["value"] for c in raw}, # 方便 requests.Session 使用
|
|
529
|
+
"raw": raw, # 完整信息 (domain/path/expires...)
|
|
530
|
+
"header": "; ".join(f"{c['name']}={c['value']}" for c in raw), # Cookie 请求头
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
async def download_file(self, url: str, output_path: str,
|
|
534
|
+
warmup_url: str = None) -> bool:
|
|
535
|
+
"""过 CF 后下载二进制文件(PDF/图片等),保存到本地。
|
|
536
|
+
|
|
537
|
+
通过页内 fetch() 复用浏览器的 cookie 和 TLS 指纹,绕过 CF。
|
|
538
|
+
|
|
539
|
+
Args:
|
|
540
|
+
url: 文件下载 URL
|
|
541
|
+
output_path: 保存路径
|
|
542
|
+
warmup_url: 预热 URL(默认取 url 的根路径)
|
|
543
|
+
"""
|
|
544
|
+
import base64
|
|
545
|
+
from urllib.parse import urlparse
|
|
546
|
+
|
|
547
|
+
if warmup_url is None:
|
|
548
|
+
parsed = urlparse(url)
|
|
549
|
+
warmup_url = f"{parsed.scheme}://{parsed.netloc}/"
|
|
550
|
+
|
|
551
|
+
# 预热过 CF
|
|
552
|
+
if self.verbose:
|
|
553
|
+
print(f"[下载] 预热: {warmup_url}")
|
|
554
|
+
r = await self.fetch_page(warmup_url, wait_until="load")
|
|
555
|
+
if not r["success"]:
|
|
556
|
+
print(f"[下载] 预热失败")
|
|
557
|
+
return False
|
|
558
|
+
|
|
559
|
+
# 用同域页面绑定 cookie,再通过页内 fetch 拿文件
|
|
560
|
+
await self._ensure_context()
|
|
561
|
+
self._active_pages += 1
|
|
562
|
+
page = None
|
|
563
|
+
try:
|
|
564
|
+
page = await self._context.new_page()
|
|
565
|
+
await page.goto(warmup_url, wait_until="load", timeout=30000)
|
|
566
|
+
|
|
567
|
+
pdf_base64 = await page.evaluate(f"""
|
|
568
|
+
async () => {{
|
|
569
|
+
const resp = await fetch('{url}');
|
|
570
|
+
if (!resp.ok) return null;
|
|
571
|
+
const buf = await resp.arrayBuffer();
|
|
572
|
+
const bytes = new Uint8Array(buf);
|
|
573
|
+
let binary = '';
|
|
574
|
+
for (let i = 0; i < bytes.length; i++)
|
|
575
|
+
binary += String.fromCharCode(bytes[i]);
|
|
576
|
+
return btoa(binary);
|
|
577
|
+
}}
|
|
578
|
+
""")
|
|
579
|
+
|
|
580
|
+
if not pdf_base64:
|
|
581
|
+
print(f"[下载] fetch 失败")
|
|
582
|
+
return False
|
|
583
|
+
|
|
584
|
+
content = base64.b64decode(pdf_base64)
|
|
585
|
+
with open(output_path, "wb") as f:
|
|
586
|
+
f.write(content)
|
|
587
|
+
if self.verbose:
|
|
588
|
+
print(f"[下载] 已保存: {output_path} ({len(content)/1024:.0f}KB)")
|
|
589
|
+
return True
|
|
590
|
+
except Exception as e:
|
|
591
|
+
print(f"[下载] 失败: {e}")
|
|
592
|
+
return False
|
|
593
|
+
finally:
|
|
594
|
+
self._active_pages -= 1
|
|
595
|
+
if page:
|
|
596
|
+
try:
|
|
597
|
+
await page.close()
|
|
598
|
+
except Exception:
|
|
599
|
+
pass
|
|
600
|
+
await self._maybe_recycle()
|
|
601
|
+
|
|
602
|
+
async def _detect_cf_with_retry(self, page: Page, max_wait: float = 5.0) -> bool:
|
|
603
|
+
"""带重试的 CF 检测,适配 JS 延迟写入标题的站点(如 ScienceDirect)。"""
|
|
604
|
+
deadline = time.time() + max_wait
|
|
605
|
+
interval = 0.3
|
|
606
|
+
while time.time() < deadline:
|
|
607
|
+
if await _quick_cf_check(page):
|
|
608
|
+
return True
|
|
609
|
+
# 未检测到:等 JS 执行,再试
|
|
610
|
+
try:
|
|
611
|
+
await page.wait_for_load_state('load', timeout=2000)
|
|
612
|
+
except Exception:
|
|
613
|
+
pass
|
|
614
|
+
await asyncio.sleep(interval)
|
|
615
|
+
interval = min(interval * 1.3, 1.5) # 逐渐拉长间隔
|
|
616
|
+
return False
|
|
617
|
+
|
|
618
|
+
async def fetch_page(
|
|
619
|
+
self,
|
|
620
|
+
url: str,
|
|
621
|
+
wait_until: str = 'domcontentloaded',
|
|
622
|
+
wait_selector: Optional[str] = None,
|
|
623
|
+
wait_network_idle: bool = False,
|
|
624
|
+
wait_timeout: int = 15000,
|
|
625
|
+
scroll_times: int = 0,
|
|
626
|
+
final_delay: float = 2.0,
|
|
627
|
+
) -> Dict:
|
|
628
|
+
"""
|
|
629
|
+
抓取单个页面,自动处理 CF 解盾。
|
|
630
|
+
|
|
631
|
+
返回: {"url", "title", "html", "success"}
|
|
632
|
+
"""
|
|
633
|
+
await self._ensure_context()
|
|
634
|
+
self._active_pages += 1
|
|
635
|
+
page = await self._context.new_page()
|
|
636
|
+
page.set_default_navigation_timeout(self.timeout)
|
|
637
|
+
page.set_default_timeout(self.timeout)
|
|
638
|
+
|
|
639
|
+
try:
|
|
640
|
+
try:
|
|
641
|
+
await page.goto(url, wait_until=wait_until)
|
|
642
|
+
except Exception:
|
|
643
|
+
# goto 超时(如 ScienceDirect CF 页卡 domcontentloaded),
|
|
644
|
+
# 不立即失败,检查页面是否已是 CF 挑战页
|
|
645
|
+
if self.verbose:
|
|
646
|
+
print(f"goto 超时 url={url},检查页面状态...")
|
|
647
|
+
try:
|
|
648
|
+
await page.wait_for_load_state('load', timeout=5000)
|
|
649
|
+
except Exception:
|
|
650
|
+
pass
|
|
651
|
+
await asyncio.sleep(1.0)
|
|
652
|
+
|
|
653
|
+
# ---- CF 检测(带重试,适配 ScienceDirect 等 JS 延迟写入标题的站点)----
|
|
654
|
+
if self.solve_cf:
|
|
655
|
+
cf_detected = await self._detect_cf_with_retry(page)
|
|
656
|
+
|
|
657
|
+
if cf_detected:
|
|
658
|
+
if self.verbose:
|
|
659
|
+
print(f"检测到 CF 盾 url={url}")
|
|
660
|
+
solver = CFSolver(
|
|
661
|
+
max_retries=self.cf_max_retries,
|
|
662
|
+
headless=self.headless,
|
|
663
|
+
verbose=self.verbose,
|
|
664
|
+
)
|
|
665
|
+
cf_ok = await solver.solve(page)
|
|
666
|
+
if self.verbose and not cf_ok:
|
|
667
|
+
print(f"CF 未过 url={url}")
|
|
668
|
+
elif self.verbose:
|
|
669
|
+
print(f"非 CF url={url}")
|
|
670
|
+
|
|
671
|
+
for _ in range(scroll_times):
|
|
672
|
+
try:
|
|
673
|
+
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
|
674
|
+
await asyncio.sleep(0.5)
|
|
675
|
+
except Exception:
|
|
676
|
+
pass
|
|
677
|
+
|
|
678
|
+
if wait_selector:
|
|
679
|
+
try:
|
|
680
|
+
await page.wait_for_selector(wait_selector, timeout=wait_timeout, state='attached')
|
|
681
|
+
except Exception:
|
|
682
|
+
pass
|
|
683
|
+
|
|
684
|
+
if wait_network_idle:
|
|
685
|
+
try:
|
|
686
|
+
await page.wait_for_load_state('networkidle', timeout=wait_timeout)
|
|
687
|
+
except Exception:
|
|
688
|
+
pass
|
|
689
|
+
|
|
690
|
+
await asyncio.sleep(final_delay)
|
|
691
|
+
|
|
692
|
+
title = await page.title()
|
|
693
|
+
html = await page.content()
|
|
694
|
+
result = {"url": url, "title": title, "html": html, "success": True}
|
|
695
|
+
|
|
696
|
+
if self.return_cookies:
|
|
697
|
+
result["cookies"] = await self._get_cookies(url)
|
|
698
|
+
|
|
699
|
+
self._page_count += 1
|
|
700
|
+
return result
|
|
701
|
+
|
|
702
|
+
except Exception as e:
|
|
703
|
+
# 尝试获取当前页面信息用于结果记录(页面可能已关闭)
|
|
704
|
+
try:
|
|
705
|
+
title = await page.title()
|
|
706
|
+
except Exception:
|
|
707
|
+
title = None
|
|
708
|
+
try:
|
|
709
|
+
html = await page.content()
|
|
710
|
+
except Exception:
|
|
711
|
+
html = None
|
|
712
|
+
result = {"url": url, "title": title, "html": html, "success": False}
|
|
713
|
+
|
|
714
|
+
if self.return_cookies:
|
|
715
|
+
try:
|
|
716
|
+
result["cookies"] = await self._get_cookies(url)
|
|
717
|
+
except Exception:
|
|
718
|
+
result["cookies"] = {}
|
|
719
|
+
|
|
720
|
+
print(f"抓取失败 url={url}: {e}")
|
|
721
|
+
self._page_count += 1
|
|
722
|
+
return result
|
|
723
|
+
finally:
|
|
724
|
+
self._active_pages -= 1
|
|
725
|
+
try:
|
|
726
|
+
await page.close()
|
|
727
|
+
except Exception:
|
|
728
|
+
pass
|
|
729
|
+
await self._maybe_recycle()
|
|
730
|
+
|
|
731
|
+
async def fetch_batch(
|
|
732
|
+
self,
|
|
733
|
+
urls: List[str],
|
|
734
|
+
concurrency: int = 3,
|
|
735
|
+
**kwargs,
|
|
736
|
+
) -> List[Dict]:
|
|
737
|
+
"""批量抓取:先串行预热第 1 个 URL 过 CF,之后并发拉取剩余。"""
|
|
738
|
+
if not urls:
|
|
739
|
+
return []
|
|
740
|
+
|
|
741
|
+
results = [None] * len(urls)
|
|
742
|
+
|
|
743
|
+
# ---- 阶段 1: 串行预热 ----
|
|
744
|
+
if self.verbose:
|
|
745
|
+
print(f"[预热] {urls[0][:80]}")
|
|
746
|
+
results[0] = await self.fetch_page(urls[0], **kwargs)
|
|
747
|
+
|
|
748
|
+
if len(urls) == 1:
|
|
749
|
+
return results
|
|
750
|
+
|
|
751
|
+
# ---- 阶段 2: 并发抓取 ----
|
|
752
|
+
sem = asyncio.Semaphore(concurrency)
|
|
753
|
+
|
|
754
|
+
async def _fetch_one(i: int, url: str):
|
|
755
|
+
async with sem:
|
|
756
|
+
results[i] = await self.fetch_page(url, **kwargs)
|
|
757
|
+
|
|
758
|
+
tasks = [_fetch_one(i, url) for i, url in enumerate(urls[1:], start=1)]
|
|
759
|
+
if self.verbose:
|
|
760
|
+
print(f"[并发] {len(tasks)} 个,并发 {concurrency}")
|
|
761
|
+
await asyncio.gather(*tasks)
|
|
762
|
+
|
|
763
|
+
return results
|
|
764
|
+
|
|
765
|
+
|
|
766
|
+
# ============================================================
|
|
767
|
+
# 同步入口
|
|
768
|
+
# ============================================================
|
|
769
|
+
def _run_in_thread(urls_chunk: List[str], index: int, concurrency: int = 3, **fetcher_kwargs) -> List[Dict]:
|
|
770
|
+
"""在线程中运行一个独立的 async event loop + 浏览器实例。"""
|
|
771
|
+
async def _run():
|
|
772
|
+
async with CFPageFetcher(**fetcher_kwargs) as fetcher:
|
|
773
|
+
return await fetcher.fetch_batch(urls_chunk, concurrency=concurrency)
|
|
774
|
+
return asyncio.run(_run())
|
|
775
|
+
|
|
776
|
+
|
|
777
|
+
def fetch_all(
|
|
778
|
+
urls: List[str],
|
|
779
|
+
instances: int = 1,
|
|
780
|
+
concurrency: int = 3,
|
|
781
|
+
max_pages_per_context: int = 20,
|
|
782
|
+
verbose: bool = True,
|
|
783
|
+
return_cookies: bool = False,
|
|
784
|
+
**fetcher_kwargs,
|
|
785
|
+
) -> List[Dict]:
|
|
786
|
+
"""
|
|
787
|
+
多实例并行抓取所有 URL。
|
|
788
|
+
|
|
789
|
+
Args:
|
|
790
|
+
urls: URL 列表
|
|
791
|
+
instances: 并行浏览器实例数(默认 1)
|
|
792
|
+
concurrency: 每个实例的并发 tab 数(默认 3)
|
|
793
|
+
max_pages_per_context: 每个实例处理 N 页后自动回收
|
|
794
|
+
verbose: 是否输出日志
|
|
795
|
+
return_cookies: 是否在结果中返回 cookies(默认 False)
|
|
796
|
+
**fetcher_kwargs: 传给 CFPageFetcher 的参数
|
|
797
|
+
headless, solve_cf, cf_max_retries, timeout, humanize, ...
|
|
798
|
+
proxy: 支持三种形式:
|
|
799
|
+
- 字符串: 所有实例共享(不推荐,会触发服务端限流;会打印警告)
|
|
800
|
+
- 列表: 每个实例分配一个,不足则循环
|
|
801
|
+
- callable: 每实例调用一次,返回代理字符串
|
|
802
|
+
|
|
803
|
+
Returns:
|
|
804
|
+
结果列表(保持原 URL 顺序)
|
|
805
|
+
|
|
806
|
+
用法:
|
|
807
|
+
# 每个实例不同代理
|
|
808
|
+
results = fetch_all(urls, instances=3,
|
|
809
|
+
proxy=["http://p1", "http://p2", "http://p3"])
|
|
810
|
+
# 或 callable
|
|
811
|
+
results = fetch_all(urls, instances=3, proxy=get_next_proxy)
|
|
812
|
+
"""
|
|
813
|
+
# ---- 代理分配 ----
|
|
814
|
+
raw_proxy = fetcher_kwargs.pop("proxy", None)
|
|
815
|
+
|
|
816
|
+
def _resolve_proxy(idx: int) -> str:
|
|
817
|
+
"""为第 idx 个实例解析代理。"""
|
|
818
|
+
if raw_proxy is None:
|
|
819
|
+
return None
|
|
820
|
+
if isinstance(raw_proxy, list):
|
|
821
|
+
return raw_proxy[idx % len(raw_proxy)]
|
|
822
|
+
if callable(raw_proxy):
|
|
823
|
+
return raw_proxy()
|
|
824
|
+
# 单个字符串:共享代理,警告
|
|
825
|
+
if idx == 0 and instances > 1 and verbose:
|
|
826
|
+
print("[警告] 多实例共享同一代理,可能触发服务端限流!"
|
|
827
|
+
" 建议传入 proxy=[] 列表,每个实例不同代理")
|
|
828
|
+
return raw_proxy
|
|
829
|
+
|
|
830
|
+
if instances <= 1:
|
|
831
|
+
return _run_in_thread(urls, 0, concurrency=concurrency,
|
|
832
|
+
max_pages_per_context=max_pages_per_context,
|
|
833
|
+
verbose=verbose,
|
|
834
|
+
return_cookies=return_cookies,
|
|
835
|
+
proxy=_resolve_proxy(0),
|
|
836
|
+
**fetcher_kwargs)
|
|
837
|
+
|
|
838
|
+
# 将 URL 均匀分给各实例
|
|
839
|
+
chunks = [[] for _ in range(instances)]
|
|
840
|
+
for i, url in enumerate(urls):
|
|
841
|
+
chunks[i % instances].append(url)
|
|
842
|
+
|
|
843
|
+
if verbose:
|
|
844
|
+
proxy_info = ""
|
|
845
|
+
if isinstance(raw_proxy, list):
|
|
846
|
+
proxy_info = f",代理数={len(raw_proxy)}"
|
|
847
|
+
elif callable(raw_proxy):
|
|
848
|
+
proxy_info = ",代理=callable"
|
|
849
|
+
elif raw_proxy:
|
|
850
|
+
proxy_info = ",代理=共享(⚠️)"
|
|
851
|
+
print(f"[多实例] {instances} 个实例并行,共 {len(urls)} 个 URL{proxy_info}")
|
|
852
|
+
|
|
853
|
+
all_results = [None] * len(urls)
|
|
854
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=instances) as pool:
|
|
855
|
+
futures = {}
|
|
856
|
+
for idx, chunk in enumerate(chunks):
|
|
857
|
+
if not chunk:
|
|
858
|
+
continue
|
|
859
|
+
fut = pool.submit(
|
|
860
|
+
_run_in_thread, chunk, idx, concurrency,
|
|
861
|
+
max_pages_per_context=max_pages_per_context,
|
|
862
|
+
verbose=verbose,
|
|
863
|
+
return_cookies=return_cookies,
|
|
864
|
+
proxy=_resolve_proxy(idx),
|
|
865
|
+
**fetcher_kwargs,
|
|
866
|
+
)
|
|
867
|
+
futures[fut] = idx
|
|
868
|
+
|
|
869
|
+
for fut in concurrent.futures.as_completed(futures):
|
|
870
|
+
idx = futures[fut]
|
|
871
|
+
chunk_results = fut.result()
|
|
872
|
+
for j, result in enumerate(chunk_results):
|
|
873
|
+
original_index = idx + j * instances
|
|
874
|
+
all_results[original_index] = result
|
|
875
|
+
|
|
876
|
+
return all_results
|
|
877
|
+
|
|
878
|
+
|
|
879
|
+
def fetch_urls(
|
|
880
|
+
urls: List[str],
|
|
881
|
+
instances: int = 1,
|
|
882
|
+
concurrency: int = 3,
|
|
883
|
+
max_pages_per_context: int = 20,
|
|
884
|
+
verbose: bool = True,
|
|
885
|
+
return_cookies: bool = False,
|
|
886
|
+
**fetcher_kwargs,
|
|
887
|
+
) -> List[Dict]:
|
|
888
|
+
"""同步批量抓取(fetch_all 的别名)。"""
|
|
889
|
+
return fetch_all(urls, instances=instances, concurrency=concurrency,
|
|
890
|
+
max_pages_per_context=max_pages_per_context,
|
|
891
|
+
verbose=verbose, return_cookies=return_cookies,
|
|
892
|
+
**fetcher_kwargs)
|
|
893
|
+
|
|
894
|
+
|
|
895
|
+
def fetch_url(
|
|
896
|
+
url: str,
|
|
897
|
+
verbose: bool = True,
|
|
898
|
+
**fetcher_kwargs,
|
|
899
|
+
) -> Dict:
|
|
900
|
+
"""同步抓取单个 URL。"""
|
|
901
|
+
return fetch_all([url], instances=1, verbose=verbose, **fetcher_kwargs)[0]
|
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cf-killer
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Cloudflare 5s 盾自动求解 + 页面批量抓取工具(基于 CloakBrowser)
|
|
5
|
+
Author: cf-killer contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/CloakHQ/CloakBrowser
|
|
8
|
+
Project-URL: Repository, https://github.com/CloakHQ/CloakBrowser
|
|
9
|
+
Keywords: cloudflare,anti-bot,web-scraping,cloakbrowser,turnstile-solver,playwright
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Python: >=3.9
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
Requires-Dist: cloakbrowser>=0.3.0
|
|
24
|
+
Requires-Dist: playwright>=1.40
|
|
25
|
+
|
|
26
|
+
# CF Killer
|
|
27
|
+
|
|
28
|
+
基于 [CloakBrowser](https://github.com/erickirt/CloakBrowser)(Chromium C++ 源码级反检测浏览器)的 **Cloudflare 5 秒盾自动求解 + 页面批量抓取** 工具。
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## 1. 运行环境
|
|
33
|
+
|
|
34
|
+
| 项目 | 说明 |
|
|
35
|
+
|------|------|
|
|
36
|
+
| OS | Windows 10+ / Linux / macOS |
|
|
37
|
+
| Python | 3.9+(推荐 3.11) |
|
|
38
|
+
| 浏览器 | CloakBrowser 专用 Chromium(自动下载,~200MB) |
|
|
39
|
+
|
|
40
|
+
## 2. 依赖安装
|
|
41
|
+
|
|
42
|
+
```bash
|
|
43
|
+
# 1. 安装 cloakbrowser(含 Playwright)
|
|
44
|
+
pip install cloakbrowser
|
|
45
|
+
|
|
46
|
+
# 2. 下载特制 Chromium 二进制(首次运行前执行一次)
|
|
47
|
+
python -c "import cloakbrowser; cloakbrowser.ensure_binary()"
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
核心依赖链:
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
cloakbrowser (C++ 源码级反检测 Chromium)
|
|
54
|
+
├── playwright >= 1.40 # 浏览器自动化
|
|
55
|
+
├── httpx >= 0.24 # HTTP 客户端
|
|
56
|
+
└── greenlet >= 3.1.1 # 协程支持
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## 3. 功能概述
|
|
62
|
+
|
|
63
|
+
### 3.1 Cloudflare 自动解盾 (`CFSolver`)
|
|
64
|
+
|
|
65
|
+
自动检测并求解 Cloudflare Turnstile 挑战,支持多种 challenge 类型:
|
|
66
|
+
|
|
67
|
+
| 类型 | 策略 |
|
|
68
|
+
|------|------|
|
|
69
|
+
| `non-interactive` | 纯轮询等待 CF 自动放行 |
|
|
70
|
+
| `managed` | 等待 iframe → 点击 checkbox → 轮询消失 |
|
|
71
|
+
| `interactive` | 同上,带更复杂的点击路径 |
|
|
72
|
+
| `embedded` | 嵌入式 Turnstile 求解 |
|
|
73
|
+
|
|
74
|
+
点击采用四路径递进策略:iframe 内精确选择器 → iframe 坐标点击 → 主页面容器坐标 → Tab+Space 兜底。
|
|
75
|
+
|
|
76
|
+
### 3.2 页面批量抓取 (`CFPageFetcher`)
|
|
77
|
+
|
|
78
|
+
- 基于 CloakBrowser 持久化上下文,复用浏览器指纹和 cookie
|
|
79
|
+
- 内置 CF 检测(支持 JS 延迟写入标题的站点,如 ScienceDirect)
|
|
80
|
+
- 自动 context 回收:处理 N 页后重建浏览器上下文,防止内存泄漏
|
|
81
|
+
- **延迟回收机制**:并发场景下等活跃页面全部完成后再回收,避免竞态崩溃
|
|
82
|
+
- 支持代理(单实例/多实例/callable 三种模式)
|
|
83
|
+
|
|
84
|
+
### 3.3 文件下载 (`download_file`)
|
|
85
|
+
|
|
86
|
+
过 CF 后,通过页内 `fetch()` 直接下载二进制文件(PDF、图片等),复用浏览器 cookie 和 TLS 指纹,绕过反爬限制。
|
|
87
|
+
|
|
88
|
+
### 3.4 多实例并行 (`fetch_all`)
|
|
89
|
+
|
|
90
|
+
将 URL 均匀分配到多个浏览器实例,每个实例独立 event loop + 独立代理,ThreadPoolExecutor 并行执行,最大化吞吐量。
|
|
91
|
+
|
|
92
|
+
### 3.5 主入口函数
|
|
93
|
+
|
|
94
|
+
| 函数 | 用途 |
|
|
95
|
+
|------|------|
|
|
96
|
+
| `fetch_url(url, ...)` | 同步抓取单个 URL |
|
|
97
|
+
| `fetch_urls(urls, ...)` | 同步批量抓取(`fetch_all` 别名) |
|
|
98
|
+
| `fetch_all(urls, ...)` | 多实例并行抓取,支持分片代理 |
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## 4. 测试案例
|
|
103
|
+
|
|
104
|
+
### 案例 A:批量页面抓取
|
|
105
|
+
|
|
106
|
+
测试 31 个混合 URL(Gut 医学期刊 + American Football Wiki + ScienceDirect),验证 CF 解盾和页面抓取能力。
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
# -*- coding: utf-8 -*-
|
|
110
|
+
"""CF 自动解盾 + 页面抓取 — 测试脚本"""
|
|
111
|
+
import os
|
|
112
|
+
import sys
|
|
113
|
+
|
|
114
|
+
if sys.platform == "win32":
|
|
115
|
+
import io
|
|
116
|
+
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
|
|
117
|
+
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace")
|
|
118
|
+
|
|
119
|
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
120
|
+
from cf_killer import fetch_all
|
|
121
|
+
|
|
122
|
+
HEADLESS = True
|
|
123
|
+
PROXY = None
|
|
124
|
+
CONCURRENCY = 3
|
|
125
|
+
INSTANCES = 2
|
|
126
|
+
MAX_PAGES_PER_CONTEXT = 10
|
|
127
|
+
RETURN_COOKIES = False
|
|
128
|
+
|
|
129
|
+
URLS = [
|
|
130
|
+
"https://gut.bmj.com/content/75/6/1085",
|
|
131
|
+
"https://gut.bmj.com/content/75/6/1087",
|
|
132
|
+
"https://gut.bmj.com/content/75/6/1090",
|
|
133
|
+
"https://gut.bmj.com/content/75/6/1092",
|
|
134
|
+
"https://gut.bmj.com/content/75/6/1094",
|
|
135
|
+
"https://gut.bmj.com/content/75/6/1097",
|
|
136
|
+
"https://gut.bmj.com/content/75/6/1110",
|
|
137
|
+
"https://gut.bmj.com/content/75/6/1123",
|
|
138
|
+
"https://gut.bmj.com/content/75/6/1136",
|
|
139
|
+
"https://gut.bmj.com/content/75/6/1147",
|
|
140
|
+
"https://gut.bmj.com/content/75/6/1160",
|
|
141
|
+
"https://gut.bmj.com/content/75/6/1169",
|
|
142
|
+
"https://gut.bmj.com/content/75/6/1186",
|
|
143
|
+
"https://gut.bmj.com/content/75/6/1201",
|
|
144
|
+
"https://gut.bmj.com/content/75/6/1211",
|
|
145
|
+
"https://gut.bmj.com/content/75/6/1226",
|
|
146
|
+
"https://gut.bmj.com/content/75/6/1237",
|
|
147
|
+
"https://gut.bmj.com/content/75/6/1248",
|
|
148
|
+
"https://gut.bmj.com/content/75/6/1264",
|
|
149
|
+
"https://gut.bmj.com/content/75/6/1266.1",
|
|
150
|
+
"https://gut.bmj.com/content/75/6/1266.2",
|
|
151
|
+
"https://gut.bmj.com/content/75/6/1267",
|
|
152
|
+
"https://gut.bmj.com/content/75/6/1109",
|
|
153
|
+
"http://americanfootball.fandom.com/1993_Kentucky_vs._Mississippi",
|
|
154
|
+
"http://americanfootball.fandom.com/Isaiah_Foskey",
|
|
155
|
+
"http://americanfootball.fandom.com/wiki/2014_Susquehanna_Crusaders",
|
|
156
|
+
"http://americanfootball.fandom.com/wiki/2015_Lake_Forest_Foresters",
|
|
157
|
+
"http://americanfootball.fandom.com/wiki/2023_Colorado_State_Rams",
|
|
158
|
+
"http://americanfootballdatabase.fandom.com/Paul_Hackett_(American_football)",
|
|
159
|
+
"http://americanfootballdatabase.fandom.com/wiki/100th_Grey_Cup",
|
|
160
|
+
"https://www.sciencedirect.com/science/article/pii/S0039606025002491",
|
|
161
|
+
]
|
|
162
|
+
|
|
163
|
+
if __name__ == "__main__":
|
|
164
|
+
print(f"测试: {len(URLS)} 个 URL")
|
|
165
|
+
|
|
166
|
+
results = fetch_all(
|
|
167
|
+
URLS,
|
|
168
|
+
instances=INSTANCES,
|
|
169
|
+
concurrency=CONCURRENCY,
|
|
170
|
+
max_pages_per_context=MAX_PAGES_PER_CONTEXT,
|
|
171
|
+
headless=HEADLESS,
|
|
172
|
+
solve_cf=True,
|
|
173
|
+
proxy=PROXY,
|
|
174
|
+
return_cookies=RETURN_COOKIES,
|
|
175
|
+
verbose=False,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
ok = sum(1 for r in results if r["success"])
|
|
179
|
+
print(f"\n{'='*50}")
|
|
180
|
+
for r in results:
|
|
181
|
+
status = "✓" if r["success"] else "✗"
|
|
182
|
+
print(f" {status} {(r['title'] or 'FAILED')[:60]}")
|
|
183
|
+
print(f"{'='*50}")
|
|
184
|
+
print(f"结果: {ok}/{len(results)} 成功")
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
运行:`python test.py`
|
|
188
|
+
|
|
189
|
+
预期输出:
|
|
190
|
+
|
|
191
|
+
```
|
|
192
|
+
测试: 31 个 URL
|
|
193
|
+
|
|
194
|
+
==================================================
|
|
195
|
+
✓ Gut-peritoneal-multisystem axis in endometriosis | Gut
|
|
196
|
+
✓ Hitting the mitotic spot of fibrolamellar carcinoma | Gut
|
|
197
|
+
...
|
|
198
|
+
✓ 100th Grey Cup | American Football Database | Fandom
|
|
199
|
+
✓ Guidelines for perioperative care in elective colorectal sur
|
|
200
|
+
==================================================
|
|
201
|
+
结果: 31/31 成功
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
---
|
|
205
|
+
|
|
206
|
+
### 案例 B:PDF 文件下载
|
|
207
|
+
|
|
208
|
+
通过过 CF 后的浏览器页面发起 `fetch()` 下载 PDF,复用 TLS 指纹和 cookie。
|
|
209
|
+
|
|
210
|
+
```python
|
|
211
|
+
# -*- coding: utf-8 -*-
|
|
212
|
+
"""测试 download_file 方法 — PDF 下载"""
|
|
213
|
+
import asyncio
|
|
214
|
+
import os
|
|
215
|
+
import sys
|
|
216
|
+
|
|
217
|
+
if sys.platform == "win32":
|
|
218
|
+
import io
|
|
219
|
+
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding="utf-8", errors="replace")
|
|
220
|
+
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding="utf-8", errors="replace")
|
|
221
|
+
|
|
222
|
+
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
|
|
223
|
+
from cf_killer import CFPageFetcher
|
|
224
|
+
|
|
225
|
+
PDF_URL = "https://www.myavls.org/assets/pdf/SuperficialVenousDiseaseGuidelinesPMS313-02.03.16.pdf"
|
|
226
|
+
OUTPUT = os.path.join(os.path.dirname(os.path.abspath(__file__)), "SuperficialVenousDiseaseGuidelines.pdf")
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
async def main():
|
|
230
|
+
print(f"目标: {PDF_URL}")
|
|
231
|
+
print(f"保存: {OUTPUT}")
|
|
232
|
+
|
|
233
|
+
async with CFPageFetcher(
|
|
234
|
+
headless=True,
|
|
235
|
+
verbose=True,
|
|
236
|
+
solve_cf=True,
|
|
237
|
+
) as fetcher:
|
|
238
|
+
ok = await fetcher.download_file(PDF_URL, OUTPUT)
|
|
239
|
+
if ok:
|
|
240
|
+
size_kb = os.path.getsize(OUTPUT) / 1024
|
|
241
|
+
print(f"\n✅ 下载成功! 文件: {OUTPUT} ({size_kb:.0f} KB)")
|
|
242
|
+
else:
|
|
243
|
+
print(f"\n❌ 下载失败")
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
if __name__ == "__main__":
|
|
247
|
+
asyncio.run(main())
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
运行:`python test_download.py`
|
|
251
|
+
|
|
252
|
+
预期输出:
|
|
253
|
+
|
|
254
|
+
```
|
|
255
|
+
目标: https://www.myavls.org/assets/pdf/SuperficialVenousDiseaseGuidelinesPMS313-02.03.16.pdf
|
|
256
|
+
保存: ...\SuperficialVenousDiseaseGuidelines.pdf
|
|
257
|
+
[上下文] 已创建
|
|
258
|
+
[下载] 预热: https://www.myavls.org/
|
|
259
|
+
非 CF url=https://www.myavls.org/
|
|
260
|
+
[下载] 已保存: ...\SuperficialVenousDiseaseGuidelines.pdf (121KB)
|
|
261
|
+
|
|
262
|
+
✅ 下载成功! ... (121 KB)
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
---
|
|
266
|
+
|
|
267
|
+
## 5. 主要 API 参数
|
|
268
|
+
|
|
269
|
+
### `CFPageFetcher`
|
|
270
|
+
|
|
271
|
+
| 参数 | 类型 | 默认值 | 说明 |
|
|
272
|
+
|------|------|--------|------|
|
|
273
|
+
| `headless` | bool | True | 无头模式 |
|
|
274
|
+
| `humanize` | bool | False | 人类化鼠标轨迹/键盘时序 |
|
|
275
|
+
| `solve_cf` | bool | True | 自动求解 CF 挑战 |
|
|
276
|
+
| `cf_max_retries` | int | 5 | CF 求解最大重试次数 |
|
|
277
|
+
| `timeout` | int | 90000 | 页面导航超时 (ms) |
|
|
278
|
+
| `proxy` | str | None | 代理 URL |
|
|
279
|
+
| `max_pages_per_context` | int | 20 | 每 N 页回收浏览器上下文 |
|
|
280
|
+
| `return_cookies` | bool | False | 结果中是否包含 cookies |
|
|
281
|
+
|
|
282
|
+
### `fetch_all`
|
|
283
|
+
|
|
284
|
+
| 参数 | 类型 | 默认值 | 说明 |
|
|
285
|
+
|------|------|--------|------|
|
|
286
|
+
| `urls` | list | - | URL 列表 |
|
|
287
|
+
| `instances` | int | 1 | 并行浏览器实例数 |
|
|
288
|
+
| `concurrency` | int | 3 | 每实例并发 tab 数 |
|
|
289
|
+
| `max_pages_per_context` | int | 20 | 每 N 页自动回收 |
|
|
290
|
+
| `proxy` | str/list/callable | None | 单代理/代理列表/代理工厂函数 |
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
cf_killer/__init__.py,sha256=0aFOmuZMW7KyoWqDY-4cMabvimDIIrpKjazu0t2BRCA,821
|
|
2
|
+
cf_killer/core.py,sha256=ebcCQ21uHZIhQyWagqcrKydNDJSpyODnyOpe8WBilAU,34863
|
|
3
|
+
cf_killer-0.1.0.dist-info/METADATA,sha256=jnpZBkWFrkKI1Un1lQ5JpliDrdcGQu7tqpUUhE-Kjpg,10100
|
|
4
|
+
cf_killer-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
5
|
+
cf_killer-0.1.0.dist-info/top_level.txt,sha256=5uL1uAXfDxgHyngBtOhrXdBdQlYpyILreuQaZ0iFv1o,10
|
|
6
|
+
cf_killer-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
cf_killer
|