cfspider 1.7.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cfspider/stealth.py ADDED
@@ -0,0 +1,537 @@
1
+ """
2
+ CFspider 隐身模式模块
3
+
4
+ 提供完整的反爬虫规避能力,解决以下常见问题:
5
+
6
+ 1. 请求头不完整或不真实
7
+ - 问题:缺少 User-Agent, Accept-Language, Sec-Fetch-* 等头
8
+ - 解决:自动添加 15+ 个真实浏览器请求头
9
+
10
+ 2. 缺乏会话一致性
11
+ - 问题:频繁更换 IP、User-Agent,不处理 Cookie
12
+ - 解决:StealthSession 固定 User-Agent,自动管理 Cookie
13
+
14
+ 3. 行为模式单一
15
+ - 问题:只访问特定 API,没有随机停留等行为
16
+ - 解决:random_delay 随机延迟,auto_referer 自动添加来源
17
+
18
+ 使用方式:
19
+
20
+ 方式一:单次请求启用隐身模式
21
+ >>> response = cfspider.get(url, stealth=True, stealth_browser='chrome')
22
+
23
+ 方式二:使用 StealthSession 保持会话一致性
24
+ >>> with cfspider.StealthSession(browser='chrome', delay=(1, 3)) as session:
25
+ ... response1 = session.get(url1) # 自动添加请求头
26
+ ... response2 = session.get(url2) # 自动添加 Referer = url1
27
+
28
+ 支持的浏览器:
29
+ - chrome: Chrome 131(推荐,15 个请求头)
30
+ - firefox: Firefox 133(12 个请求头,含隐私保护头)
31
+ - safari: Safari 18(5 个请求头,macOS 风格)
32
+ - edge: Edge 131(14 个请求头)
33
+ - chrome_mobile: Chrome Mobile(10 个请求头,Android)
34
+ """
35
+
36
+ import random
37
+ import time
38
+ from typing import Optional, Dict, List, Tuple, Any
39
+ from urllib.parse import urlparse
40
+
41
+
42
+ # Chrome 131 完整请求头模板
43
+ CHROME_HEADERS = {
44
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
45
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
46
+ 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
47
+ 'Accept-Encoding': 'gzip, deflate, br, zstd',
48
+ 'Sec-Fetch-Dest': 'document',
49
+ 'Sec-Fetch-Mode': 'navigate',
50
+ 'Sec-Fetch-Site': 'none',
51
+ 'Sec-Fetch-User': '?1',
52
+ 'Sec-CH-UA': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
53
+ 'Sec-CH-UA-Mobile': '?0',
54
+ 'Sec-CH-UA-Platform': '"Windows"',
55
+ 'Upgrade-Insecure-Requests': '1',
56
+ 'Cache-Control': 'max-age=0',
57
+ 'Connection': 'keep-alive',
58
+ 'DNT': '1',
59
+ }
60
+
61
+ # Firefox 133 完整请求头模板
62
+ FIREFOX_HEADERS = {
63
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0',
64
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/png,image/svg+xml,*/*;q=0.8',
65
+ 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
66
+ 'Accept-Encoding': 'gzip, deflate, br, zstd',
67
+ 'Sec-Fetch-Dest': 'document',
68
+ 'Sec-Fetch-Mode': 'navigate',
69
+ 'Sec-Fetch-Site': 'none',
70
+ 'Sec-Fetch-User': '?1',
71
+ 'Upgrade-Insecure-Requests': '1',
72
+ 'Connection': 'keep-alive',
73
+ 'DNT': '1',
74
+ 'Sec-GPC': '1',
75
+ }
76
+
77
+ # Safari 18 完整请求头模板
78
+ SAFARI_HEADERS = {
79
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0 Safari/605.1.15',
80
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
81
+ 'Accept-Language': 'zh-CN,zh-Hans;q=0.9',
82
+ 'Accept-Encoding': 'gzip, deflate, br',
83
+ 'Connection': 'keep-alive',
84
+ }
85
+
86
+ # Edge 131 完整请求头模板
87
+ EDGE_HEADERS = {
88
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
89
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
90
+ 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
91
+ 'Accept-Encoding': 'gzip, deflate, br, zstd',
92
+ 'Sec-Fetch-Dest': 'document',
93
+ 'Sec-Fetch-Mode': 'navigate',
94
+ 'Sec-Fetch-Site': 'none',
95
+ 'Sec-Fetch-User': '?1',
96
+ 'Sec-CH-UA': '"Microsoft Edge";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
97
+ 'Sec-CH-UA-Mobile': '?0',
98
+ 'Sec-CH-UA-Platform': '"Windows"',
99
+ 'Upgrade-Insecure-Requests': '1',
100
+ 'Cache-Control': 'max-age=0',
101
+ 'Connection': 'keep-alive',
102
+ }
103
+
104
+ # 移动端 Chrome 请求头
105
+ CHROME_MOBILE_HEADERS = {
106
+ 'User-Agent': 'Mozilla/5.0 (Linux; Android 14; Pixel 8 Pro) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36',
107
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
108
+ 'Accept-Language': 'zh-CN,zh;q=0.9',
109
+ 'Accept-Encoding': 'gzip, deflate, br',
110
+ 'Sec-Fetch-Dest': 'document',
111
+ 'Sec-Fetch-Mode': 'navigate',
112
+ 'Sec-Fetch-Site': 'none',
113
+ 'Sec-Fetch-User': '?1',
114
+ 'Sec-CH-UA': '"Google Chrome";v="131", "Chromium";v="131"',
115
+ 'Sec-CH-UA-Mobile': '?1',
116
+ 'Sec-CH-UA-Platform': '"Android"',
117
+ 'Upgrade-Insecure-Requests': '1',
118
+ }
119
+
120
+ # 浏览器配置集合
121
+ BROWSER_PROFILES = {
122
+ 'chrome': CHROME_HEADERS,
123
+ 'firefox': FIREFOX_HEADERS,
124
+ 'safari': SAFARI_HEADERS,
125
+ 'edge': EDGE_HEADERS,
126
+ 'chrome_mobile': CHROME_MOBILE_HEADERS,
127
+ }
128
+
129
+ # 默认使用 Chrome
130
+ DEFAULT_BROWSER = 'chrome'
131
+
132
+
133
+ def get_stealth_headers(browser: str = 'chrome', custom_headers: Dict = None) -> Dict[str, str]:
134
+ """
135
+ 获取隐身模式请求头
136
+
137
+ Args:
138
+ browser: 浏览器类型 (chrome/firefox/safari/edge/chrome_mobile)
139
+ custom_headers: 自定义请求头(会覆盖默认值)
140
+
141
+ Returns:
142
+ 完整的浏览器请求头字典
143
+ """
144
+ headers = BROWSER_PROFILES.get(browser, CHROME_HEADERS).copy()
145
+ if custom_headers:
146
+ headers.update(custom_headers)
147
+ return headers
148
+
149
+
150
+ def get_random_browser_headers() -> Dict[str, str]:
151
+ """随机选择一个浏览器的请求头"""
152
+ browser = random.choice(list(BROWSER_PROFILES.keys()))
153
+ return get_stealth_headers(browser)
154
+
155
+
156
+ def random_delay(min_sec: float = 0.5, max_sec: float = 2.0) -> float:
157
+ """
158
+ 随机延迟,模拟人类行为
159
+
160
+ Args:
161
+ min_sec: 最小延迟秒数
162
+ max_sec: 最大延迟秒数
163
+
164
+ Returns:
165
+ 实际延迟的秒数
166
+ """
167
+ delay = random.uniform(min_sec, max_sec)
168
+ time.sleep(delay)
169
+ return delay
170
+
171
+
172
+ def get_referer(current_url: str, previous_url: str = None) -> Optional[str]:
173
+ """
174
+ 生成 Referer 头
175
+
176
+ Args:
177
+ current_url: 当前请求的 URL
178
+ previous_url: 上一个访问的 URL
179
+
180
+ Returns:
181
+ Referer 值
182
+ """
183
+ if previous_url:
184
+ return previous_url
185
+
186
+ # 如果没有上一个 URL,使用当前 URL 的首页作为 Referer
187
+ parsed = urlparse(current_url)
188
+ return f"{parsed.scheme}://{parsed.netloc}/"
189
+
190
+
191
+ def update_sec_fetch_headers(headers: Dict, site_type: str = 'none') -> Dict:
192
+ """
193
+ 更新 Sec-Fetch-* 请求头
194
+
195
+ Args:
196
+ headers: 原始请求头
197
+ site_type: 网站类型 (none/same-origin/same-site/cross-site)
198
+
199
+ Returns:
200
+ 更新后的请求头
201
+ """
202
+ headers = headers.copy()
203
+ headers['Sec-Fetch-Site'] = site_type
204
+
205
+ if site_type == 'none':
206
+ # 直接访问(如在地址栏输入URL)
207
+ headers['Sec-Fetch-Mode'] = 'navigate'
208
+ headers['Sec-Fetch-Dest'] = 'document'
209
+ elif site_type in ('same-origin', 'same-site'):
210
+ # 站内跳转
211
+ headers['Sec-Fetch-Mode'] = 'navigate'
212
+ headers['Sec-Fetch-Dest'] = 'document'
213
+ else:
214
+ # 跨站跳转
215
+ headers['Sec-Fetch-Mode'] = 'navigate'
216
+ headers['Sec-Fetch-Dest'] = 'document'
217
+
218
+ return headers
219
+
220
+
221
+ class StealthSession:
222
+ """
223
+ 隐身会话类
224
+
225
+ 提供完整的会话一致性管理,解决反爬虫检测的三大问题:
226
+
227
+ 1. 固定 User-Agent:整个会话使用同一个浏览器指纹
228
+ 2. 自动管理 Cookie:响应中的 Cookie 自动保存并在后续请求中发送
229
+ 3. 自动添加 Referer:页面跳转时自动添加来源信息
230
+ 4. 随机延迟:每次请求前随机等待,模拟人类行为
231
+ 5. 自动更新 Sec-Fetch-Site:根据 Referer 判断同站/跨站访问
232
+
233
+ Attributes:
234
+ browser (str): 当前使用的浏览器类型
235
+ cf_proxies (str): 代理地址
236
+ delay (tuple): 随机延迟范围
237
+ auto_referer (bool): 是否自动添加 Referer
238
+ last_url (str): 上一次请求的 URL
239
+ request_count (int): 会话累计请求次数
240
+
241
+ Example:
242
+ >>> import cfspider
243
+ >>>
244
+ >>> # 基本用法
245
+ >>> with cfspider.StealthSession(browser='chrome') as session:
246
+ ... # 第一次请求:Sec-Fetch-Site: none
247
+ ... r1 = session.get("https://example.com")
248
+ ...
249
+ ... # 第二次请求:自动添加 Referer: https://example.com
250
+ ... # Sec-Fetch-Site: same-origin
251
+ ... r2 = session.get("https://example.com/page2")
252
+ >>>
253
+ >>> # 带随机延迟
254
+ >>> with cfspider.StealthSession(delay=(1, 3)) as session:
255
+ ... for url in urls:
256
+ ... # 每次请求前随机等待 1-3 秒
257
+ ... response = session.get(url)
258
+ >>>
259
+ >>> # 结合代理使用
260
+ >>> with cfspider.StealthSession(
261
+ ... cf_proxies="https://your-workers.dev",
262
+ ... browser='firefox',
263
+ ... delay=(0.5, 2.0)
264
+ ... ) as session:
265
+ ... response = session.get("https://example.com")
266
+ ... print(f"请求次数: {session.request_count}")
267
+ ... print(f"当前 Cookie: {session.get_cookies()}")
268
+
269
+ Note:
270
+ StealthSession 与普通 Session 的区别:
271
+ - Session: 仅保持代理配置和基本请求头
272
+ - StealthSession: 完整的隐身模式,包括浏览器指纹、Cookie 管理、
273
+ 自动 Referer、随机延迟、Sec-Fetch-* 更新
274
+ """
275
+
276
+ def __init__(
277
+ self,
278
+ browser: str = 'chrome',
279
+ cf_proxies: str = None,
280
+ cf_workers: bool = True,
281
+ delay: Tuple[float, float] = None,
282
+ auto_referer: bool = True,
283
+ token: str = None,
284
+ **kwargs
285
+ ):
286
+ """
287
+ 初始化隐身会话
288
+
289
+ Args:
290
+ browser (str): 浏览器类型,决定使用的 User-Agent 和请求头模板
291
+ - 'chrome': Chrome 131(推荐,最完整的请求头,15 个)
292
+ - 'firefox': Firefox 133(含 Sec-GPC 隐私头,12 个)
293
+ - 'safari': Safari 18(macOS 风格,5 个)
294
+ - 'edge': Edge 131(类似 Chrome,14 个)
295
+ - 'chrome_mobile': Chrome Mobile(Android,10 个)
296
+ cf_proxies (str, optional): 代理地址
297
+ - 不指定则直接请求目标 URL
298
+ - 指定 Workers 地址时配合 cf_workers=True
299
+ - 指定普通代理时配合 cf_workers=False
300
+ cf_workers (bool): 是否使用 CFspider Workers API(默认 True)
301
+ delay (tuple, optional): 请求间随机延迟范围(秒)
302
+ - 如 (1, 3) 表示每次请求前随机等待 1-3 秒
303
+ - 第一次请求不会延迟
304
+ - 用于避免请求频率过高被检测
305
+ auto_referer (bool): 是否自动添加 Referer(默认 True)
306
+ - True: 自动使用上一个 URL 作为 Referer
307
+ - False: 不自动添加(但可以手动指定)
308
+ **kwargs: 保留参数,用于未来扩展
309
+
310
+ Example:
311
+ >>> session = cfspider.StealthSession(
312
+ ... browser='chrome',
313
+ ... cf_proxies='https://your-workers.dev',
314
+ ... delay=(1, 3),
315
+ ... auto_referer=True
316
+ ... )
317
+ """
318
+ self.browser = browser
319
+ self.cf_proxies = cf_proxies
320
+ self.cf_workers = cf_workers
321
+ self.delay = delay
322
+ self.auto_referer = auto_referer
323
+ self.token = token
324
+ self.last_url = None
325
+ self.request_count = 0
326
+ self._extra_kwargs = kwargs
327
+
328
+ # 获取固定的浏览器请求头
329
+ self._base_headers = get_stealth_headers(browser)
330
+
331
+ # Cookie 管理
332
+ self._cookies = {}
333
+
334
+ def _prepare_headers(self, url: str, headers: Dict = None) -> Dict:
335
+ """准备请求头"""
336
+ final_headers = self._base_headers.copy()
337
+
338
+ # 添加 Referer
339
+ if self.auto_referer and self.last_url:
340
+ parsed_current = urlparse(url)
341
+ parsed_last = urlparse(self.last_url)
342
+
343
+ if parsed_current.netloc == parsed_last.netloc:
344
+ # 同站跳转
345
+ final_headers['Referer'] = self.last_url
346
+ final_headers = update_sec_fetch_headers(final_headers, 'same-origin')
347
+ else:
348
+ # 跨站跳转
349
+ final_headers['Referer'] = self.last_url
350
+ final_headers = update_sec_fetch_headers(final_headers, 'cross-site')
351
+
352
+ # 合并自定义请求头
353
+ if headers:
354
+ final_headers.update(headers)
355
+
356
+ return final_headers
357
+
358
+ def _apply_delay(self):
359
+ """应用请求延迟"""
360
+ if self.delay and self.request_count > 0:
361
+ random_delay(self.delay[0], self.delay[1])
362
+
363
+ def _update_cookies(self, response):
364
+ """更新 Cookie"""
365
+ if hasattr(response, 'cookies'):
366
+ for cookie in response.cookies:
367
+ self._cookies[cookie.name] = cookie.value
368
+
369
+ def get(self, url: str, **kwargs) -> Any:
370
+ """
371
+ 发送 GET 请求
372
+
373
+ Args:
374
+ url: 目标 URL
375
+ **kwargs: 其他参数
376
+
377
+ Returns:
378
+ 响应对象
379
+ """
380
+ from .api import get as _get
381
+
382
+ self._apply_delay()
383
+
384
+ headers = self._prepare_headers(url, kwargs.pop('headers', None))
385
+
386
+ # 添加 Cookie
387
+ cookies = kwargs.pop('cookies', {})
388
+ cookies.update(self._cookies)
389
+
390
+ response = _get(
391
+ url,
392
+ cf_proxies=self.cf_proxies,
393
+ cf_workers=self.cf_workers,
394
+ token=self.token,
395
+ headers=headers,
396
+ cookies=cookies,
397
+ **kwargs
398
+ )
399
+
400
+ self._update_cookies(response)
401
+ self.last_url = url
402
+ self.request_count += 1
403
+
404
+ return response
405
+
406
+ def post(self, url: str, **kwargs) -> Any:
407
+ """发送 POST 请求"""
408
+ from .api import post as _post
409
+
410
+ self._apply_delay()
411
+
412
+ headers = self._prepare_headers(url, kwargs.pop('headers', None))
413
+
414
+ # POST 请求的特殊头
415
+ if 'json' in kwargs or 'data' in kwargs:
416
+ headers.setdefault('Content-Type', 'application/x-www-form-urlencoded')
417
+
418
+ cookies = kwargs.pop('cookies', {})
419
+ cookies.update(self._cookies)
420
+
421
+ response = _post(
422
+ url,
423
+ cf_proxies=self.cf_proxies,
424
+ cf_workers=self.cf_workers,
425
+ token=self.token,
426
+ headers=headers,
427
+ cookies=cookies,
428
+ **kwargs
429
+ )
430
+
431
+ self._update_cookies(response)
432
+ self.last_url = url
433
+ self.request_count += 1
434
+
435
+ return response
436
+
437
+ def put(self, url: str, **kwargs) -> Any:
438
+ """发送 PUT 请求"""
439
+ from .api import put as _put
440
+
441
+ self._apply_delay()
442
+ headers = self._prepare_headers(url, kwargs.pop('headers', None))
443
+ cookies = kwargs.pop('cookies', {})
444
+ cookies.update(self._cookies)
445
+ response = _put(
446
+ url,
447
+ cf_proxies=self.cf_proxies,
448
+ cf_workers=self.cf_workers,
449
+ token=self.token,
450
+ headers=headers,
451
+ cookies=cookies,
452
+ **kwargs
453
+ )
454
+ self._update_cookies(response)
455
+ self.last_url = url
456
+ self.request_count += 1
457
+ return response
458
+
459
+ def delete(self, url: str, **kwargs) -> Any:
460
+ """发送 DELETE 请求"""
461
+ from .api import delete as _delete
462
+
463
+ self._apply_delay()
464
+ headers = self._prepare_headers(url, kwargs.pop('headers', None))
465
+ cookies = kwargs.pop('cookies', {})
466
+ cookies.update(self._cookies)
467
+ response = _delete(
468
+ url,
469
+ cf_proxies=self.cf_proxies,
470
+ cf_workers=self.cf_workers,
471
+ token=self.token,
472
+ headers=headers,
473
+ cookies=cookies,
474
+ **kwargs
475
+ )
476
+ self._update_cookies(response)
477
+ self.last_url = url
478
+ self.request_count += 1
479
+ return response
480
+
481
+ def head(self, url: str, **kwargs) -> Any:
482
+ """发送 HEAD 请求"""
483
+ from .api import head as _head
484
+
485
+ self._apply_delay()
486
+ headers = self._prepare_headers(url, kwargs.pop('headers', None))
487
+ cookies = kwargs.pop('cookies', {})
488
+ cookies.update(self._cookies)
489
+ response = _head(
490
+ url,
491
+ cf_proxies=self.cf_proxies,
492
+ cf_workers=self.cf_workers,
493
+ token=self.token,
494
+ headers=headers,
495
+ cookies=cookies,
496
+ **kwargs
497
+ )
498
+ self._update_cookies(response)
499
+ self.last_url = url
500
+ self.request_count += 1
501
+ return response
502
+
503
+ def get_cookies(self) -> Dict[str, str]:
504
+ """获取当前会话的所有 Cookie"""
505
+ return self._cookies.copy()
506
+
507
+ def set_cookie(self, name: str, value: str):
508
+ """设置 Cookie"""
509
+ self._cookies[name] = value
510
+
511
+ def clear_cookies(self):
512
+ """清除所有 Cookie"""
513
+ self._cookies.clear()
514
+
515
+ def get_headers(self) -> Dict[str, str]:
516
+ """获取当前会话的基础请求头"""
517
+ return self._base_headers.copy()
518
+
519
+ def close(self):
520
+ """关闭会话"""
521
+ pass # 无需清理,每次请求都是独立的
522
+
523
+ def __enter__(self):
524
+ return self
525
+
526
+ def __exit__(self, exc_type, exc_val, exc_tb):
527
+ self.close()
528
+
529
+
530
+ # 支持的浏览器列表
531
+ SUPPORTED_BROWSERS = list(BROWSER_PROFILES.keys())
532
+
533
+
534
+ def get_supported_browsers() -> List[str]:
535
+ """获取支持的浏览器列表"""
536
+ return SUPPORTED_BROWSERS.copy()
537
+