crawlo 1.3.3__py3-none-any.whl → 1.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (289) hide show
  1. crawlo/__init__.py +87 -63
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/help.py +138 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +341 -323
  10. crawlo/commands/startproject.py +436 -436
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +312 -312
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +46 -2
  16. crawlo/core/engine.py +439 -365
  17. crawlo/core/processor.py +40 -40
  18. crawlo/core/scheduler.py +257 -256
  19. crawlo/crawler.py +639 -1167
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +273 -273
  23. crawlo/downloader/aiohttp_downloader.py +228 -226
  24. crawlo/downloader/cffi_downloader.py +245 -245
  25. crawlo/downloader/httpx_downloader.py +259 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +402 -402
  28. crawlo/downloader/selenium_downloader.py +472 -472
  29. crawlo/event.py +11 -11
  30. crawlo/exceptions.py +81 -81
  31. crawlo/extension/__init__.py +39 -39
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +57 -57
  34. crawlo/extension/log_stats.py +81 -81
  35. crawlo/extension/logging_extension.py +61 -52
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +28 -0
  40. crawlo/factories/base.py +69 -0
  41. crawlo/factories/crawler.py +104 -0
  42. crawlo/factories/registry.py +85 -0
  43. crawlo/filters/__init__.py +154 -154
  44. crawlo/filters/aioredis_filter.py +257 -234
  45. crawlo/filters/memory_filter.py +269 -269
  46. crawlo/framework.py +292 -0
  47. crawlo/initialization/__init__.py +40 -0
  48. crawlo/initialization/built_in.py +426 -0
  49. crawlo/initialization/context.py +142 -0
  50. crawlo/initialization/core.py +194 -0
  51. crawlo/initialization/phases.py +149 -0
  52. crawlo/initialization/registry.py +146 -0
  53. crawlo/items/__init__.py +23 -23
  54. crawlo/items/base.py +23 -22
  55. crawlo/items/fields.py +52 -52
  56. crawlo/items/items.py +104 -104
  57. crawlo/logging/__init__.py +38 -0
  58. crawlo/logging/config.py +97 -0
  59. crawlo/logging/factory.py +129 -0
  60. crawlo/logging/manager.py +112 -0
  61. crawlo/middleware/__init__.py +21 -21
  62. crawlo/middleware/default_header.py +132 -132
  63. crawlo/middleware/download_delay.py +104 -104
  64. crawlo/middleware/middleware_manager.py +135 -135
  65. crawlo/middleware/offsite.py +123 -123
  66. crawlo/middleware/proxy.py +386 -386
  67. crawlo/middleware/request_ignore.py +86 -86
  68. crawlo/middleware/response_code.py +163 -163
  69. crawlo/middleware/response_filter.py +136 -136
  70. crawlo/middleware/retry.py +124 -124
  71. crawlo/middleware/simple_proxy.py +65 -65
  72. crawlo/mode_manager.py +212 -187
  73. crawlo/network/__init__.py +21 -21
  74. crawlo/network/request.py +379 -379
  75. crawlo/network/response.py +359 -359
  76. crawlo/pipelines/__init__.py +21 -21
  77. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  78. crawlo/pipelines/console_pipeline.py +39 -39
  79. crawlo/pipelines/csv_pipeline.py +316 -316
  80. crawlo/pipelines/database_dedup_pipeline.py +222 -222
  81. crawlo/pipelines/json_pipeline.py +218 -218
  82. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  83. crawlo/pipelines/mongo_pipeline.py +131 -131
  84. crawlo/pipelines/mysql_pipeline.py +325 -318
  85. crawlo/pipelines/pipeline_manager.py +76 -75
  86. crawlo/pipelines/redis_dedup_pipeline.py +166 -166
  87. crawlo/project.py +327 -325
  88. crawlo/queue/pqueue.py +43 -37
  89. crawlo/queue/queue_manager.py +503 -379
  90. crawlo/queue/redis_priority_queue.py +326 -306
  91. crawlo/settings/__init__.py +7 -7
  92. crawlo/settings/default_settings.py +321 -225
  93. crawlo/settings/setting_manager.py +214 -198
  94. crawlo/spider/__init__.py +657 -639
  95. crawlo/stats_collector.py +73 -59
  96. crawlo/subscriber.py +129 -129
  97. crawlo/task_manager.py +139 -30
  98. crawlo/templates/crawlo.cfg.tmpl +10 -10
  99. crawlo/templates/project/__init__.py.tmpl +3 -3
  100. crawlo/templates/project/items.py.tmpl +17 -17
  101. crawlo/templates/project/middlewares.py.tmpl +118 -118
  102. crawlo/templates/project/pipelines.py.tmpl +96 -96
  103. crawlo/templates/project/settings.py.tmpl +168 -267
  104. crawlo/templates/project/settings_distributed.py.tmpl +167 -180
  105. crawlo/templates/project/settings_gentle.py.tmpl +167 -61
  106. crawlo/templates/project/settings_high_performance.py.tmpl +168 -131
  107. crawlo/templates/project/settings_minimal.py.tmpl +66 -35
  108. crawlo/templates/project/settings_simple.py.tmpl +165 -102
  109. crawlo/templates/project/spiders/__init__.py.tmpl +10 -6
  110. crawlo/templates/run.py.tmpl +34 -38
  111. crawlo/templates/spider/spider.py.tmpl +143 -143
  112. crawlo/templates/spiders_init.py.tmpl +10 -0
  113. crawlo/tools/__init__.py +200 -200
  114. crawlo/tools/anti_crawler.py +268 -268
  115. crawlo/tools/authenticated_proxy.py +240 -240
  116. crawlo/tools/data_formatter.py +225 -225
  117. crawlo/tools/data_validator.py +180 -180
  118. crawlo/tools/date_tools.py +289 -289
  119. crawlo/tools/distributed_coordinator.py +388 -388
  120. crawlo/tools/encoding_converter.py +127 -127
  121. crawlo/tools/network_diagnostic.py +365 -0
  122. crawlo/tools/request_tools.py +82 -82
  123. crawlo/tools/retry_mechanism.py +224 -224
  124. crawlo/tools/scenario_adapter.py +262 -262
  125. crawlo/tools/text_cleaner.py +232 -232
  126. crawlo/utils/__init__.py +34 -34
  127. crawlo/utils/batch_processor.py +259 -259
  128. crawlo/utils/class_loader.py +26 -0
  129. crawlo/utils/controlled_spider_mixin.py +439 -439
  130. crawlo/utils/db_helper.py +343 -343
  131. crawlo/utils/enhanced_error_handler.py +356 -356
  132. crawlo/utils/env_config.py +142 -142
  133. crawlo/utils/error_handler.py +165 -124
  134. crawlo/utils/func_tools.py +82 -82
  135. crawlo/utils/large_scale_config.py +286 -286
  136. crawlo/utils/large_scale_helper.py +344 -344
  137. crawlo/utils/log.py +80 -200
  138. crawlo/utils/performance_monitor.py +285 -285
  139. crawlo/utils/queue_helper.py +175 -175
  140. crawlo/utils/redis_connection_pool.py +388 -351
  141. crawlo/utils/redis_key_validator.py +198 -198
  142. crawlo/utils/request.py +267 -267
  143. crawlo/utils/request_serializer.py +225 -218
  144. crawlo/utils/spider_loader.py +61 -61
  145. crawlo/utils/system.py +11 -11
  146. crawlo/utils/tools.py +4 -4
  147. crawlo/utils/url.py +39 -39
  148. {crawlo-1.3.3.dist-info → crawlo-1.3.5.dist-info}/METADATA +1126 -1020
  149. crawlo-1.3.5.dist-info/RECORD +288 -0
  150. examples/__init__.py +7 -7
  151. tests/__init__.py +7 -7
  152. tests/advanced_tools_example.py +275 -275
  153. tests/authenticated_proxy_example.py +107 -107
  154. tests/baidu_performance_test.py +109 -0
  155. tests/baidu_test.py +60 -0
  156. tests/cleaners_example.py +160 -160
  157. tests/comprehensive_framework_test.py +213 -0
  158. tests/comprehensive_test.py +82 -0
  159. tests/comprehensive_testing_summary.md +187 -0
  160. tests/config_validation_demo.py +142 -142
  161. tests/controlled_spider_example.py +205 -205
  162. tests/date_tools_example.py +180 -180
  163. tests/debug_configure.py +70 -0
  164. tests/debug_framework_logger.py +85 -0
  165. tests/debug_log_config.py +127 -0
  166. tests/debug_log_levels.py +64 -0
  167. tests/debug_pipelines.py +66 -66
  168. tests/detailed_log_test.py +234 -0
  169. tests/distributed_test.py +67 -0
  170. tests/distributed_test_debug.py +77 -0
  171. tests/dynamic_loading_example.py +523 -523
  172. tests/dynamic_loading_test.py +104 -104
  173. tests/env_config_example.py +133 -133
  174. tests/error_handling_example.py +171 -171
  175. tests/final_command_test_report.md +0 -0
  176. tests/final_comprehensive_test.py +152 -0
  177. tests/final_log_test.py +261 -0
  178. tests/final_validation_test.py +183 -0
  179. tests/fix_log_test.py +143 -0
  180. tests/framework_performance_test.py +203 -0
  181. tests/log_buffering_test.py +112 -0
  182. tests/log_generation_timing_test.py +154 -0
  183. tests/optimized_performance_test.py +212 -0
  184. tests/performance_comparison.py +246 -0
  185. tests/queue_blocking_test.py +114 -0
  186. tests/queue_test.py +90 -0
  187. tests/redis_key_validation_demo.py +130 -130
  188. tests/request_params_example.py +150 -150
  189. tests/response_improvements_example.py +144 -144
  190. tests/scrapy_comparison/ofweek_scrapy.py +139 -0
  191. tests/scrapy_comparison/scrapy_test.py +134 -0
  192. tests/simple_command_test.py +120 -0
  193. tests/simple_crawlo_test.py +128 -0
  194. tests/simple_log_test.py +58 -0
  195. tests/simple_log_test2.py +138 -0
  196. tests/simple_optimization_test.py +129 -0
  197. tests/simple_spider_test.py +50 -0
  198. tests/simple_test.py +48 -0
  199. tests/spider_log_timing_test.py +178 -0
  200. tests/test_advanced_tools.py +148 -148
  201. tests/test_all_commands.py +231 -0
  202. tests/test_all_redis_key_configs.py +145 -145
  203. tests/test_authenticated_proxy.py +141 -141
  204. tests/test_batch_processor.py +179 -0
  205. tests/test_cleaners.py +54 -54
  206. tests/test_component_factory.py +175 -0
  207. tests/test_comprehensive.py +146 -146
  208. tests/test_config_consistency.py +80 -80
  209. tests/test_config_merge.py +152 -152
  210. tests/test_config_validator.py +182 -182
  211. tests/test_controlled_spider_mixin.py +80 -0
  212. tests/test_crawlo_proxy_integration.py +108 -108
  213. tests/test_date_tools.py +123 -123
  214. tests/test_default_header_middleware.py +158 -158
  215. tests/test_distributed.py +65 -65
  216. tests/test_double_crawlo_fix.py +207 -207
  217. tests/test_double_crawlo_fix_simple.py +124 -124
  218. tests/test_download_delay_middleware.py +221 -221
  219. tests/test_downloader_proxy_compatibility.py +268 -268
  220. tests/test_dynamic_downloaders_proxy.py +124 -124
  221. tests/test_dynamic_proxy.py +92 -92
  222. tests/test_dynamic_proxy_config.py +146 -146
  223. tests/test_dynamic_proxy_real.py +109 -109
  224. tests/test_edge_cases.py +303 -303
  225. tests/test_enhanced_error_handler.py +270 -270
  226. tests/test_enhanced_error_handler_comprehensive.py +246 -0
  227. tests/test_env_config.py +121 -121
  228. tests/test_error_handler_compatibility.py +112 -112
  229. tests/test_factories.py +253 -0
  230. tests/test_final_validation.py +153 -153
  231. tests/test_framework_env_usage.py +103 -103
  232. tests/test_framework_logger.py +67 -0
  233. tests/test_framework_startup.py +65 -0
  234. tests/test_get_component_logger.py +84 -0
  235. tests/test_integration.py +169 -169
  236. tests/test_item_dedup_redis_key.py +122 -122
  237. tests/test_large_scale_config.py +113 -0
  238. tests/test_large_scale_helper.py +236 -0
  239. tests/test_logging_system.py +283 -0
  240. tests/test_mode_change.py +73 -0
  241. tests/test_mode_consistency.py +51 -51
  242. tests/test_offsite_middleware.py +221 -221
  243. tests/test_parsel.py +29 -29
  244. tests/test_performance.py +327 -327
  245. tests/test_performance_monitor.py +116 -0
  246. tests/test_proxy_api.py +264 -264
  247. tests/test_proxy_health_check.py +32 -32
  248. tests/test_proxy_middleware.py +121 -121
  249. tests/test_proxy_middleware_enhanced.py +216 -216
  250. tests/test_proxy_middleware_integration.py +136 -136
  251. tests/test_proxy_middleware_refactored.py +184 -184
  252. tests/test_proxy_providers.py +56 -56
  253. tests/test_proxy_stats.py +19 -19
  254. tests/test_proxy_strategies.py +59 -59
  255. tests/test_queue_empty_check.py +42 -0
  256. tests/test_queue_manager_double_crawlo.py +173 -173
  257. tests/test_queue_manager_redis_key.py +176 -176
  258. tests/test_random_user_agent.py +72 -72
  259. tests/test_real_scenario_proxy.py +195 -195
  260. tests/test_redis_config.py +28 -28
  261. tests/test_redis_connection_pool.py +294 -294
  262. tests/test_redis_key_naming.py +181 -181
  263. tests/test_redis_key_validator.py +123 -123
  264. tests/test_redis_queue.py +224 -224
  265. tests/test_request_ignore_middleware.py +182 -182
  266. tests/test_request_params.py +111 -111
  267. tests/test_request_serialization.py +70 -70
  268. tests/test_response_code_middleware.py +349 -349
  269. tests/test_response_filter_middleware.py +427 -427
  270. tests/test_response_improvements.py +152 -152
  271. tests/test_retry_middleware.py +241 -241
  272. tests/test_scheduler.py +252 -252
  273. tests/test_scheduler_config_update.py +133 -133
  274. tests/test_simple_response.py +61 -61
  275. tests/test_telecom_spider_redis_key.py +205 -205
  276. tests/test_template_content.py +87 -87
  277. tests/test_template_redis_key.py +134 -134
  278. tests/test_tools.py +159 -159
  279. tests/test_user_agents.py +96 -96
  280. tests/tools_example.py +260 -260
  281. tests/untested_features_report.md +139 -0
  282. tests/verify_debug.py +52 -0
  283. tests/verify_distributed.py +117 -117
  284. tests/verify_log_fix.py +112 -0
  285. crawlo-1.3.3.dist-info/RECORD +0 -219
  286. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -82
  287. {crawlo-1.3.3.dist-info → crawlo-1.3.5.dist-info}/WHEEL +0 -0
  288. {crawlo-1.3.3.dist-info → crawlo-1.3.5.dist-info}/entry_points.txt +0 -0
  289. {crawlo-1.3.3.dist-info → crawlo-1.3.5.dist-info}/top_level.txt +0 -0
@@ -1,269 +1,269 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- # @Time : 2025-09-10 22:00
5
- # @Author : crawl-coder
6
- # @Desc : 反爬虫应对工具
7
- """
8
-
9
- import asyncio
10
- import random
11
- import time
12
- from typing import Dict, Any, Optional, List, Callable
13
-
14
-
15
- class ProxyPoolManager:
16
- """代理池管理器类"""
17
-
18
- def __init__(self, proxies: Optional[List[Dict[str, str]]] = None):
19
- """
20
- 初始化代理池管理器
21
-
22
- Args:
23
- proxies (Optional[List[Dict[str, str]]]): 代理列表
24
- """
25
- self.proxies = proxies or [
26
- {"http": "http://proxy1.example.com:8080", "https": "https://proxy1.example.com:8080"},
27
- {"http": "http://proxy2.example.com:8080", "https": "https://proxy2.example.com:8080"},
28
- {"http": "http://proxy3.example.com:8080", "https": "https://proxy3.example.com:8080"}
29
- ]
30
- self.proxy_status = {id(proxy): {"last_used": 0, "success_count": 0, "fail_count": 0}
31
- for proxy in self.proxies}
32
-
33
- def get_random_proxy(self) -> Dict[str, str]:
34
- """
35
- 获取随机代理
36
-
37
- Returns:
38
- Dict[str, str]: 代理配置
39
- """
40
- return random.choice(self.proxies)
41
-
42
- def get_best_proxy(self) -> Dict[str, str]:
43
- """
44
- 根据成功率获取最佳代理
45
-
46
- Returns:
47
- Dict[str, str]: 代理配置
48
- """
49
- if not self.proxy_status:
50
- return self.get_random_proxy()
51
-
52
- # 计算每个代理的成功率
53
- proxy_scores = []
54
- for proxy in self.proxies:
55
- proxy_id = id(proxy)
56
- status = self.proxy_status.get(proxy_id, {"success_count": 0, "fail_count": 0})
57
- total = status["success_count"] + status["fail_count"]
58
-
59
- if total == 0:
60
- score = 0.5 # 默认成功率
61
- else:
62
- score = status["success_count"] / total
63
-
64
- proxy_scores.append((proxy, score))
65
-
66
- # 按成功率排序,返回成功率最高的代理
67
- proxy_scores.sort(key=lambda x: x[1], reverse=True)
68
- return proxy_scores[0][0]
69
-
70
- def report_proxy_result(self, proxy: Dict[str, str], success: bool) -> None:
71
- """
72
- 报告代理使用结果
73
-
74
- Args:
75
- proxy (Dict[str, str]): 代理配置
76
- success (bool): 是否成功
77
- """
78
- proxy_id = id(proxy)
79
- if proxy_id not in self.proxy_status:
80
- self.proxy_status[proxy_id] = {"last_used": 0, "success_count": 0, "fail_count": 0}
81
-
82
- status = self.proxy_status[proxy_id]
83
- status["last_used"] = time.time()
84
-
85
- if success:
86
- status["success_count"] += 1
87
- else:
88
- status["fail_count"] += 1
89
-
90
- def remove_invalid_proxy(self, proxy: Dict[str, str]) -> None:
91
- """
92
- 移除无效代理
93
-
94
- Args:
95
- proxy (Dict[str, str]): 代理配置
96
- """
97
- if proxy in self.proxies:
98
- self.proxies.remove(proxy)
99
- proxy_id = id(proxy)
100
- if proxy_id in self.proxy_status:
101
- del self.proxy_status[proxy_id]
102
-
103
-
104
- class CaptchaHandler:
105
- """验证码处理器类"""
106
-
107
- def __init__(self, captcha_service: Optional[Callable] = None):
108
- """
109
- 初始化验证码处理器
110
-
111
- Args:
112
- captcha_service (Optional[Callable]): 验证码识别服务
113
- """
114
- self.captcha_service = captcha_service
115
-
116
- async def recognize_captcha(self, image_data: bytes,
117
- captcha_type: str = "image") -> Optional[str]:
118
- """
119
- 识别验证码
120
-
121
- Args:
122
- image_data (bytes): 验证码图片数据
123
- captcha_type (str): 验证码类型
124
-
125
- Returns:
126
- Optional[str]: 识别结果
127
- """
128
- if self.captcha_service:
129
- try:
130
- return await self.captcha_service(image_data, captcha_type)
131
- except Exception:
132
- return None
133
- else:
134
- # 如果没有配置验证码服务,返回None
135
- return None
136
-
137
- async def handle_manual_captcha(self, prompt: str = "请输入验证码: ") -> str:
138
- """
139
- 处理手动验证码输入
140
-
141
- Args:
142
- prompt (str): 提示信息
143
-
144
- Returns:
145
- str: 用户输入的验证码
146
- """
147
- # 在实际应用中,这里可能需要与用户界面交互
148
- # 为了演示目的,我们模拟用户输入
149
- print(prompt)
150
- return input() if not asyncio.get_event_loop().is_running() else ""
151
-
152
-
153
- class AntiCrawler:
154
- """反爬虫应对工具类"""
155
-
156
- def __init__(self, proxies: Optional[List[Dict[str, str]]] = None,
157
- captcha_service: Optional[Callable] = None):
158
- """
159
- 初始化反爬虫应对工具
160
-
161
- Args:
162
- proxies (Optional[List[Dict[str, str]]]): 代理列表
163
- captcha_service (Optional[Callable]): 验证码识别服务
164
- """
165
- self.proxy_manager = ProxyPoolManager(proxies)
166
- self.captcha_handler = CaptchaHandler(captcha_service)
167
-
168
- def get_random_user_agent(self) -> str:
169
- """
170
- 获取随机User-Agent
171
-
172
- Returns:
173
- str: 随机User-Agent
174
- """
175
- user_agents = [
176
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
177
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
178
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
179
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
180
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59",
181
- "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1",
182
- "Mozilla/5.0 (Linux; Android 11; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36"
183
- ]
184
- return random.choice(user_agents)
185
-
186
- def rotate_proxy(self) -> Dict[str, str]:
187
- """
188
- 轮换代理
189
-
190
- Returns:
191
- Dict[str, str]: 代理配置
192
- """
193
- return self.proxy_manager.get_best_proxy()
194
-
195
- def handle_captcha(self, response_text: str) -> bool:
196
- """
197
- 检测是否遇到验证码
198
-
199
- Args:
200
- response_text (str): 响应文本
201
-
202
- Returns:
203
- bool: 是否遇到验证码
204
- """
205
- captcha_keywords = ["captcha", "verify", "验证", "验证码", "human verification"]
206
- return any(keyword in response_text.lower() for keyword in captcha_keywords)
207
-
208
- def detect_rate_limiting(self, status_code: int, response_headers: Dict[str, Any]) -> bool:
209
- """
210
- 检测是否遇到频率限制
211
-
212
- Args:
213
- status_code (int): HTTP状态码
214
- response_headers (Dict[str, Any]): 响应头
215
-
216
- Returns:
217
- bool: 是否遇到频率限制
218
- """
219
- # 检查状态码
220
- if status_code in [429, 503]:
221
- return True
222
-
223
- # 检查响应头
224
- rate_limit_headers = ["x-ratelimit-remaining", "retry-after", "x-ratelimit-reset"]
225
- return any(header.lower() in [k.lower() for k in response_headers.keys()]
226
- for header in rate_limit_headers)
227
-
228
- def random_delay(self, min_delay: float = 1.0, max_delay: float = 3.0) -> None:
229
- """
230
- 随机延迟,避免请求过于频繁
231
-
232
- Args:
233
- min_delay (float): 最小延迟时间(秒)
234
- max_delay (float): 最大延迟时间(秒)
235
- """
236
- delay = random.uniform(min_delay, max_delay)
237
- time.sleep(delay)
238
-
239
- async def async_random_delay(self, min_delay: float = 1.0, max_delay: float = 3.0) -> None:
240
- """
241
- 异步随机延迟,避免请求过于频繁
242
-
243
- Args:
244
- min_delay (float): 最小延迟时间(秒)
245
- max_delay (float): 最大延迟时间(秒)
246
- """
247
- delay = random.uniform(min_delay, max_delay)
248
- await asyncio.sleep(delay)
249
-
250
-
251
- # 便捷函数
252
- def get_random_user_agent() -> str:
253
- """获取随机User-Agent"""
254
- return AntiCrawler().get_random_user_agent()
255
-
256
-
257
- def rotate_proxy(proxies: Optional[List[Dict[str, str]]] = None) -> Dict[str, str]:
258
- """轮换代理"""
259
- return AntiCrawler(proxies).rotate_proxy()
260
-
261
-
262
- def handle_captcha(response_text: str) -> bool:
263
- """检测是否遇到验证码"""
264
- return AntiCrawler().handle_captcha(response_text)
265
-
266
-
267
- def detect_rate_limiting(status_code: int, response_headers: Dict[str, Any]) -> bool:
268
- """检测是否遇到频率限制"""
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ # @Time : 2025-09-10 22:00
5
+ # @Author : crawl-coder
6
+ # @Desc : 反爬虫应对工具
7
+ """
8
+
9
+ import asyncio
10
+ import random
11
+ import time
12
+ from typing import Dict, Any, Optional, List, Callable
13
+
14
+
15
+ class ProxyPoolManager:
16
+ """代理池管理器类"""
17
+
18
+ def __init__(self, proxies: Optional[List[Dict[str, str]]] = None):
19
+ """
20
+ 初始化代理池管理器
21
+
22
+ Args:
23
+ proxies (Optional[List[Dict[str, str]]]): 代理列表
24
+ """
25
+ self.proxies = proxies or [
26
+ {"http": "http://proxy1.example.com:8080", "https": "https://proxy1.example.com:8080"},
27
+ {"http": "http://proxy2.example.com:8080", "https": "https://proxy2.example.com:8080"},
28
+ {"http": "http://proxy3.example.com:8080", "https": "https://proxy3.example.com:8080"}
29
+ ]
30
+ self.proxy_status = {id(proxy): {"last_used": 0, "success_count": 0, "fail_count": 0}
31
+ for proxy in self.proxies}
32
+
33
+ def get_random_proxy(self) -> Dict[str, str]:
34
+ """
35
+ 获取随机代理
36
+
37
+ Returns:
38
+ Dict[str, str]: 代理配置
39
+ """
40
+ return random.choice(self.proxies)
41
+
42
+ def get_best_proxy(self) -> Dict[str, str]:
43
+ """
44
+ 根据成功率获取最佳代理
45
+
46
+ Returns:
47
+ Dict[str, str]: 代理配置
48
+ """
49
+ if not self.proxy_status:
50
+ return self.get_random_proxy()
51
+
52
+ # 计算每个代理的成功率
53
+ proxy_scores = []
54
+ for proxy in self.proxies:
55
+ proxy_id = id(proxy)
56
+ status = self.proxy_status.get(proxy_id, {"success_count": 0, "fail_count": 0})
57
+ total = status["success_count"] + status["fail_count"]
58
+
59
+ if total == 0:
60
+ score = 0.5 # 默认成功率
61
+ else:
62
+ score = status["success_count"] / total
63
+
64
+ proxy_scores.append((proxy, score))
65
+
66
+ # 按成功率排序,返回成功率最高的代理
67
+ proxy_scores.sort(key=lambda x: x[1], reverse=True)
68
+ return proxy_scores[0][0]
69
+
70
+ def report_proxy_result(self, proxy: Dict[str, str], success: bool) -> None:
71
+ """
72
+ 报告代理使用结果
73
+
74
+ Args:
75
+ proxy (Dict[str, str]): 代理配置
76
+ success (bool): 是否成功
77
+ """
78
+ proxy_id = id(proxy)
79
+ if proxy_id not in self.proxy_status:
80
+ self.proxy_status[proxy_id] = {"last_used": 0, "success_count": 0, "fail_count": 0}
81
+
82
+ status = self.proxy_status[proxy_id]
83
+ status["last_used"] = time.time()
84
+
85
+ if success:
86
+ status["success_count"] += 1
87
+ else:
88
+ status["fail_count"] += 1
89
+
90
+ def remove_invalid_proxy(self, proxy: Dict[str, str]) -> None:
91
+ """
92
+ 移除无效代理
93
+
94
+ Args:
95
+ proxy (Dict[str, str]): 代理配置
96
+ """
97
+ if proxy in self.proxies:
98
+ self.proxies.remove(proxy)
99
+ proxy_id = id(proxy)
100
+ if proxy_id in self.proxy_status:
101
+ del self.proxy_status[proxy_id]
102
+
103
+
104
+ class CaptchaHandler:
105
+ """验证码处理器类"""
106
+
107
+ def __init__(self, captcha_service: Optional[Callable] = None):
108
+ """
109
+ 初始化验证码处理器
110
+
111
+ Args:
112
+ captcha_service (Optional[Callable]): 验证码识别服务
113
+ """
114
+ self.captcha_service = captcha_service
115
+
116
+ async def recognize_captcha(self, image_data: bytes,
117
+ captcha_type: str = "image") -> Optional[str]:
118
+ """
119
+ 识别验证码
120
+
121
+ Args:
122
+ image_data (bytes): 验证码图片数据
123
+ captcha_type (str): 验证码类型
124
+
125
+ Returns:
126
+ Optional[str]: 识别结果
127
+ """
128
+ if self.captcha_service:
129
+ try:
130
+ return await self.captcha_service(image_data, captcha_type)
131
+ except Exception:
132
+ return None
133
+ else:
134
+ # 如果没有配置验证码服务,返回None
135
+ return None
136
+
137
+ async def handle_manual_captcha(self, prompt: str = "请输入验证码: ") -> str:
138
+ """
139
+ 处理手动验证码输入
140
+
141
+ Args:
142
+ prompt (str): 提示信息
143
+
144
+ Returns:
145
+ str: 用户输入的验证码
146
+ """
147
+ # 在实际应用中,这里可能需要与用户界面交互
148
+ # 为了演示目的,我们模拟用户输入
149
+ print(prompt)
150
+ return input() if not asyncio.get_event_loop().is_running() else ""
151
+
152
+
153
+ class AntiCrawler:
154
+ """反爬虫应对工具类"""
155
+
156
+ def __init__(self, proxies: Optional[List[Dict[str, str]]] = None,
157
+ captcha_service: Optional[Callable] = None):
158
+ """
159
+ 初始化反爬虫应对工具
160
+
161
+ Args:
162
+ proxies (Optional[List[Dict[str, str]]]): 代理列表
163
+ captcha_service (Optional[Callable]): 验证码识别服务
164
+ """
165
+ self.proxy_manager = ProxyPoolManager(proxies)
166
+ self.captcha_handler = CaptchaHandler(captcha_service)
167
+
168
+ def get_random_user_agent(self) -> str:
169
+ """
170
+ 获取随机User-Agent
171
+
172
+ Returns:
173
+ str: 随机User-Agent
174
+ """
175
+ user_agents = [
176
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
177
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
178
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0",
179
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15",
180
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/91.0.864.59",
181
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1",
182
+ "Mozilla/5.0 (Linux; Android 11; Pixel 5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Mobile Safari/537.36"
183
+ ]
184
+ return random.choice(user_agents)
185
+
186
+ def rotate_proxy(self) -> Dict[str, str]:
187
+ """
188
+ 轮换代理
189
+
190
+ Returns:
191
+ Dict[str, str]: 代理配置
192
+ """
193
+ return self.proxy_manager.get_best_proxy()
194
+
195
+ def handle_captcha(self, response_text: str) -> bool:
196
+ """
197
+ 检测是否遇到验证码
198
+
199
+ Args:
200
+ response_text (str): 响应文本
201
+
202
+ Returns:
203
+ bool: 是否遇到验证码
204
+ """
205
+ captcha_keywords = ["captcha", "verify", "验证", "验证码", "human verification"]
206
+ return any(keyword in response_text.lower() for keyword in captcha_keywords)
207
+
208
+ def detect_rate_limiting(self, status_code: int, response_headers: Dict[str, Any]) -> bool:
209
+ """
210
+ 检测是否遇到频率限制
211
+
212
+ Args:
213
+ status_code (int): HTTP状态码
214
+ response_headers (Dict[str, Any]): 响应头
215
+
216
+ Returns:
217
+ bool: 是否遇到频率限制
218
+ """
219
+ # 检查状态码
220
+ if status_code in [429, 503]:
221
+ return True
222
+
223
+ # 检查响应头
224
+ rate_limit_headers = ["x-ratelimit-remaining", "retry-after", "x-ratelimit-reset"]
225
+ return any(header.lower() in [k.lower() for k in response_headers.keys()]
226
+ for header in rate_limit_headers)
227
+
228
+ def random_delay(self, min_delay: float = 1.0, max_delay: float = 3.0) -> None:
229
+ """
230
+ 随机延迟,避免请求过于频繁
231
+
232
+ Args:
233
+ min_delay (float): 最小延迟时间(秒)
234
+ max_delay (float): 最大延迟时间(秒)
235
+ """
236
+ delay = random.uniform(min_delay, max_delay)
237
+ time.sleep(delay)
238
+
239
+ async def async_random_delay(self, min_delay: float = 1.0, max_delay: float = 3.0) -> None:
240
+ """
241
+ 异步随机延迟,避免请求过于频繁
242
+
243
+ Args:
244
+ min_delay (float): 最小延迟时间(秒)
245
+ max_delay (float): 最大延迟时间(秒)
246
+ """
247
+ delay = random.uniform(min_delay, max_delay)
248
+ await asyncio.sleep(delay)
249
+
250
+
251
+ # 便捷函数
252
+ def get_random_user_agent() -> str:
253
+ """获取随机User-Agent"""
254
+ return AntiCrawler().get_random_user_agent()
255
+
256
+
257
+ def rotate_proxy(proxies: Optional[List[Dict[str, str]]] = None) -> Dict[str, str]:
258
+ """轮换代理"""
259
+ return AntiCrawler(proxies).rotate_proxy()
260
+
261
+
262
+ def handle_captcha(response_text: str) -> bool:
263
+ """检测是否遇到验证码"""
264
+ return AntiCrawler().handle_captcha(response_text)
265
+
266
+
267
+ def detect_rate_limiting(status_code: int, response_headers: Dict[str, Any]) -> bool:
268
+ """检测是否遇到频率限制"""
269
269
  return AntiCrawler().detect_rate_limiting(status_code, response_headers)