crawlo 1.2.2__py3-none-any.whl → 1.2.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (222) hide show
  1. crawlo/__init__.py +61 -61
  2. crawlo/__version__.py +1 -1
  3. crawlo/cleaners/__init__.py +60 -60
  4. crawlo/cleaners/data_formatter.py +225 -225
  5. crawlo/cleaners/encoding_converter.py +125 -125
  6. crawlo/cleaners/text_cleaner.py +232 -232
  7. crawlo/cli.py +81 -81
  8. crawlo/commands/__init__.py +14 -14
  9. crawlo/commands/check.py +594 -594
  10. crawlo/commands/genspider.py +151 -151
  11. crawlo/commands/help.py +144 -142
  12. crawlo/commands/list.py +155 -155
  13. crawlo/commands/run.py +323 -292
  14. crawlo/commands/startproject.py +420 -418
  15. crawlo/commands/stats.py +188 -188
  16. crawlo/commands/utils.py +186 -186
  17. crawlo/config.py +312 -312
  18. crawlo/config_validator.py +251 -252
  19. crawlo/core/__init__.py +2 -2
  20. crawlo/core/engine.py +354 -354
  21. crawlo/core/processor.py +40 -40
  22. crawlo/core/scheduler.py +143 -143
  23. crawlo/crawler.py +1110 -1027
  24. crawlo/data/__init__.py +6 -0
  25. crawlo/data/user_agents.py +108 -0
  26. crawlo/downloader/__init__.py +266 -266
  27. crawlo/downloader/aiohttp_downloader.py +220 -220
  28. crawlo/downloader/cffi_downloader.py +256 -256
  29. crawlo/downloader/httpx_downloader.py +259 -259
  30. crawlo/downloader/hybrid_downloader.py +212 -213
  31. crawlo/downloader/playwright_downloader.py +402 -402
  32. crawlo/downloader/selenium_downloader.py +472 -472
  33. crawlo/event.py +11 -11
  34. crawlo/exceptions.py +81 -81
  35. crawlo/extension/__init__.py +37 -37
  36. crawlo/extension/health_check.py +141 -141
  37. crawlo/extension/log_interval.py +57 -57
  38. crawlo/extension/log_stats.py +81 -81
  39. crawlo/extension/logging_extension.py +43 -43
  40. crawlo/extension/memory_monitor.py +104 -104
  41. crawlo/extension/performance_profiler.py +133 -133
  42. crawlo/extension/request_recorder.py +107 -107
  43. crawlo/filters/__init__.py +154 -154
  44. crawlo/filters/aioredis_filter.py +280 -280
  45. crawlo/filters/memory_filter.py +269 -269
  46. crawlo/items/__init__.py +23 -23
  47. crawlo/items/base.py +21 -21
  48. crawlo/items/fields.py +52 -53
  49. crawlo/items/items.py +104 -104
  50. crawlo/middleware/__init__.py +21 -21
  51. crawlo/middleware/default_header.py +131 -131
  52. crawlo/middleware/download_delay.py +104 -104
  53. crawlo/middleware/middleware_manager.py +135 -135
  54. crawlo/middleware/offsite.py +114 -115
  55. crawlo/middleware/proxy.py +367 -366
  56. crawlo/middleware/request_ignore.py +86 -87
  57. crawlo/middleware/response_code.py +163 -164
  58. crawlo/middleware/response_filter.py +136 -137
  59. crawlo/middleware/retry.py +124 -124
  60. crawlo/mode_manager.py +211 -211
  61. crawlo/network/__init__.py +21 -21
  62. crawlo/network/request.py +338 -338
  63. crawlo/network/response.py +359 -359
  64. crawlo/pipelines/__init__.py +21 -21
  65. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  66. crawlo/pipelines/console_pipeline.py +39 -39
  67. crawlo/pipelines/csv_pipeline.py +316 -316
  68. crawlo/pipelines/database_dedup_pipeline.py +222 -224
  69. crawlo/pipelines/json_pipeline.py +218 -218
  70. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  71. crawlo/pipelines/mongo_pipeline.py +131 -131
  72. crawlo/pipelines/mysql_pipeline.py +317 -316
  73. crawlo/pipelines/pipeline_manager.py +61 -61
  74. crawlo/pipelines/redis_dedup_pipeline.py +165 -167
  75. crawlo/project.py +279 -187
  76. crawlo/queue/pqueue.py +37 -37
  77. crawlo/queue/queue_manager.py +337 -337
  78. crawlo/queue/redis_priority_queue.py +298 -298
  79. crawlo/settings/__init__.py +7 -7
  80. crawlo/settings/default_settings.py +217 -226
  81. crawlo/settings/setting_manager.py +122 -122
  82. crawlo/spider/__init__.py +639 -639
  83. crawlo/stats_collector.py +59 -59
  84. crawlo/subscriber.py +129 -130
  85. crawlo/task_manager.py +30 -30
  86. crawlo/templates/crawlo.cfg.tmpl +10 -10
  87. crawlo/templates/project/__init__.py.tmpl +3 -3
  88. crawlo/templates/project/items.py.tmpl +17 -17
  89. crawlo/templates/project/middlewares.py.tmpl +118 -118
  90. crawlo/templates/project/pipelines.py.tmpl +96 -96
  91. crawlo/templates/project/run.py.tmpl +47 -45
  92. crawlo/templates/project/settings.py.tmpl +350 -327
  93. crawlo/templates/project/settings_distributed.py.tmpl +160 -119
  94. crawlo/templates/project/settings_gentle.py.tmpl +133 -94
  95. crawlo/templates/project/settings_high_performance.py.tmpl +155 -151
  96. crawlo/templates/project/settings_simple.py.tmpl +108 -68
  97. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  98. crawlo/templates/spider/spider.py.tmpl +143 -143
  99. crawlo/tools/__init__.py +182 -182
  100. crawlo/tools/anti_crawler.py +268 -268
  101. crawlo/tools/authenticated_proxy.py +240 -240
  102. crawlo/tools/data_validator.py +180 -180
  103. crawlo/tools/date_tools.py +35 -35
  104. crawlo/tools/distributed_coordinator.py +386 -386
  105. crawlo/tools/retry_mechanism.py +220 -220
  106. crawlo/tools/scenario_adapter.py +262 -262
  107. crawlo/utils/__init__.py +35 -35
  108. crawlo/utils/batch_processor.py +259 -260
  109. crawlo/utils/controlled_spider_mixin.py +439 -439
  110. crawlo/utils/date_tools.py +290 -290
  111. crawlo/utils/db_helper.py +343 -343
  112. crawlo/utils/enhanced_error_handler.py +356 -359
  113. crawlo/utils/env_config.py +105 -105
  114. crawlo/utils/error_handler.py +123 -125
  115. crawlo/utils/func_tools.py +82 -82
  116. crawlo/utils/large_scale_config.py +286 -286
  117. crawlo/utils/large_scale_helper.py +344 -343
  118. crawlo/utils/log.py +128 -128
  119. crawlo/utils/performance_monitor.py +285 -284
  120. crawlo/utils/queue_helper.py +175 -175
  121. crawlo/utils/redis_connection_pool.py +334 -334
  122. crawlo/utils/redis_key_validator.py +198 -199
  123. crawlo/utils/request.py +267 -267
  124. crawlo/utils/request_serializer.py +218 -219
  125. crawlo/utils/spider_loader.py +61 -62
  126. crawlo/utils/system.py +11 -11
  127. crawlo/utils/tools.py +4 -4
  128. crawlo/utils/url.py +39 -39
  129. {crawlo-1.2.2.dist-info → crawlo-1.2.4.dist-info}/METADATA +764 -692
  130. crawlo-1.2.4.dist-info/RECORD +206 -0
  131. examples/__init__.py +7 -7
  132. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
  133. tests/__init__.py +7 -7
  134. tests/advanced_tools_example.py +275 -275
  135. tests/authenticated_proxy_example.py +236 -236
  136. tests/cleaners_example.py +160 -160
  137. tests/config_validation_demo.py +102 -102
  138. tests/controlled_spider_example.py +205 -205
  139. tests/date_tools_example.py +180 -180
  140. tests/dynamic_loading_example.py +523 -523
  141. tests/dynamic_loading_test.py +104 -104
  142. tests/env_config_example.py +133 -133
  143. tests/error_handling_example.py +171 -171
  144. tests/redis_key_validation_demo.py +130 -130
  145. tests/response_improvements_example.py +144 -144
  146. tests/test_advanced_tools.py +148 -148
  147. tests/test_all_redis_key_configs.py +145 -145
  148. tests/test_authenticated_proxy.py +141 -141
  149. tests/test_cleaners.py +54 -54
  150. tests/test_comprehensive.py +146 -146
  151. tests/test_config_validator.py +193 -193
  152. tests/test_crawlo_proxy_integration.py +172 -172
  153. tests/test_date_tools.py +123 -123
  154. tests/test_default_header_middleware.py +158 -158
  155. tests/test_double_crawlo_fix.py +207 -207
  156. tests/test_double_crawlo_fix_simple.py +124 -124
  157. tests/test_download_delay_middleware.py +221 -221
  158. tests/test_downloader_proxy_compatibility.py +268 -268
  159. tests/test_dynamic_downloaders_proxy.py +124 -124
  160. tests/test_dynamic_proxy.py +92 -92
  161. tests/test_dynamic_proxy_config.py +146 -146
  162. tests/test_dynamic_proxy_real.py +109 -109
  163. tests/test_edge_cases.py +303 -303
  164. tests/test_enhanced_error_handler.py +270 -270
  165. tests/test_env_config.py +121 -121
  166. tests/test_error_handler_compatibility.py +112 -112
  167. tests/test_final_validation.py +153 -153
  168. tests/test_framework_env_usage.py +103 -103
  169. tests/test_integration.py +356 -356
  170. tests/test_item_dedup_redis_key.py +122 -122
  171. tests/test_offsite_middleware.py +221 -221
  172. tests/test_parsel.py +29 -29
  173. tests/test_performance.py +327 -327
  174. tests/test_proxy_api.py +264 -264
  175. tests/test_proxy_health_check.py +32 -32
  176. tests/test_proxy_middleware.py +121 -121
  177. tests/test_proxy_middleware_enhanced.py +216 -216
  178. tests/test_proxy_middleware_integration.py +136 -136
  179. tests/test_proxy_providers.py +56 -56
  180. tests/test_proxy_stats.py +19 -19
  181. tests/test_proxy_strategies.py +59 -59
  182. tests/test_queue_manager_double_crawlo.py +173 -173
  183. tests/test_queue_manager_redis_key.py +176 -176
  184. tests/test_real_scenario_proxy.py +195 -195
  185. tests/test_redis_config.py +28 -28
  186. tests/test_redis_connection_pool.py +294 -294
  187. tests/test_redis_key_naming.py +181 -181
  188. tests/test_redis_key_validator.py +123 -123
  189. tests/test_redis_queue.py +224 -224
  190. tests/test_request_ignore_middleware.py +182 -182
  191. tests/test_request_serialization.py +70 -70
  192. tests/test_response_code_middleware.py +349 -349
  193. tests/test_response_filter_middleware.py +427 -427
  194. tests/test_response_improvements.py +152 -152
  195. tests/test_retry_middleware.py +241 -241
  196. tests/test_scheduler.py +241 -241
  197. tests/test_simple_response.py +61 -61
  198. tests/test_telecom_spider_redis_key.py +205 -205
  199. tests/test_template_content.py +87 -87
  200. tests/test_template_redis_key.py +134 -134
  201. tests/test_tools.py +153 -153
  202. tests/tools_example.py +257 -257
  203. crawlo-1.2.2.dist-info/RECORD +0 -220
  204. examples/aiohttp_settings.py +0 -42
  205. examples/curl_cffi_settings.py +0 -41
  206. examples/default_header_middleware_example.py +0 -107
  207. examples/default_header_spider_example.py +0 -129
  208. examples/download_delay_middleware_example.py +0 -160
  209. examples/httpx_settings.py +0 -42
  210. examples/multi_downloader_proxy_example.py +0 -81
  211. examples/offsite_middleware_example.py +0 -55
  212. examples/offsite_spider_example.py +0 -107
  213. examples/proxy_spider_example.py +0 -166
  214. examples/request_ignore_middleware_example.py +0 -51
  215. examples/request_ignore_spider_example.py +0 -99
  216. examples/response_code_middleware_example.py +0 -52
  217. examples/response_filter_middleware_example.py +0 -67
  218. examples/tong_hua_shun_settings.py +0 -62
  219. examples/tong_hua_shun_spider.py +0 -170
  220. {crawlo-1.2.2.dist-info → crawlo-1.2.4.dist-info}/WHEEL +0 -0
  221. {crawlo-1.2.2.dist-info → crawlo-1.2.4.dist-info}/entry_points.txt +0 -0
  222. {crawlo-1.2.2.dist-info → crawlo-1.2.4.dist-info}/top_level.txt +0 -0
crawlo/crawler.py CHANGED
@@ -1,1028 +1,1111 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- Crawlo Crawler Module
5
- ====================
6
- 提供爬虫进程管理和运行时核心功能。
7
-
8
- 核心组件:
9
- - Crawler: 单个爬虫运行实例,管理Spider与引擎的生命周期
10
- - CrawlerProcess: 爬虫进程管理器,支持多爬虫并发调度和资源管理
11
-
12
- 功能特性:
13
- - 智能并发控制和资源管理
14
- - 优雅关闭和信号处理
15
- - 统计监控和性能追踪
16
- - 自动模块发现和注册
17
- - 错误恢复和重试机制
18
- - 大规模爬虫优化支持
19
-
20
- 示例用法:
21
- # 单个爬虫运行
22
- crawler = Crawler(MySpider, settings)
23
- await crawler.crawl()
24
-
25
- # 多爬虫并发管理
26
- process = CrawlerProcess()
27
- await process.crawl([Spider1, Spider2])
28
- """
29
- from __future__ import annotations
30
- import asyncio
31
- import signal
32
- import time
33
- import threading
34
- from typing import Type, Optional, Set, List, Union, Dict, Any
35
- from .spider import Spider, get_global_spider_registry
36
- from .core.engine import Engine
37
- from .utils.log import get_logger
38
- from .subscriber import Subscriber
39
- from .extension import ExtensionManager
40
- from .stats_collector import StatsCollector
41
- from .event import spider_opened, spider_closed
42
- from .settings.setting_manager import SettingManager
43
- from crawlo.project import merge_settings, get_settings
44
-
45
-
46
- logger = get_logger(__name__)
47
-
48
-
49
- class CrawlerContext:
50
- """
51
- 爬虫上下文管理器
52
- 提供共享状态和资源管理
53
- """
54
-
55
- def __init__(self):
56
- self.start_time = time.time()
57
- self.total_crawlers = 0
58
- self.active_crawlers = 0
59
- self.completed_crawlers = 0
60
- self.failed_crawlers = 0
61
- self.error_log = []
62
- self._lock = threading.RLock()
63
-
64
- def increment_total(self):
65
- with self._lock:
66
- self.total_crawlers += 1
67
-
68
- def increment_active(self):
69
- with self._lock:
70
- self.active_crawlers += 1
71
-
72
- def decrement_active(self):
73
- with self._lock:
74
- self.active_crawlers -= 1
75
-
76
- def increment_completed(self):
77
- with self._lock:
78
- self.completed_crawlers += 1
79
-
80
- def increment_failed(self, error: str):
81
- with self._lock:
82
- self.failed_crawlers += 1
83
- self.error_log.append({
84
- 'timestamp': time.time(),
85
- 'error': error
86
- })
87
-
88
- def get_stats(self) -> Dict[str, Any]:
89
- with self._lock:
90
- duration = time.time() - self.start_time
91
- return {
92
- 'total_crawlers': self.total_crawlers,
93
- 'active_crawlers': self.active_crawlers,
94
- 'completed_crawlers': self.completed_crawlers,
95
- 'failed_crawlers': self.failed_crawlers,
96
- 'success_rate': (self.completed_crawlers / max(1, self.total_crawlers)) * 100,
97
- 'duration_seconds': round(duration, 2),
98
- 'error_count': len(self.error_log)
99
- }
100
-
101
-
102
- class Crawler:
103
- """
104
- 单个爬虫运行实例,管理 Spider 与引擎的生命周期
105
-
106
- 提供功能:
107
- - Spider 生命周期管理(初始化、运行、关闭)
108
- - 引擎组件的协调管理
109
- - 配置合并和验证
110
- - 统计数据收集
111
- - 扩展管理
112
- - 异常处理和清理
113
- """
114
-
115
- def __init__(self, spider_cls: Type[Spider], settings: SettingManager, context: Optional[CrawlerContext] = None):
116
- self.spider_cls = spider_cls
117
- self.spider: Optional[Spider] = None
118
- self.engine: Optional[Engine] = None
119
- self.stats: Optional[StatsCollector] = None
120
- self.subscriber: Optional[Subscriber] = None
121
- self.extension: Optional[ExtensionManager] = None
122
- self.settings: SettingManager = settings.copy()
123
- self.context = context or CrawlerContext()
124
-
125
- # 状态管理
126
- self._closed = False
127
- self._close_lock = asyncio.Lock()
128
- self._start_time = None
129
- self._end_time = None
130
-
131
- # 性能监控
132
- self._performance_metrics = {
133
- 'initialization_time': 0,
134
- 'crawl_duration': 0,
135
- 'memory_peak': 0,
136
- 'request_count': 0,
137
- 'error_count': 0
138
- }
139
-
140
- async def crawl(self):
141
- """
142
- 启动爬虫核心流程
143
-
144
- 包含以下阶段:
145
- 1. 初始化阶段: 创建所有组件
146
- 2. 验证阶段: 检查配置和状态
147
- 3. 运行阶段: 启动爬虫引擎
148
- 4. 清理阶段: 资源释放
149
- """
150
- init_start = time.time()
151
- self._start_time = init_start
152
-
153
- try:
154
- # 更新上下文状态
155
- self.context.increment_active()
156
-
157
- # 阶段 1: 初始化组件
158
- self.subscriber = self._create_subscriber()
159
- self.spider = self._create_spider()
160
- self.engine = self._create_engine()
161
- self.stats = self._create_stats()
162
- self.extension = self._create_extension()
163
-
164
- # 记录初始化时间
165
- self._performance_metrics['initialization_time'] = time.time() - init_start
166
-
167
- # 阶段 2: 验证状态
168
- self._validate_crawler_state()
169
-
170
- # 阶段 3: 启动爬虫
171
- crawl_start = time.time()
172
- await self.engine.start_spider(self.spider)
173
-
174
- # 记录爬取时间
175
- self._performance_metrics['crawl_duration'] = time.time() - crawl_start
176
- self._end_time = time.time()
177
-
178
- # 更新上下文状态
179
- self.context.increment_completed()
180
-
181
- logger.info(f"爬虫 {self.spider.name} 完成,耗时 {self._get_total_duration():.2f}秒")
182
-
183
- except Exception as e:
184
- self._performance_metrics['error_count'] += 1
185
- self.context.increment_failed(str(e))
186
- logger.error(f"爬虫 {getattr(self.spider, 'name', 'Unknown')} 运行失败: {e}", exc_info=True)
187
- raise
188
- finally:
189
- self.context.decrement_active()
190
- # 确保资源清理
191
- await self._ensure_cleanup()
192
-
193
- def _validate_crawler_state(self):
194
- """
195
- 验证爬虫状态和配置
196
- 确保所有必要组件都已正确初始化
197
- """
198
- if not self.spider:
199
- raise RuntimeError("爬虫实例未初始化")
200
- if not self.engine:
201
- raise RuntimeError("引擎未初始化")
202
- if not self.stats:
203
- raise RuntimeError("统计收集器未初始化")
204
- if not self.subscriber:
205
- raise RuntimeError("事件订阅器未初始化")
206
-
207
- # 检查关键配置
208
- if not self.spider.name:
209
- raise ValueError("爬虫名称不能为空")
210
-
211
- logger.debug(f"爬虫 {self.spider.name} 状态验证通过")
212
-
213
- def _get_total_duration(self) -> float:
214
- """获取总运行时间"""
215
- if self._start_time and self._end_time:
216
- return self._end_time - self._start_time
217
- return 0.0
218
-
219
- async def _ensure_cleanup(self):
220
- """确保资源清理"""
221
- try:
222
- if not self._closed:
223
- await self.close()
224
- except Exception as e:
225
- logger.warning(f"清理资源时发生错误: {e}")
226
-
227
- def get_performance_metrics(self) -> Dict[str, Any]:
228
- """获取性能指标"""
229
- metrics = self._performance_metrics.copy()
230
- metrics['total_duration'] = self._get_total_duration()
231
- if self.stats:
232
- # 添加统计数据
233
- stats_data = getattr(self.stats, 'get_stats', lambda: {})()
234
- metrics.update(stats_data)
235
- return metrics
236
- @staticmethod
237
- def _create_subscriber() -> Subscriber:
238
- """创建事件订阅器"""
239
- return Subscriber()
240
-
241
- def _create_spider(self) -> Spider:
242
- """
243
- 创建并验证爬虫实例(增强版)
244
-
245
- 执行以下验证:
246
- - 爬虫名称必须存在
247
- - start_requests 方法必须可调用
248
- - start_urls 不能是字符串
249
- - parse 方法建议存在
250
- """
251
- spider = self.spider_cls.create_instance(self)
252
-
253
- # 必要属性检查
254
- if not getattr(spider, 'name', None):
255
- raise AttributeError(
256
- f"爬虫类 '{self.spider_cls.__name__}' 必须定义 'name' 属性。\n"
257
- f"示例: name = 'my_spider'"
258
- )
259
-
260
- if not callable(getattr(spider, 'start_requests', None)):
261
- raise AttributeError(
262
- f"爬虫 '{spider.name}' 必须实现可调用的 'start_requests' 方法。\n"
263
- f"示例: def start_requests(self): yield Request(url='...')"
264
- )
265
-
266
- # start_urls 类型检查
267
- start_urls = getattr(spider, 'start_urls', [])
268
- if isinstance(start_urls, str):
269
- raise TypeError(
270
- f"爬虫 '{spider.name}' 的 'start_urls' 必须是列表或元组,不能是字符串。\n"
271
- f"正确写法: start_urls = ['http://example.com']\n"
272
- f"错误写法: start_urls = 'http://example.com'"
273
- )
274
-
275
- # parse 方法检查(警告而非错误)
276
- if not callable(getattr(spider, 'parse', None)):
277
- logger.warning(
278
- f"爬虫 '{spider.name}' 未定义 'parse' 方法。\n"
279
- f"请确保所有 Request 都指定了回调函数,否则响应将被忽略。"
280
- )
281
-
282
- # 设置爬虫配置
283
- self._set_spider(spider)
284
-
285
- logger.debug(f"爬虫 '{spider.name}' 初始化完成")
286
- return spider
287
-
288
- def _create_engine(self) -> Engine:
289
- """创建并初始化引擎"""
290
- engine = Engine(self)
291
- engine.engine_start()
292
- logger.debug(f"引擎初始化完成,爬虫: {getattr(self.spider, 'name', 'Unknown')}")
293
- return engine
294
-
295
- def _create_stats(self) -> StatsCollector:
296
- """创建统计收集器"""
297
- stats = StatsCollector(self)
298
- logger.debug(f"统计收集器初始化完成,爬虫: {getattr(self.spider, 'name', 'Unknown')}")
299
- return stats
300
-
301
- def _create_extension(self) -> ExtensionManager:
302
- """创建扩展管理器"""
303
- extension = ExtensionManager.create_instance(self)
304
- logger.debug(f"扩展管理器初始化完成,爬虫: {getattr(self.spider, 'name', 'Unknown')}")
305
- return extension
306
-
307
- def _set_spider(self, spider: Spider):
308
- """
309
- 设置爬虫配置和事件订阅
310
- 将爬虫的生命周期事件与订阅器绑定
311
- """
312
- # 订阅爬虫生命周期事件
313
- self.subscriber.subscribe(spider.spider_opened, event=spider_opened)
314
- self.subscriber.subscribe(spider.spider_closed, event=spider_closed)
315
-
316
- # 合并爬虫自定义配置
317
- merge_settings(spider, self.settings)
318
-
319
- logger.debug(f"爬虫 '{spider.name}' 配置合并完成")
320
-
321
- async def close(self, reason='finished') -> None:
322
- """
323
- 关闭爬虫并清理资源(增强版)
324
-
325
- 确保只关闭一次,并处理所有清理操作
326
- """
327
- async with self._close_lock:
328
- if self._closed:
329
- return
330
-
331
- self._closed = True
332
- self._end_time = time.time()
333
-
334
- try:
335
- # 通知爬虫关闭事件
336
- if self.subscriber:
337
- await self.subscriber.notify(spider_closed)
338
-
339
- # 统计数据收集
340
- if self.stats and self.spider:
341
- self.stats.close_spider(spider=self.spider, reason=reason)
342
- # 记录统计数据
343
- try:
344
- from crawlo.commands.stats import record_stats
345
- record_stats(self)
346
- except ImportError:
347
- logger.debug("统计记录模块不存在,跳过统计记录")
348
-
349
- logger.info(
350
- f"爬虫 '{getattr(self.spider, 'name', 'Unknown')}' 已关闭,"
351
- f"原因: {reason},耗时: {self._get_total_duration():.2f}秒"
352
- )
353
-
354
- except Exception as e:
355
- logger.error(f"关闭爬虫时发生错误: {e}", exc_info=True)
356
- finally:
357
- # 确保资源清理
358
- await self._cleanup_resources()
359
-
360
- async def _cleanup_resources(self):
361
- """清理所有资源"""
362
- cleanup_tasks = []
363
-
364
- # 引擎清理
365
- if self.engine:
366
- try:
367
- cleanup_tasks.append(self.engine.close())
368
- except AttributeError:
369
- pass # 引擎没有close方法
370
-
371
- # 扩展清理
372
- if self.extension:
373
- try:
374
- cleanup_tasks.append(self.extension.close())
375
- except AttributeError:
376
- pass
377
-
378
- # 统计收集器清理
379
- if self.stats:
380
- try:
381
- cleanup_tasks.append(self.stats.close())
382
- except AttributeError:
383
- pass
384
-
385
- # 并发执行清理任务
386
- if cleanup_tasks:
387
- await asyncio.gather(*cleanup_tasks, return_exceptions=True)
388
-
389
- logger.debug("资源清理完成")
390
-
391
-
392
- class CrawlerProcess:
393
- """
394
- 爬虫进程管理器
395
-
396
- 支持功能:
397
- - 多爬虫并发调度和资源管理
398
- - 自动模块发现和爬虫注册
399
- - 智能并发控制和负载均衡
400
- - 优雅关闭和信号处理
401
- - 实时状态监控和统计
402
- - 错误恢复和重试机制
403
- - 大规模爬虫优化支持
404
-
405
- 使用示例:
406
- # 基本用法
407
- process = CrawlerProcess()
408
- await process.crawl(MySpider)
409
-
410
- # 多爬虫并发
411
- await process.crawl([Spider1, Spider2, 'spider_name'])
412
-
413
- # 自定义并发数
414
- process = CrawlerProcess(max_concurrency=8)
415
- """
416
-
417
- def __init__(
418
- self,
419
- settings: Optional[SettingManager] = None,
420
- max_concurrency: Optional[int] = None,
421
- spider_modules: Optional[List[str]] = None,
422
- enable_monitoring: bool = True
423
- ):
424
- # 基础配置
425
- self.settings: SettingManager = settings or self._get_default_settings()
426
- self.crawlers: Set[Crawler] = set()
427
- self._active_tasks: Set[asyncio.Task] = set()
428
-
429
- # 上下文管理器
430
- self.context = CrawlerContext()
431
-
432
- # 并发控制配置
433
- self.max_concurrency: int = (
434
- max_concurrency
435
- or self.settings.get('MAX_RUNNING_SPIDERS')
436
- or self.settings.get('CONCURRENCY', 3)
437
- )
438
- self.semaphore = asyncio.Semaphore(self.max_concurrency)
439
-
440
- # 监控配置
441
- self.enable_monitoring = enable_monitoring
442
- self._monitoring_task = None
443
- self._shutdown_event = asyncio.Event()
444
-
445
- # 自动发现并导入爬虫模块
446
- if spider_modules:
447
- self.auto_discover(spider_modules)
448
-
449
- # 使用全局注册表的快照(避免后续导入影响)
450
- self._spider_registry: Dict[str, Type[Spider]] = get_global_spider_registry()
451
-
452
- # 性能监控
453
- self._performance_stats = {
454
- 'total_requests': 0,
455
- 'successful_requests': 0,
456
- 'failed_requests': 0,
457
- 'memory_usage_mb': 0,
458
- 'cpu_usage_percent': 0
459
- }
460
-
461
- # 注册信号量
462
- signal.signal(signal.SIGINT, self._shutdown)
463
- signal.signal(signal.SIGTERM, self._shutdown)
464
-
465
- logger.info(
466
- f"CrawlerProcess 初始化完成\n"
467
- f" - 最大并行爬虫数: {self.max_concurrency}\n"
468
- f" - 已注册爬虫数: {len(self._spider_registry)}\n"
469
- f" - 监控启用: {self.enable_monitoring}"
470
- )
471
-
472
- async def start_monitoring(self):
473
- """启动监控任务"""
474
- if not self.enable_monitoring:
475
- return
476
-
477
- self._monitoring_task = asyncio.create_task(self._monitor_loop())
478
- logger.debug("监控任务已启动")
479
-
480
- async def stop_monitoring(self):
481
- """停止监控任务"""
482
- if self._monitoring_task and not self._monitoring_task.done():
483
- self._monitoring_task.cancel()
484
- try:
485
- await self._monitoring_task
486
- except asyncio.CancelledError:
487
- pass
488
- logger.debug("监控任务已停止")
489
-
490
- async def _monitor_loop(self):
491
- """监控循环,定期收集和报告状态"""
492
- try:
493
- while not self._shutdown_event.is_set():
494
- await self._collect_performance_stats()
495
-
496
- # 每30秒输出一次状态
497
- stats = self.context.get_stats()
498
- if stats['active_crawlers'] > 0:
499
- logger.info(
500
- f"爬虫状态: 活跃 {stats['active_crawlers']}, "
501
- f"完成 {stats['completed_crawlers']}, "
502
- f"失败 {stats['failed_crawlers']}, "
503
- f"成功率 {stats['success_rate']:.1f}%"
504
- )
505
-
506
- await asyncio.sleep(30) # 30秒间隔
507
-
508
- except asyncio.CancelledError:
509
- logger.debug("监控循环被取消")
510
- except Exception as e:
511
- logger.error(f"监控循环错误: {e}", exc_info=True)
512
-
513
- async def _collect_performance_stats(self):
514
- """收集性能统计数据"""
515
- try:
516
- import psutil
517
- import os
518
-
519
- process = psutil.Process(os.getpid())
520
- memory_info = process.memory_info()
521
-
522
- self._performance_stats.update({
523
- 'memory_usage_mb': round(memory_info.rss / 1024 / 1024, 2),
524
- 'cpu_usage_percent': round(process.cpu_percent(), 2)
525
- })
526
-
527
- except ImportError:
528
- # psutil 不存在时跳过性能监控
529
- pass
530
- except Exception as e:
531
- logger.debug(f"收集性能统计失败: {e}")
532
- @staticmethod
533
- def auto_discover(modules: List[str]):
534
- """
535
- 自动导入模块,触发 Spider 类定义和注册(增强版)
536
-
537
- 支持递归扫描和错误恢复
538
- """
539
- import importlib
540
- import pkgutil
541
-
542
- discovered_count = 0
543
- error_count = 0
544
-
545
- for module_name in modules:
546
- try:
547
- module = importlib.import_module(module_name)
548
-
549
- if hasattr(module, '__path__'):
550
- # 包模块,递归扫描
551
- for _, name, _ in pkgutil.walk_packages(module.__path__, module.__name__ + "."):
552
- try:
553
- importlib.import_module(name)
554
- discovered_count += 1
555
- except Exception as sub_e:
556
- error_count += 1
557
- logger.warning(f"导入子模块 {name} 失败: {sub_e}")
558
- else:
559
- # 单个模块
560
- importlib.import_module(module_name)
561
- discovered_count += 1
562
-
563
- logger.debug(f"已扫描模块: {module_name}")
564
-
565
- except Exception as e:
566
- error_count += 1
567
- logger.error(f"扫描模块 {module_name} 失败: {e}", exc_info=True)
568
-
569
- logger.info(
570
- f"模块发现完成: 成功 {discovered_count} 个,失败 {error_count} 个"
571
- )
572
-
573
- # === 公共只读接口:避免直接访问 _spider_registry ===
574
-
575
- def get_spider_names(self) -> List[str]:
576
- """获取所有已注册的爬虫名称"""
577
- return list(self._spider_registry.keys())
578
-
579
- def get_spider_class(self, name: str) -> Optional[Type[Spider]]:
580
- """根据 name 获取爬虫类"""
581
- return self._spider_registry.get(name)
582
-
583
- def is_spider_registered(self, name: str) -> bool:
584
- """检查某个 name 是否已注册"""
585
- return name in self._spider_registry
586
-
587
- async def crawl(self, spiders: Union[Type[Spider], str, List[Union[Type[Spider], str]]]):
588
- """
589
- 启动一个或多个爬虫
590
-
591
- 增强功能:
592
- - 智能并发控制
593
- - 实时监控和统计
594
- - 错误恢复和重试
595
- - 优雅关闭处理
596
- """
597
- # 阶段 1: 预处理和验证
598
- spider_classes_to_run = self._resolve_spiders_to_run(spiders)
599
- total = len(spider_classes_to_run)
600
-
601
- if total == 0:
602
- raise ValueError("至少需要提供一个爬虫类或名称")
603
-
604
- # 阶段 2: 初始化上下文和监控
605
- for _ in range(total):
606
- self.context.increment_total()
607
-
608
- # 启动监控任务
609
- await self.start_monitoring()
610
-
611
- try:
612
- # 阶段 3: 按类名排序,保证启动顺序可预测
613
- spider_classes_to_run.sort(key=lambda cls: cls.__name__.lower())
614
-
615
- logger.info(
616
- f"开始启动 {total} 个爬虫\n"
617
- f" - 最大并发数: {self.max_concurrency}\n"
618
- f" - 爬虫列表: {[cls.__name__ for cls in spider_classes_to_run]}"
619
- )
620
-
621
- # 阶段 4: 流式启动所有爬虫任务
622
- tasks = [
623
- asyncio.create_task(
624
- self._run_spider_with_limit(spider_cls, index + 1, total),
625
- name=f"spider-{spider_cls.__name__}-{index+1}"
626
- )
627
- for index, spider_cls in enumerate(spider_classes_to_run)
628
- ]
629
-
630
- # 阶段 5: 等待所有任务完成(失败不中断)
631
- results = await asyncio.gather(*tasks, return_exceptions=True)
632
-
633
- # 阶段 6: 统计异常和结果
634
- failed = [i for i, r in enumerate(results) if isinstance(r, Exception)]
635
- successful = total - len(failed)
636
-
637
- if failed:
638
- failed_spiders = [spider_classes_to_run[i].__name__ for i in failed]
639
- logger.error(
640
- f"爬虫执行结果: 成功 {successful}/{total},失败 {len(failed)}/{total}\n"
641
- f" - 失败爬虫: {failed_spiders}"
642
- )
643
-
644
- # 记录详细错误信息
645
- for i in failed:
646
- error = results[i]
647
- logger.error(f"爬虫 {spider_classes_to_run[i].__name__} 错误详情: {error}")
648
- else:
649
- logger.info(f"所有 {total} 个爬虫均成功完成! 🎉")
650
-
651
- # 返回统计结果
652
- return {
653
- 'total': total,
654
- 'successful': successful,
655
- 'failed': len(failed),
656
- 'success_rate': (successful / total) * 100 if total > 0 else 0,
657
- 'context_stats': self.context.get_stats()
658
- }
659
-
660
- finally:
661
- # 阶段 7: 清理和关闭
662
- await self.stop_monitoring()
663
- await self._cleanup_process()
664
-
665
- async def _cleanup_process(self):
666
- """清理进程资源"""
667
- try:
668
- # 等待所有活跃爬虫完成
669
- if self.crawlers:
670
- close_tasks = [crawler.close() for crawler in self.crawlers]
671
- await asyncio.gather(*close_tasks, return_exceptions=True)
672
- self.crawlers.clear()
673
-
674
- # 清理活跃任务
675
- if self._active_tasks:
676
- for task in list(self._active_tasks):
677
- if not task.done():
678
- task.cancel()
679
- await asyncio.gather(*self._active_tasks, return_exceptions=True)
680
- self._active_tasks.clear()
681
-
682
- logger.debug("进程资源清理完成")
683
-
684
- except Exception as e:
685
- logger.error(f"清理进程资源时发生错误: {e}", exc_info=True)
686
-
687
- def get_process_stats(self) -> Dict[str, Any]:
688
- """获取进程统计信息"""
689
- context_stats = self.context.get_stats()
690
-
691
- return {
692
- 'context': context_stats,
693
- 'performance': self._performance_stats.copy(),
694
- 'crawlers': {
695
- 'total_registered': len(self._spider_registry),
696
- 'active_crawlers': len(self.crawlers),
697
- 'max_concurrency': self.max_concurrency
698
- },
699
- 'registry': {
700
- 'spider_names': list(self._spider_registry.keys()),
701
- 'spider_classes': [cls.__name__ for cls in self._spider_registry.values()]
702
- }
703
- }
704
- def _resolve_spiders_to_run(
705
- self,
706
- spiders_input: Union[Type[Spider], str, List[Union[Type[Spider], str]]]
707
- ) -> List[Type[Spider]]:
708
- """
709
- 解析输入为爬虫类列表
710
-
711
- 支持各种输入格式并验证唯一性
712
- """
713
- inputs = self._normalize_inputs(spiders_input)
714
- seen_spider_names: Set[str] = set()
715
- spider_classes: List[Type[Spider]] = []
716
-
717
- for item in inputs:
718
- try:
719
- spider_cls = self._resolve_spider_class(item)
720
- spider_name = getattr(spider_cls, 'name', None)
721
-
722
- if not spider_name:
723
- raise ValueError(f"爬虫类 {spider_cls.__name__} 缺少 'name' 属性")
724
-
725
- if spider_name in seen_spider_names:
726
- raise ValueError(
727
- f"本次运行中爬虫名称 '{spider_name}' 重复。\n"
728
- f"请确保每个爬虫的 name 属性在本次运行中唯一。"
729
- )
730
-
731
- seen_spider_names.add(spider_name)
732
- spider_classes.append(spider_cls)
733
-
734
- logger.debug(f"解析爬虫成功: {item} -> {spider_cls.__name__} (name='{spider_name}')")
735
-
736
- except Exception as e:
737
- logger.error(f"解析爬虫失败: {item} - {e}")
738
- raise
739
-
740
- return spider_classes
741
-
742
- @staticmethod
743
- def _normalize_inputs(spiders_input) -> List[Union[Type[Spider], str]]:
744
- """
745
- 标准化输入为列表
746
-
747
- 支持更多输入类型并提供更好的错误信息
748
- """
749
- if isinstance(spiders_input, (type, str)):
750
- return [spiders_input]
751
- elif isinstance(spiders_input, (list, tuple, set)):
752
- spider_list = list(spiders_input)
753
- if not spider_list:
754
- raise ValueError("爬虫列表不能为空")
755
- return spider_list
756
- else:
757
- raise TypeError(
758
- f"spiders 参数类型不支持: {type(spiders_input)}\n"
759
- f"支持的类型: Spider类、name字符串,或它们的列表/元组/集合"
760
- )
761
-
762
- def _resolve_spider_class(self, item: Union[Type[Spider], str]) -> Type[Spider]:
763
- """
764
- 解析单个输入项为爬虫类
765
-
766
- 提供更好的错误提示和调试信息
767
- """
768
- if isinstance(item, type) and issubclass(item, Spider):
769
- # 直接是 Spider 类
770
- return item
771
- elif isinstance(item, str):
772
- # 是字符串名称,需要查找注册表
773
- spider_cls = self._spider_registry.get(item)
774
- if not spider_cls:
775
- available_spiders = list(self._spider_registry.keys())
776
- raise ValueError(
777
- f"未找到名为 '{item}' 的爬虫。\n"
778
- f"已注册的爬虫: {available_spiders}\n"
779
- f"请检查爬虫名称是否正确,或者确保爬虫已被正确导入和注册。"
780
- )
781
- return spider_cls
782
- else:
783
- raise TypeError(
784
- f"无效类型 {type(item)}: {item}\n"
785
- f"必须是 Spider 类或字符串 name。\n"
786
- f"示例: MySpider 或 'my_spider'"
787
- )
788
-
789
- async def _run_spider_with_limit(self, spider_cls: Type[Spider], seq: int, total: int):
790
- """
791
- 受信号量限制的爬虫运行函数
792
-
793
- 包含增强的错误处理和监控功能
794
- """
795
- task = asyncio.current_task()
796
- crawler = None
797
-
798
- try:
799
- # 注册任务
800
- if task:
801
- self._active_tasks.add(task)
802
-
803
- # 获取并发许可
804
- await self.semaphore.acquire()
805
-
806
- start_msg = f"[{seq}/{total}] 启动爬虫: {spider_cls.__name__}"
807
- logger.info(start_msg)
808
-
809
- # 创建并运行爬虫
810
- crawler = Crawler(spider_cls, self.settings, self.context)
811
- self.crawlers.add(crawler)
812
-
813
- # 记录启动时间
814
- start_time = time.time()
815
-
816
- # 运行爬虫
817
- await crawler.crawl()
818
-
819
- # 计算运行时间
820
- duration = time.time() - start_time
821
-
822
- end_msg = (
823
- f"[{seq}/{total}] 爬虫完成: {spider_cls.__name__}, "
824
- f"耗时: {duration:.2f}秒"
825
- )
826
- logger.info(end_msg)
827
-
828
- # 记录成功统计
829
- self._performance_stats['successful_requests'] += 1
830
-
831
- except Exception as e:
832
- # 记录失败统计
833
- self._performance_stats['failed_requests'] += 1
834
-
835
- error_msg = f"爬虫 {spider_cls.__name__} 执行失败: {e}"
836
- logger.error(error_msg, exc_info=True)
837
-
838
- # 将错误信息记录到上下文
839
- if hasattr(self, 'context'):
840
- self.context.increment_failed(error_msg)
841
-
842
- raise
843
- finally:
844
- # 清理资源
845
- try:
846
- if crawler and crawler in self.crawlers:
847
- self.crawlers.remove(crawler)
848
-
849
- if task and task in self._active_tasks:
850
- self._active_tasks.remove(task)
851
-
852
- self.semaphore.release()
853
-
854
- except Exception as cleanup_error:
855
- logger.warning(f"清理资源时发生错误: {cleanup_error}")
856
-
857
- def _shutdown(self, _signum, _frame):
858
- """
859
- 优雅关闭信号处理
860
-
861
- 提供更好的关闭体验和资源清理
862
- """
863
- signal_name = {signal.SIGINT: 'SIGINT', signal.SIGTERM: 'SIGTERM'}.get(_signum, str(_signum))
864
- logger.warning(f"收到关闭信号 {signal_name},正在停止所有爬虫...")
865
-
866
- # 设置关闭事件
867
- if hasattr(self, '_shutdown_event'):
868
- self._shutdown_event.set()
869
-
870
- # 停止所有爬虫引擎
871
- for crawler in list(self.crawlers):
872
- if crawler.engine:
873
- crawler.engine.running = False
874
- crawler.engine.normal = False
875
- logger.debug(f"已停止爬虫引擎: {getattr(crawler.spider, 'name', 'Unknown')}")
876
-
877
- # 创建关闭任务
878
- asyncio.create_task(self._wait_for_shutdown())
879
-
880
- logger.info("关闭指令已发送,等待爬虫完成当前任务...")
881
-
882
- async def _wait_for_shutdown(self):
883
- """
884
- 等待所有活跃任务完成
885
-
886
- 提供更好的关闭时间控制和进度反馈
887
- """
888
- try:
889
- # 停止监控任务
890
- await self.stop_monitoring()
891
-
892
- # 等待活跃任务完成
893
- pending = [t for t in self._active_tasks if not t.done()]
894
-
895
- if pending:
896
- logger.info(
897
- f"等待 {len(pending)} 个活跃任务完成..."
898
- f"(最大等待时间: 30秒)"
899
- )
900
-
901
- # 设置超时时间
902
- try:
903
- await asyncio.wait_for(
904
- asyncio.gather(*pending, return_exceptions=True),
905
- timeout=30.0
906
- )
907
- except asyncio.TimeoutError:
908
- logger.warning("部分任务超时,强制取消中...")
909
-
910
- # 强制取消超时任务
911
- for task in pending:
912
- if not task.done():
913
- task.cancel()
914
-
915
- # 等待取消完成
916
- await asyncio.gather(*pending, return_exceptions=True)
917
-
918
- # 最终清理
919
- await self._cleanup_process()
920
-
921
- # 输出最终统计
922
- final_stats = self.context.get_stats()
923
- logger.info(
924
- f"所有爬虫已优雅关闭 👋\n"
925
- f" - 总计爬虫: {final_stats['total_crawlers']}\n"
926
- f" - 成功完成: {final_stats['completed_crawlers']}\n"
927
- f" - 失败数量: {final_stats['failed_crawlers']}\n"
928
- f" - 成功率: {final_stats['success_rate']:.1f}%\n"
929
- f" - 总运行时间: {final_stats['duration_seconds']}"
930
- )
931
-
932
- except Exception as e:
933
- logger.error(f"关闭过程中发生错误: {e}", exc_info=True)
934
-
935
- @classmethod
936
- def _get_default_settings(cls) -> SettingManager:
937
- """
938
- 加载默认配置
939
-
940
- 提供更好的错误处理和降级策略
941
- """
942
- try:
943
- settings = get_settings()
944
- logger.debug("成功加载默认配置")
945
- return settings
946
- except Exception as e:
947
- logger.warning(f"无法加载默认配置: {e},使用空配置")
948
- return SettingManager()
949
-
950
-
951
- # === 工具函数 ===
952
-
953
- def create_crawler_with_optimizations(
954
- spider_cls: Type[Spider],
955
- settings: Optional[SettingManager] = None,
956
- **optimization_kwargs
957
- ) -> Crawler:
958
- """
959
- 创建优化的爬虫实例
960
-
961
- :param spider_cls: 爬虫类
962
- :param settings: 设置管理器
963
- :param optimization_kwargs: 优化参数
964
- :return: 爬虫实例
965
- """
966
- if settings is None:
967
- settings = SettingManager()
968
-
969
- # 应用优化配置
970
- for key, value in optimization_kwargs.items():
971
- settings.set(key, value)
972
-
973
- context = CrawlerContext()
974
- return Crawler(spider_cls, settings, context)
975
-
976
-
977
- def create_process_with_large_scale_config(
978
- config_type: str = 'balanced',
979
- concurrency: int = 16,
980
- **kwargs
981
- ) -> CrawlerProcess:
982
- """
983
- 创建支持大规模优化的进程管理器
984
-
985
- :param config_type: 配置类型 ('conservative', 'balanced', 'aggressive', 'memory_optimized')
986
- :param concurrency: 并发数
987
- :param kwargs: 其他参数
988
- :return: 进程管理器
989
- """
990
- try:
991
- from crawlo.utils.large_scale_config import LargeScaleConfig
992
-
993
- # 获取优化配置
994
- config_methods = {
995
- 'conservative': LargeScaleConfig.conservative_config,
996
- 'balanced': LargeScaleConfig.balanced_config,
997
- 'aggressive': LargeScaleConfig.aggressive_config,
998
- 'memory_optimized': LargeScaleConfig.memory_optimized_config
999
- }
1000
-
1001
- if config_type not in config_methods:
1002
- logger.warning(f"未知的配置类型: {config_type},使用默认配置")
1003
- settings = SettingManager()
1004
- else:
1005
- config = config_methods[config_type](concurrency)
1006
- settings = SettingManager()
1007
- settings.update(config)
1008
-
1009
- return CrawlerProcess(
1010
- settings=settings,
1011
- max_concurrency=concurrency,
1012
- **kwargs
1013
- )
1014
-
1015
- except ImportError:
1016
- logger.warning("大规模配置模块不存在,使用默认配置")
1017
- return CrawlerProcess(max_concurrency=concurrency, **kwargs)
1018
-
1019
-
1020
- # === 导出接口 ===
1021
-
1022
- __all__ = [
1023
- 'Crawler',
1024
- 'CrawlerProcess',
1025
- 'CrawlerContext',
1026
- 'create_crawler_with_optimizations',
1027
- 'create_process_with_large_scale_config'
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ Crawlo Crawler Module
5
+ ====================
6
+ 提供爬虫进程管理和运行时核心功能。
7
+
8
+ 核心组件:
9
+ - Crawler: 单个爬虫运行实例,管理Spider与引擎的生命周期
10
+ - CrawlerProcess: 爬虫进程管理器,支持多爬虫并发调度和资源管理
11
+
12
+ 功能特性:
13
+ - 智能并发控制和资源管理
14
+ - 优雅关闭和信号处理
15
+ - 统计监控和性能追踪
16
+ - 自动模块发现和注册
17
+ - 错误恢复和重试机制
18
+ - 大规模爬虫优化支持
19
+
20
+ 示例用法:
21
+ # 单个爬虫运行
22
+ crawler = Crawler(MySpider, settings)
23
+ await crawler.crawl()
24
+
25
+ # 多爬虫并发管理
26
+ process = CrawlerProcess()
27
+ await process.crawl([Spider1, Spider2])
28
+ """
29
+ from __future__ import annotations
30
+ import asyncio
31
+ import signal
32
+ import time
33
+ import threading
34
+ from typing import Type, Optional, Set, List, Union, Dict, Any
35
+ from .spider import Spider, get_global_spider_registry
36
+ from .core.engine import Engine
37
+ from .utils.log import get_logger
38
+ from .subscriber import Subscriber
39
+ from .extension import ExtensionManager
40
+ from .stats_collector import StatsCollector
41
+ from .event import spider_opened, spider_closed
42
+ from .settings.setting_manager import SettingManager
43
+ from crawlo.project import merge_settings, get_settings
44
+
45
+
46
+ logger = get_logger(__name__)
47
+
48
+
49
+ class CrawlerContext:
50
+ """
51
+ 爬虫上下文管理器
52
+ 提供共享状态和资源管理
53
+ """
54
+
55
+ def __init__(self):
56
+ self.start_time = time.time()
57
+ self.total_crawlers = 0
58
+ self.active_crawlers = 0
59
+ self.completed_crawlers = 0
60
+ self.failed_crawlers = 0
61
+ self.error_log = []
62
+ self._lock = threading.RLock()
63
+
64
+ def increment_total(self):
65
+ with self._lock:
66
+ self.total_crawlers += 1
67
+
68
+ def increment_active(self):
69
+ with self._lock:
70
+ self.active_crawlers += 1
71
+
72
+ def decrement_active(self):
73
+ with self._lock:
74
+ self.active_crawlers -= 1
75
+
76
+ def increment_completed(self):
77
+ with self._lock:
78
+ self.completed_crawlers += 1
79
+
80
+ def increment_failed(self, error: str):
81
+ with self._lock:
82
+ self.failed_crawlers += 1
83
+ self.error_log.append({
84
+ 'timestamp': time.time(),
85
+ 'error': error
86
+ })
87
+
88
+ def get_stats(self) -> Dict[str, Any]:
89
+ with self._lock:
90
+ duration = time.time() - self.start_time
91
+ return {
92
+ 'total_crawlers': self.total_crawlers,
93
+ 'active_crawlers': self.active_crawlers,
94
+ 'completed_crawlers': self.completed_crawlers,
95
+ 'failed_crawlers': self.failed_crawlers,
96
+ 'success_rate': (self.completed_crawlers / max(1, self.total_crawlers)) * 100,
97
+ 'duration_seconds': round(duration, 2),
98
+ 'error_count': len(self.error_log)
99
+ }
100
+
101
+
102
+ class Crawler:
103
+ """
104
+ 单个爬虫运行实例,管理 Spider 与引擎的生命周期
105
+
106
+ 提供功能:
107
+ - Spider 生命周期管理(初始化、运行、关闭)
108
+ - 引擎组件的协调管理
109
+ - 配置合并和验证
110
+ - 统计数据收集
111
+ - 扩展管理
112
+ - 异常处理和清理
113
+ """
114
+
115
+ def __init__(self, spider_cls: Type[Spider], settings: SettingManager, context: Optional[CrawlerContext] = None):
116
+ self.spider_cls = spider_cls
117
+ self.spider: Optional[Spider] = None
118
+ self.engine: Optional[Engine] = None
119
+ self.stats: Optional[StatsCollector] = None
120
+ self.subscriber: Optional[Subscriber] = None
121
+ self.extension: Optional[ExtensionManager] = None
122
+ self.settings: SettingManager = settings.copy()
123
+ self.context = context or CrawlerContext()
124
+
125
+ # 状态管理
126
+ self._closed = False
127
+ self._close_lock = asyncio.Lock()
128
+ self._start_time = None
129
+ self._end_time = None
130
+
131
+ # 性能监控
132
+ self._performance_metrics = {
133
+ 'initialization_time': 0,
134
+ 'crawl_duration': 0,
135
+ 'memory_peak': 0,
136
+ 'request_count': 0,
137
+ 'error_count': 0
138
+ }
139
+
140
+ async def crawl(self):
141
+ """
142
+ 启动爬虫核心流程
143
+
144
+ 包含以下阶段:
145
+ 1. 初始化阶段: 创建所有组件
146
+ 2. 验证阶段: 检查配置和状态
147
+ 3. 运行阶段: 启动爬虫引擎
148
+ 4. 清理阶段: 资源释放
149
+ """
150
+ init_start = time.time()
151
+ self._start_time = init_start
152
+
153
+ try:
154
+ # 更新上下文状态
155
+ self.context.increment_active()
156
+
157
+ # 阶段 1: 初始化组件
158
+ self.subscriber = self._create_subscriber()
159
+ self.spider = self._create_spider()
160
+ self.engine = self._create_engine()
161
+ self.stats = self._create_stats()
162
+ self.extension = self._create_extension()
163
+
164
+ # 记录初始化时间
165
+ self._performance_metrics['initialization_time'] = time.time() - init_start
166
+
167
+ # 阶段 2: 验证状态
168
+ self._validate_crawler_state()
169
+
170
+ # 阶段 3: 启动爬虫
171
+ crawl_start = time.time()
172
+ await self.engine.start_spider(self.spider)
173
+
174
+ # 记录爬取时间
175
+ self._performance_metrics['crawl_duration'] = time.time() - crawl_start
176
+ self._end_time = time.time()
177
+
178
+ # 更新上下文状态
179
+ self.context.increment_completed()
180
+
181
+ logger.info(f"爬虫 {self.spider.name} 完成,耗时 {self._get_total_duration():.2f}秒")
182
+
183
+ except Exception as e:
184
+ self._performance_metrics['error_count'] += 1
185
+ self.context.increment_failed(str(e))
186
+ logger.error(f"爬虫 {getattr(self.spider, 'name', 'Unknown')} 运行失败: {e}", exc_info=True)
187
+ raise
188
+ finally:
189
+ self.context.decrement_active()
190
+ # 确保资源清理
191
+ await self._ensure_cleanup()
192
+
193
+ def _validate_crawler_state(self):
194
+ """
195
+ 验证爬虫状态和配置
196
+ 确保所有必要组件都已正确初始化
197
+ """
198
+ if not self.spider:
199
+ raise RuntimeError("爬虫实例未初始化")
200
+ if not self.engine:
201
+ raise RuntimeError("引擎未初始化")
202
+ if not self.stats:
203
+ raise RuntimeError("统计收集器未初始化")
204
+ if not self.subscriber:
205
+ raise RuntimeError("事件订阅器未初始化")
206
+
207
+ # 检查关键配置
208
+ if not self.spider.name:
209
+ raise ValueError("爬虫名称不能为空")
210
+
211
+ logger.debug(f"爬虫 {self.spider.name} 状态验证通过")
212
+
213
+ def _get_total_duration(self) -> float:
214
+ """获取总运行时间"""
215
+ if self._start_time and self._end_time:
216
+ return self._end_time - self._start_time
217
+ return 0.0
218
+
219
+ async def _ensure_cleanup(self):
220
+ """确保资源清理"""
221
+ try:
222
+ if not self._closed:
223
+ await self.close()
224
+ except Exception as e:
225
+ logger.warning(f"清理资源时发生错误: {e}")
226
+
227
+ def get_performance_metrics(self) -> Dict[str, Any]:
228
+ """获取性能指标"""
229
+ metrics = self._performance_metrics.copy()
230
+ metrics['total_duration'] = self._get_total_duration()
231
+ if self.stats:
232
+ # 添加统计数据
233
+ stats_data = getattr(self.stats, 'get_stats', lambda: {})()
234
+ metrics.update(stats_data)
235
+ return metrics
236
+ @staticmethod
237
+ def _create_subscriber() -> Subscriber:
238
+ """创建事件订阅器"""
239
+ return Subscriber()
240
+
241
+ def _create_spider(self) -> Spider:
242
+ """
243
+ 创建并验证爬虫实例(增强版)
244
+
245
+ 执行以下验证:
246
+ - 爬虫名称必须存在
247
+ - start_requests 方法必须可调用
248
+ - start_urls 不能是字符串
249
+ - parse 方法建议存在
250
+ """
251
+ spider = self.spider_cls.create_instance(self)
252
+
253
+ # 必要属性检查
254
+ if not getattr(spider, 'name', None):
255
+ raise AttributeError(
256
+ f"爬虫类 '{self.spider_cls.__name__}' 必须定义 'name' 属性。\n"
257
+ f"示例: name = 'my_spider'"
258
+ )
259
+
260
+ if not callable(getattr(spider, 'start_requests', None)):
261
+ raise AttributeError(
262
+ f"爬虫 '{spider.name}' 必须实现可调用的 'start_requests' 方法。\n"
263
+ f"示例: def start_requests(self): yield Request(url='...')"
264
+ )
265
+
266
+ # start_urls 类型检查
267
+ start_urls = getattr(spider, 'start_urls', [])
268
+ if isinstance(start_urls, str):
269
+ raise TypeError(
270
+ f"爬虫 '{spider.name}' 的 'start_urls' 必须是列表或元组,不能是字符串。\n"
271
+ f"正确写法: start_urls = ['http://example.com']\n"
272
+ f"错误写法: start_urls = 'http://example.com'"
273
+ )
274
+
275
+ # parse 方法检查(警告而非错误)
276
+ if not callable(getattr(spider, 'parse', None)):
277
+ logger.warning(
278
+ f"爬虫 '{spider.name}' 未定义 'parse' 方法。\n"
279
+ f"请确保所有 Request 都指定了回调函数,否则响应将被忽略。"
280
+ )
281
+
282
+ # 设置爬虫配置
283
+ self._set_spider(spider)
284
+
285
+ logger.debug(f"爬虫 '{spider.name}' 初始化完成")
286
+ return spider
287
+
288
+ def _create_engine(self) -> Engine:
289
+ """创建并初始化引擎"""
290
+ engine = Engine(self)
291
+ engine.engine_start()
292
+ logger.debug(f"引擎初始化完成,爬虫: {getattr(self.spider, 'name', 'Unknown')}")
293
+ return engine
294
+
295
+ def _create_stats(self) -> StatsCollector:
296
+ """创建统计收集器"""
297
+ stats = StatsCollector(self)
298
+ logger.debug(f"统计收集器初始化完成,爬虫: {getattr(self.spider, 'name', 'Unknown')}")
299
+ return stats
300
+
301
+ def _create_extension(self) -> ExtensionManager:
302
+ """创建扩展管理器"""
303
+ extension = ExtensionManager.create_instance(self)
304
+ logger.debug(f"扩展管理器初始化完成,爬虫: {getattr(self.spider, 'name', 'Unknown')}")
305
+ return extension
306
+
307
+ def _set_spider(self, spider: Spider):
308
+ """
309
+ 设置爬虫配置和事件订阅
310
+ 将爬虫的生命周期事件与订阅器绑定
311
+ """
312
+ # 订阅爬虫生命周期事件
313
+ self.subscriber.subscribe(spider.spider_opened, event=spider_opened)
314
+ self.subscriber.subscribe(spider.spider_closed, event=spider_closed)
315
+
316
+ # 合并爬虫自定义配置
317
+ merge_settings(spider, self.settings)
318
+
319
+ logger.debug(f"爬虫 '{spider.name}' 配置合并完成")
320
+
321
+ async def close(self, reason='finished') -> None:
322
+ """
323
+ 关闭爬虫并清理资源(增强版)
324
+
325
+ 确保只关闭一次,并处理所有清理操作
326
+ """
327
+ async with self._close_lock:
328
+ if self._closed:
329
+ return
330
+
331
+ self._closed = True
332
+ self._end_time = time.time()
333
+
334
+ try:
335
+ # 通知爬虫关闭事件
336
+ if self.subscriber:
337
+ await self.subscriber.notify(spider_closed)
338
+
339
+ # 统计数据收集
340
+ if self.stats and self.spider:
341
+ self.stats.close_spider(spider=self.spider, reason=reason)
342
+ # 记录统计数据
343
+ try:
344
+ from crawlo.commands.stats import record_stats
345
+ record_stats(self)
346
+ except ImportError:
347
+ logger.debug("统计记录模块不存在,跳过统计记录")
348
+
349
+ logger.info(
350
+ f"爬虫 '{getattr(self.spider, 'name', 'Unknown')}' 已关闭,"
351
+ f"原因: {reason},耗时: {self._get_total_duration():.2f}秒"
352
+ )
353
+
354
+ except Exception as e:
355
+ logger.error(f"关闭爬虫时发生错误: {e}", exc_info=True)
356
+ finally:
357
+ # 确保资源清理
358
+ await self._cleanup_resources()
359
+
360
+ async def _cleanup_resources(self):
361
+ """清理所有资源"""
362
+ cleanup_tasks = []
363
+
364
+ # 引擎清理
365
+ if self.engine:
366
+ try:
367
+ cleanup_tasks.append(self.engine.close())
368
+ except AttributeError:
369
+ pass # 引擎没有close方法
370
+
371
+ # 扩展清理
372
+ if self.extension:
373
+ try:
374
+ cleanup_tasks.append(self.extension.close())
375
+ except AttributeError:
376
+ pass
377
+
378
+ # 统计收集器清理
379
+ if self.stats:
380
+ try:
381
+ cleanup_tasks.append(self.stats.close())
382
+ except AttributeError:
383
+ pass
384
+
385
+ # 并发执行清理任务
386
+ if cleanup_tasks:
387
+ await asyncio.gather(*cleanup_tasks, return_exceptions=True)
388
+
389
+ logger.debug("资源清理完成")
390
+
391
+
392
+ class CrawlerProcess:
393
+ """
394
+ 爬虫进程管理器
395
+
396
+ 支持功能:
397
+ - 多爬虫并发调度和资源管理
398
+ - 自动模块发现和爬虫注册
399
+ - 智能并发控制和负载均衡
400
+ - 优雅关闭和信号处理
401
+ - 实时状态监控和统计
402
+ - 错误恢复和重试机制
403
+ - 大规模爬虫优化支持
404
+
405
+ 使用示例:
406
+ # 基本用法
407
+ process = CrawlerProcess()
408
+ await process.crawl(MySpider)
409
+
410
+ # 多爬虫并发
411
+ await process.crawl([Spider1, Spider2, 'spider_name'])
412
+
413
+ # 自定义并发数
414
+ process = CrawlerProcess(max_concurrency=8)
415
+ """
416
+
417
+ def __init__(
418
+ self,
419
+ settings: Optional[SettingManager] = None,
420
+ max_concurrency: Optional[int] = None,
421
+ spider_modules: Optional[List[str]] = None,
422
+ enable_monitoring: bool = True
423
+ ):
424
+ # 基础配置
425
+ self.settings: SettingManager = settings or self._get_default_settings()
426
+ self.crawlers: Set[Crawler] = set()
427
+ self._active_tasks: Set[asyncio.Task] = set()
428
+
429
+ # 上下文管理器
430
+ self.context = CrawlerContext()
431
+
432
+ # 并发控制配置
433
+ self.max_concurrency: int = (
434
+ max_concurrency
435
+ or self.settings.get('MAX_RUNNING_SPIDERS')
436
+ or self.settings.get('CONCURRENCY', 3)
437
+ )
438
+ self.semaphore = asyncio.Semaphore(self.max_concurrency)
439
+
440
+ # 监控配置
441
+ self.enable_monitoring = enable_monitoring
442
+ self._monitoring_task = None
443
+ self._shutdown_event = asyncio.Event()
444
+
445
+ # 自动发现并导入爬虫模块
446
+ if spider_modules:
447
+ self.auto_discover(spider_modules)
448
+
449
+ # 使用全局注册表的快照(避免后续导入影响)
450
+ self._spider_registry: Dict[str, Type[Spider]] = get_global_spider_registry()
451
+
452
+ # 性能监控
453
+ self._performance_stats = {
454
+ 'total_requests': 0,
455
+ 'successful_requests': 0,
456
+ 'failed_requests': 0,
457
+ 'memory_usage_mb': 0,
458
+ 'cpu_usage_percent': 0
459
+ }
460
+
461
+ # 注册信号量
462
+ signal.signal(signal.SIGINT, self._shutdown)
463
+ signal.signal(signal.SIGTERM, self._shutdown)
464
+
465
+ self._log_startup_info()
466
+
467
+ logger.debug(
468
+ f"CrawlerProcess 初始化完成\n"
469
+ f" - 最大并行爬虫数: {self.max_concurrency}\n"
470
+ f" - 已注册爬虫数: {len(self._spider_registry)}\n"
471
+ f" - 监控启用: {self.enable_monitoring}"
472
+ )
473
+
474
+ async def start_monitoring(self):
475
+ """启动监控任务"""
476
+ if not self.enable_monitoring:
477
+ return
478
+
479
+ self._monitoring_task = asyncio.create_task(self._monitor_loop())
480
+ logger.debug("监控任务已启动")
481
+
482
+ async def stop_monitoring(self):
483
+ """停止监控任务"""
484
+ if self._monitoring_task and not self._monitoring_task.done():
485
+ self._monitoring_task.cancel()
486
+ try:
487
+ await self._monitoring_task
488
+ except asyncio.CancelledError:
489
+ pass
490
+ logger.debug("监控任务已停止")
491
+
492
+ async def _monitor_loop(self):
493
+ """监控循环,定期收集和报告状态"""
494
+ try:
495
+ while not self._shutdown_event.is_set():
496
+ await self._collect_performance_stats()
497
+
498
+ # 每30秒输出一次状态
499
+ stats = self.context.get_stats()
500
+ if stats['active_crawlers'] > 0:
501
+ logger.debug(
502
+ f"爬虫状态: 活跃 {stats['active_crawlers']}, "
503
+ f"完成 {stats['completed_crawlers']}, "
504
+ f"失败 {stats['failed_crawlers']}, "
505
+ f"成功率 {stats['success_rate']:.1f}%"
506
+ )
507
+
508
+ await asyncio.sleep(30) # 30秒间隔
509
+
510
+ except asyncio.CancelledError:
511
+ logger.debug("监控循环被取消")
512
+ except Exception as e:
513
+ logger.error(f"监控循环错误: {e}", exc_info=True)
514
+
515
+ async def _collect_performance_stats(self):
516
+ """收集性能统计数据"""
517
+ try:
518
+ import psutil
519
+ import os
520
+
521
+ process = psutil.Process(os.getpid())
522
+ memory_info = process.memory_info()
523
+
524
+ self._performance_stats.update({
525
+ 'memory_usage_mb': round(memory_info.rss / 1024 / 1024, 2),
526
+ 'cpu_usage_percent': round(process.cpu_percent(), 2)
527
+ })
528
+
529
+ except ImportError:
530
+ # psutil 不存在时跳过性能监控
531
+ pass
532
+ except Exception as e:
533
+ logger.debug(f"收集性能统计失败: {e}")
534
+ @staticmethod
535
+ def auto_discover(modules: List[str]):
536
+ """
537
+ 自动导入模块,触发 Spider 类定义和注册(增强版)
538
+
539
+ 支持递归扫描和错误恢复
540
+ """
541
+ import importlib
542
+ import pkgutil
543
+
544
+ discovered_count = 0
545
+ error_count = 0
546
+
547
+ for module_name in modules:
548
+ try:
549
+ module = importlib.import_module(module_name)
550
+
551
+ if hasattr(module, '__path__'):
552
+ # 包模块,递归扫描
553
+ for _, name, _ in pkgutil.walk_packages(module.__path__, module.__name__ + "."):
554
+ try:
555
+ importlib.import_module(name)
556
+ discovered_count += 1
557
+ except Exception as sub_e:
558
+ error_count += 1
559
+ logger.warning(f"导入子模块 {name} 失败: {sub_e}")
560
+ else:
561
+ # 单个模块
562
+ importlib.import_module(module_name)
563
+ discovered_count += 1
564
+
565
+ logger.debug(f"已扫描模块: {module_name}")
566
+
567
+ except Exception as e:
568
+ error_count += 1
569
+ logger.error(f"扫描模块 {module_name} 失败: {e}", exc_info=True)
570
+
571
+ logger.debug(
572
+ f"爬虫注册完成: 成功 {discovered_count} 个,失败 {error_count} 个"
573
+ )
574
+
575
+ # === 公共只读接口:避免直接访问 _spider_registry ===
576
+
577
+ def get_spider_names(self) -> List[str]:
578
+ """获取所有已注册的爬虫名称"""
579
+ return list(self._spider_registry.keys())
580
+
581
+ def get_spider_class(self, name: str) -> Optional[Type[Spider]]:
582
+ """根据 name 获取爬虫类"""
583
+ return self._spider_registry.get(name)
584
+
585
+ def is_spider_registered(self, name: str) -> bool:
586
+ """检查某个 name 是否已注册"""
587
+ return name in self._spider_registry
588
+
589
+ async def crawl(self, spiders: Union[Type[Spider], str, List[Union[Type[Spider], str]]]):
590
+ """
591
+ 启动一个或多个爬虫
592
+
593
+ 增强功能:
594
+ - 智能并发控制
595
+ - 实时监控和统计
596
+ - 错误恢复和重试
597
+ - 优雅关闭处理
598
+ """
599
+ # 阶段 1: 预处理和验证
600
+ spider_classes_to_run = self._resolve_spiders_to_run(spiders)
601
+ total = len(spider_classes_to_run)
602
+
603
+ if total == 0:
604
+ raise ValueError("至少需要提供一个爬虫类或名称")
605
+
606
+ # 阶段 2: 初始化上下文和监控
607
+ for _ in range(total):
608
+ self.context.increment_total()
609
+
610
+ # 启动监控任务
611
+ await self.start_monitoring()
612
+
613
+ try:
614
+ # 阶段 3: 按类名排序,保证启动顺序可预测
615
+ spider_classes_to_run.sort(key=lambda cls: cls.__name__.lower())
616
+
617
+ logger.debug(
618
+ f"开始启动 {total} 个爬虫\n"
619
+ f" - 最大并发数: {self.max_concurrency}\n"
620
+ f" - 爬虫列表: {[cls.__name__ for cls in spider_classes_to_run]}"
621
+ )
622
+
623
+ # 阶段 4: 流式启动所有爬虫任务
624
+ tasks = [
625
+ asyncio.create_task(
626
+ self._run_spider_with_limit(spider_cls, index + 1, total),
627
+ name=f"spider-{spider_cls.__name__}-{index+1}"
628
+ )
629
+ for index, spider_cls in enumerate(spider_classes_to_run)
630
+ ]
631
+
632
+ # 阶段 5: 等待所有任务完成(失败不中断)
633
+ results = await asyncio.gather(*tasks, return_exceptions=True)
634
+
635
+ # 阶段 6: 统计异常和结果
636
+ failed = [i for i, r in enumerate(results) if isinstance(r, Exception)]
637
+ successful = total - len(failed)
638
+
639
+ if failed:
640
+ failed_spiders = [spider_classes_to_run[i].__name__ for i in failed]
641
+ logger.error(
642
+ f"爬虫执行结果: 成功 {successful}/{total},失败 {len(failed)}/{total}\n"
643
+ f" - 失败爬虫: {failed_spiders}"
644
+ )
645
+
646
+ # 记录详细错误信息
647
+ for i in failed:
648
+ error = results[i]
649
+ logger.error(f"爬虫 {spider_classes_to_run[i].__name__} 错误详情: {error}")
650
+ else:
651
+ logger.info(f"所有 {total} 个爬虫均成功完成! 🎉")
652
+
653
+ # 返回统计结果
654
+ return {
655
+ 'total': total,
656
+ 'successful': successful,
657
+ 'failed': len(failed),
658
+ 'success_rate': (successful / total) * 100 if total > 0 else 0,
659
+ 'context_stats': self.context.get_stats()
660
+ }
661
+
662
+ finally:
663
+ # 阶段 7: 清理和关闭
664
+ await self.stop_monitoring()
665
+ await self._cleanup_process()
666
+
667
+ async def _cleanup_process(self):
668
+ """清理进程资源"""
669
+ try:
670
+ # 等待所有活跃爬虫完成
671
+ if self.crawlers:
672
+ close_tasks = [crawler.close() for crawler in self.crawlers]
673
+ await asyncio.gather(*close_tasks, return_exceptions=True)
674
+ self.crawlers.clear()
675
+
676
+ # 清理活跃任务
677
+ if self._active_tasks:
678
+ for task in list(self._active_tasks):
679
+ if not task.done():
680
+ task.cancel()
681
+ await asyncio.gather(*self._active_tasks, return_exceptions=True)
682
+ self._active_tasks.clear()
683
+
684
+ logger.debug("进程资源清理完成")
685
+
686
+ except Exception as e:
687
+ logger.error(f"清理进程资源时发生错误: {e}", exc_info=True)
688
+
689
+ def get_process_stats(self) -> Dict[str, Any]:
690
+ """获取进程统计信息"""
691
+ context_stats = self.context.get_stats()
692
+
693
+ return {
694
+ 'context': context_stats,
695
+ 'performance': self._performance_stats.copy(),
696
+ 'crawlers': {
697
+ 'total_registered': len(self._spider_registry),
698
+ 'active_crawlers': len(self.crawlers),
699
+ 'max_concurrency': self.max_concurrency
700
+ },
701
+ 'registry': {
702
+ 'spider_names': list(self._spider_registry.keys()),
703
+ 'spider_classes': [cls.__name__ for cls in self._spider_registry.values()]
704
+ }
705
+ }
706
+ def _resolve_spiders_to_run(
707
+ self,
708
+ spiders_input: Union[Type[Spider], str, List[Union[Type[Spider], str]]]
709
+ ) -> List[Type[Spider]]:
710
+ """
711
+ 解析输入为爬虫类列表
712
+
713
+ 支持各种输入格式并验证唯一性
714
+ """
715
+ inputs = self._normalize_inputs(spiders_input)
716
+ seen_spider_names: Set[str] = set()
717
+ spider_classes: List[Type[Spider]] = []
718
+
719
+ for item in inputs:
720
+ try:
721
+ spider_cls = self._resolve_spider_class(item)
722
+ spider_name = getattr(spider_cls, 'name', None)
723
+
724
+ if not spider_name:
725
+ raise ValueError(f"爬虫类 {spider_cls.__name__} 缺少 'name' 属性")
726
+
727
+ if spider_name in seen_spider_names:
728
+ raise ValueError(
729
+ f"本次运行中爬虫名称 '{spider_name}' 重复。\n"
730
+ f"请确保每个爬虫的 name 属性在本次运行中唯一。"
731
+ )
732
+
733
+ seen_spider_names.add(spider_name)
734
+ spider_classes.append(spider_cls)
735
+
736
+ logger.debug(f"解析爬虫成功: {item} -> {spider_cls.__name__} (name='{spider_name}')")
737
+
738
+ except Exception as e:
739
+ logger.error(f"解析爬虫失败: {item} - {e}")
740
+ raise
741
+
742
+ return spider_classes
743
+
744
+ @staticmethod
745
+ def _normalize_inputs(spiders_input) -> List[Union[Type[Spider], str]]:
746
+ """
747
+ 标准化输入为列表
748
+
749
+ 支持更多输入类型并提供更好的错误信息
750
+ """
751
+ if isinstance(spiders_input, (type, str)):
752
+ return [spiders_input]
753
+ elif isinstance(spiders_input, (list, tuple, set)):
754
+ spider_list = list(spiders_input)
755
+ if not spider_list:
756
+ raise ValueError("爬虫列表不能为空")
757
+ return spider_list
758
+ else:
759
+ raise TypeError(
760
+ f"spiders 参数类型不支持: {type(spiders_input)}\n"
761
+ f"支持的类型: Spider类、name字符串,或它们的列表/元组/集合"
762
+ )
763
+
764
+ def _resolve_spider_class(self, item: Union[Type[Spider], str]) -> Type[Spider]:
765
+ """
766
+ 解析单个输入项为爬虫类
767
+
768
+ 提供更好的错误提示和调试信息
769
+ """
770
+ if isinstance(item, type) and issubclass(item, Spider):
771
+ # 直接是 Spider 类
772
+ return item
773
+ elif isinstance(item, str):
774
+ # 是字符串名称,需要查找注册表
775
+ spider_cls = self._spider_registry.get(item)
776
+ if not spider_cls:
777
+ available_spiders = list(self._spider_registry.keys())
778
+ raise ValueError(
779
+ f"未找到名为 '{item}' 的爬虫。\n"
780
+ f"已注册的爬虫: {available_spiders}\n"
781
+ f"请检查爬虫名称是否正确,或者确保爬虫已被正确导入和注册。"
782
+ )
783
+ return spider_cls
784
+ else:
785
+ raise TypeError(
786
+ f"无效类型 {type(item)}: {item}\n"
787
+ f"必须是 Spider 类或字符串 name。\n"
788
+ f"示例: MySpider 或 'my_spider'"
789
+ )
790
+
791
+ async def _run_spider_with_limit(self, spider_cls: Type[Spider], seq: int, total: int):
792
+ """
793
+ 受信号量限制的爬虫运行函数
794
+
795
+ 包含增强的错误处理和监控功能
796
+ """
797
+ task = asyncio.current_task()
798
+ crawler = None
799
+
800
+ try:
801
+ # 注册任务
802
+ if task:
803
+ self._active_tasks.add(task)
804
+
805
+ # 获取并发许可
806
+ await self.semaphore.acquire()
807
+
808
+ start_msg = f"[{seq}/{total}] 启动爬虫: {spider_cls.__name__}"
809
+ logger.info(start_msg)
810
+
811
+ # 创建并运行爬虫
812
+ crawler = Crawler(spider_cls, self.settings, self.context)
813
+ self.crawlers.add(crawler)
814
+
815
+ # 记录启动时间
816
+ start_time = time.time()
817
+
818
+ # 运行爬虫
819
+ await crawler.crawl()
820
+
821
+ # 计算运行时间
822
+ duration = time.time() - start_time
823
+
824
+ end_msg = (
825
+ f"[{seq}/{total}] 爬虫完成: {spider_cls.__name__}, "
826
+ f"耗时: {duration:.2f}秒"
827
+ )
828
+ logger.info(end_msg)
829
+
830
+ # 记录成功统计
831
+ self._performance_stats['successful_requests'] += 1
832
+
833
+ except Exception as e:
834
+ # 记录失败统计
835
+ self._performance_stats['failed_requests'] += 1
836
+
837
+ error_msg = f"爬虫 {spider_cls.__name__} 执行失败: {e}"
838
+ logger.error(error_msg, exc_info=True)
839
+
840
+ # 将错误信息记录到上下文
841
+ if hasattr(self, 'context'):
842
+ self.context.increment_failed(error_msg)
843
+
844
+ raise
845
+ finally:
846
+ # 清理资源
847
+ try:
848
+ if crawler and crawler in self.crawlers:
849
+ self.crawlers.remove(crawler)
850
+
851
+ if task and task in self._active_tasks:
852
+ self._active_tasks.remove(task)
853
+
854
+ self.semaphore.release()
855
+
856
+ except Exception as cleanup_error:
857
+ logger.warning(f"清理资源时发生错误: {cleanup_error}")
858
+
859
+ def _shutdown(self, _signum, _frame):
860
+ """
861
+ 优雅关闭信号处理
862
+
863
+ 提供更好的关闭体验和资源清理
864
+ """
865
+ signal_name = {signal.SIGINT: 'SIGINT', signal.SIGTERM: 'SIGTERM'}.get(_signum, str(_signum))
866
+ logger.warning(f"收到关闭信号 {signal_name},正在停止所有爬虫...")
867
+
868
+ # 设置关闭事件
869
+ if hasattr(self, '_shutdown_event'):
870
+ self._shutdown_event.set()
871
+
872
+ # 停止所有爬虫引擎
873
+ for crawler in list(self.crawlers):
874
+ if crawler.engine:
875
+ crawler.engine.running = False
876
+ crawler.engine.normal = False
877
+ logger.debug(f"已停止爬虫引擎: {getattr(crawler.spider, 'name', 'Unknown')}")
878
+
879
+ # 创建关闭任务
880
+ asyncio.create_task(self._wait_for_shutdown())
881
+
882
+ logger.info("关闭指令已发送,等待爬虫完成当前任务...")
883
+
884
+ async def _wait_for_shutdown(self):
885
+ """
886
+ 等待所有活跃任务完成
887
+
888
+ 提供更好的关闭时间控制和进度反馈
889
+ """
890
+ try:
891
+ # 停止监控任务
892
+ await self.stop_monitoring()
893
+
894
+ # 等待活跃任务完成
895
+ pending = [t for t in self._active_tasks if not t.done()]
896
+
897
+ if pending:
898
+ logger.info(
899
+ f"等待 {len(pending)} 个活跃任务完成..."
900
+ f"(最大等待时间: 30秒)"
901
+ )
902
+
903
+ # 设置超时时间
904
+ try:
905
+ await asyncio.wait_for(
906
+ asyncio.gather(*pending, return_exceptions=True),
907
+ timeout=30.0
908
+ )
909
+ except asyncio.TimeoutError:
910
+ logger.warning("部分任务超时,强制取消中...")
911
+
912
+ # 强制取消超时任务
913
+ for task in pending:
914
+ if not task.done():
915
+ task.cancel()
916
+
917
+ # 等待取消完成
918
+ await asyncio.gather(*pending, return_exceptions=True)
919
+
920
+ # 最终清理
921
+ await self._cleanup_process()
922
+
923
+ # 输出最终统计
924
+ final_stats = self.context.get_stats()
925
+ logger.info(
926
+ f"所有爬虫已优雅关闭 👋\n"
927
+ f" - 总计爬虫: {final_stats['total_crawlers']}\n"
928
+ f" - 成功完成: {final_stats['completed_crawlers']}\n"
929
+ f" - 失败数量: {final_stats['failed_crawlers']}\n"
930
+ f" - 成功率: {final_stats['success_rate']:.1f}%\n"
931
+ f" - 总运行时间: {final_stats['duration_seconds']}秒"
932
+ )
933
+
934
+ except Exception as e:
935
+ logger.error(f"关闭过程中发生错误: {e}", exc_info=True)
936
+
937
+ @classmethod
938
+ def _get_default_settings(cls) -> SettingManager:
939
+ """
940
+ 加载默认配置
941
+
942
+ 提供更好的错误处理和降级策略
943
+ """
944
+ try:
945
+ settings = get_settings()
946
+ logger.debug("成功加载默认配置")
947
+ return settings
948
+ except Exception as e:
949
+ logger.warning(f"无法加载默认配置: {e},使用空配置")
950
+ return SettingManager()
951
+
952
+ def _log_startup_info(self):
953
+ """打印启动信息,包括运行模式和关键配置检查"""
954
+ # 获取运行模式
955
+ run_mode = self.settings.get('RUN_MODE', 'standalone')
956
+
957
+ # 构建启动信息日志
958
+ startup_info = [
959
+ "🚀 Crawlo 爬虫框架启动",
960
+ f" 运行模式: {run_mode}"
961
+ ]
962
+
963
+ # 根据运行模式添加特定信息
964
+ if run_mode == 'distributed':
965
+ startup_info.append(" 🌐 分布式模式 - 支持多节点协同工作")
966
+ # 检查Redis配置
967
+ redis_host = self.settings.get('REDIS_HOST', 'localhost')
968
+ redis_port = self.settings.get('REDIS_PORT', 6379)
969
+ startup_info.append(f" Redis地址: {redis_host}:{redis_port}")
970
+
971
+ # 检查队列类型
972
+ queue_type = self.settings.get('QUEUE_TYPE', 'redis')
973
+ if queue_type != 'redis':
974
+ startup_info.append(f" ⚠️ 警告: 分布式模式下建议使用 'redis' 队列类型,当前为 '{queue_type}'")
975
+ else:
976
+ startup_info.append(" 🏠 单机模式 - 适用于开发和小规模数据采集")
977
+ # 检查队列类型
978
+ queue_type = self.settings.get('QUEUE_TYPE', 'memory')
979
+ if queue_type != 'memory' and queue_type != 'auto':
980
+ startup_info.append(f" ⚠️ 警告: 单机模式下建议使用 'memory' 队列类型,当前为 '{queue_type}'")
981
+
982
+ # 检查关键配置项
983
+ concurrency = self.settings.get('CONCURRENCY', 8)
984
+ download_delay = self.settings.get('DOWNLOAD_DELAY', 1.0)
985
+ filter_class = self.settings.get('FILTER_CLASS', 'crawlo.filters.memory_filter.MemoryFilter')
986
+
987
+ # 并发数检查
988
+ if run_mode == 'distributed':
989
+ if concurrency < 8:
990
+ startup_info.append(f" ⚠️ 警告: 分布式模式下建议并发数 >= 8,当前为 {concurrency}")
991
+ else:
992
+ if concurrency > 16:
993
+ startup_info.append(f" ⚠️ 警告: 单机模式下建议并发数 <= 16,当前为 {concurrency}")
994
+
995
+ # 下载延迟检查
996
+ if download_delay < 0.1:
997
+ startup_info.append(f" ⚠️ 警告: 下载延迟过小({download_delay}s)可能导致被封IP")
998
+ elif download_delay > 10:
999
+ startup_info.append(f" ⚠️ 警告: 下载延迟过大({download_delay}s)可能影响效率")
1000
+
1001
+ startup_info.extend([
1002
+ f" 并发数: {concurrency}",
1003
+ f" 下载延迟: {download_delay}秒",
1004
+ f" 过滤器类: {filter_class}"
1005
+ ])
1006
+
1007
+ # 检查去重管道配置
1008
+ default_dedup_pipeline = self.settings.get('DEFAULT_DEDUP_PIPELINE', '')
1009
+ pipelines = self.settings.get('PIPELINES', [])
1010
+
1011
+ if default_dedup_pipeline:
1012
+ startup_info.append(f" 默认去重管道: {default_dedup_pipeline}")
1013
+
1014
+ # 检查去重管道是否在PIPELINES列表中
1015
+ if default_dedup_pipeline and default_dedup_pipeline not in pipelines:
1016
+ startup_info.append(f" ⚠️ 警告: 默认去重管道 '{default_dedup_pipeline}' 未添加到 PIPELINES 列表中")
1017
+
1018
+ # 检查下载器配置
1019
+ downloader = self.settings.get('DOWNLOADER', 'crawlo.downloader.aiohttp_downloader.AioHttpDownloader')
1020
+ # startup_info.append(f" 下载器: {downloader}")
1021
+
1022
+ # 检查中间件配置
1023
+ middlewares = self.settings.get('MIDDLEWARES', [])
1024
+ # startup_info.append(f" 中间件数量: {len(middlewares)}")
1025
+
1026
+ # 检查扩展组件配置
1027
+ extensions = self.settings.get('EXTENSIONS', [])
1028
+ # startup_info.append(f" 扩展组件数量: {len(extensions)}")
1029
+
1030
+ # 输出启动信息
1031
+ logger.info("\n".join(startup_info))
1032
+
1033
+
1034
+ # === 工具函数 ===
1035
+
1036
+ def create_crawler_with_optimizations(
1037
+ spider_cls: Type[Spider],
1038
+ settings: Optional[SettingManager] = None,
1039
+ **optimization_kwargs
1040
+ ) -> Crawler:
1041
+ """
1042
+ 创建优化的爬虫实例
1043
+
1044
+ :param spider_cls: 爬虫类
1045
+ :param settings: 设置管理器
1046
+ :param optimization_kwargs: 优化参数
1047
+ :return: 爬虫实例
1048
+ """
1049
+ if settings is None:
1050
+ settings = SettingManager()
1051
+
1052
+ # 应用优化配置
1053
+ for key, value in optimization_kwargs.items():
1054
+ settings.set(key, value)
1055
+
1056
+ context = CrawlerContext()
1057
+ return Crawler(spider_cls, settings, context)
1058
+
1059
+
1060
+ def create_process_with_large_scale_config(
1061
+ config_type: str = 'balanced',
1062
+ concurrency: int = 16,
1063
+ **kwargs
1064
+ ) -> CrawlerProcess:
1065
+ """
1066
+ 创建支持大规模优化的进程管理器
1067
+
1068
+ :param config_type: 配置类型 ('conservative', 'balanced', 'aggressive', 'memory_optimized')
1069
+ :param concurrency: 并发数
1070
+ :param kwargs: 其他参数
1071
+ :return: 进程管理器
1072
+ """
1073
+ try:
1074
+ from crawlo.utils.large_scale_config import LargeScaleConfig
1075
+
1076
+ # 获取优化配置
1077
+ config_methods = {
1078
+ 'conservative': LargeScaleConfig.conservative_config,
1079
+ 'balanced': LargeScaleConfig.balanced_config,
1080
+ 'aggressive': LargeScaleConfig.aggressive_config,
1081
+ 'memory_optimized': LargeScaleConfig.memory_optimized_config
1082
+ }
1083
+
1084
+ if config_type not in config_methods:
1085
+ logger.warning(f"未知的配置类型: {config_type},使用默认配置")
1086
+ settings = SettingManager()
1087
+ else:
1088
+ config = config_methods[config_type](concurrency)
1089
+ settings = SettingManager()
1090
+ settings.update(config)
1091
+
1092
+ return CrawlerProcess(
1093
+ settings=settings,
1094
+ max_concurrency=concurrency,
1095
+ **kwargs
1096
+ )
1097
+
1098
+ except ImportError:
1099
+ logger.warning("大规模配置模块不存在,使用默认配置")
1100
+ return CrawlerProcess(max_concurrency=concurrency, **kwargs)
1101
+
1102
+
1103
+ # === 导出接口 ===
1104
+
1105
+ __all__ = [
1106
+ 'Crawler',
1107
+ 'CrawlerProcess',
1108
+ 'CrawlerContext',
1109
+ 'create_crawler_with_optimizations',
1110
+ 'create_process_with_large_scale_config'
1028
1111
  ]