crawlo 1.2.7__py3-none-any.whl → 1.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (221) hide show
  1. crawlo/__init__.py +63 -61
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/help.py +138 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +323 -323
  10. crawlo/commands/startproject.py +436 -436
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +186 -186
  13. crawlo/config.py +312 -312
  14. crawlo/config_validator.py +277 -251
  15. crawlo/core/__init__.py +2 -2
  16. crawlo/core/engine.py +366 -365
  17. crawlo/core/processor.py +40 -40
  18. crawlo/core/scheduler.py +256 -251
  19. crawlo/crawler.py +1103 -1100
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -107
  22. crawlo/downloader/__init__.py +273 -266
  23. crawlo/downloader/aiohttp_downloader.py +226 -228
  24. crawlo/downloader/cffi_downloader.py +245 -256
  25. crawlo/downloader/httpx_downloader.py +259 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +402 -402
  28. crawlo/downloader/selenium_downloader.py +472 -472
  29. crawlo/event.py +11 -11
  30. crawlo/exceptions.py +81 -81
  31. crawlo/extension/__init__.py +39 -39
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +57 -57
  34. crawlo/extension/log_stats.py +81 -81
  35. crawlo/extension/logging_extension.py +43 -43
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/filters/__init__.py +154 -154
  40. crawlo/filters/aioredis_filter.py +234 -234
  41. crawlo/filters/memory_filter.py +269 -269
  42. crawlo/items/__init__.py +23 -23
  43. crawlo/items/base.py +21 -21
  44. crawlo/items/fields.py +52 -52
  45. crawlo/items/items.py +104 -104
  46. crawlo/middleware/__init__.py +21 -21
  47. crawlo/middleware/default_header.py +132 -132
  48. crawlo/middleware/download_delay.py +104 -104
  49. crawlo/middleware/middleware_manager.py +136 -136
  50. crawlo/middleware/offsite.py +114 -114
  51. crawlo/middleware/proxy.py +386 -368
  52. crawlo/middleware/request_ignore.py +86 -86
  53. crawlo/middleware/response_code.py +163 -163
  54. crawlo/middleware/response_filter.py +136 -136
  55. crawlo/middleware/retry.py +124 -124
  56. crawlo/middleware/simple_proxy.py +65 -0
  57. crawlo/mode_manager.py +211 -211
  58. crawlo/network/__init__.py +21 -21
  59. crawlo/network/request.py +379 -338
  60. crawlo/network/response.py +359 -359
  61. crawlo/pipelines/__init__.py +21 -21
  62. crawlo/pipelines/bloom_dedup_pipeline.py +157 -157
  63. crawlo/pipelines/console_pipeline.py +39 -39
  64. crawlo/pipelines/csv_pipeline.py +316 -316
  65. crawlo/pipelines/database_dedup_pipeline.py +223 -223
  66. crawlo/pipelines/json_pipeline.py +218 -218
  67. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  68. crawlo/pipelines/mongo_pipeline.py +131 -131
  69. crawlo/pipelines/mysql_pipeline.py +317 -317
  70. crawlo/pipelines/pipeline_manager.py +62 -62
  71. crawlo/pipelines/redis_dedup_pipeline.py +167 -167
  72. crawlo/project.py +290 -315
  73. crawlo/queue/pqueue.py +37 -37
  74. crawlo/queue/queue_manager.py +379 -378
  75. crawlo/queue/redis_priority_queue.py +306 -306
  76. crawlo/settings/__init__.py +7 -7
  77. crawlo/settings/default_settings.py +216 -220
  78. crawlo/settings/setting_manager.py +163 -122
  79. crawlo/spider/__init__.py +639 -639
  80. crawlo/stats_collector.py +59 -59
  81. crawlo/subscriber.py +129 -129
  82. crawlo/task_manager.py +30 -30
  83. crawlo/templates/crawlo.cfg.tmpl +10 -10
  84. crawlo/templates/project/__init__.py.tmpl +3 -3
  85. crawlo/templates/project/items.py.tmpl +17 -17
  86. crawlo/templates/project/middlewares.py.tmpl +118 -118
  87. crawlo/templates/project/pipelines.py.tmpl +96 -96
  88. crawlo/templates/project/settings.py.tmpl +261 -288
  89. crawlo/templates/project/settings_distributed.py.tmpl +174 -157
  90. crawlo/templates/project/settings_gentle.py.tmpl +95 -100
  91. crawlo/templates/project/settings_high_performance.py.tmpl +125 -134
  92. crawlo/templates/project/settings_minimal.py.tmpl +30 -0
  93. crawlo/templates/project/settings_simple.py.tmpl +96 -98
  94. crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
  95. crawlo/templates/run.py.tmpl +47 -45
  96. crawlo/templates/spider/spider.py.tmpl +143 -143
  97. crawlo/tools/__init__.py +200 -182
  98. crawlo/tools/anti_crawler.py +268 -268
  99. crawlo/tools/authenticated_proxy.py +240 -240
  100. crawlo/{cleaners → tools}/data_formatter.py +225 -225
  101. crawlo/tools/data_validator.py +180 -180
  102. crawlo/tools/date_tools.py +290 -36
  103. crawlo/tools/distributed_coordinator.py +388 -387
  104. crawlo/{cleaners → tools}/encoding_converter.py +127 -126
  105. crawlo/tools/request_tools.py +83 -0
  106. crawlo/tools/retry_mechanism.py +224 -221
  107. crawlo/tools/scenario_adapter.py +262 -262
  108. crawlo/{cleaners → tools}/text_cleaner.py +232 -232
  109. crawlo/utils/__init__.py +35 -35
  110. crawlo/utils/batch_processor.py +259 -259
  111. crawlo/utils/controlled_spider_mixin.py +439 -439
  112. crawlo/utils/db_helper.py +343 -343
  113. crawlo/utils/enhanced_error_handler.py +356 -356
  114. crawlo/utils/env_config.py +142 -142
  115. crawlo/utils/error_handler.py +123 -123
  116. crawlo/utils/func_tools.py +82 -82
  117. crawlo/utils/large_scale_config.py +286 -286
  118. crawlo/utils/large_scale_helper.py +344 -344
  119. crawlo/utils/log.py +187 -128
  120. crawlo/utils/performance_monitor.py +285 -285
  121. crawlo/utils/queue_helper.py +175 -175
  122. crawlo/utils/redis_connection_pool.py +351 -351
  123. crawlo/utils/redis_key_validator.py +198 -198
  124. crawlo/utils/request.py +267 -267
  125. crawlo/utils/request_serializer.py +218 -218
  126. crawlo/utils/spider_loader.py +61 -61
  127. crawlo/utils/system.py +11 -11
  128. crawlo/utils/tools.py +4 -4
  129. crawlo/utils/url.py +39 -39
  130. {crawlo-1.2.7.dist-info → crawlo-1.2.9.dist-info}/METADATA +1011 -764
  131. crawlo-1.2.9.dist-info/RECORD +219 -0
  132. examples/__init__.py +7 -7
  133. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
  134. tests/__init__.py +7 -7
  135. tests/advanced_tools_example.py +275 -275
  136. tests/authenticated_proxy_example.py +107 -237
  137. tests/cleaners_example.py +160 -160
  138. tests/config_validation_demo.py +143 -103
  139. tests/controlled_spider_example.py +205 -205
  140. tests/date_tools_example.py +180 -180
  141. tests/debug_pipelines.py +67 -0
  142. tests/dynamic_loading_example.py +523 -523
  143. tests/dynamic_loading_test.py +104 -104
  144. tests/env_config_example.py +133 -133
  145. tests/error_handling_example.py +171 -171
  146. tests/redis_key_validation_demo.py +130 -130
  147. tests/request_params_example.py +151 -0
  148. tests/response_improvements_example.py +144 -144
  149. tests/test_advanced_tools.py +148 -148
  150. tests/test_all_redis_key_configs.py +145 -145
  151. tests/test_authenticated_proxy.py +141 -141
  152. tests/test_cleaners.py +54 -54
  153. tests/test_comprehensive.py +146 -146
  154. tests/test_config_consistency.py +80 -80
  155. tests/test_config_merge.py +153 -0
  156. tests/test_config_validator.py +182 -193
  157. tests/test_crawlo_proxy_integration.py +109 -173
  158. tests/test_date_tools.py +123 -123
  159. tests/test_default_header_middleware.py +158 -158
  160. tests/test_distributed.py +65 -0
  161. tests/test_double_crawlo_fix.py +207 -207
  162. tests/test_double_crawlo_fix_simple.py +124 -124
  163. tests/test_download_delay_middleware.py +221 -221
  164. tests/test_downloader_proxy_compatibility.py +268 -268
  165. tests/test_dynamic_downloaders_proxy.py +124 -124
  166. tests/test_dynamic_proxy.py +92 -92
  167. tests/test_dynamic_proxy_config.py +146 -146
  168. tests/test_dynamic_proxy_real.py +109 -109
  169. tests/test_edge_cases.py +303 -303
  170. tests/test_enhanced_error_handler.py +270 -270
  171. tests/test_env_config.py +121 -121
  172. tests/test_error_handler_compatibility.py +112 -112
  173. tests/test_final_validation.py +153 -153
  174. tests/test_framework_env_usage.py +103 -103
  175. tests/test_integration.py +169 -357
  176. tests/test_item_dedup_redis_key.py +122 -122
  177. tests/test_mode_consistency.py +51 -51
  178. tests/test_offsite_middleware.py +221 -221
  179. tests/test_parsel.py +29 -29
  180. tests/test_performance.py +327 -327
  181. tests/test_proxy_api.py +264 -264
  182. tests/test_proxy_health_check.py +32 -32
  183. tests/test_proxy_middleware.py +121 -121
  184. tests/test_proxy_middleware_enhanced.py +216 -216
  185. tests/test_proxy_middleware_integration.py +136 -136
  186. tests/test_proxy_middleware_refactored.py +185 -0
  187. tests/test_proxy_providers.py +56 -56
  188. tests/test_proxy_stats.py +19 -19
  189. tests/test_proxy_strategies.py +59 -59
  190. tests/test_queue_manager_double_crawlo.py +173 -173
  191. tests/test_queue_manager_redis_key.py +176 -176
  192. tests/test_random_user_agent.py +73 -0
  193. tests/test_real_scenario_proxy.py +195 -195
  194. tests/test_redis_config.py +28 -28
  195. tests/test_redis_connection_pool.py +294 -294
  196. tests/test_redis_key_naming.py +181 -181
  197. tests/test_redis_key_validator.py +123 -123
  198. tests/test_redis_queue.py +224 -224
  199. tests/test_request_ignore_middleware.py +182 -182
  200. tests/test_request_params.py +112 -0
  201. tests/test_request_serialization.py +70 -70
  202. tests/test_response_code_middleware.py +349 -349
  203. tests/test_response_filter_middleware.py +427 -427
  204. tests/test_response_improvements.py +152 -152
  205. tests/test_retry_middleware.py +241 -241
  206. tests/test_scheduler.py +252 -252
  207. tests/test_scheduler_config_update.py +133 -133
  208. tests/test_simple_response.py +61 -61
  209. tests/test_telecom_spider_redis_key.py +205 -205
  210. tests/test_template_content.py +87 -87
  211. tests/test_template_redis_key.py +134 -134
  212. tests/test_tools.py +159 -153
  213. tests/test_user_agents.py +97 -0
  214. tests/tools_example.py +260 -257
  215. tests/verify_distributed.py +117 -0
  216. crawlo/cleaners/__init__.py +0 -61
  217. crawlo/utils/date_tools.py +0 -290
  218. crawlo-1.2.7.dist-info/RECORD +0 -209
  219. {crawlo-1.2.7.dist-info → crawlo-1.2.9.dist-info}/WHEEL +0 -0
  220. {crawlo-1.2.7.dist-info → crawlo-1.2.9.dist-info}/entry_points.txt +0 -0
  221. {crawlo-1.2.7.dist-info → crawlo-1.2.9.dist-info}/top_level.txt +0 -0
crawlo/crawler.py CHANGED
@@ -1,1100 +1,1103 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- Crawlo Crawler Module
5
- ====================
6
- 提供爬虫进程管理和运行时核心功能。
7
-
8
- 核心组件:
9
- - Crawler: 单个爬虫运行实例,管理Spider与引擎的生命周期
10
- - CrawlerProcess: 爬虫进程管理器,支持多爬虫并发调度和资源管理
11
-
12
- 功能特性:
13
- - 智能并发控制和资源管理
14
- - 优雅关闭和信号处理
15
- - 统计监控和性能追踪
16
- - 自动模块发现和注册
17
- - 错误恢复和重试机制
18
- - 大规模爬虫优化支持
19
-
20
- 示例用法:
21
- # 单个爬虫运行
22
- crawler = Crawler(MySpider, settings)
23
- await crawler.crawl()
24
-
25
- # 多爬虫并发管理
26
- process = CrawlerProcess()
27
- await process.crawl([Spider1, Spider2])
28
- """
29
- from __future__ import annotations
30
- import asyncio
31
- import signal
32
- import time
33
- import threading
34
- from typing import Type, Optional, Set, List, Union, Dict, Any
35
- from .spider import Spider, get_global_spider_registry
36
- from .core.engine import Engine
37
- from .utils.log import get_logger
38
- from .subscriber import Subscriber
39
- from .extension import ExtensionManager
40
- from .stats_collector import StatsCollector
41
- from .event import spider_opened, spider_closed
42
- from .settings.setting_manager import SettingManager
43
- from crawlo.project import merge_settings, get_settings
44
-
45
-
46
- logger = get_logger(__name__)
47
-
48
-
49
- class CrawlerContext:
50
- """
51
- 爬虫上下文管理器
52
- 提供共享状态和资源管理
53
- """
54
-
55
- def __init__(self):
56
- self.start_time = time.time()
57
- self.total_crawlers = 0
58
- self.active_crawlers = 0
59
- self.completed_crawlers = 0
60
- self.failed_crawlers = 0
61
- self.error_log = []
62
- self._lock = threading.RLock()
63
-
64
- def increment_total(self):
65
- with self._lock:
66
- self.total_crawlers += 1
67
-
68
- def increment_active(self):
69
- with self._lock:
70
- self.active_crawlers += 1
71
-
72
- def decrement_active(self):
73
- with self._lock:
74
- self.active_crawlers -= 1
75
-
76
- def increment_completed(self):
77
- with self._lock:
78
- self.completed_crawlers += 1
79
-
80
- def increment_failed(self, error: str):
81
- with self._lock:
82
- self.failed_crawlers += 1
83
- self.error_log.append({
84
- 'timestamp': time.time(),
85
- 'error': error
86
- })
87
-
88
- def get_stats(self) -> Dict[str, Any]:
89
- with self._lock:
90
- duration = time.time() - self.start_time
91
- return {
92
- 'total_crawlers': self.total_crawlers,
93
- 'active_crawlers': self.active_crawlers,
94
- 'completed_crawlers': self.completed_crawlers,
95
- 'failed_crawlers': self.failed_crawlers,
96
- 'success_rate': (self.completed_crawlers / max(1, self.total_crawlers)) * 100,
97
- 'duration_seconds': round(duration, 2),
98
- 'error_count': len(self.error_log)
99
- }
100
-
101
-
102
- class Crawler:
103
- """
104
- 单个爬虫运行实例,管理 Spider 与引擎的生命周期
105
-
106
- 提供功能:
107
- - Spider 生命周期管理(初始化、运行、关闭)
108
- - 引擎组件的协调管理
109
- - 配置合并和验证
110
- - 统计数据收集
111
- - 扩展管理
112
- - 异常处理和清理
113
- """
114
-
115
- def __init__(self, spider_cls: Type[Spider], settings: SettingManager, context: Optional[CrawlerContext] = None):
116
- self.spider_cls = spider_cls
117
- self.spider: Optional[Spider] = None
118
- self.engine: Optional[Engine] = None
119
- self.stats: Optional[StatsCollector] = None
120
- self.subscriber: Optional[Subscriber] = None
121
- self.extension: Optional[ExtensionManager] = None
122
- self.settings: SettingManager = settings.copy()
123
- self.context = context or CrawlerContext()
124
-
125
- # 状态管理
126
- self._closed = False
127
- self._close_lock = asyncio.Lock()
128
- self._start_time = None
129
- self._end_time = None
130
-
131
- # 性能监控
132
- self._performance_metrics = {
133
- 'initialization_time': 0,
134
- 'crawl_duration': 0,
135
- 'memory_peak': 0,
136
- 'request_count': 0,
137
- 'error_count': 0
138
- }
139
-
140
- async def crawl(self):
141
- """
142
- 启动爬虫核心流程
143
-
144
- 包含以下阶段:
145
- 1. 初始化阶段: 创建所有组件
146
- 2. 验证阶段: 检查配置和状态
147
- 3. 运行阶段: 启动爬虫引擎
148
- 4. 清理阶段: 资源释放
149
- """
150
- init_start = time.time()
151
- self._start_time = init_start
152
-
153
- try:
154
- # 更新上下文状态
155
- self.context.increment_active()
156
-
157
- # 阶段 1: 初始化组件
158
- # 调整组件初始化顺序,确保日志输出顺序符合要求
159
- self.subscriber = self._create_subscriber()
160
- self.spider = self._create_spider()
161
- self.engine = self._create_engine()
162
- self.stats = self._create_stats()
163
- # 注意:这里不初始化扩展管理器,让它在引擎中初始化
164
-
165
- # 记录初始化时间
166
- self._performance_metrics['initialization_time'] = time.time() - init_start
167
-
168
- # 阶段 2: 验证状态
169
- self._validate_crawler_state()
170
-
171
- # 阶段 3: 显示运行配置摘要
172
- self._log_runtime_summary()
173
-
174
- # 阶段 4: 启动爬虫
175
- crawl_start = time.time()
176
- await self.engine.start_spider(self.spider)
177
-
178
- # 记录爬取时间
179
- self._performance_metrics['crawl_duration'] = time.time() - crawl_start
180
- self._end_time = time.time()
181
-
182
- # 更新上下文状态
183
- self.context.increment_completed()
184
-
185
- logger.info(f"爬虫 {self.spider.name} 完成,耗时 {self._get_total_duration():.2f}秒")
186
-
187
- except Exception as e:
188
- self._performance_metrics['error_count'] += 1
189
- self.context.increment_failed(str(e))
190
- logger.error(f"爬虫 {getattr(self.spider, 'name', 'Unknown')} 运行失败: {e}", exc_info=True)
191
- raise
192
- finally:
193
- self.context.decrement_active()
194
- # 确保资源清理
195
- await self._ensure_cleanup()
196
-
197
- def _log_runtime_summary(self):
198
- """记录运行时配置摘要"""
199
- # 获取爬虫名称
200
- spider_name = getattr(self.spider, 'name', 'Unknown')
201
-
202
- # 显示简化的运行时信息,避免与项目初始化重复
203
- logger.info(f"🕷️ 开始运行爬虫: {spider_name}")
204
-
205
- # 注意:并发数和下载延迟信息已在其他地方显示,避免重复
206
- # 如果需要显示其他运行时特定信息,可以在这里添加
207
-
208
- def _validate_crawler_state(self):
209
- """
210
- 验证爬虫状态和配置
211
- 确保所有必要组件都已正确初始化
212
- """
213
- if not self.spider:
214
- raise RuntimeError("爬虫实例未初始化")
215
- if not self.engine:
216
- raise RuntimeError("引擎未初始化")
217
- if not self.stats:
218
- raise RuntimeError("统计收集器未初始化")
219
- if not self.subscriber:
220
- raise RuntimeError("事件订阅器未初始化")
221
-
222
- # 检查关键配置
223
- if not self.spider.name:
224
- raise ValueError("爬虫名称不能为空")
225
-
226
- logger.debug(f"爬虫 {self.spider.name} 状态验证通过")
227
-
228
- def _get_total_duration(self) -> float:
229
- """获取总运行时间"""
230
- if self._start_time and self._end_time:
231
- return self._end_time - self._start_time
232
- return 0.0
233
-
234
- async def _ensure_cleanup(self):
235
- """确保资源清理"""
236
- try:
237
- if not self._closed:
238
- await self.close()
239
- except Exception as e:
240
- logger.warning(f"清理资源时发生错误: {e}")
241
-
242
- def get_performance_metrics(self) -> Dict[str, Any]:
243
- """获取性能指标"""
244
- metrics = self._performance_metrics.copy()
245
- metrics['total_duration'] = self._get_total_duration()
246
- if self.stats:
247
- # 添加统计数据
248
- stats_data = getattr(self.stats, 'get_stats', lambda: {})()
249
- metrics.update(stats_data)
250
- return metrics
251
- @staticmethod
252
- def _create_subscriber() -> Subscriber:
253
- """创建事件订阅器"""
254
- return Subscriber()
255
-
256
- def _create_spider(self) -> Spider:
257
- """
258
- 创建并验证爬虫实例(增强版)
259
-
260
- 执行以下验证:
261
- - 爬虫名称必须存在
262
- - start_requests 方法必须可调用
263
- - start_urls 不能是字符串
264
- - parse 方法建议存在
265
- """
266
- spider = self.spider_cls.create_instance(self)
267
-
268
- # 必要属性检查
269
- if not getattr(spider, 'name', None):
270
- raise AttributeError(
271
- f"爬虫类 '{self.spider_cls.__name__}' 必须定义 'name' 属性。\n"
272
- f"示例: name = 'my_spider'"
273
- )
274
-
275
- if not callable(getattr(spider, 'start_requests', None)):
276
- raise AttributeError(
277
- f"爬虫 '{spider.name}' 必须实现可调用的 'start_requests' 方法。\n"
278
- f"示例: def start_requests(self): yield Request(url='...')"
279
- )
280
-
281
- # start_urls 类型检查
282
- start_urls = getattr(spider, 'start_urls', [])
283
- if isinstance(start_urls, str):
284
- raise TypeError(
285
- f"爬虫 '{spider.name}' 的 'start_urls' 必须是列表或元组,不能是字符串。\n"
286
- f"正确写法: start_urls = ['http://example.com']\n"
287
- f"错误写法: start_urls = 'http://example.com'"
288
- )
289
-
290
- # parse 方法检查(警告而非错误)
291
- if not callable(getattr(spider, 'parse', None)):
292
- logger.warning(
293
- f"爬虫 '{spider.name}' 未定义 'parse' 方法。\n"
294
- f"请确保所有 Request 都指定了回调函数,否则响应将被忽略。"
295
- )
296
-
297
- # 设置爬虫配置
298
- self._set_spider(spider)
299
-
300
- logger.debug(f"爬虫 '{spider.name}' 初始化完成")
301
- return spider
302
-
303
- def _create_engine(self) -> Engine:
304
- """创建并初始化引擎"""
305
- engine = Engine(self)
306
- engine.engine_start()
307
- logger.debug(f"引擎初始化完成,爬虫: {getattr(self.spider, 'name', 'Unknown')}")
308
- return engine
309
-
310
- def _create_stats(self) -> StatsCollector:
311
- """创建统计收集器"""
312
- stats = StatsCollector(self)
313
- logger.debug(f"统计收集器初始化完成,爬虫: {getattr(self.spider, 'name', 'Unknown')}")
314
- return stats
315
-
316
- def _create_extension(self) -> ExtensionManager:
317
- """创建扩展管理器"""
318
- # 修改扩展管理器的创建方式,延迟初始化直到需要时
319
- extension = ExtensionManager.create_instance(self)
320
- logger.debug(f"扩展管理器初始化完成,爬虫: {getattr(self.spider, 'name', 'Unknown')}")
321
- return extension
322
-
323
- def _set_spider(self, spider: Spider):
324
- """
325
- 设置爬虫配置和事件订阅
326
- 将爬虫的生命周期事件与订阅器绑定
327
- """
328
- # 订阅爬虫生命周期事件
329
- self.subscriber.subscribe(spider.spider_opened, event=spider_opened)
330
- self.subscriber.subscribe(spider.spider_closed, event=spider_closed)
331
-
332
- # 合并爬虫自定义配置
333
- merge_settings(spider, self.settings)
334
-
335
- logger.debug(f"爬虫 '{spider.name}' 配置合并完成")
336
-
337
- async def close(self, reason='finished') -> None:
338
- """
339
- 关闭爬虫并清理资源(增强版)
340
-
341
- 确保只关闭一次,并处理所有清理操作
342
- """
343
- async with self._close_lock:
344
- if self._closed:
345
- return
346
-
347
- self._closed = True
348
- self._end_time = time.time()
349
-
350
- try:
351
- # 通知爬虫关闭事件
352
- if self.subscriber:
353
- await self.subscriber.notify(spider_closed)
354
-
355
- # 统计数据收集
356
- if self.stats and self.spider:
357
- self.stats.close_spider(spider=self.spider, reason=reason)
358
- # 记录统计数据
359
- try:
360
- from crawlo.commands.stats import record_stats
361
- record_stats(self)
362
- except ImportError:
363
- logger.debug("统计记录模块不存在,跳过统计记录")
364
-
365
- logger.info(
366
- f"爬虫 '{getattr(self.spider, 'name', 'Unknown')}' 已关闭,"
367
- f"原因: {reason},耗时: {self._get_total_duration():.2f}秒"
368
- )
369
-
370
- except Exception as e:
371
- logger.error(f"关闭爬虫时发生错误: {e}", exc_info=True)
372
- finally:
373
- # 确保资源清理
374
- await self._cleanup_resources()
375
-
376
- async def _cleanup_resources(self):
377
- """清理所有资源"""
378
- cleanup_tasks = []
379
-
380
- # 引擎清理
381
- if self.engine:
382
- try:
383
- cleanup_tasks.append(self.engine.close())
384
- except AttributeError:
385
- pass # 引擎没有close方法
386
-
387
- # 扩展清理
388
- if self.extension:
389
- try:
390
- cleanup_tasks.append(self.extension.close())
391
- except AttributeError:
392
- pass
393
-
394
- # 统计收集器清理
395
- if self.stats:
396
- try:
397
- cleanup_tasks.append(self.stats.close())
398
- except AttributeError:
399
- pass
400
-
401
- # 并发执行清理任务
402
- if cleanup_tasks:
403
- await asyncio.gather(*cleanup_tasks, return_exceptions=True)
404
-
405
- logger.debug("资源清理完成")
406
-
407
-
408
- class CrawlerProcess:
409
- """
410
- 爬虫进程管理器
411
-
412
- 支持功能:
413
- - 多爬虫并发调度和资源管理
414
- - 自动模块发现和爬虫注册
415
- - 智能并发控制和负载均衡
416
- - 优雅关闭和信号处理
417
- - 实时状态监控和统计
418
- - 错误恢复和重试机制
419
- - 大规模爬虫优化支持
420
-
421
- 使用示例:
422
- # 基本用法
423
- process = CrawlerProcess()
424
- await process.crawl(MySpider)
425
-
426
- # 多爬虫并发
427
- await process.crawl([Spider1, Spider2, 'spider_name'])
428
-
429
- # 自定义并发数
430
- process = CrawlerProcess(max_concurrency=8)
431
- """
432
-
433
- def __init__(
434
- self,
435
- settings: Optional[SettingManager] = None,
436
- max_concurrency: Optional[int] = None,
437
- spider_modules: Optional[List[str]] = None,
438
- enable_monitoring: bool = True
439
- ):
440
- # 基础配置
441
- self.settings: SettingManager = settings or self._get_default_settings()
442
- self.crawlers: Set[Crawler] = set()
443
- self._active_tasks: Set[asyncio.Task] = set()
444
-
445
- # 上下文管理器
446
- self.context = CrawlerContext()
447
-
448
- # 并发控制配置
449
- self.max_concurrency: int = (
450
- max_concurrency
451
- or self.settings.get('MAX_RUNNING_SPIDERS')
452
- or self.settings.get('CONCURRENCY', 3)
453
- )
454
- self.semaphore = asyncio.Semaphore(self.max_concurrency)
455
-
456
- # 监控配置
457
- self.enable_monitoring = enable_monitoring
458
- self._monitoring_task = None
459
- self._shutdown_event = asyncio.Event()
460
-
461
- # 自动发现并导入爬虫模块
462
- if spider_modules:
463
- self.auto_discover(spider_modules)
464
-
465
- # 使用全局注册表的快照(避免后续导入影响)
466
- self._spider_registry: Dict[str, Type[Spider]] = get_global_spider_registry()
467
-
468
- # 性能监控
469
- self._performance_stats = {
470
- 'total_requests': 0,
471
- 'successful_requests': 0,
472
- 'failed_requests': 0,
473
- 'memory_usage_mb': 0,
474
- 'cpu_usage_percent': 0
475
- }
476
-
477
- # 注册信号量
478
- signal.signal(signal.SIGINT, self._shutdown)
479
- signal.signal(signal.SIGTERM, self._shutdown)
480
-
481
- self._log_startup_info()
482
-
483
- logger.debug(
484
- f"CrawlerProcess 初始化完成\n"
485
- f" - 最大并行爬虫数: {self.max_concurrency}\n"
486
- f" - 已注册爬虫数: {len(self._spider_registry)}\n"
487
- f" - 监控启用: {self.enable_monitoring}"
488
- )
489
-
490
- async def start_monitoring(self):
491
- """启动监控任务"""
492
- if not self.enable_monitoring:
493
- return
494
-
495
- self._monitoring_task = asyncio.create_task(self._monitor_loop())
496
- logger.debug("监控任务已启动")
497
-
498
- async def stop_monitoring(self):
499
- """停止监控任务"""
500
- if self._monitoring_task and not self._monitoring_task.done():
501
- self._monitoring_task.cancel()
502
- try:
503
- await self._monitoring_task
504
- except asyncio.CancelledError:
505
- pass
506
- logger.debug("监控任务已停止")
507
-
508
- async def _monitor_loop(self):
509
- """监控循环,定期收集和报告状态"""
510
- try:
511
- while not self._shutdown_event.is_set():
512
- await self._collect_performance_stats()
513
-
514
- # 每30秒输出一次状态
515
- stats = self.context.get_stats()
516
- if stats['active_crawlers'] > 0:
517
- logger.debug(
518
- f"爬虫状态: 活跃 {stats['active_crawlers']}, "
519
- f"完成 {stats['completed_crawlers']}, "
520
- f"失败 {stats['failed_crawlers']}, "
521
- f"成功率 {stats['success_rate']:.1f}%"
522
- )
523
-
524
- await asyncio.sleep(30) # 30秒间隔
525
-
526
- except asyncio.CancelledError:
527
- logger.debug("监控循环被取消")
528
- except Exception as e:
529
- logger.error(f"监控循环错误: {e}", exc_info=True)
530
-
531
- async def _collect_performance_stats(self):
532
- """收集性能统计数据"""
533
- try:
534
- import psutil
535
- import os
536
-
537
- process = psutil.Process(os.getpid())
538
- memory_info = process.memory_info()
539
-
540
- self._performance_stats.update({
541
- 'memory_usage_mb': round(memory_info.rss / 1024 / 1024, 2),
542
- 'cpu_usage_percent': round(process.cpu_percent(), 2)
543
- })
544
-
545
- except ImportError:
546
- # psutil 不存在时跳过性能监控
547
- pass
548
- except Exception as e:
549
- logger.debug(f"收集性能统计失败: {e}")
550
- @staticmethod
551
- def auto_discover(modules: List[str]):
552
- """
553
- 自动导入模块,触发 Spider 类定义和注册(增强版)
554
-
555
- 支持递归扫描和错误恢复
556
- """
557
- import importlib
558
- import pkgutil
559
-
560
- discovered_count = 0
561
- error_count = 0
562
-
563
- for module_name in modules:
564
- try:
565
- module = importlib.import_module(module_name)
566
-
567
- if hasattr(module, '__path__'):
568
- # 包模块,递归扫描
569
- for _, name, _ in pkgutil.walk_packages(module.__path__, module.__name__ + "."):
570
- try:
571
- importlib.import_module(name)
572
- discovered_count += 1
573
- except Exception as sub_e:
574
- error_count += 1
575
- logger.warning(f"导入子模块 {name} 失败: {sub_e}")
576
- else:
577
- # 单个模块
578
- importlib.import_module(module_name)
579
- discovered_count += 1
580
-
581
- logger.debug(f"已扫描模块: {module_name}")
582
-
583
- except Exception as e:
584
- error_count += 1
585
- logger.error(f"扫描模块 {module_name} 失败: {e}", exc_info=True)
586
-
587
- logger.debug(
588
- f"爬虫注册完成: 成功 {discovered_count} 个,失败 {error_count} 个"
589
- )
590
-
591
- # === 公共只读接口:避免直接访问 _spider_registry ===
592
-
593
- def get_spider_names(self) -> List[str]:
594
- """获取所有已注册的爬虫名称"""
595
- return list(self._spider_registry.keys())
596
-
597
- def get_spider_class(self, name: str) -> Optional[Type[Spider]]:
598
- """根据 name 获取爬虫类"""
599
- return self._spider_registry.get(name)
600
-
601
- def is_spider_registered(self, name: str) -> bool:
602
- """检查某个 name 是否已注册"""
603
- return name in self._spider_registry
604
-
605
- async def crawl(self, spiders: Union[Type[Spider], str, List[Union[Type[Spider], str]]]):
606
- """
607
- 启动一个或多个爬虫
608
-
609
- 增强功能:
610
- - 智能并发控制
611
- - 实时监控和统计
612
- - 错误恢复和重试
613
- - 优雅关闭处理
614
- """
615
- # 阶段 1: 预处理和验证
616
- spider_classes_to_run = self._resolve_spiders_to_run(spiders)
617
- total = len(spider_classes_to_run)
618
-
619
- if total == 0:
620
- raise ValueError("至少需要提供一个爬虫类或名称")
621
-
622
- # 阶段 2: 初始化上下文和监控
623
- for _ in range(total):
624
- self.context.increment_total()
625
-
626
- # 启动监控任务
627
- await self.start_monitoring()
628
-
629
- try:
630
- # 阶段 3: 按类名排序,保证启动顺序可预测
631
- spider_classes_to_run.sort(key=lambda cls: cls.__name__.lower())
632
-
633
- logger.debug(
634
- f"开始启动 {total} 个爬虫\n"
635
- f" - 最大并发数: {self.max_concurrency}\n"
636
- f" - 爬虫列表: {[cls.__name__ for cls in spider_classes_to_run]}"
637
- )
638
-
639
- # 阶段 4: 流式启动所有爬虫任务
640
- tasks = [
641
- asyncio.create_task(
642
- self._run_spider_with_limit(spider_cls, index + 1, total),
643
- name=f"spider-{spider_cls.__name__}-{index+1}"
644
- )
645
- for index, spider_cls in enumerate(spider_classes_to_run)
646
- ]
647
-
648
- # 阶段 5: 等待所有任务完成(失败不中断)
649
- results = await asyncio.gather(*tasks, return_exceptions=True)
650
-
651
- # 阶段 6: 统计异常和结果
652
- failed = [i for i, r in enumerate(results) if isinstance(r, Exception)]
653
- successful = total - len(failed)
654
-
655
- if failed:
656
- failed_spiders = [spider_classes_to_run[i].__name__ for i in failed]
657
- logger.error(
658
- f"爬虫执行结果: 成功 {successful}/{total},失败 {len(failed)}/{total}\n"
659
- f" - 失败爬虫: {failed_spiders}"
660
- )
661
-
662
- # 记录详细错误信息
663
- for i in failed:
664
- error = results[i]
665
- logger.error(f"爬虫 {spider_classes_to_run[i].__name__} 错误详情: {error}")
666
- else:
667
- logger.info(f"所有 {total} 个爬虫均成功完成! 🎉")
668
-
669
- # 返回统计结果
670
- return {
671
- 'total': total,
672
- 'successful': successful,
673
- 'failed': len(failed),
674
- 'success_rate': (successful / total) * 100 if total > 0 else 0,
675
- 'context_stats': self.context.get_stats()
676
- }
677
-
678
- finally:
679
- # 阶段 7: 清理和关闭
680
- await self.stop_monitoring()
681
- await self._cleanup_process()
682
-
683
- async def _cleanup_process(self):
684
- """清理进程资源"""
685
- try:
686
- # 等待所有活跃爬虫完成
687
- if self.crawlers:
688
- close_tasks = [crawler.close() for crawler in self.crawlers]
689
- await asyncio.gather(*close_tasks, return_exceptions=True)
690
- self.crawlers.clear()
691
-
692
- # 清理活跃任务
693
- if self._active_tasks:
694
- for task in list(self._active_tasks):
695
- if not task.done():
696
- task.cancel()
697
- await asyncio.gather(*self._active_tasks, return_exceptions=True)
698
- self._active_tasks.clear()
699
-
700
- logger.debug("进程资源清理完成")
701
-
702
- except Exception as e:
703
- logger.error(f"清理进程资源时发生错误: {e}", exc_info=True)
704
-
705
- def get_process_stats(self) -> Dict[str, Any]:
706
- """获取进程统计信息"""
707
- context_stats = self.context.get_stats()
708
-
709
- return {
710
- 'context': context_stats,
711
- 'performance': self._performance_stats.copy(),
712
- 'crawlers': {
713
- 'total_registered': len(self._spider_registry),
714
- 'active_crawlers': len(self.crawlers),
715
- 'max_concurrency': self.max_concurrency
716
- },
717
- 'registry': {
718
- 'spider_names': list(self._spider_registry.keys()),
719
- 'spider_classes': [cls.__name__ for cls in self._spider_registry.values()]
720
- }
721
- }
722
- def _resolve_spiders_to_run(
723
- self,
724
- spiders_input: Union[Type[Spider], str, List[Union[Type[Spider], str]]]
725
- ) -> List[Type[Spider]]:
726
- """
727
- 解析输入为爬虫类列表
728
-
729
- 支持各种输入格式并验证唯一性
730
- """
731
- inputs = self._normalize_inputs(spiders_input)
732
- seen_spider_names: Set[str] = set()
733
- spider_classes: List[Type[Spider]] = []
734
-
735
- for item in inputs:
736
- try:
737
- spider_cls = self._resolve_spider_class(item)
738
- spider_name = getattr(spider_cls, 'name', None)
739
-
740
- if not spider_name:
741
- raise ValueError(f"爬虫类 {spider_cls.__name__} 缺少 'name' 属性")
742
-
743
- if spider_name in seen_spider_names:
744
- raise ValueError(
745
- f"本次运行中爬虫名称 '{spider_name}' 重复。\n"
746
- f"请确保每个爬虫的 name 属性在本次运行中唯一。"
747
- )
748
-
749
- seen_spider_names.add(spider_name)
750
- spider_classes.append(spider_cls)
751
-
752
- logger.debug(f"解析爬虫成功: {item} -> {spider_cls.__name__} (name='{spider_name}')")
753
-
754
- except Exception as e:
755
- logger.error(f"解析爬虫失败: {item} - {e}")
756
- raise
757
-
758
- return spider_classes
759
-
760
- @staticmethod
761
- def _normalize_inputs(spiders_input) -> List[Union[Type[Spider], str]]:
762
- """
763
- 标准化输入为列表
764
-
765
- 支持更多输入类型并提供更好的错误信息
766
- """
767
- if isinstance(spiders_input, (type, str)):
768
- return [spiders_input]
769
- elif isinstance(spiders_input, (list, tuple, set)):
770
- spider_list = list(spiders_input)
771
- if not spider_list:
772
- raise ValueError("爬虫列表不能为空")
773
- return spider_list
774
- else:
775
- raise TypeError(
776
- f"spiders 参数类型不支持: {type(spiders_input)}\n"
777
- f"支持的类型: Spider类、name字符串,或它们的列表/元组/集合"
778
- )
779
-
780
- def _resolve_spider_class(self, item: Union[Type[Spider], str]) -> Type[Spider]:
781
- """
782
- 解析单个输入项为爬虫类
783
-
784
- 提供更好的错误提示和调试信息
785
- """
786
- if isinstance(item, type) and issubclass(item, Spider):
787
- # 直接是 Spider 类
788
- return item
789
- elif isinstance(item, str):
790
- # 是字符串名称,需要查找注册表
791
- spider_cls = self._spider_registry.get(item)
792
- if not spider_cls:
793
- available_spiders = list(self._spider_registry.keys())
794
- raise ValueError(
795
- f"未找到名为 '{item}' 的爬虫。\n"
796
- f"已注册的爬虫: {available_spiders}\n"
797
- f"请检查爬虫名称是否正确,或者确保爬虫已被正确导入和注册。"
798
- )
799
- return spider_cls
800
- else:
801
- raise TypeError(
802
- f"无效类型 {type(item)}: {item}\n"
803
- f"必须是 Spider 类或字符串 name。\n"
804
- f"示例: MySpider 或 'my_spider'"
805
- )
806
-
807
- async def _run_spider_with_limit(self, spider_cls: Type[Spider], seq: int, total: int):
808
- """
809
- 受信号量限制的爬虫运行函数
810
-
811
- 包含增强的错误处理和监控功能
812
- """
813
- task = asyncio.current_task()
814
- crawler = None
815
-
816
- try:
817
- # 注册任务
818
- if task:
819
- self._active_tasks.add(task)
820
-
821
- # 获取并发许可
822
- await self.semaphore.acquire()
823
-
824
- start_msg = f"[{seq}/{total}] 启动爬虫: {spider_cls.__name__}"
825
- logger.info(start_msg)
826
-
827
- # 创建并运行爬虫
828
- crawler = Crawler(spider_cls, self.settings, self.context)
829
- self.crawlers.add(crawler)
830
-
831
- # 记录启动时间
832
- start_time = time.time()
833
-
834
- # 运行爬虫
835
- await crawler.crawl()
836
-
837
- # 计算运行时间
838
- duration = time.time() - start_time
839
-
840
- end_msg = (
841
- f"[{seq}/{total}] 爬虫完成: {spider_cls.__name__}, "
842
- f"耗时: {duration:.2f}秒"
843
- )
844
- logger.info(end_msg)
845
-
846
- # 记录成功统计
847
- self._performance_stats['successful_requests'] += 1
848
-
849
- except Exception as e:
850
- # 记录失败统计
851
- self._performance_stats['failed_requests'] += 1
852
-
853
- error_msg = f"爬虫 {spider_cls.__name__} 执行失败: {e}"
854
- logger.error(error_msg, exc_info=True)
855
-
856
- # 将错误信息记录到上下文
857
- if hasattr(self, 'context'):
858
- self.context.increment_failed(error_msg)
859
-
860
- raise
861
- finally:
862
- # 清理资源
863
- try:
864
- if crawler and crawler in self.crawlers:
865
- self.crawlers.remove(crawler)
866
-
867
- if task and task in self._active_tasks:
868
- self._active_tasks.remove(task)
869
-
870
- self.semaphore.release()
871
-
872
- except Exception as cleanup_error:
873
- logger.warning(f"清理资源时发生错误: {cleanup_error}")
874
-
875
- def _shutdown(self, _signum, _frame):
876
- """
877
- 优雅关闭信号处理
878
-
879
- 提供更好的关闭体验和资源清理
880
- """
881
- signal_name = {signal.SIGINT: 'SIGINT', signal.SIGTERM: 'SIGTERM'}.get(_signum, str(_signum))
882
- logger.warning(f"收到关闭信号 {signal_name},正在停止所有爬虫...")
883
-
884
- # 设置关闭事件
885
- if hasattr(self, '_shutdown_event'):
886
- self._shutdown_event.set()
887
-
888
- # 停止所有爬虫引擎
889
- for crawler in list(self.crawlers):
890
- if crawler.engine:
891
- crawler.engine.running = False
892
- crawler.engine.normal = False
893
- logger.debug(f"已停止爬虫引擎: {getattr(crawler.spider, 'name', 'Unknown')}")
894
-
895
- # 创建关闭任务
896
- asyncio.create_task(self._wait_for_shutdown())
897
-
898
- logger.info("关闭指令已发送,等待爬虫完成当前任务...")
899
-
900
- async def _wait_for_shutdown(self):
901
- """
902
- 等待所有活跃任务完成
903
-
904
- 提供更好的关闭时间控制和进度反馈
905
- """
906
- try:
907
- # 停止监控任务
908
- await self.stop_monitoring()
909
-
910
- # 等待活跃任务完成
911
- pending = [t for t in self._active_tasks if not t.done()]
912
-
913
- if pending:
914
- logger.info(
915
- f"等待 {len(pending)} 个活跃任务完成..."
916
- f"(最大等待时间: 30秒)"
917
- )
918
-
919
- # 设置超时时间
920
- try:
921
- await asyncio.wait_for(
922
- asyncio.gather(*pending, return_exceptions=True),
923
- timeout=30.0
924
- )
925
- except asyncio.TimeoutError:
926
- logger.warning("部分任务超时,强制取消中...")
927
-
928
- # 强制取消超时任务
929
- for task in pending:
930
- if not task.done():
931
- task.cancel()
932
-
933
- # 等待取消完成
934
- await asyncio.gather(*pending, return_exceptions=True)
935
-
936
- # 最终清理
937
- await self._cleanup_process()
938
-
939
- # 输出最终统计
940
- final_stats = self.context.get_stats()
941
- logger.info(
942
- f"所有爬虫已优雅关闭 👋\n"
943
- f" - 总计爬虫: {final_stats['total_crawlers']}\n"
944
- f" - 成功完成: {final_stats['completed_crawlers']}\n"
945
- f" - 失败数量: {final_stats['failed_crawlers']}\n"
946
- f" - 成功率: {final_stats['success_rate']:.1f}%\n"
947
- f" - 总运行时间: {final_stats['duration_seconds']}秒"
948
- )
949
-
950
- except Exception as e:
951
- logger.error(f"关闭过程中发生错误: {e}", exc_info=True)
952
-
953
- @classmethod
954
- def _get_default_settings(cls) -> SettingManager:
955
- """
956
- 加载默认配置
957
-
958
- 提供更好的错误处理和降级策略
959
- """
960
- try:
961
- settings = get_settings()
962
- logger.debug("成功加载默认配置")
963
- return settings
964
- except Exception as e:
965
- logger.warning(f"无法加载默认配置: {e},使用空配置")
966
- return SettingManager()
967
-
968
- def _log_startup_info(self):
969
- """打印启动信息,包括运行模式和关键配置检查"""
970
- # 获取运行模式
971
- run_mode = self.settings.get('RUN_MODE', 'standalone')
972
-
973
- # 构建启动信息日志
974
- startup_info = [
975
- "🚀 Crawlo 爬虫框架启动"
976
- ]
977
-
978
- # 获取实际的队列类型
979
- queue_type = self.settings.get('QUEUE_TYPE', 'memory')
980
-
981
- # 根据运行模式和队列类型组合显示信息
982
- if run_mode == 'distributed':
983
- startup_info.append(" 运行模式: distributed")
984
- startup_info.append(" 🌐 分布式模式 - 支持多节点协同工作")
985
- # 显示Redis配置
986
- redis_host = self.settings.get('REDIS_HOST', 'localhost')
987
- redis_port = self.settings.get('REDIS_PORT', 6379)
988
- startup_info.append(f" Redis地址: {redis_host}:{redis_port}")
989
- elif run_mode == 'standalone':
990
- if queue_type == 'redis':
991
- startup_info.append(" 运行模式: standalone+redis")
992
- # startup_info.append(" 🌐 分布式模式 - 支持多节点协同工作")
993
- # 显示Redis配置
994
- redis_host = self.settings.get('REDIS_HOST', 'localhost')
995
- redis_port = self.settings.get('REDIS_PORT', 6379)
996
- startup_info.append(f" Redis地址: {redis_host}:{redis_port}")
997
- elif queue_type == 'auto':
998
- startup_info.append(" 运行模式: standalone+auto")
999
- # startup_info.append(" 🤖 自动检测模式 - 智能选择最佳运行方式")
1000
- else: # memory
1001
- startup_info.append(" 运行模式: standalone")
1002
- # startup_info.append(" 🏠 单机模式 - 适用于开发和小规模数据采集")
1003
- else: # auto mode
1004
- if queue_type == 'redis':
1005
- startup_info.append(" 运行模式: auto+redis")
1006
- # startup_info.append(" 🌐 分布式模式 - 支持多节点协同工作")
1007
- # 显示Redis配置
1008
- redis_host = self.settings.get('REDIS_HOST', 'localhost')
1009
- redis_port = self.settings.get('REDIS_PORT', 6379)
1010
- startup_info.append(f" Redis地址: {redis_host}:{redis_port}")
1011
- elif queue_type == 'memory':
1012
- startup_info.append(" 运行模式: auto+memory")
1013
- # startup_info.append(" 🏠 单机模式 - 适用于开发和小规模数据采集")
1014
- else: # auto
1015
- startup_info.append(" 运行模式: auto")
1016
- # startup_info.append(" 🤖 自动检测模式 - 智能选择最佳运行方式")
1017
-
1018
- # 打印启动信息
1019
- for info in startup_info:
1020
- logger.info(info)
1021
-
1022
-
1023
- # === 工具函数 ===
1024
-
1025
- def create_crawler_with_optimizations(
1026
- spider_cls: Type[Spider],
1027
- settings: Optional[SettingManager] = None,
1028
- **optimization_kwargs
1029
- ) -> Crawler:
1030
- """
1031
- 创建优化的爬虫实例
1032
-
1033
- :param spider_cls: 爬虫类
1034
- :param settings: 设置管理器
1035
- :param optimization_kwargs: 优化参数
1036
- :return: 爬虫实例
1037
- """
1038
- if settings is None:
1039
- settings = SettingManager()
1040
-
1041
- # 应用优化配置
1042
- for key, value in optimization_kwargs.items():
1043
- settings.set(key, value)
1044
-
1045
- context = CrawlerContext()
1046
- return Crawler(spider_cls, settings, context)
1047
-
1048
-
1049
- def create_process_with_large_scale_config(
1050
- config_type: str = 'balanced',
1051
- concurrency: int = 16,
1052
- **kwargs
1053
- ) -> CrawlerProcess:
1054
- """
1055
- 创建支持大规模优化的进程管理器
1056
-
1057
- :param config_type: 配置类型 ('conservative', 'balanced', 'aggressive', 'memory_optimized')
1058
- :param concurrency: 并发数
1059
- :param kwargs: 其他参数
1060
- :return: 进程管理器
1061
- """
1062
- try:
1063
- from crawlo.utils.large_scale_config import LargeScaleConfig
1064
-
1065
- # 获取优化配置
1066
- config_methods = {
1067
- 'conservative': LargeScaleConfig.conservative_config,
1068
- 'balanced': LargeScaleConfig.balanced_config,
1069
- 'aggressive': LargeScaleConfig.aggressive_config,
1070
- 'memory_optimized': LargeScaleConfig.memory_optimized_config
1071
- }
1072
-
1073
- if config_type not in config_methods:
1074
- logger.warning(f"未知的配置类型: {config_type},使用默认配置")
1075
- settings = SettingManager()
1076
- else:
1077
- config = config_methods[config_type](concurrency)
1078
- settings = SettingManager()
1079
- settings.update(config)
1080
-
1081
- return CrawlerProcess(
1082
- settings=settings,
1083
- max_concurrency=concurrency,
1084
- **kwargs
1085
- )
1086
-
1087
- except ImportError:
1088
- logger.warning("大规模配置模块不存在,使用默认配置")
1089
- return CrawlerProcess(max_concurrency=concurrency, **kwargs)
1090
-
1091
-
1092
- # === 导出接口 ===
1093
-
1094
- __all__ = [
1095
- 'Crawler',
1096
- 'CrawlerProcess',
1097
- 'CrawlerContext',
1098
- 'create_crawler_with_optimizations',
1099
- 'create_process_with_large_scale_config'
1100
- ]
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ Crawlo Crawler Module
5
+ ====================
6
+ Provides crawler process management and runtime core functionality.
7
+
8
+ Core Components:
9
+ - Crawler: Single crawler runtime instance, managing Spider and engine lifecycle
10
+ - CrawlerProcess: Crawler process manager, supporting multi-crawler concurrent scheduling and resource management
11
+
12
+ Features:
13
+ - Intelligent concurrency control and resource management
14
+ - Graceful shutdown and signal handling
15
+ - Statistics monitoring and performance tracking
16
+ - Automatic module discovery and registration
17
+ - Error recovery and retry mechanism
18
+ - Large-scale crawler optimization support
19
+
20
+ Example Usage:
21
+ # Single crawler run
22
+ crawler = Crawler(MySpider, settings)
23
+ await crawler.crawl()
24
+
25
+ # Multi-crawler concurrent management
26
+ process = CrawlerProcess()
27
+ await process.crawl([Spider1, Spider2])
28
+ """
29
+ from __future__ import annotations
30
+ import asyncio
31
+ import signal
32
+ import time
33
+ import threading
34
+ from typing import Type, Optional, Set, List, Union, Dict, Any
35
+ from .spider import Spider, get_global_spider_registry
36
+ from .core.engine import Engine
37
+ from .utils.log import get_logger
38
+ from .subscriber import Subscriber
39
+ from .extension import ExtensionManager
40
+ from .stats_collector import StatsCollector
41
+ from .event import spider_opened, spider_closed
42
+ from .settings.setting_manager import SettingManager
43
+ from crawlo.project import merge_settings, get_settings
44
+
45
+ # 延迟初始化logger,在需要时通过get_logger获取
46
+ logger = None
47
+
48
+
49
+ def _get_logger():
50
+ """延迟获取logger实例,确保在配置加载后创建"""
51
+ global logger
52
+ if logger is None:
53
+ logger = get_logger(__name__)
54
+ return logger
55
+
56
+
57
+ class CrawlerContext:
58
+ """
59
+ Crawler context manager
60
+ Provides shared state and resource management
61
+ """
62
+
63
+ def __init__(self):
64
+ self.start_time = time.time()
65
+ self.total_crawlers = 0
66
+ self.active_crawlers = 0
67
+ self.completed_crawlers = 0
68
+ self.failed_crawlers = 0
69
+ self.error_log = []
70
+ self._lock = threading.RLock()
71
+
72
+ def increment_total(self):
73
+ with self._lock:
74
+ self.total_crawlers += 1
75
+
76
+ def increment_active(self):
77
+ with self._lock:
78
+ self.active_crawlers += 1
79
+
80
+ def decrement_active(self):
81
+ with self._lock:
82
+ self.active_crawlers -= 1
83
+
84
+ def increment_completed(self):
85
+ with self._lock:
86
+ self.completed_crawlers += 1
87
+
88
+ def increment_failed(self, error: str):
89
+ with self._lock:
90
+ self.failed_crawlers += 1
91
+ self.error_log.append({
92
+ 'timestamp': time.time(),
93
+ 'error': error
94
+ })
95
+
96
+ def get_stats(self) -> Dict[str, Any]:
97
+ with self._lock:
98
+ duration = time.time() - self.start_time
99
+ return {
100
+ 'total_crawlers': self.total_crawlers,
101
+ 'active_crawlers': self.active_crawlers,
102
+ 'completed_crawlers': self.completed_crawlers,
103
+ 'failed_crawlers': self.failed_crawlers,
104
+ 'success_rate': (self.completed_crawlers / max(1, self.total_crawlers)) * 100,
105
+ 'duration_seconds': round(duration, 2),
106
+ 'error_count': len(self.error_log)
107
+ }
108
+
109
+
110
+ class Crawler:
111
+ """
112
+ Single crawler runtime instance, managing Spider and engine lifecycle
113
+
114
+ Provides functionality:
115
+ - Spider lifecycle management (initialization, running, closing)
116
+ - Engine component coordination management
117
+ - Configuration merging and validation
118
+ - Statistics data collection
119
+ - Extension management
120
+ - Exception handling and cleanup
121
+ """
122
+
123
+ def __init__(self, spider_cls: Type[Spider], settings: SettingManager, context: Optional[CrawlerContext] = None):
124
+ self.spider_cls = spider_cls
125
+ self.spider: Optional[Spider] = None
126
+ self.engine: Optional[Engine] = None
127
+ self.stats: Optional[StatsCollector] = None
128
+ self.subscriber: Optional[Subscriber] = None
129
+ self.extension: Optional[ExtensionManager] = None
130
+ self.settings: SettingManager = settings.copy()
131
+ self.context = context or CrawlerContext()
132
+
133
+ # State management
134
+ self._closed = False
135
+ self._close_lock = asyncio.Lock()
136
+ self._start_time = None
137
+ self._end_time = None
138
+
139
+ # Performance monitoring
140
+ self._performance_metrics = {
141
+ 'initialization_time': 0,
142
+ 'crawl_duration': 0,
143
+ 'memory_peak': 0,
144
+ 'request_count': 0,
145
+ 'error_count': 0
146
+ }
147
+
148
+ async def crawl(self):
149
+ """
150
+ Start the crawler core process
151
+
152
+ Includes the following stages:
153
+ 1. Initialization stage: Create all components
154
+ 2. Validation stage: Check configuration and state
155
+ 3. Running stage: Start the crawler engine
156
+ 4. Cleanup stage: Resource release
157
+ """
158
+ init_start = time.time()
159
+ self._start_time = init_start
160
+
161
+ try:
162
+ # Update context status
163
+ self.context.increment_active()
164
+
165
+ # Phase 1: Initialize components
166
+ # Adjust component initialization order to ensure log output order meets requirements
167
+ self.subscriber = self._create_subscriber()
168
+ self.spider = self._create_spider()
169
+ self.engine = self._create_engine()
170
+ self.stats = self._create_stats()
171
+ # Note: Do not initialize extension manager here, let it initialize in the engine
172
+
173
+ # Record initialization time
174
+ self._performance_metrics['initialization_time'] = time.time() - init_start
175
+
176
+ # Phase 2: Validate state
177
+ self._validate_crawler_state()
178
+
179
+ # Phase 3: Display runtime configuration summary
180
+ self._log_runtime_summary()
181
+
182
+ # Phase 4: Start crawler
183
+ crawl_start = time.time()
184
+ await self.engine.start_spider(self.spider)
185
+
186
+ # Record crawl time
187
+ self._performance_metrics['crawl_duration'] = time.time() - crawl_start
188
+ self._end_time = time.time()
189
+
190
+ # Update context status
191
+ self.context.increment_completed()
192
+
193
+ _get_logger().info(f"Spider {self.spider.name} completed, took {self._get_total_duration():.2f} seconds")
194
+
195
+ except Exception as e:
196
+ self._performance_metrics['error_count'] += 1
197
+ self.context.increment_failed(str(e))
198
+ _get_logger().error(f"Spider {getattr(self.spider, 'name', 'Unknown')} failed to run: {e}", exc_info=True)
199
+ raise
200
+ finally:
201
+ self.context.decrement_active()
202
+ # Ensure resource cleanup
203
+ await self._ensure_cleanup()
204
+
205
+ def _log_runtime_summary(self):
206
+ """Log runtime configuration summary"""
207
+ # Get spider name
208
+ spider_name = getattr(self.spider, 'name', 'Unknown')
209
+
210
+ # Ensure spider name is a string and strip leading/trailing whitespace
211
+ if spider_name:
212
+ spider_name = str(spider_name).strip()
213
+ else:
214
+ spider_name = 'Unknown'
215
+
216
+ _get_logger().info(f"Starting running {spider_name}")
217
+
218
+ def _validate_crawler_state(self):
219
+ """
220
+ Validate crawler state and configuration
221
+ Ensure all necessary components are properly initialized
222
+ """
223
+ if not self.spider:
224
+ raise RuntimeError("Spider instance not initialized")
225
+ if not self.engine:
226
+ raise RuntimeError("Engine not initialized")
227
+ if not self.stats:
228
+ raise RuntimeError("Stats collector not initialized")
229
+ if not self.subscriber:
230
+ raise RuntimeError("Event subscriber not initialized")
231
+
232
+ # Check key configuration
233
+ if not self.spider.name:
234
+ raise ValueError("Spider name cannot be empty")
235
+
236
+ _get_logger().debug(f"Spider {self.spider.name} state validation passed")
237
+
238
+ def _get_total_duration(self) -> float:
239
+ """Get total runtime"""
240
+ if self._start_time and self._end_time:
241
+ return self._end_time - self._start_time
242
+ return 0.0
243
+
244
+ async def _ensure_cleanup(self):
245
+ """Ensure resource cleanup"""
246
+ try:
247
+ if not self._closed:
248
+ await self.close()
249
+ except Exception as e:
250
+ _get_logger().warning(f"Error cleaning up resources: {e}")
251
+
252
+ def get_performance_metrics(self) -> Dict[str, Any]:
253
+ """Get performance metrics"""
254
+ metrics = self._performance_metrics.copy()
255
+ metrics['total_duration'] = self._get_total_duration()
256
+ if self.stats:
257
+ # Add statistics data
258
+ stats_data = getattr(self.stats, 'get_stats', lambda: {})()
259
+ metrics.update(stats_data)
260
+ return metrics
261
+
262
+ @staticmethod
263
+ def _create_subscriber() -> Subscriber:
264
+ """Create event subscriber"""
265
+ return Subscriber()
266
+
267
+ def _create_spider(self) -> Spider:
268
+ """
269
+ Create and validate spider instance (enhanced version)
270
+
271
+ Performs the following validations:
272
+ - Spider name must exist
273
+ - start_requests method must be callable
274
+ - start_urls cannot be a string
275
+ - parse method is recommended to exist
276
+ """
277
+ spider = self.spider_cls.create_instance(self)
278
+
279
+ # Required attribute check
280
+ if not getattr(spider, 'name', None):
281
+ raise AttributeError(
282
+ f"Spider class '{self.spider_cls.__name__}' must define 'name' attribute.\n"
283
+ f"Example: name = 'my_spider'"
284
+ )
285
+
286
+ if not callable(getattr(spider, 'start_requests', None)):
287
+ raise AttributeError(
288
+ f"Spider '{spider.name}' must implement a callable 'start_requests' method.\n"
289
+ f"Example: def start_requests(self): yield Request(url='...')"
290
+ )
291
+
292
+ # start_urls type check
293
+ start_urls = getattr(spider, 'start_urls', [])
294
+ if isinstance(start_urls, str):
295
+ raise TypeError(
296
+ f"Spider '{spider.name}' 'start_urls' must be a list or tuple, not a string.\n"
297
+ f"Correct: start_urls = ['http://example.com']\n"
298
+ f"Incorrect: start_urls = 'http://example.com'"
299
+ )
300
+
301
+ # parse method check (warning instead of error)
302
+ if not callable(getattr(spider, 'parse', None)):
303
+ _get_logger().warning(
304
+ f"Spider '{spider.name}' does not define 'parse' method.\n"
305
+ f"Ensure all Requests specify a callback function, otherwise responses will be ignored."
306
+ )
307
+
308
+ # Set spider configuration
309
+ self._set_spider(spider)
310
+
311
+ _get_logger().debug(f"Spider '{spider.name}' initialized successfully")
312
+ return spider
313
+
314
+ def _create_engine(self) -> Engine:
315
+ """Create and initialize engine"""
316
+ engine = Engine(self)
317
+ engine.engine_start()
318
+ _get_logger().debug(f"Engine initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
319
+ return engine
320
+
321
+ def _create_stats(self) -> StatsCollector:
322
+ """Create stats collector"""
323
+ stats = StatsCollector(self)
324
+ _get_logger().debug(f"Stats collector initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
325
+ return stats
326
+
327
+ def _create_extension(self) -> ExtensionManager:
328
+ """Create extension manager"""
329
+ # Modify extension manager creation method, delay initialization until needed
330
+ extension = ExtensionManager.create_instance(self)
331
+ _get_logger().debug(f"Extension manager initialized successfully, spider: {getattr(self.spider, 'name', 'Unknown')}")
332
+ return extension
333
+
334
+ def _set_spider(self, spider: Spider):
335
+ """
336
+ Set spider configuration and event subscription
337
+ Bind spider lifecycle events with subscriber
338
+ """
339
+ # Subscribe to spider lifecycle events
340
+ self.subscriber.subscribe(spider.spider_opened, event=spider_opened)
341
+ self.subscriber.subscribe(spider.spider_closed, event=spider_closed)
342
+
343
+ # Merge spider custom configuration
344
+ merge_settings(spider, self.settings)
345
+
346
+ _get_logger().debug(f"Spider '{spider.name}' configuration merged successfully")
347
+
348
+ async def close(self, reason='finished') -> None:
349
+ """
350
+ Close crawler and clean up resources (enhanced version)
351
+
352
+ Ensure closing only once and handle all cleanup operations
353
+ """
354
+ async with self._close_lock:
355
+ if self._closed:
356
+ return
357
+
358
+ self._closed = True
359
+ self._end_time = time.time()
360
+
361
+ try:
362
+ # Notify spider close event
363
+ if self.subscriber:
364
+ await self.subscriber.notify(spider_closed)
365
+
366
+ # Statistics data collection
367
+ if self.stats and self.spider:
368
+ self.stats.close_spider(spider=self.spider, reason=reason)
369
+ # Record statistics data
370
+ try:
371
+ from crawlo.commands.stats import record_stats
372
+ record_stats(self)
373
+ except ImportError:
374
+ _get_logger().debug("Statistics recording module does not exist, skipping statistics recording")
375
+
376
+ _get_logger().info(
377
+ f"Spider '{getattr(self.spider, 'name', 'Unknown')}' closed, "
378
+ f"reason: {reason}, took: {self._get_total_duration():.2f} seconds"
379
+ )
380
+
381
+ except Exception as e:
382
+ _get_logger().error(f"Error closing crawler: {e}", exc_info=True)
383
+ finally:
384
+ # Ensure resource cleanup
385
+ await self._cleanup_resources()
386
+
387
+ async def _cleanup_resources(self):
388
+ """Clean up all resources"""
389
+ cleanup_tasks = []
390
+
391
+ # Engine cleanup
392
+ if self.engine:
393
+ try:
394
+ cleanup_tasks.append(self.engine.close())
395
+ except AttributeError:
396
+ pass # Engine has no close method
397
+
398
+ # Extension cleanup
399
+ if self.extension:
400
+ try:
401
+ cleanup_tasks.append(self.extension.close())
402
+ except AttributeError:
403
+ pass
404
+
405
+ # Stats collector cleanup
406
+ if self.stats:
407
+ try:
408
+ cleanup_tasks.append(self.stats.close())
409
+ except AttributeError:
410
+ pass
411
+
412
+ # Concurrently execute cleanup tasks
413
+ if cleanup_tasks:
414
+ await asyncio.gather(*cleanup_tasks, return_exceptions=True)
415
+
416
+ _get_logger().debug("Resource cleanup completed")
417
+
418
+
419
+ class CrawlerProcess:
420
+ """
421
+ Crawler process manager
422
+
423
+ Supported features:
424
+ - Multi-crawler concurrent scheduling and resource management
425
+ - Automatic module discovery and spider registration
426
+ - Intelligent concurrency control and load balancing
427
+ - Graceful shutdown and signal handling
428
+ - Real-time status monitoring and statistics
429
+ - Error recovery and retry mechanism
430
+ - Large-scale crawler optimization support
431
+
432
+ Usage example:
433
+ # Basic usage
434
+ process = CrawlerProcess()
435
+ await process.crawl(MySpider)
436
+
437
+ # Multi-crawler concurrency
438
+ await process.crawl([Spider1, Spider2, 'spider_name'])
439
+
440
+ # Custom concurrency
441
+ process = CrawlerProcess(max_concurrency=8)
442
+ """
443
+
444
+ def __init__(
445
+ self,
446
+ settings: Optional[SettingManager] = None,
447
+ max_concurrency: Optional[int] = None,
448
+ spider_modules: Optional[List[str]] = None,
449
+ enable_monitoring: bool = True
450
+ ):
451
+ # Basic configuration
452
+ self.settings: SettingManager = settings or self._get_default_settings()
453
+ self.crawlers: Set[Crawler] = set()
454
+ self._active_tasks: Set[asyncio.Task] = set()
455
+
456
+ # Context manager
457
+ self.context = CrawlerContext()
458
+
459
+ # Concurrency control configuration
460
+ self.max_concurrency: int = (
461
+ max_concurrency
462
+ or self.settings.get('MAX_RUNNING_SPIDERS')
463
+ or self.settings.get('CONCURRENCY', 3)
464
+ )
465
+ self.semaphore = asyncio.Semaphore(self.max_concurrency)
466
+
467
+ # Monitoring configuration
468
+ self.enable_monitoring = enable_monitoring
469
+ self._monitoring_task = None
470
+ self._shutdown_event = asyncio.Event()
471
+
472
+ # Automatically discover and import spider modules
473
+ if spider_modules:
474
+ self.auto_discover(spider_modules)
475
+
476
+ # Use snapshot of global registry (avoid subsequent import impact)
477
+ self._spider_registry: Dict[str, Type[Spider]] = get_global_spider_registry()
478
+
479
+ # Performance monitoring
480
+ self._performance_stats = {
481
+ 'total_requests': 0,
482
+ 'successful_requests': 0,
483
+ 'failed_requests': 0,
484
+ 'memory_usage_mb': 0,
485
+ 'cpu_usage_percent': 0
486
+ }
487
+
488
+ # Register signal handlers
489
+ signal.signal(signal.SIGINT, self._shutdown)
490
+ signal.signal(signal.SIGTERM, self._shutdown)
491
+
492
+ self._log_startup_info()
493
+
494
+ logger.debug(
495
+ f"CrawlerProcess initialized successfully\n"
496
+ f" - Max concurrent crawlers: {self.max_concurrency}\n"
497
+ f" - Registered crawlers: {len(self._spider_registry)}\n"
498
+ f" - Monitoring enabled: {self.enable_monitoring}"
499
+ )
500
+
501
+ async def start_monitoring(self):
502
+ """Start monitoring task"""
503
+ if not self.enable_monitoring:
504
+ return
505
+
506
+ self._monitoring_task = asyncio.create_task(self._monitor_loop())
507
+ logger.debug("Monitoring task started")
508
+
509
+ async def stop_monitoring(self):
510
+ """Stop monitoring task"""
511
+ if self._monitoring_task and not self._monitoring_task.done():
512
+ self._monitoring_task.cancel()
513
+ try:
514
+ await self._monitoring_task
515
+ except asyncio.CancelledError:
516
+ pass
517
+ logger.debug("Monitoring task stopped")
518
+
519
+ async def _monitor_loop(self):
520
+ """Monitoring loop, periodically collect and report status"""
521
+ try:
522
+ while not self._shutdown_event.is_set():
523
+ await self._collect_performance_stats()
524
+
525
+ # Output status every 30 seconds
526
+ stats = self.context.get_stats()
527
+ if stats['active_crawlers'] > 0:
528
+ logger.debug(
529
+ f"Crawler status: Active {stats['active_crawlers']}, "
530
+ f"Completed {stats['completed_crawlers']}, "
531
+ f"Failed {stats['failed_crawlers']}, "
532
+ f"Success rate {stats['success_rate']:.1f}%"
533
+ )
534
+
535
+ await asyncio.sleep(30) # 30 second interval
536
+
537
+ except asyncio.CancelledError:
538
+ logger.debug("Monitoring loop cancelled")
539
+ except Exception as e:
540
+ logger.error(f"Monitoring loop error: {e}", exc_info=True)
541
+
542
+ async def _collect_performance_stats(self):
543
+ """Collect performance statistics data"""
544
+ try:
545
+ import psutil
546
+ import os
547
+
548
+ process = psutil.Process(os.getpid())
549
+ memory_info = process.memory_info()
550
+
551
+ self._performance_stats.update({
552
+ 'memory_usage_mb': round(memory_info.rss / 1024 / 1024, 2),
553
+ 'cpu_usage_percent': round(process.cpu_percent(), 2)
554
+ })
555
+
556
+ except ImportError:
557
+ # Skip performance monitoring when psutil is not available
558
+ pass
559
+ except Exception as e:
560
+ logger.debug(f"Failed to collect performance statistics: {e}")
561
+
562
+ @staticmethod
563
+ def auto_discover(modules: List[str]):
564
+ """
565
+ Automatically import modules, trigger Spider class definition and registration (enhanced version)
566
+
567
+ Supports recursive scanning and error recovery
568
+ """
569
+ import importlib
570
+ import pkgutil
571
+
572
+ discovered_count = 0
573
+ error_count = 0
574
+
575
+ for module_name in modules:
576
+ try:
577
+ module = importlib.import_module(module_name)
578
+
579
+ if hasattr(module, '__path__'):
580
+ # Package module, recursive scanning
581
+ for _, name, _ in pkgutil.walk_packages(module.__path__, module.__name__ + "."):
582
+ try:
583
+ importlib.import_module(name)
584
+ discovered_count += 1
585
+ except Exception as sub_e:
586
+ error_count += 1
587
+ logger.warning(f"Failed to import submodule {name}: {sub_e}")
588
+ else:
589
+ # Single module
590
+ importlib.import_module(module_name)
591
+ discovered_count += 1
592
+
593
+ logger.debug(f"Module scanned: {module_name}")
594
+
595
+ except Exception as e:
596
+ error_count += 1
597
+ logger.error(f"Failed to scan module {module_name}: {e}", exc_info=True)
598
+
599
+ logger.debug(
600
+ f"Spider registration completed: {discovered_count} succeeded, {error_count} failed"
601
+ )
602
+
603
+ # === Public read-only interface: Avoid direct access to _spider_registry ===
604
+
605
+ def get_spider_names(self) -> List[str]:
606
+ """Get all registered spider names"""
607
+ return list(self._spider_registry.keys())
608
+
609
+ def get_spider_class(self, name: str) -> Optional[Type[Spider]]:
610
+ """Get spider class by name"""
611
+ return self._spider_registry.get(name)
612
+
613
+ def is_spider_registered(self, name: str) -> bool:
614
+ """Check if a name is registered"""
615
+ return name in self._spider_registry
616
+
617
+ async def crawl(self, spiders: Union[Type[Spider], str, List[Union[Type[Spider], str]]]):
618
+ """
619
+ Start one or more crawlers
620
+
621
+ Enhanced features:
622
+ - Intelligent concurrency control
623
+ - Real-time monitoring and statistics
624
+ - Error recovery and retry
625
+ - Graceful shutdown handling
626
+ """
627
+ # Phase 1: Preprocessing and validation
628
+ spider_classes_to_run = self._resolve_spiders_to_run(spiders)
629
+ total = len(spider_classes_to_run)
630
+
631
+ if total == 0:
632
+ raise ValueError("At least one spider class or name must be provided")
633
+
634
+ # Phase 2: Initialize context and monitoring
635
+ for _ in range(total):
636
+ self.context.increment_total()
637
+
638
+ # Start monitoring task
639
+ await self.start_monitoring()
640
+
641
+ try:
642
+ # Phase 3: Sort by class name to ensure predictable startup order
643
+ spider_classes_to_run.sort(key=lambda cls: cls.__name__.lower())
644
+
645
+ logger.debug(
646
+ f"Starting {total} crawlers\n"
647
+ f" - Max concurrency: {self.max_concurrency}\n"
648
+ f" - Spider list: {[cls.__name__ for cls in spider_classes_to_run]}"
649
+ )
650
+
651
+ # Phase 4: Stream start all crawler tasks
652
+ tasks = [
653
+ asyncio.create_task(
654
+ self._run_spider_with_limit(spider_cls, index + 1, total),
655
+ name=f"spider-{spider_cls.__name__}-{index + 1}"
656
+ )
657
+ for index, spider_cls in enumerate(spider_classes_to_run)
658
+ ]
659
+
660
+ # Phase 5: Wait for all tasks to complete (failures do not interrupt)
661
+ results = await asyncio.gather(*tasks, return_exceptions=True)
662
+
663
+ # Phase 6: Statistics exceptions and results
664
+ failed = [i for i, r in enumerate(results) if isinstance(r, Exception)]
665
+ successful = total - len(failed)
666
+
667
+ if failed:
668
+ failed_spiders = [spider_classes_to_run[i].__name__ for i in failed]
669
+ logger.error(
670
+ f"Crawler execution result: {successful}/{total} succeeded, {len(failed)}/{total} failed\n"
671
+ f" - Failed crawlers: {failed_spiders}"
672
+ )
673
+
674
+ # Record detailed error information
675
+ for i in failed:
676
+ error = results[i]
677
+ logger.error(f"Spider {spider_classes_to_run[i].__name__} error details: {error}")
678
+ else:
679
+ logger.info(f"All {total} crawlers completed successfully!")
680
+
681
+ # Return statistics results
682
+ return {
683
+ 'total': total,
684
+ 'successful': successful,
685
+ 'failed': len(failed),
686
+ 'success_rate': (successful / total) * 100 if total > 0 else 0,
687
+ 'context_stats': self.context.get_stats()
688
+ }
689
+
690
+ finally:
691
+ # Phase 7: Cleanup and shutdown
692
+ await self.stop_monitoring()
693
+ await self._cleanup_process()
694
+
695
+ async def _cleanup_process(self):
696
+ """Clean up process resources"""
697
+ try:
698
+ # Wait for all active crawlers to complete
699
+ if self.crawlers:
700
+ close_tasks = [crawler.close() for crawler in self.crawlers]
701
+ await asyncio.gather(*close_tasks, return_exceptions=True)
702
+ self.crawlers.clear()
703
+
704
+ # Clean up active tasks
705
+ if self._active_tasks:
706
+ for task in list(self._active_tasks):
707
+ if not task.done():
708
+ task.cancel()
709
+ await asyncio.gather(*self._active_tasks, return_exceptions=True)
710
+ self._active_tasks.clear()
711
+
712
+ logger.debug("Process resources cleanup completed")
713
+
714
+ except Exception as e:
715
+ logger.error(f"Error cleaning up process resources: {e}", exc_info=True)
716
+
717
+ def get_process_stats(self) -> Dict[str, Any]:
718
+ """Get process statistics information"""
719
+ context_stats = self.context.get_stats()
720
+
721
+ return {
722
+ 'context': context_stats,
723
+ 'performance': self._performance_stats.copy(),
724
+ 'crawlers': {
725
+ 'total_registered': len(self._spider_registry),
726
+ 'active_crawlers': len(self.crawlers),
727
+ 'max_concurrency': self.max_concurrency
728
+ },
729
+ 'registry': {
730
+ 'spider_names': list(self._spider_registry.keys()),
731
+ 'spider_classes': [cls.__name__ for cls in self._spider_registry.values()]
732
+ }
733
+ }
734
+
735
+ def _resolve_spiders_to_run(
736
+ self,
737
+ spiders_input: Union[Type[Spider], str, List[Union[Type[Spider], str]]]
738
+ ) -> List[Type[Spider]]:
739
+ """
740
+ Resolve input to spider class list
741
+
742
+ Supports various input formats and validates uniqueness
743
+ """
744
+ inputs = self._normalize_inputs(spiders_input)
745
+ seen_spider_names: Set[str] = set()
746
+ spider_classes: List[Type[Spider]] = []
747
+
748
+ for item in inputs:
749
+ try:
750
+ spider_cls = self._resolve_spider_class(item)
751
+ spider_name = getattr(spider_cls, 'name', None)
752
+
753
+ if not spider_name:
754
+ raise ValueError(f"Spider class {spider_cls.__name__} missing 'name' attribute")
755
+
756
+ if spider_name in seen_spider_names:
757
+ raise ValueError(
758
+ f"Duplicate spider name '{spider_name}' in this run.\n"
759
+ f"Ensure each spider's name attribute is unique in this run."
760
+ )
761
+
762
+ seen_spider_names.add(spider_name)
763
+ spider_classes.append(spider_cls)
764
+
765
+ logger.debug(f"Spider resolved successfully: {item} -> {spider_cls.__name__} (name='{spider_name}')")
766
+
767
+ except Exception as e:
768
+ logger.error(f"Failed to resolve spider: {item} - {e}")
769
+ raise
770
+
771
+ return spider_classes
772
+
773
+ @staticmethod
774
+ def _normalize_inputs(spiders_input) -> List[Union[Type[Spider], str]]:
775
+ """
776
+ Normalize input to list
777
+
778
+ Supports more input types and provides better error information
779
+ """
780
+ if isinstance(spiders_input, (type, str)):
781
+ return [spiders_input]
782
+ elif isinstance(spiders_input, (list, tuple, set)):
783
+ spider_list = list(spiders_input)
784
+ if not spider_list:
785
+ raise ValueError("Spider list cannot be empty")
786
+ return spider_list
787
+ else:
788
+ raise TypeError(
789
+ f"Unsupported spiders parameter type: {type(spiders_input)}\n"
790
+ f"Supported types: Spider class, name string, or their list/tuple/set"
791
+ )
792
+
793
+ def _resolve_spider_class(self, item: Union[Type[Spider], str]) -> Type[Spider]:
794
+ """
795
+ Resolve single input item to spider class
796
+
797
+ Provides better error prompts and debugging information
798
+ """
799
+ if isinstance(item, type) and issubclass(item, Spider):
800
+ # Direct Spider class
801
+ return item
802
+ elif isinstance(item, str):
803
+ # String name, need to look up registry
804
+ spider_cls = self._spider_registry.get(item)
805
+ if not spider_cls:
806
+ available_spiders = list(self._spider_registry.keys())
807
+ raise ValueError(
808
+ f"Spider named '{item}' not found.\n"
809
+ f"Registered spiders: {available_spiders}\n"
810
+ f"Please check if the spider name is correct, or ensure the spider has been properly imported and registered."
811
+ )
812
+ return spider_cls
813
+ else:
814
+ raise TypeError(
815
+ f"Invalid type {type(item)}: {item}\n"
816
+ f"Must be Spider class or string name.\n"
817
+ f"Example: MySpider or 'my_spider'"
818
+ )
819
+
820
+ async def _run_spider_with_limit(self, spider_cls: Type[Spider], seq: int, total: int):
821
+ """
822
+ Spider running function limited by semaphore
823
+
824
+ Includes enhanced error handling and monitoring functionality
825
+ """
826
+ task = asyncio.current_task()
827
+ crawler = None
828
+
829
+ try:
830
+ # Register task
831
+ if task:
832
+ self._active_tasks.add(task)
833
+
834
+ # Acquire concurrency permit
835
+ await self.semaphore.acquire()
836
+
837
+ # start_msg = f"[{seq}/{total}] Initializing spider: {spider_cls.__name__}"
838
+ # logger.info(start_msg)
839
+
840
+ # Create and run crawler
841
+ crawler = Crawler(spider_cls, self.settings, self.context)
842
+ self.crawlers.add(crawler)
843
+
844
+ # Record start time
845
+ start_time = time.time()
846
+
847
+ # Run crawler
848
+ await crawler.crawl()
849
+
850
+ # Calculate runtime
851
+ duration = time.time() - start_time
852
+
853
+ end_msg = (
854
+ f"[{seq}/{total}] Crawler completed: {spider_cls.__name__}, "
855
+ f"took: {duration:.2f} seconds"
856
+ )
857
+ logger.info(end_msg)
858
+
859
+ # Record success statistics
860
+ self._performance_stats['successful_requests'] += 1
861
+
862
+ except Exception as e:
863
+ # Record failure statistics
864
+ self._performance_stats['failed_requests'] += 1
865
+
866
+ error_msg = f"Spider {spider_cls.__name__} execution failed: {e}"
867
+ logger.error(error_msg, exc_info=True)
868
+
869
+ # Record error information to context
870
+ if hasattr(self, 'context'):
871
+ self.context.increment_failed(error_msg)
872
+
873
+ raise
874
+ finally:
875
+ # Clean up resources
876
+ try:
877
+ if crawler and crawler in self.crawlers:
878
+ self.crawlers.remove(crawler)
879
+
880
+ if task and task in self._active_tasks:
881
+ self._active_tasks.remove(task)
882
+
883
+ self.semaphore.release()
884
+
885
+ except Exception as cleanup_error:
886
+ logger.warning(f"Error cleaning up resources: {cleanup_error}")
887
+
888
+ def _shutdown(self, _signum, _frame):
889
+ """
890
+ Graceful shutdown signal handling
891
+
892
+ Provides better shutdown experience and resource cleanup
893
+ """
894
+ signal_name = {signal.SIGINT: 'SIGINT', signal.SIGTERM: 'SIGTERM'}.get(_signum, str(_signum))
895
+ logger.warning(f"Received shutdown signal {signal_name}, stopping all crawlers...")
896
+
897
+ # Set shutdown event
898
+ if hasattr(self, '_shutdown_event'):
899
+ self._shutdown_event.set()
900
+
901
+ # Stop all crawler engines
902
+ for crawler in list(self.crawlers):
903
+ if crawler.engine:
904
+ crawler.engine.running = False
905
+ crawler.engine.normal = False
906
+ logger.debug(f"Crawler engine stopped: {getattr(crawler.spider, 'name', 'Unknown')}")
907
+
908
+ # Create shutdown task
909
+ asyncio.create_task(self._wait_for_shutdown())
910
+
911
+ logger.info("Shutdown command sent, waiting for crawlers to complete current tasks...")
912
+
913
+ async def _wait_for_shutdown(self):
914
+ """
915
+ Wait for all active tasks to complete
916
+
917
+ Provides better shutdown time control and progress feedback
918
+ """
919
+ try:
920
+ # Stop monitoring task
921
+ await self.stop_monitoring()
922
+
923
+ # Wait for active tasks to complete
924
+ pending = [t for t in self._active_tasks if not t.done()]
925
+
926
+ if pending:
927
+ logger.info(
928
+ f"Waiting for {len(pending)} active tasks to complete..."
929
+ f"(Maximum wait time: 30 seconds)"
930
+ )
931
+
932
+ # Set timeout
933
+ try:
934
+ await asyncio.wait_for(
935
+ asyncio.gather(*pending, return_exceptions=True),
936
+ timeout=30.0
937
+ )
938
+ except asyncio.TimeoutError:
939
+ logger.warning("Some tasks timed out, forcing cancellation...")
940
+
941
+ # Force cancel timed out tasks
942
+ for task in pending:
943
+ if not task.done():
944
+ task.cancel()
945
+
946
+ # Wait for cancellation to complete
947
+ await asyncio.gather(*pending, return_exceptions=True)
948
+
949
+ # Final cleanup
950
+ await self._cleanup_process()
951
+
952
+ # Output final statistics
953
+ final_stats = self.context.get_stats()
954
+ logger.info(
955
+ f"All crawlers gracefully shut down 👋\n"
956
+ f" - Total crawlers: {final_stats['total_crawlers']}\n"
957
+ f" - Successfully completed: {final_stats['completed_crawlers']}\n"
958
+ f" - Failed: {final_stats['failed_crawlers']}\n"
959
+ f" - Success rate: {final_stats['success_rate']:.1f}%\n"
960
+ f" - Total runtime: {final_stats['duration_seconds']} seconds"
961
+ )
962
+
963
+ except Exception as e:
964
+ logger.error(f"Error during shutdown process: {e}", exc_info=True)
965
+
966
+ @classmethod
967
+ def _get_default_settings(cls) -> SettingManager:
968
+ """
969
+ Load default configuration
970
+
971
+ Provides better error handling and fallback strategy
972
+ """
973
+ try:
974
+ settings = get_settings()
975
+ _get_logger().debug("Default configuration loaded successfully")
976
+ return settings
977
+ except Exception as e:
978
+ _get_logger().warning(f"Unable to load default configuration: {e}, using empty configuration")
979
+ return SettingManager()
980
+
981
+ def _log_startup_info(self):
982
+ """Print startup information, including run mode and key configuration checks"""
983
+ # Get run mode
984
+ run_mode = self.settings.get('RUN_MODE', 'standalone')
985
+
986
+ # Get version number
987
+ version = self.settings.get('VERSION', '1.0.0')
988
+ if not version or version == 'None':
989
+ version = '1.0.0'
990
+
991
+ # Build startup info log
992
+ startup_info = [
993
+ f"Crawlo Framework Started v{version}"
994
+ ]
995
+
996
+ # Get actual queue type
997
+ queue_type = self.settings.get('QUEUE_TYPE', 'memory')
998
+
999
+ # Display information based on run mode and queue type combination
1000
+ if run_mode == 'distributed':
1001
+ startup_info.append("Run Mode: distributed")
1002
+ startup_info.append("Distributed Mode - Multi-node collaboration supported")
1003
+ # Show Redis configuration
1004
+ redis_host = self.settings.get('REDIS_HOST', 'localhost')
1005
+ redis_port = self.settings.get('REDIS_PORT', 6379)
1006
+ startup_info.append(f"Redis Address: {redis_host}:{redis_port}")
1007
+ elif run_mode == 'standalone':
1008
+ if queue_type == 'redis':
1009
+ startup_info.append("Run Mode: standalone+redis")
1010
+ # Show Redis configuration
1011
+ redis_host = self.settings.get('REDIS_HOST', 'localhost')
1012
+ redis_port = self.settings.get('REDIS_PORT', 6379)
1013
+ startup_info.append(f"Redis Address: {redis_host}:{redis_port}")
1014
+ elif queue_type == 'auto':
1015
+ startup_info.append("Run Mode: standalone+auto")
1016
+ else: # memory
1017
+ startup_info.append("Run Mode: standalone")
1018
+ else:
1019
+ startup_info.append(f"Run Mode: {run_mode}")
1020
+
1021
+ # Print startup information
1022
+ for info in startup_info:
1023
+ logger.info(info)
1024
+
1025
+
1026
+ # === Utility functions ===
1027
+
1028
+ def create_crawler_with_optimizations(
1029
+ spider_cls: Type[Spider],
1030
+ settings: Optional[SettingManager] = None,
1031
+ **optimization_kwargs
1032
+ ) -> Crawler:
1033
+ """
1034
+ Create an optimized crawler instance
1035
+
1036
+ :param spider_cls: Spider class
1037
+ :param settings: Settings manager
1038
+ :param optimization_kwargs: Optimization parameters
1039
+ :return: Crawler instance
1040
+ """
1041
+ if settings is None:
1042
+ settings = SettingManager()
1043
+
1044
+ # Apply optimization configuration
1045
+ for key, value in optimization_kwargs.items():
1046
+ settings.set(key, value)
1047
+
1048
+ context = CrawlerContext()
1049
+ return Crawler(spider_cls, settings, context)
1050
+
1051
+
1052
+ def create_process_with_large_scale_config(
1053
+ config_type: str = 'balanced',
1054
+ concurrency: int = 16,
1055
+ **kwargs
1056
+ ) -> CrawlerProcess:
1057
+ """
1058
+ Create a process manager that supports large-scale optimization
1059
+
1060
+ :param config_type: Configuration type ('conservative', 'balanced', 'aggressive', 'memory_optimized')
1061
+ :param concurrency: Concurrency count
1062
+ :param kwargs: Other parameters
1063
+ :return: Process manager
1064
+ """
1065
+ try:
1066
+ from crawlo.utils.large_scale_config import LargeScaleConfig
1067
+
1068
+ # Get optimization configuration
1069
+ config_methods = {
1070
+ 'conservative': LargeScaleConfig.conservative_config,
1071
+ 'balanced': LargeScaleConfig.balanced_config,
1072
+ 'aggressive': LargeScaleConfig.aggressive_config,
1073
+ 'memory_optimized': LargeScaleConfig.memory_optimized_config
1074
+ }
1075
+
1076
+ if config_type not in config_methods:
1077
+ logger.warning(f"Unknown configuration type: {config_type}, using default configuration")
1078
+ settings = SettingManager()
1079
+ else:
1080
+ config = config_methods[config_type](concurrency)
1081
+ settings = SettingManager()
1082
+ settings.update(config)
1083
+
1084
+ return CrawlerProcess(
1085
+ settings=settings,
1086
+ max_concurrency=concurrency,
1087
+ **kwargs
1088
+ )
1089
+
1090
+ except ImportError:
1091
+ logger.warning("Large-scale configuration module does not exist, using default configuration")
1092
+ return CrawlerProcess(max_concurrency=concurrency, **kwargs)
1093
+
1094
+
1095
+ # === Exported interfaces ===
1096
+
1097
+ __all__ = [
1098
+ 'Crawler',
1099
+ 'CrawlerProcess',
1100
+ 'CrawlerContext',
1101
+ 'create_crawler_with_optimizations',
1102
+ 'create_process_with_large_scale_config'
1103
+ ]