crawlo 1.3.6__py3-none-any.whl → 1.3.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (291) hide show
  1. crawlo/__init__.py +87 -87
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/help.py +138 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +341 -341
  10. crawlo/commands/startproject.py +436 -436
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +312 -312
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +45 -45
  16. crawlo/core/engine.py +439 -439
  17. crawlo/core/processor.py +40 -40
  18. crawlo/core/scheduler.py +257 -257
  19. crawlo/crawler.py +638 -638
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +273 -273
  23. crawlo/downloader/aiohttp_downloader.py +228 -228
  24. crawlo/downloader/cffi_downloader.py +245 -245
  25. crawlo/downloader/httpx_downloader.py +259 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +402 -402
  28. crawlo/downloader/selenium_downloader.py +472 -472
  29. crawlo/event.py +11 -11
  30. crawlo/exceptions.py +81 -81
  31. crawlo/extension/__init__.py +39 -39
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +57 -57
  34. crawlo/extension/log_stats.py +81 -81
  35. crawlo/extension/logging_extension.py +61 -61
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +27 -27
  40. crawlo/factories/base.py +68 -68
  41. crawlo/factories/crawler.py +103 -103
  42. crawlo/factories/registry.py +84 -84
  43. crawlo/filters/__init__.py +154 -154
  44. crawlo/filters/aioredis_filter.py +257 -257
  45. crawlo/filters/memory_filter.py +269 -269
  46. crawlo/framework.py +292 -292
  47. crawlo/initialization/__init__.py +39 -39
  48. crawlo/initialization/built_in.py +425 -425
  49. crawlo/initialization/context.py +141 -141
  50. crawlo/initialization/core.py +193 -193
  51. crawlo/initialization/phases.py +148 -148
  52. crawlo/initialization/registry.py +145 -145
  53. crawlo/items/__init__.py +23 -23
  54. crawlo/items/base.py +23 -23
  55. crawlo/items/fields.py +52 -52
  56. crawlo/items/items.py +104 -104
  57. crawlo/logging/__init__.py +37 -37
  58. crawlo/logging/config.py +96 -96
  59. crawlo/logging/factory.py +128 -128
  60. crawlo/logging/manager.py +111 -111
  61. crawlo/middleware/__init__.py +21 -21
  62. crawlo/middleware/default_header.py +132 -132
  63. crawlo/middleware/download_delay.py +104 -104
  64. crawlo/middleware/middleware_manager.py +135 -135
  65. crawlo/middleware/offsite.py +123 -123
  66. crawlo/middleware/proxy.py +386 -386
  67. crawlo/middleware/request_ignore.py +86 -86
  68. crawlo/middleware/response_code.py +163 -163
  69. crawlo/middleware/response_filter.py +136 -136
  70. crawlo/middleware/retry.py +124 -124
  71. crawlo/middleware/simple_proxy.py +65 -65
  72. crawlo/mode_manager.py +212 -212
  73. crawlo/network/__init__.py +21 -21
  74. crawlo/network/request.py +379 -379
  75. crawlo/network/response.py +359 -359
  76. crawlo/pipelines/__init__.py +21 -21
  77. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  78. crawlo/pipelines/console_pipeline.py +39 -39
  79. crawlo/pipelines/csv_pipeline.py +316 -316
  80. crawlo/pipelines/database_dedup_pipeline.py +222 -222
  81. crawlo/pipelines/json_pipeline.py +218 -218
  82. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  83. crawlo/pipelines/mongo_pipeline.py +131 -131
  84. crawlo/pipelines/mysql_pipeline.py +325 -325
  85. crawlo/pipelines/pipeline_manager.py +76 -76
  86. crawlo/pipelines/redis_dedup_pipeline.py +166 -166
  87. crawlo/project.py +327 -327
  88. crawlo/queue/pqueue.py +42 -42
  89. crawlo/queue/queue_manager.py +522 -503
  90. crawlo/queue/redis_priority_queue.py +367 -326
  91. crawlo/settings/__init__.py +7 -7
  92. crawlo/settings/default_settings.py +321 -321
  93. crawlo/settings/setting_manager.py +214 -214
  94. crawlo/spider/__init__.py +657 -657
  95. crawlo/stats_collector.py +73 -73
  96. crawlo/subscriber.py +129 -129
  97. crawlo/task_manager.py +138 -138
  98. crawlo/templates/crawlo.cfg.tmpl +10 -10
  99. crawlo/templates/project/__init__.py.tmpl +3 -3
  100. crawlo/templates/project/items.py.tmpl +17 -17
  101. crawlo/templates/project/middlewares.py.tmpl +118 -118
  102. crawlo/templates/project/pipelines.py.tmpl +96 -96
  103. crawlo/templates/project/settings.py.tmpl +170 -167
  104. crawlo/templates/project/settings_distributed.py.tmpl +169 -166
  105. crawlo/templates/project/settings_gentle.py.tmpl +166 -166
  106. crawlo/templates/project/settings_high_performance.py.tmpl +167 -167
  107. crawlo/templates/project/settings_minimal.py.tmpl +65 -65
  108. crawlo/templates/project/settings_simple.py.tmpl +164 -164
  109. crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
  110. crawlo/templates/run.py.tmpl +34 -34
  111. crawlo/templates/spider/spider.py.tmpl +143 -143
  112. crawlo/templates/spiders_init.py.tmpl +9 -9
  113. crawlo/tools/__init__.py +200 -200
  114. crawlo/tools/anti_crawler.py +268 -268
  115. crawlo/tools/authenticated_proxy.py +240 -240
  116. crawlo/tools/data_formatter.py +225 -225
  117. crawlo/tools/data_validator.py +180 -180
  118. crawlo/tools/date_tools.py +289 -289
  119. crawlo/tools/distributed_coordinator.py +388 -388
  120. crawlo/tools/encoding_converter.py +127 -127
  121. crawlo/tools/network_diagnostic.py +364 -364
  122. crawlo/tools/request_tools.py +82 -82
  123. crawlo/tools/retry_mechanism.py +224 -224
  124. crawlo/tools/scenario_adapter.py +262 -262
  125. crawlo/tools/text_cleaner.py +232 -232
  126. crawlo/utils/__init__.py +34 -34
  127. crawlo/utils/batch_processor.py +259 -259
  128. crawlo/utils/class_loader.py +25 -25
  129. crawlo/utils/controlled_spider_mixin.py +439 -439
  130. crawlo/utils/db_helper.py +343 -343
  131. crawlo/utils/enhanced_error_handler.py +356 -356
  132. crawlo/utils/env_config.py +142 -142
  133. crawlo/utils/error_handler.py +165 -165
  134. crawlo/utils/func_tools.py +82 -82
  135. crawlo/utils/large_scale_config.py +286 -286
  136. crawlo/utils/large_scale_helper.py +344 -344
  137. crawlo/utils/log.py +79 -79
  138. crawlo/utils/performance_monitor.py +285 -285
  139. crawlo/utils/queue_helper.py +175 -175
  140. crawlo/utils/redis_connection_pool.py +388 -388
  141. crawlo/utils/redis_key_validator.py +198 -198
  142. crawlo/utils/request.py +267 -267
  143. crawlo/utils/request_serializer.py +225 -225
  144. crawlo/utils/spider_loader.py +61 -61
  145. crawlo/utils/system.py +11 -11
  146. crawlo/utils/tools.py +4 -4
  147. crawlo/utils/url.py +39 -39
  148. {crawlo-1.3.6.dist-info → crawlo-1.3.7.dist-info}/METADATA +1199 -1126
  149. crawlo-1.3.7.dist-info/RECORD +292 -0
  150. examples/__init__.py +7 -7
  151. tests/__init__.py +7 -7
  152. tests/advanced_tools_example.py +275 -275
  153. tests/authenticated_proxy_example.py +106 -106
  154. tests/baidu_performance_test.py +108 -108
  155. tests/baidu_test.py +59 -59
  156. tests/cleaners_example.py +160 -160
  157. tests/comprehensive_framework_test.py +212 -212
  158. tests/comprehensive_test.py +81 -81
  159. tests/comprehensive_testing_summary.md +186 -186
  160. tests/config_validation_demo.py +142 -142
  161. tests/controlled_spider_example.py +205 -205
  162. tests/date_tools_example.py +180 -180
  163. tests/debug_configure.py +69 -69
  164. tests/debug_framework_logger.py +84 -84
  165. tests/debug_log_config.py +126 -126
  166. tests/debug_log_levels.py +63 -63
  167. tests/debug_pipelines.py +66 -66
  168. tests/detailed_log_test.py +233 -233
  169. tests/distributed_test.py +66 -66
  170. tests/distributed_test_debug.py +76 -76
  171. tests/dynamic_loading_example.py +523 -523
  172. tests/dynamic_loading_test.py +104 -104
  173. tests/env_config_example.py +133 -133
  174. tests/error_handling_example.py +171 -171
  175. tests/final_comprehensive_test.py +151 -151
  176. tests/final_log_test.py +260 -260
  177. tests/final_validation_test.py +182 -182
  178. tests/fix_log_test.py +142 -142
  179. tests/framework_performance_test.py +202 -202
  180. tests/log_buffering_test.py +111 -111
  181. tests/log_generation_timing_test.py +153 -153
  182. tests/optimized_performance_test.py +211 -211
  183. tests/performance_comparison.py +245 -245
  184. tests/queue_blocking_test.py +113 -113
  185. tests/queue_test.py +89 -89
  186. tests/redis_key_validation_demo.py +130 -130
  187. tests/request_params_example.py +150 -150
  188. tests/response_improvements_example.py +144 -144
  189. tests/scrapy_comparison/ofweek_scrapy.py +138 -138
  190. tests/scrapy_comparison/scrapy_test.py +133 -133
  191. tests/simple_command_test.py +119 -119
  192. tests/simple_crawlo_test.py +127 -127
  193. tests/simple_log_test.py +57 -57
  194. tests/simple_log_test2.py +137 -137
  195. tests/simple_optimization_test.py +128 -128
  196. tests/simple_queue_type_test.py +41 -41
  197. tests/simple_spider_test.py +49 -49
  198. tests/simple_test.py +47 -47
  199. tests/spider_log_timing_test.py +177 -177
  200. tests/test_advanced_tools.py +148 -148
  201. tests/test_all_commands.py +230 -230
  202. tests/test_all_redis_key_configs.py +145 -145
  203. tests/test_authenticated_proxy.py +141 -141
  204. tests/test_batch_processor.py +178 -178
  205. tests/test_cleaners.py +54 -54
  206. tests/test_component_factory.py +174 -174
  207. tests/test_comprehensive.py +146 -146
  208. tests/test_config_consistency.py +80 -80
  209. tests/test_config_merge.py +152 -152
  210. tests/test_config_validator.py +182 -182
  211. tests/test_controlled_spider_mixin.py +79 -79
  212. tests/test_crawlo_proxy_integration.py +108 -108
  213. tests/test_date_tools.py +123 -123
  214. tests/test_default_header_middleware.py +158 -158
  215. tests/test_distributed.py +65 -65
  216. tests/test_double_crawlo_fix.py +204 -207
  217. tests/test_double_crawlo_fix_simple.py +124 -124
  218. tests/test_download_delay_middleware.py +221 -221
  219. tests/test_downloader_proxy_compatibility.py +268 -268
  220. tests/test_dynamic_downloaders_proxy.py +124 -124
  221. tests/test_dynamic_proxy.py +92 -92
  222. tests/test_dynamic_proxy_config.py +146 -146
  223. tests/test_dynamic_proxy_real.py +109 -109
  224. tests/test_edge_cases.py +303 -303
  225. tests/test_enhanced_error_handler.py +270 -270
  226. tests/test_enhanced_error_handler_comprehensive.py +245 -245
  227. tests/test_env_config.py +121 -121
  228. tests/test_error_handler_compatibility.py +112 -112
  229. tests/test_factories.py +252 -252
  230. tests/test_final_validation.py +153 -153
  231. tests/test_framework_env_usage.py +103 -103
  232. tests/test_framework_logger.py +66 -66
  233. tests/test_framework_startup.py +64 -64
  234. tests/test_get_component_logger.py +83 -83
  235. tests/test_integration.py +169 -169
  236. tests/test_item_dedup_redis_key.py +122 -122
  237. tests/test_large_scale_config.py +112 -112
  238. tests/test_large_scale_helper.py +235 -235
  239. tests/test_logging_system.py +282 -282
  240. tests/test_mode_change.py +72 -72
  241. tests/test_mode_consistency.py +51 -51
  242. tests/test_offsite_middleware.py +221 -221
  243. tests/test_parsel.py +29 -29
  244. tests/test_performance.py +327 -327
  245. tests/test_performance_monitor.py +115 -115
  246. tests/test_proxy_api.py +264 -264
  247. tests/test_proxy_health_check.py +32 -32
  248. tests/test_proxy_middleware.py +121 -121
  249. tests/test_proxy_middleware_enhanced.py +216 -216
  250. tests/test_proxy_middleware_integration.py +136 -136
  251. tests/test_proxy_middleware_refactored.py +184 -184
  252. tests/test_proxy_providers.py +56 -56
  253. tests/test_proxy_stats.py +19 -19
  254. tests/test_proxy_strategies.py +59 -59
  255. tests/test_queue_empty_check.py +41 -41
  256. tests/test_queue_manager_double_crawlo.py +173 -173
  257. tests/test_queue_manager_redis_key.py +179 -176
  258. tests/test_queue_naming.py +155 -0
  259. tests/test_queue_type.py +106 -106
  260. tests/test_random_user_agent.py +72 -72
  261. tests/test_real_scenario_proxy.py +195 -195
  262. tests/test_redis_config.py +28 -28
  263. tests/test_redis_connection_pool.py +294 -294
  264. tests/test_redis_key_naming.py +181 -181
  265. tests/test_redis_key_validator.py +123 -123
  266. tests/test_redis_queue.py +224 -224
  267. tests/test_redis_queue_name_fix.py +176 -0
  268. tests/test_request_ignore_middleware.py +182 -182
  269. tests/test_request_params.py +111 -111
  270. tests/test_request_serialization.py +70 -70
  271. tests/test_response_code_middleware.py +349 -349
  272. tests/test_response_filter_middleware.py +427 -427
  273. tests/test_response_improvements.py +152 -152
  274. tests/test_retry_middleware.py +241 -241
  275. tests/test_scheduler.py +252 -252
  276. tests/test_scheduler_config_update.py +133 -133
  277. tests/test_simple_response.py +61 -61
  278. tests/test_telecom_spider_redis_key.py +205 -205
  279. tests/test_template_content.py +87 -87
  280. tests/test_template_redis_key.py +134 -134
  281. tests/test_tools.py +159 -159
  282. tests/test_user_agents.py +96 -96
  283. tests/tools_example.py +260 -260
  284. tests/untested_features_report.md +138 -138
  285. tests/verify_debug.py +51 -51
  286. tests/verify_distributed.py +117 -117
  287. tests/verify_log_fix.py +111 -111
  288. crawlo-1.3.6.dist-info/RECORD +0 -290
  289. {crawlo-1.3.6.dist-info → crawlo-1.3.7.dist-info}/WHEEL +0 -0
  290. {crawlo-1.3.6.dist-info → crawlo-1.3.7.dist-info}/entry_points.txt +0 -0
  291. {crawlo-1.3.6.dist-info → crawlo-1.3.7.dist-info}/top_level.txt +0 -0
crawlo/crawler.py CHANGED
@@ -1,639 +1,639 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 重构后的Crawler系统
5
- ==================
6
-
7
- 设计原则:
8
- 1. 单一职责 - 每个类只负责一个明确的功能
9
- 2. 依赖注入 - 通过工厂创建组件,便于测试
10
- 3. 状态管理 - 清晰的状态转换和生命周期
11
- 4. 错误处理 - 优雅的错误处理和恢复机制
12
- """
13
-
14
- import asyncio
15
- import time
16
- from contextlib import asynccontextmanager
17
- from dataclasses import dataclass
18
- from enum import Enum
19
- from typing import Optional, Type, Dict, Any, List
20
-
21
- from crawlo.factories import get_component_registry
22
- from crawlo.initialization import initialize_framework, is_framework_ready
23
- from crawlo.logging import get_logger
24
-
25
-
26
- class CrawlerState(Enum):
27
- """Crawler状态枚举"""
28
- CREATED = "created"
29
- INITIALIZING = "initializing"
30
- READY = "ready"
31
- RUNNING = "running"
32
- CLOSING = "closing"
33
- CLOSED = "closed"
34
- ERROR = "error"
35
-
36
-
37
- @dataclass
38
- class CrawlerMetrics:
39
- """Crawler性能指标"""
40
- start_time: Optional[float] = None
41
- end_time: Optional[float] = None
42
- initialization_duration: float = 0.0
43
- crawl_duration: float = 0.0
44
- request_count: int = 0
45
- success_count: int = 0
46
- error_count: int = 0
47
-
48
- def get_total_duration(self) -> float:
49
- if self.start_time and self.end_time:
50
- return self.end_time - self.start_time
51
- return 0.0
52
-
53
- def get_success_rate(self) -> float:
54
- total = self.success_count + self.error_count
55
- return (self.success_count / total * 100) if total > 0 else 0.0
56
-
57
-
58
- class ModernCrawler:
59
- """
60
- 现代化的Crawler实现
61
-
62
- 特点:
63
- 1. 清晰的状态管理
64
- 2. 依赖注入
65
- 3. 组件化架构
66
- 4. 完善的错误处理
67
- """
68
-
69
- def __init__(self, spider_cls: Type, settings=None):
70
- self._spider_cls = spider_cls
71
- self._settings = settings
72
- self._state = CrawlerState.CREATED
73
- self._state_lock = asyncio.Lock()
74
-
75
- # 组件
76
- self._spider = None
77
- self._engine = None
78
- self._stats = None
79
- self._subscriber = None
80
- self._extension = None
81
-
82
- # 指标
83
- self._metrics = CrawlerMetrics()
84
-
85
- # 日志
86
- self._logger = get_logger(f'crawler.{spider_cls.__name__ if spider_cls else "unknown"}')
87
-
88
- # 确保框架已初始化
89
- self._ensure_framework_ready()
90
-
91
- def _ensure_framework_ready(self):
92
- """确保框架已准备就绪"""
93
- if not is_framework_ready():
94
- try:
95
- self._settings = initialize_framework(self._settings)
96
- self._logger.debug("Framework initialized successfully")
97
- except Exception as e:
98
- self._logger.warning(f"Framework initialization failed: {e}")
99
- # 使用降级策略
100
- if not self._settings:
101
- from crawlo.settings.setting_manager import SettingManager
102
- self._settings = SettingManager()
103
-
104
- # 确保是SettingManager实例
105
- if isinstance(self._settings, dict):
106
- from crawlo.settings.setting_manager import SettingManager
107
- settings_manager = SettingManager()
108
- settings_manager.update_attributes(self._settings)
109
- self._settings = settings_manager
110
-
111
- @property
112
- def state(self) -> CrawlerState:
113
- """获取当前状态"""
114
- return self._state
115
-
116
- @property
117
- def spider(self):
118
- """获取Spider实例"""
119
- return self._spider
120
-
121
- @property
122
- def stats(self):
123
- """获取Stats实例(向后兼容)"""
124
- return self._stats
125
-
126
- @property
127
- def metrics(self) -> CrawlerMetrics:
128
- """获取性能指标"""
129
- return self._metrics
130
-
131
- @property
132
- def settings(self):
133
- """获取配置"""
134
- return self._settings
135
-
136
- @property
137
- def engine(self):
138
- """获取Engine实例(向后兼容)"""
139
- return self._engine
140
-
141
- @property
142
- def subscriber(self):
143
- """获取Subscriber实例(向后兼容)"""
144
- return self._subscriber
145
-
146
- @property
147
- def extension(self):
148
- """获取Extension实例(向后兼容)"""
149
- return self._extension
150
-
151
- @extension.setter
152
- def extension(self, value):
153
- """设置Extension实例(向后兼容)"""
154
- self._extension = value
155
-
156
- def _create_extension(self):
157
- """创建Extension管理器(向后兼容)"""
158
- if self._extension is None:
159
- try:
160
- registry = get_component_registry()
161
- self._extension = registry.create('extension_manager', crawler=self)
162
- except Exception as e:
163
- self._logger.warning(f"Failed to create extension manager: {e}")
164
- return self._extension
165
-
166
- async def close(self):
167
- """关闭爹虫(向后兼容)"""
168
- await self._cleanup()
169
-
170
- async def crawl(self):
171
- """执行爬取任务"""
172
- async with self._lifecycle_manager():
173
- await self._initialize_components()
174
- await self._run_crawler()
175
-
176
- @asynccontextmanager
177
- async def _lifecycle_manager(self):
178
- """生命周期管理"""
179
- self._metrics.start_time = time.time()
180
-
181
- try:
182
- yield
183
- except Exception as e:
184
- await self._handle_error(e)
185
- raise
186
- finally:
187
- await self._cleanup()
188
- self._metrics.end_time = time.time()
189
-
190
- async def _initialize_components(self):
191
- """初始化组件"""
192
- async with self._state_lock:
193
- if self._state != CrawlerState.CREATED:
194
- raise RuntimeError(f"Cannot initialize from state {self._state}")
195
-
196
- self._state = CrawlerState.INITIALIZING
197
-
198
- init_start = time.time()
199
-
200
- try:
201
- # 使用组件工厂创建组件
202
- registry = get_component_registry()
203
-
204
- # 创建Subscriber(无依赖)
205
- self._subscriber = registry.create('subscriber')
206
-
207
- # 创建Spider
208
- self._spider = self._create_spider()
209
-
210
- # 创建Engine(需要crawler参数)
211
- self._engine = registry.create('engine', crawler=self)
212
-
213
- # 创建Stats(需要crawler参数)
214
- self._stats = registry.create('stats', crawler=self)
215
-
216
- # 创建Extension Manager (可选,需要crawler参数)
217
- try:
218
- self._extension = registry.create('extension_manager', crawler=self)
219
- except Exception as e:
220
- self._logger.warning(f"Failed to create extension manager: {e}")
221
-
222
- self._metrics.initialization_duration = time.time() - init_start
223
-
224
- async with self._state_lock:
225
- self._state = CrawlerState.READY
226
-
227
- self._logger.debug(f"Crawler components initialized successfully in {self._metrics.initialization_duration:.2f}s")
228
-
229
- except Exception as e:
230
- async with self._state_lock:
231
- self._state = CrawlerState.ERROR
232
- raise RuntimeError(f"Component initialization failed: {e}")
233
-
234
- def _create_spider(self):
235
- """创建Spider实例"""
236
- if not self._spider_cls:
237
- raise ValueError("Spider class not provided")
238
-
239
- # 检查Spider类的有效性
240
- if not hasattr(self._spider_cls, 'name'):
241
- raise ValueError("Spider class must have 'name' attribute")
242
-
243
- # 创建Spider实例
244
- spider = self._spider_cls()
245
-
246
- # 设置crawler引用
247
- if hasattr(spider, 'crawler'):
248
- spider.crawler = self
249
-
250
- return spider
251
-
252
- async def _run_crawler(self):
253
- """运行爬虫引擎"""
254
- async with self._state_lock:
255
- if self._state != CrawlerState.READY:
256
- raise RuntimeError(f"Cannot run from state {self._state}")
257
-
258
- self._state = CrawlerState.RUNNING
259
-
260
- crawl_start = time.time()
261
-
262
- try:
263
- # 启动引擎
264
- if self._engine:
265
- await self._engine.start_spider(self._spider)
266
- else:
267
- raise RuntimeError("Engine not initialized")
268
-
269
- self._metrics.crawl_duration = time.time() - crawl_start
270
-
271
- self._logger.info(f"Crawler completed successfully in {self._metrics.crawl_duration:.2f}s")
272
-
273
- except Exception as e:
274
- self._metrics.crawl_duration = time.time() - crawl_start
275
- raise RuntimeError(f"Crawler execution failed: {e}")
276
-
277
- async def _handle_error(self, error: Exception):
278
- """处理错误"""
279
- async with self._state_lock:
280
- self._state = CrawlerState.ERROR
281
-
282
- self._metrics.error_count += 1
283
- self._logger.error(f"Crawler error: {error}", exc_info=True)
284
-
285
- # 这里可以添加错误恢复逻辑
286
-
287
- async def _cleanup(self):
288
- """清理资源"""
289
- async with self._state_lock:
290
- if self._state not in [CrawlerState.CLOSING, CrawlerState.CLOSED]:
291
- self._state = CrawlerState.CLOSING
292
-
293
- try:
294
- # 关闭各个组件
295
- if self._engine and hasattr(self._engine, 'close'):
296
- try:
297
- await self._engine.close()
298
- except Exception as e:
299
- self._logger.warning(f"Engine cleanup failed: {e}")
300
-
301
- # 调用Spider的spider_closed方法
302
- if self._spider:
303
- try:
304
- if asyncio.iscoroutinefunction(self._spider.spider_closed):
305
- await self._spider.spider_closed()
306
- else:
307
- self._spider.spider_closed()
308
- except Exception as e:
309
- self._logger.warning(f"Spider cleanup failed: {e}")
310
-
311
- if self._stats and hasattr(self._stats, 'close'):
312
- try:
313
- close_result = self._stats.close()
314
- if asyncio.iscoroutine(close_result):
315
- await close_result
316
- except Exception as e:
317
- self._logger.warning(f"Stats cleanup failed: {e}")
318
-
319
- async with self._state_lock:
320
- self._state = CrawlerState.CLOSED
321
-
322
- self._logger.debug("Crawler cleanup completed")
323
-
324
- except Exception as e:
325
- self._logger.error(f"Cleanup error: {e}")
326
-
327
-
328
- class CrawlerProcess:
329
- """
330
- Crawler进程管理器 - 管理多个Crawler的执行
331
-
332
- 简化版本,专注于核心功能
333
- """
334
-
335
- def __init__(self, settings=None, max_concurrency: int = 3, spider_modules=None):
336
- self._settings = settings or initialize_framework()
337
- self._max_concurrency = max_concurrency
338
- self._crawlers: List[ModernCrawler] = []
339
- self._semaphore = asyncio.Semaphore(max_concurrency)
340
- self._logger = get_logger('crawler.process')
341
- self._spider_modules = spider_modules # 保存spider_modules
342
-
343
- # 如果提供了spider_modules,自动注册这些模块中的爬虫
344
- if spider_modules:
345
- self._register_spider_modules(spider_modules)
346
-
347
- # 指标
348
- self._start_time: Optional[float] = None
349
- self._end_time: Optional[float] = None
350
-
351
- def _register_spider_modules(self, spider_modules):
352
- """注册爬虫模块"""
353
- try:
354
- from crawlo.spider import get_global_spider_registry
355
- registry = get_global_spider_registry()
356
-
357
- self._logger.debug(f"Registering spider modules: {spider_modules}")
358
-
359
- initial_spider_count = len(registry)
360
-
361
- for module_path in spider_modules:
362
- try:
363
- # 导入模块
364
- __import__(module_path)
365
- self._logger.debug(f"Successfully imported spider module: {module_path}")
366
- except ImportError as e:
367
- self._logger.warning(f"Failed to import spider module {module_path}: {e}")
368
- # 如果导入失败,尝试自动发现
369
- self._auto_discover_spider_modules([module_path])
370
-
371
- # 检查注册表中的爬虫
372
- spider_names = list(registry.keys())
373
- self._logger.debug(f"Registered spiders after import: {spider_names}")
374
-
375
- # 如果导入模块后没有新的爬虫被注册,则尝试自动发现
376
- final_spider_count = len(registry)
377
- if final_spider_count == initial_spider_count:
378
- self._logger.debug("No new spiders registered after importing modules, attempting auto-discovery")
379
- self._auto_discover_spider_modules(spider_modules)
380
- spider_names = list(registry.keys())
381
- self._logger.debug(f"Registered spiders after auto-discovery: {spider_names}")
382
- except Exception as e:
383
- self._logger.warning(f"Error registering spider modules: {e}")
384
-
385
- def _auto_discover_spider_modules(self, spider_modules):
386
- """
387
- 自动发现并导入爬虫模块中的所有爬虫
388
- 这个方法会扫描指定模块目录下的所有Python文件并自动导入
389
- """
390
- try:
391
- from crawlo.spider import get_global_spider_registry
392
- import importlib
393
- from pathlib import Path
394
- import sys
395
-
396
- registry = get_global_spider_registry()
397
- initial_spider_count = len(registry)
398
-
399
- for module_path in spider_modules:
400
- try:
401
- # 将模块路径转换为文件系统路径
402
- # 例如: ofweek_standalone.spiders -> ofweek_standalone/spiders
403
- package_parts = module_path.split('.')
404
- if len(package_parts) < 2:
405
- continue
406
-
407
- # 获取项目根目录
408
- project_root = None
409
- for path in sys.path:
410
- if path and Path(path).exists():
411
- possible_module_path = Path(path) / package_parts[0]
412
- if possible_module_path.exists():
413
- project_root = path
414
- break
415
-
416
- if not project_root:
417
- # 尝试使用当前工作目录
418
- project_root = str(Path.cwd())
419
-
420
- # 构建模块目录路径
421
- module_dir = Path(project_root)
422
- for part in package_parts:
423
- module_dir = module_dir / part
424
-
425
- # 如果目录存在,扫描其中的Python文件
426
- if module_dir.exists() and module_dir.is_dir():
427
- # 导入目录下的所有Python文件(除了__init__.py)
428
- for py_file in module_dir.glob("*.py"):
429
- if py_file.name.startswith('_'):
430
- continue
431
-
432
- # 构造模块名
433
- module_name = py_file.stem # 文件名(不含扩展名)
434
- full_module_path = f"{module_path}.{module_name}"
435
-
436
- try:
437
- # 导入模块以触发Spider注册
438
- importlib.import_module(full_module_path)
439
- except ImportError as e:
440
- self._logger.warning(f"Failed to auto-import spider module {full_module_path}: {e}")
441
- except Exception as e:
442
- self._logger.warning(f"Error during auto-discovery for module {module_path}: {e}")
443
-
444
- # 检查是否有新的爬虫被注册
445
- final_spider_count = len(registry)
446
- if final_spider_count > initial_spider_count:
447
- new_spiders = list(registry.keys())
448
- self._logger.info(f"Auto-discovered {final_spider_count - initial_spider_count} new spiders: {new_spiders}")
449
-
450
- except Exception as e:
451
- self._logger.warning(f"Error during auto-discovery of spider modules: {e}")
452
-
453
- def is_spider_registered(self, name: str) -> bool:
454
- """检查爬虫是否已注册"""
455
- from crawlo.spider import get_global_spider_registry
456
- registry = get_global_spider_registry()
457
- return name in registry
458
-
459
- def get_spider_class(self, name: str):
460
- """获取爬虫类"""
461
- from crawlo.spider import get_global_spider_registry
462
- registry = get_global_spider_registry()
463
- return registry.get(name)
464
-
465
- def get_spider_names(self):
466
- """获取所有注册的爬虫名称"""
467
- from crawlo.spider import get_global_spider_registry
468
- registry = get_global_spider_registry()
469
- return list(registry.keys())
470
-
471
- async def crawl(self, spider_cls_or_name, settings=None):
472
- """运行单个爬虫"""
473
- spider_cls = self._resolve_spider_class(spider_cls_or_name)
474
-
475
- # 记录启动的爬虫名称(符合规范要求)
476
- from crawlo.logging import get_logger
477
- logger = get_logger('crawlo.framework')
478
- logger.info(f"Starting spider: {spider_cls.name}")
479
-
480
- merged_settings = self._merge_settings(settings)
481
- crawler = ModernCrawler(spider_cls, merged_settings)
482
-
483
- async with self._semaphore:
484
- await crawler.crawl()
485
-
486
- return crawler
487
-
488
- async def crawl_multiple(self, spider_classes_or_names, settings=None):
489
- """运行多个爬虫"""
490
- self._start_time = time.time()
491
-
492
- try:
493
- spider_classes = []
494
- for cls_or_name in spider_classes_or_names:
495
- spider_cls = self._resolve_spider_class(cls_or_name)
496
- spider_classes.append(spider_cls)
497
-
498
- # 记录启动的爬虫名称(符合规范要求)
499
- spider_names = [cls.name for cls in spider_classes]
500
- from crawlo.logging import get_logger
501
- logger = get_logger('crawlo.framework')
502
- if len(spider_names) == 1:
503
- logger.info(f"Starting spider: {spider_names[0]}")
504
- else:
505
- logger.info(f"Starting spiders: {', '.join(spider_names)}")
506
-
507
- tasks = []
508
- for spider_cls in spider_classes:
509
- merged_settings = self._merge_settings(settings)
510
- crawler = ModernCrawler(spider_cls, merged_settings)
511
- self._crawlers.append(crawler)
512
-
513
- task = asyncio.create_task(self._run_with_semaphore(crawler))
514
- tasks.append(task)
515
-
516
- results = await asyncio.gather(*tasks, return_exceptions=True)
517
-
518
- # 处理结果
519
- successful = sum(1 for r in results if not isinstance(r, Exception))
520
- failed = len(results) - successful
521
-
522
- self._logger.info(f"Crawl completed: {successful} successful, {failed} failed")
523
-
524
- return results
525
-
526
- finally:
527
- self._end_time = time.time()
528
- if self._start_time:
529
- duration = self._end_time - self._start_time
530
- self._logger.info(f"Total execution time: {duration:.2f}s")
531
-
532
- async def _run_with_semaphore(self, crawler: ModernCrawler):
533
- """在信号量控制下运行爬虫"""
534
- async with self._semaphore:
535
- await crawler.crawl()
536
- return crawler
537
-
538
- def _resolve_spider_class(self, spider_cls_or_name):
539
- """解析Spider类"""
540
- if isinstance(spider_cls_or_name, str):
541
- # 从注册表中查找
542
- try:
543
- from crawlo.spider import get_global_spider_registry
544
- registry = get_global_spider_registry()
545
- if spider_cls_or_name in registry:
546
- return registry[spider_cls_or_name]
547
- else:
548
- # 如果在注册表中找不到,尝试通过spider_modules导入所有模块来触发注册
549
- # 然后再次检查注册表
550
- if hasattr(self, '_spider_modules') and self._spider_modules:
551
- for module_path in self._spider_modules:
552
- try:
553
- # 导入模块来触发爬虫注册
554
- __import__(module_path)
555
- except ImportError:
556
- pass # 忽略导入错误
557
-
558
- # 再次检查注册表
559
- if spider_cls_or_name in registry:
560
- return registry[spider_cls_or_name]
561
-
562
- # 如果仍然找不到,尝试自动发现模式
563
- if hasattr(self, '_spider_modules') and self._spider_modules:
564
- self._auto_discover_spider_modules(self._spider_modules)
565
- if spider_cls_or_name in registry:
566
- return registry[spider_cls_or_name]
567
-
568
- # 如果仍然找不到,尝试直接导入模块
569
- try:
570
- # 假设格式为 module.SpiderClass
571
- if '.' in spider_cls_or_name:
572
- module_path, class_name = spider_cls_or_name.rsplit('.', 1)
573
- module = __import__(module_path, fromlist=[class_name])
574
- spider_class = getattr(module, class_name)
575
- # 注册到全局注册表
576
- registry[spider_class.name] = spider_class
577
- return spider_class
578
- else:
579
- # 尝试在spider_modules中查找
580
- if hasattr(self, '_spider_modules') and self._spider_modules:
581
- for module_path in self._spider_modules:
582
- try:
583
- # 构造完整的模块路径
584
- full_module_path = f"{module_path}.{spider_cls_or_name}"
585
- module = __import__(full_module_path, fromlist=[spider_cls_or_name])
586
- # 获取模块中的Spider子类
587
- for attr_name in dir(module):
588
- attr_value = getattr(module, attr_name)
589
- if (isinstance(attr_value, type) and
590
- issubclass(attr_value, registry.__class__.__bases__[0]) and
591
- hasattr(attr_value, 'name') and
592
- attr_value.name == spider_cls_or_name):
593
- # 注册到全局注册表
594
- registry[spider_cls_or_name] = attr_value
595
- return attr_value
596
- except ImportError:
597
- continue
598
- raise ValueError(f"Spider '{spider_cls_or_name}' not found in registry")
599
- except (ImportError, AttributeError):
600
- raise ValueError(f"Spider '{spider_cls_or_name}' not found in registry")
601
- except ImportError:
602
- raise ValueError(f"Cannot resolve spider name '{spider_cls_or_name}'")
603
- else:
604
- return spider_cls_or_name
605
-
606
- def _merge_settings(self, additional_settings):
607
- """合并配置"""
608
- if not additional_settings:
609
- return self._settings
610
-
611
- # 这里可以实现更复杂的配置合并逻辑
612
- from crawlo.settings.setting_manager import SettingManager
613
- merged = SettingManager()
614
-
615
- # 复制基础配置
616
- if self._settings:
617
- merged.update_attributes(self._settings.__dict__)
618
-
619
- # 应用额外配置
620
- merged.update_attributes(additional_settings)
621
-
622
- return merged
623
-
624
- def get_metrics(self) -> Dict[str, Any]:
625
- """获取整体指标"""
626
- total_duration = 0.0
627
- if self._start_time and self._end_time:
628
- total_duration = self._end_time - self._start_time
629
-
630
- crawler_metrics = [crawler.metrics for crawler in self._crawlers]
631
-
632
- return {
633
- 'total_duration': total_duration,
634
- 'crawler_count': len(self._crawlers),
635
- 'total_requests': sum(m.request_count for m in crawler_metrics),
636
- 'total_success': sum(m.success_count for m in crawler_metrics),
637
- 'total_errors': sum(m.error_count for m in crawler_metrics),
638
- 'average_success_rate': sum(m.get_success_rate() for m in crawler_metrics) / len(crawler_metrics) if crawler_metrics else 0.0
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 重构后的Crawler系统
5
+ ==================
6
+
7
+ 设计原则:
8
+ 1. 单一职责 - 每个类只负责一个明确的功能
9
+ 2. 依赖注入 - 通过工厂创建组件,便于测试
10
+ 3. 状态管理 - 清晰的状态转换和生命周期
11
+ 4. 错误处理 - 优雅的错误处理和恢复机制
12
+ """
13
+
14
+ import asyncio
15
+ import time
16
+ from contextlib import asynccontextmanager
17
+ from dataclasses import dataclass
18
+ from enum import Enum
19
+ from typing import Optional, Type, Dict, Any, List
20
+
21
+ from crawlo.factories import get_component_registry
22
+ from crawlo.initialization import initialize_framework, is_framework_ready
23
+ from crawlo.logging import get_logger
24
+
25
+
26
+ class CrawlerState(Enum):
27
+ """Crawler状态枚举"""
28
+ CREATED = "created"
29
+ INITIALIZING = "initializing"
30
+ READY = "ready"
31
+ RUNNING = "running"
32
+ CLOSING = "closing"
33
+ CLOSED = "closed"
34
+ ERROR = "error"
35
+
36
+
37
+ @dataclass
38
+ class CrawlerMetrics:
39
+ """Crawler性能指标"""
40
+ start_time: Optional[float] = None
41
+ end_time: Optional[float] = None
42
+ initialization_duration: float = 0.0
43
+ crawl_duration: float = 0.0
44
+ request_count: int = 0
45
+ success_count: int = 0
46
+ error_count: int = 0
47
+
48
+ def get_total_duration(self) -> float:
49
+ if self.start_time and self.end_time:
50
+ return self.end_time - self.start_time
51
+ return 0.0
52
+
53
+ def get_success_rate(self) -> float:
54
+ total = self.success_count + self.error_count
55
+ return (self.success_count / total * 100) if total > 0 else 0.0
56
+
57
+
58
+ class ModernCrawler:
59
+ """
60
+ 现代化的Crawler实现
61
+
62
+ 特点:
63
+ 1. 清晰的状态管理
64
+ 2. 依赖注入
65
+ 3. 组件化架构
66
+ 4. 完善的错误处理
67
+ """
68
+
69
+ def __init__(self, spider_cls: Type, settings=None):
70
+ self._spider_cls = spider_cls
71
+ self._settings = settings
72
+ self._state = CrawlerState.CREATED
73
+ self._state_lock = asyncio.Lock()
74
+
75
+ # 组件
76
+ self._spider = None
77
+ self._engine = None
78
+ self._stats = None
79
+ self._subscriber = None
80
+ self._extension = None
81
+
82
+ # 指标
83
+ self._metrics = CrawlerMetrics()
84
+
85
+ # 日志
86
+ self._logger = get_logger(f'crawler.{spider_cls.__name__ if spider_cls else "unknown"}')
87
+
88
+ # 确保框架已初始化
89
+ self._ensure_framework_ready()
90
+
91
+ def _ensure_framework_ready(self):
92
+ """确保框架已准备就绪"""
93
+ if not is_framework_ready():
94
+ try:
95
+ self._settings = initialize_framework(self._settings)
96
+ self._logger.debug("Framework initialized successfully")
97
+ except Exception as e:
98
+ self._logger.warning(f"Framework initialization failed: {e}")
99
+ # 使用降级策略
100
+ if not self._settings:
101
+ from crawlo.settings.setting_manager import SettingManager
102
+ self._settings = SettingManager()
103
+
104
+ # 确保是SettingManager实例
105
+ if isinstance(self._settings, dict):
106
+ from crawlo.settings.setting_manager import SettingManager
107
+ settings_manager = SettingManager()
108
+ settings_manager.update_attributes(self._settings)
109
+ self._settings = settings_manager
110
+
111
+ @property
112
+ def state(self) -> CrawlerState:
113
+ """获取当前状态"""
114
+ return self._state
115
+
116
+ @property
117
+ def spider(self):
118
+ """获取Spider实例"""
119
+ return self._spider
120
+
121
+ @property
122
+ def stats(self):
123
+ """获取Stats实例(向后兼容)"""
124
+ return self._stats
125
+
126
+ @property
127
+ def metrics(self) -> CrawlerMetrics:
128
+ """获取性能指标"""
129
+ return self._metrics
130
+
131
+ @property
132
+ def settings(self):
133
+ """获取配置"""
134
+ return self._settings
135
+
136
+ @property
137
+ def engine(self):
138
+ """获取Engine实例(向后兼容)"""
139
+ return self._engine
140
+
141
+ @property
142
+ def subscriber(self):
143
+ """获取Subscriber实例(向后兼容)"""
144
+ return self._subscriber
145
+
146
+ @property
147
+ def extension(self):
148
+ """获取Extension实例(向后兼容)"""
149
+ return self._extension
150
+
151
+ @extension.setter
152
+ def extension(self, value):
153
+ """设置Extension实例(向后兼容)"""
154
+ self._extension = value
155
+
156
+ def _create_extension(self):
157
+ """创建Extension管理器(向后兼容)"""
158
+ if self._extension is None:
159
+ try:
160
+ registry = get_component_registry()
161
+ self._extension = registry.create('extension_manager', crawler=self)
162
+ except Exception as e:
163
+ self._logger.warning(f"Failed to create extension manager: {e}")
164
+ return self._extension
165
+
166
+ async def close(self):
167
+ """关闭爹虫(向后兼容)"""
168
+ await self._cleanup()
169
+
170
+ async def crawl(self):
171
+ """执行爬取任务"""
172
+ async with self._lifecycle_manager():
173
+ await self._initialize_components()
174
+ await self._run_crawler()
175
+
176
+ @asynccontextmanager
177
+ async def _lifecycle_manager(self):
178
+ """生命周期管理"""
179
+ self._metrics.start_time = time.time()
180
+
181
+ try:
182
+ yield
183
+ except Exception as e:
184
+ await self._handle_error(e)
185
+ raise
186
+ finally:
187
+ await self._cleanup()
188
+ self._metrics.end_time = time.time()
189
+
190
+ async def _initialize_components(self):
191
+ """初始化组件"""
192
+ async with self._state_lock:
193
+ if self._state != CrawlerState.CREATED:
194
+ raise RuntimeError(f"Cannot initialize from state {self._state}")
195
+
196
+ self._state = CrawlerState.INITIALIZING
197
+
198
+ init_start = time.time()
199
+
200
+ try:
201
+ # 使用组件工厂创建组件
202
+ registry = get_component_registry()
203
+
204
+ # 创建Subscriber(无依赖)
205
+ self._subscriber = registry.create('subscriber')
206
+
207
+ # 创建Spider
208
+ self._spider = self._create_spider()
209
+
210
+ # 创建Engine(需要crawler参数)
211
+ self._engine = registry.create('engine', crawler=self)
212
+
213
+ # 创建Stats(需要crawler参数)
214
+ self._stats = registry.create('stats', crawler=self)
215
+
216
+ # 创建Extension Manager (可选,需要crawler参数)
217
+ try:
218
+ self._extension = registry.create('extension_manager', crawler=self)
219
+ except Exception as e:
220
+ self._logger.warning(f"Failed to create extension manager: {e}")
221
+
222
+ self._metrics.initialization_duration = time.time() - init_start
223
+
224
+ async with self._state_lock:
225
+ self._state = CrawlerState.READY
226
+
227
+ self._logger.debug(f"Crawler components initialized successfully in {self._metrics.initialization_duration:.2f}s")
228
+
229
+ except Exception as e:
230
+ async with self._state_lock:
231
+ self._state = CrawlerState.ERROR
232
+ raise RuntimeError(f"Component initialization failed: {e}")
233
+
234
+ def _create_spider(self):
235
+ """创建Spider实例"""
236
+ if not self._spider_cls:
237
+ raise ValueError("Spider class not provided")
238
+
239
+ # 检查Spider类的有效性
240
+ if not hasattr(self._spider_cls, 'name'):
241
+ raise ValueError("Spider class must have 'name' attribute")
242
+
243
+ # 创建Spider实例
244
+ spider = self._spider_cls()
245
+
246
+ # 设置crawler引用
247
+ if hasattr(spider, 'crawler'):
248
+ spider.crawler = self
249
+
250
+ return spider
251
+
252
+ async def _run_crawler(self):
253
+ """运行爬虫引擎"""
254
+ async with self._state_lock:
255
+ if self._state != CrawlerState.READY:
256
+ raise RuntimeError(f"Cannot run from state {self._state}")
257
+
258
+ self._state = CrawlerState.RUNNING
259
+
260
+ crawl_start = time.time()
261
+
262
+ try:
263
+ # 启动引擎
264
+ if self._engine:
265
+ await self._engine.start_spider(self._spider)
266
+ else:
267
+ raise RuntimeError("Engine not initialized")
268
+
269
+ self._metrics.crawl_duration = time.time() - crawl_start
270
+
271
+ self._logger.info(f"Crawler completed successfully in {self._metrics.crawl_duration:.2f}s")
272
+
273
+ except Exception as e:
274
+ self._metrics.crawl_duration = time.time() - crawl_start
275
+ raise RuntimeError(f"Crawler execution failed: {e}")
276
+
277
+ async def _handle_error(self, error: Exception):
278
+ """处理错误"""
279
+ async with self._state_lock:
280
+ self._state = CrawlerState.ERROR
281
+
282
+ self._metrics.error_count += 1
283
+ self._logger.error(f"Crawler error: {error}", exc_info=True)
284
+
285
+ # 这里可以添加错误恢复逻辑
286
+
287
+ async def _cleanup(self):
288
+ """清理资源"""
289
+ async with self._state_lock:
290
+ if self._state not in [CrawlerState.CLOSING, CrawlerState.CLOSED]:
291
+ self._state = CrawlerState.CLOSING
292
+
293
+ try:
294
+ # 关闭各个组件
295
+ if self._engine and hasattr(self._engine, 'close'):
296
+ try:
297
+ await self._engine.close()
298
+ except Exception as e:
299
+ self._logger.warning(f"Engine cleanup failed: {e}")
300
+
301
+ # 调用Spider的spider_closed方法
302
+ if self._spider:
303
+ try:
304
+ if asyncio.iscoroutinefunction(self._spider.spider_closed):
305
+ await self._spider.spider_closed()
306
+ else:
307
+ self._spider.spider_closed()
308
+ except Exception as e:
309
+ self._logger.warning(f"Spider cleanup failed: {e}")
310
+
311
+ if self._stats and hasattr(self._stats, 'close'):
312
+ try:
313
+ close_result = self._stats.close()
314
+ if asyncio.iscoroutine(close_result):
315
+ await close_result
316
+ except Exception as e:
317
+ self._logger.warning(f"Stats cleanup failed: {e}")
318
+
319
+ async with self._state_lock:
320
+ self._state = CrawlerState.CLOSED
321
+
322
+ self._logger.debug("Crawler cleanup completed")
323
+
324
+ except Exception as e:
325
+ self._logger.error(f"Cleanup error: {e}")
326
+
327
+
328
+ class CrawlerProcess:
329
+ """
330
+ Crawler进程管理器 - 管理多个Crawler的执行
331
+
332
+ 简化版本,专注于核心功能
333
+ """
334
+
335
+ def __init__(self, settings=None, max_concurrency: int = 3, spider_modules=None):
336
+ self._settings = settings or initialize_framework()
337
+ self._max_concurrency = max_concurrency
338
+ self._crawlers: List[ModernCrawler] = []
339
+ self._semaphore = asyncio.Semaphore(max_concurrency)
340
+ self._logger = get_logger('crawler.process')
341
+ self._spider_modules = spider_modules # 保存spider_modules
342
+
343
+ # 如果提供了spider_modules,自动注册这些模块中的爬虫
344
+ if spider_modules:
345
+ self._register_spider_modules(spider_modules)
346
+
347
+ # 指标
348
+ self._start_time: Optional[float] = None
349
+ self._end_time: Optional[float] = None
350
+
351
+ def _register_spider_modules(self, spider_modules):
352
+ """注册爬虫模块"""
353
+ try:
354
+ from crawlo.spider import get_global_spider_registry
355
+ registry = get_global_spider_registry()
356
+
357
+ self._logger.debug(f"Registering spider modules: {spider_modules}")
358
+
359
+ initial_spider_count = len(registry)
360
+
361
+ for module_path in spider_modules:
362
+ try:
363
+ # 导入模块
364
+ __import__(module_path)
365
+ self._logger.debug(f"Successfully imported spider module: {module_path}")
366
+ except ImportError as e:
367
+ self._logger.warning(f"Failed to import spider module {module_path}: {e}")
368
+ # 如果导入失败,尝试自动发现
369
+ self._auto_discover_spider_modules([module_path])
370
+
371
+ # 检查注册表中的爬虫
372
+ spider_names = list(registry.keys())
373
+ self._logger.debug(f"Registered spiders after import: {spider_names}")
374
+
375
+ # 如果导入模块后没有新的爬虫被注册,则尝试自动发现
376
+ final_spider_count = len(registry)
377
+ if final_spider_count == initial_spider_count:
378
+ self._logger.debug("No new spiders registered after importing modules, attempting auto-discovery")
379
+ self._auto_discover_spider_modules(spider_modules)
380
+ spider_names = list(registry.keys())
381
+ self._logger.debug(f"Registered spiders after auto-discovery: {spider_names}")
382
+ except Exception as e:
383
+ self._logger.warning(f"Error registering spider modules: {e}")
384
+
385
+ def _auto_discover_spider_modules(self, spider_modules):
386
+ """
387
+ 自动发现并导入爬虫模块中的所有爬虫
388
+ 这个方法会扫描指定模块目录下的所有Python文件并自动导入
389
+ """
390
+ try:
391
+ from crawlo.spider import get_global_spider_registry
392
+ import importlib
393
+ from pathlib import Path
394
+ import sys
395
+
396
+ registry = get_global_spider_registry()
397
+ initial_spider_count = len(registry)
398
+
399
+ for module_path in spider_modules:
400
+ try:
401
+ # 将模块路径转换为文件系统路径
402
+ # 例如: ofweek_standalone.spiders -> ofweek_standalone/spiders
403
+ package_parts = module_path.split('.')
404
+ if len(package_parts) < 2:
405
+ continue
406
+
407
+ # 获取项目根目录
408
+ project_root = None
409
+ for path in sys.path:
410
+ if path and Path(path).exists():
411
+ possible_module_path = Path(path) / package_parts[0]
412
+ if possible_module_path.exists():
413
+ project_root = path
414
+ break
415
+
416
+ if not project_root:
417
+ # 尝试使用当前工作目录
418
+ project_root = str(Path.cwd())
419
+
420
+ # 构建模块目录路径
421
+ module_dir = Path(project_root)
422
+ for part in package_parts:
423
+ module_dir = module_dir / part
424
+
425
+ # 如果目录存在,扫描其中的Python文件
426
+ if module_dir.exists() and module_dir.is_dir():
427
+ # 导入目录下的所有Python文件(除了__init__.py)
428
+ for py_file in module_dir.glob("*.py"):
429
+ if py_file.name.startswith('_'):
430
+ continue
431
+
432
+ # 构造模块名
433
+ module_name = py_file.stem # 文件名(不含扩展名)
434
+ full_module_path = f"{module_path}.{module_name}"
435
+
436
+ try:
437
+ # 导入模块以触发Spider注册
438
+ importlib.import_module(full_module_path)
439
+ except ImportError as e:
440
+ self._logger.warning(f"Failed to auto-import spider module {full_module_path}: {e}")
441
+ except Exception as e:
442
+ self._logger.warning(f"Error during auto-discovery for module {module_path}: {e}")
443
+
444
+ # 检查是否有新的爬虫被注册
445
+ final_spider_count = len(registry)
446
+ if final_spider_count > initial_spider_count:
447
+ new_spiders = list(registry.keys())
448
+ self._logger.info(f"Auto-discovered {final_spider_count - initial_spider_count} new spiders: {new_spiders}")
449
+
450
+ except Exception as e:
451
+ self._logger.warning(f"Error during auto-discovery of spider modules: {e}")
452
+
453
+ def is_spider_registered(self, name: str) -> bool:
454
+ """检查爬虫是否已注册"""
455
+ from crawlo.spider import get_global_spider_registry
456
+ registry = get_global_spider_registry()
457
+ return name in registry
458
+
459
+ def get_spider_class(self, name: str):
460
+ """获取爬虫类"""
461
+ from crawlo.spider import get_global_spider_registry
462
+ registry = get_global_spider_registry()
463
+ return registry.get(name)
464
+
465
+ def get_spider_names(self):
466
+ """获取所有注册的爬虫名称"""
467
+ from crawlo.spider import get_global_spider_registry
468
+ registry = get_global_spider_registry()
469
+ return list(registry.keys())
470
+
471
+ async def crawl(self, spider_cls_or_name, settings=None):
472
+ """运行单个爬虫"""
473
+ spider_cls = self._resolve_spider_class(spider_cls_or_name)
474
+
475
+ # 记录启动的爬虫名称(符合规范要求)
476
+ from crawlo.logging import get_logger
477
+ logger = get_logger('crawlo.framework')
478
+ logger.info(f"Starting spider: {spider_cls.name}")
479
+
480
+ merged_settings = self._merge_settings(settings)
481
+ crawler = ModernCrawler(spider_cls, merged_settings)
482
+
483
+ async with self._semaphore:
484
+ await crawler.crawl()
485
+
486
+ return crawler
487
+
488
+ async def crawl_multiple(self, spider_classes_or_names, settings=None):
489
+ """运行多个爬虫"""
490
+ self._start_time = time.time()
491
+
492
+ try:
493
+ spider_classes = []
494
+ for cls_or_name in spider_classes_or_names:
495
+ spider_cls = self._resolve_spider_class(cls_or_name)
496
+ spider_classes.append(spider_cls)
497
+
498
+ # 记录启动的爬虫名称(符合规范要求)
499
+ spider_names = [cls.name for cls in spider_classes]
500
+ from crawlo.logging import get_logger
501
+ logger = get_logger('crawlo.framework')
502
+ if len(spider_names) == 1:
503
+ logger.info(f"Starting spider: {spider_names[0]}")
504
+ else:
505
+ logger.info(f"Starting spiders: {', '.join(spider_names)}")
506
+
507
+ tasks = []
508
+ for spider_cls in spider_classes:
509
+ merged_settings = self._merge_settings(settings)
510
+ crawler = ModernCrawler(spider_cls, merged_settings)
511
+ self._crawlers.append(crawler)
512
+
513
+ task = asyncio.create_task(self._run_with_semaphore(crawler))
514
+ tasks.append(task)
515
+
516
+ results = await asyncio.gather(*tasks, return_exceptions=True)
517
+
518
+ # 处理结果
519
+ successful = sum(1 for r in results if not isinstance(r, Exception))
520
+ failed = len(results) - successful
521
+
522
+ self._logger.info(f"Crawl completed: {successful} successful, {failed} failed")
523
+
524
+ return results
525
+
526
+ finally:
527
+ self._end_time = time.time()
528
+ if self._start_time:
529
+ duration = self._end_time - self._start_time
530
+ self._logger.info(f"Total execution time: {duration:.2f}s")
531
+
532
+ async def _run_with_semaphore(self, crawler: ModernCrawler):
533
+ """在信号量控制下运行爬虫"""
534
+ async with self._semaphore:
535
+ await crawler.crawl()
536
+ return crawler
537
+
538
+ def _resolve_spider_class(self, spider_cls_or_name):
539
+ """解析Spider类"""
540
+ if isinstance(spider_cls_or_name, str):
541
+ # 从注册表中查找
542
+ try:
543
+ from crawlo.spider import get_global_spider_registry
544
+ registry = get_global_spider_registry()
545
+ if spider_cls_or_name in registry:
546
+ return registry[spider_cls_or_name]
547
+ else:
548
+ # 如果在注册表中找不到,尝试通过spider_modules导入所有模块来触发注册
549
+ # 然后再次检查注册表
550
+ if hasattr(self, '_spider_modules') and self._spider_modules:
551
+ for module_path in self._spider_modules:
552
+ try:
553
+ # 导入模块来触发爬虫注册
554
+ __import__(module_path)
555
+ except ImportError:
556
+ pass # 忽略导入错误
557
+
558
+ # 再次检查注册表
559
+ if spider_cls_or_name in registry:
560
+ return registry[spider_cls_or_name]
561
+
562
+ # 如果仍然找不到,尝试自动发现模式
563
+ if hasattr(self, '_spider_modules') and self._spider_modules:
564
+ self._auto_discover_spider_modules(self._spider_modules)
565
+ if spider_cls_or_name in registry:
566
+ return registry[spider_cls_or_name]
567
+
568
+ # 如果仍然找不到,尝试直接导入模块
569
+ try:
570
+ # 假设格式为 module.SpiderClass
571
+ if '.' in spider_cls_or_name:
572
+ module_path, class_name = spider_cls_or_name.rsplit('.', 1)
573
+ module = __import__(module_path, fromlist=[class_name])
574
+ spider_class = getattr(module, class_name)
575
+ # 注册到全局注册表
576
+ registry[spider_class.name] = spider_class
577
+ return spider_class
578
+ else:
579
+ # 尝试在spider_modules中查找
580
+ if hasattr(self, '_spider_modules') and self._spider_modules:
581
+ for module_path in self._spider_modules:
582
+ try:
583
+ # 构造完整的模块路径
584
+ full_module_path = f"{module_path}.{spider_cls_or_name}"
585
+ module = __import__(full_module_path, fromlist=[spider_cls_or_name])
586
+ # 获取模块中的Spider子类
587
+ for attr_name in dir(module):
588
+ attr_value = getattr(module, attr_name)
589
+ if (isinstance(attr_value, type) and
590
+ issubclass(attr_value, registry.__class__.__bases__[0]) and
591
+ hasattr(attr_value, 'name') and
592
+ attr_value.name == spider_cls_or_name):
593
+ # 注册到全局注册表
594
+ registry[spider_cls_or_name] = attr_value
595
+ return attr_value
596
+ except ImportError:
597
+ continue
598
+ raise ValueError(f"Spider '{spider_cls_or_name}' not found in registry")
599
+ except (ImportError, AttributeError):
600
+ raise ValueError(f"Spider '{spider_cls_or_name}' not found in registry")
601
+ except ImportError:
602
+ raise ValueError(f"Cannot resolve spider name '{spider_cls_or_name}'")
603
+ else:
604
+ return spider_cls_or_name
605
+
606
+ def _merge_settings(self, additional_settings):
607
+ """合并配置"""
608
+ if not additional_settings:
609
+ return self._settings
610
+
611
+ # 这里可以实现更复杂的配置合并逻辑
612
+ from crawlo.settings.setting_manager import SettingManager
613
+ merged = SettingManager()
614
+
615
+ # 复制基础配置
616
+ if self._settings:
617
+ merged.update_attributes(self._settings.__dict__)
618
+
619
+ # 应用额外配置
620
+ merged.update_attributes(additional_settings)
621
+
622
+ return merged
623
+
624
+ def get_metrics(self) -> Dict[str, Any]:
625
+ """获取整体指标"""
626
+ total_duration = 0.0
627
+ if self._start_time and self._end_time:
628
+ total_duration = self._end_time - self._start_time
629
+
630
+ crawler_metrics = [crawler.metrics for crawler in self._crawlers]
631
+
632
+ return {
633
+ 'total_duration': total_duration,
634
+ 'crawler_count': len(self._crawlers),
635
+ 'total_requests': sum(m.request_count for m in crawler_metrics),
636
+ 'total_success': sum(m.success_count for m in crawler_metrics),
637
+ 'total_errors': sum(m.error_count for m in crawler_metrics),
638
+ 'average_success_rate': sum(m.get_success_rate() for m in crawler_metrics) / len(crawler_metrics) if crawler_metrics else 0.0
639
639
  }