crawlo 1.4.7__py3-none-any.whl → 1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (348) hide show
  1. crawlo/__init__.py +90 -90
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +186 -186
  7. crawlo/commands/help.py +140 -140
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +379 -379
  10. crawlo/commands/startproject.py +460 -460
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +320 -320
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +52 -52
  16. crawlo/core/engine.py +451 -451
  17. crawlo/core/processor.py +47 -47
  18. crawlo/core/scheduler.py +290 -290
  19. crawlo/crawler.py +698 -698
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +280 -280
  23. crawlo/downloader/aiohttp_downloader.py +233 -233
  24. crawlo/downloader/cffi_downloader.py +250 -250
  25. crawlo/downloader/httpx_downloader.py +265 -265
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +425 -425
  28. crawlo/downloader/selenium_downloader.py +486 -486
  29. crawlo/event.py +45 -45
  30. crawlo/exceptions.py +214 -214
  31. crawlo/extension/__init__.py +64 -64
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +94 -94
  34. crawlo/extension/log_stats.py +70 -70
  35. crawlo/extension/logging_extension.py +53 -53
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +27 -27
  40. crawlo/factories/base.py +68 -68
  41. crawlo/factories/crawler.py +104 -104
  42. crawlo/factories/registry.py +84 -84
  43. crawlo/factories/utils.py +134 -134
  44. crawlo/filters/__init__.py +170 -170
  45. crawlo/filters/aioredis_filter.py +347 -347
  46. crawlo/filters/memory_filter.py +261 -261
  47. crawlo/framework.py +306 -306
  48. crawlo/initialization/__init__.py +44 -44
  49. crawlo/initialization/built_in.py +391 -391
  50. crawlo/initialization/context.py +141 -141
  51. crawlo/initialization/core.py +240 -240
  52. crawlo/initialization/phases.py +229 -229
  53. crawlo/initialization/registry.py +143 -143
  54. crawlo/initialization/utils.py +48 -48
  55. crawlo/interfaces.py +23 -23
  56. crawlo/items/__init__.py +23 -23
  57. crawlo/items/base.py +23 -23
  58. crawlo/items/fields.py +52 -52
  59. crawlo/items/items.py +104 -104
  60. crawlo/logging/__init__.py +42 -42
  61. crawlo/logging/config.py +280 -276
  62. crawlo/logging/factory.py +175 -175
  63. crawlo/logging/manager.py +104 -104
  64. crawlo/middleware/__init__.py +87 -87
  65. crawlo/middleware/default_header.py +132 -132
  66. crawlo/middleware/download_delay.py +104 -104
  67. crawlo/middleware/middleware_manager.py +142 -142
  68. crawlo/middleware/offsite.py +123 -123
  69. crawlo/middleware/proxy.py +209 -209
  70. crawlo/middleware/request_ignore.py +86 -86
  71. crawlo/middleware/response_code.py +150 -150
  72. crawlo/middleware/response_filter.py +136 -136
  73. crawlo/middleware/retry.py +124 -124
  74. crawlo/mode_manager.py +287 -287
  75. crawlo/network/__init__.py +21 -21
  76. crawlo/network/request.py +408 -376
  77. crawlo/network/response.py +598 -569
  78. crawlo/pipelines/__init__.py +52 -52
  79. crawlo/pipelines/base_pipeline.py +452 -452
  80. crawlo/pipelines/bloom_dedup_pipeline.py +145 -146
  81. crawlo/pipelines/console_pipeline.py +39 -39
  82. crawlo/pipelines/csv_pipeline.py +316 -316
  83. crawlo/pipelines/database_dedup_pipeline.py +196 -197
  84. crawlo/pipelines/json_pipeline.py +218 -218
  85. crawlo/pipelines/memory_dedup_pipeline.py +104 -105
  86. crawlo/pipelines/mongo_pipeline.py +140 -139
  87. crawlo/pipelines/mysql_pipeline.py +468 -469
  88. crawlo/pipelines/pipeline_manager.py +100 -100
  89. crawlo/pipelines/redis_dedup_pipeline.py +155 -155
  90. crawlo/project.py +347 -347
  91. crawlo/queue/__init__.py +9 -9
  92. crawlo/queue/pqueue.py +38 -38
  93. crawlo/queue/queue_manager.py +591 -591
  94. crawlo/queue/redis_priority_queue.py +518 -518
  95. crawlo/settings/__init__.py +7 -7
  96. crawlo/settings/default_settings.py +287 -284
  97. crawlo/settings/setting_manager.py +219 -219
  98. crawlo/spider/__init__.py +658 -657
  99. crawlo/stats_collector.py +81 -81
  100. crawlo/subscriber.py +129 -129
  101. crawlo/task_manager.py +138 -138
  102. crawlo/templates/crawlo.cfg.tmpl +10 -10
  103. crawlo/templates/project/__init__.py.tmpl +1 -1
  104. crawlo/templates/project/items.py.tmpl +13 -13
  105. crawlo/templates/project/middlewares.py.tmpl +38 -38
  106. crawlo/templates/project/pipelines.py.tmpl +35 -35
  107. crawlo/templates/project/settings.py.tmpl +113 -109
  108. crawlo/templates/project/settings_distributed.py.tmpl +160 -156
  109. crawlo/templates/project/settings_gentle.py.tmpl +174 -170
  110. crawlo/templates/project/settings_high_performance.py.tmpl +175 -171
  111. crawlo/templates/project/settings_minimal.py.tmpl +102 -98
  112. crawlo/templates/project/settings_simple.py.tmpl +172 -168
  113. crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
  114. crawlo/templates/run.py.tmpl +23 -23
  115. crawlo/templates/spider/spider.py.tmpl +32 -32
  116. crawlo/templates/spiders_init.py.tmpl +4 -4
  117. crawlo/tools/__init__.py +86 -86
  118. crawlo/tools/date_tools.py +289 -289
  119. crawlo/tools/distributed_coordinator.py +384 -384
  120. crawlo/tools/scenario_adapter.py +262 -262
  121. crawlo/tools/text_cleaner.py +232 -232
  122. crawlo/utils/__init__.py +74 -50
  123. crawlo/utils/batch_processor.py +276 -276
  124. crawlo/utils/config_manager.py +442 -442
  125. crawlo/utils/controlled_spider_mixin.py +439 -439
  126. crawlo/utils/db_helper.py +250 -250
  127. crawlo/utils/encoding_helper.py +190 -0
  128. crawlo/utils/error_handler.py +410 -410
  129. crawlo/utils/fingerprint.py +121 -121
  130. crawlo/utils/func_tools.py +82 -82
  131. crawlo/utils/large_scale_helper.py +344 -344
  132. crawlo/utils/leak_detector.py +335 -335
  133. crawlo/utils/misc.py +81 -81
  134. crawlo/utils/mongo_connection_pool.py +157 -157
  135. crawlo/utils/mysql_connection_pool.py +197 -197
  136. crawlo/utils/performance_monitor.py +285 -285
  137. crawlo/utils/queue_helper.py +175 -175
  138. crawlo/utils/redis_checker.py +90 -90
  139. crawlo/utils/redis_connection_pool.py +578 -578
  140. crawlo/utils/redis_key_validator.py +198 -198
  141. crawlo/utils/request.py +278 -278
  142. crawlo/utils/request_serializer.py +225 -225
  143. crawlo/utils/resource_manager.py +337 -337
  144. crawlo/utils/response_helper.py +113 -0
  145. crawlo/utils/selector_helper.py +138 -137
  146. crawlo/utils/singleton.py +69 -69
  147. crawlo/utils/spider_loader.py +201 -201
  148. crawlo/utils/text_helper.py +94 -94
  149. {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/METADATA +831 -689
  150. crawlo-1.4.8.dist-info/RECORD +347 -0
  151. examples/__init__.py +7 -7
  152. tests/__init__.py +7 -7
  153. tests/advanced_tools_example.py +217 -217
  154. tests/authenticated_proxy_example.py +110 -110
  155. tests/baidu_performance_test.py +108 -108
  156. tests/baidu_test.py +59 -59
  157. tests/bug_check_test.py +250 -250
  158. tests/cleaners_example.py +160 -160
  159. tests/comprehensive_framework_test.py +212 -212
  160. tests/comprehensive_test.py +81 -81
  161. tests/comprehensive_testing_summary.md +186 -186
  162. tests/config_validation_demo.py +142 -142
  163. tests/controlled_spider_example.py +205 -205
  164. tests/date_tools_example.py +180 -180
  165. tests/debug_configure.py +69 -69
  166. tests/debug_framework_logger.py +84 -84
  167. tests/debug_log_config.py +126 -126
  168. tests/debug_log_levels.py +63 -63
  169. tests/debug_pipelines.py +66 -66
  170. tests/detailed_log_test.py +233 -233
  171. tests/direct_selector_helper_test.py +96 -96
  172. tests/distributed_dedup_test.py +467 -467
  173. tests/distributed_test.py +66 -66
  174. tests/distributed_test_debug.py +76 -76
  175. tests/dynamic_loading_example.py +523 -523
  176. tests/dynamic_loading_test.py +104 -104
  177. tests/error_handling_example.py +171 -171
  178. tests/explain_mysql_update_behavior.py +76 -76
  179. tests/final_comprehensive_test.py +151 -151
  180. tests/final_log_test.py +260 -260
  181. tests/final_validation_test.py +182 -182
  182. tests/fix_log_test.py +142 -142
  183. tests/framework_performance_test.py +202 -202
  184. tests/log_buffering_test.py +111 -111
  185. tests/log_generation_timing_test.py +153 -153
  186. tests/monitor_redis_dedup.sh +72 -72
  187. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
  188. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
  189. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
  190. tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
  191. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
  192. tests/ofweek_scrapy/scrapy.cfg +11 -11
  193. tests/optimized_performance_test.py +211 -211
  194. tests/performance_comparison.py +244 -244
  195. tests/queue_blocking_test.py +113 -113
  196. tests/queue_test.py +89 -89
  197. tests/redis_key_validation_demo.py +130 -130
  198. tests/request_params_example.py +150 -150
  199. tests/response_improvements_example.py +144 -144
  200. tests/scrapy_comparison/ofweek_scrapy.py +138 -138
  201. tests/scrapy_comparison/scrapy_test.py +133 -133
  202. tests/simple_cli_test.py +54 -54
  203. tests/simple_command_test.py +119 -119
  204. tests/simple_crawlo_test.py +126 -126
  205. tests/simple_follow_test.py +38 -38
  206. tests/simple_log_test2.py +137 -137
  207. tests/simple_optimization_test.py +128 -128
  208. tests/simple_queue_type_test.py +41 -41
  209. tests/simple_response_selector_test.py +94 -94
  210. tests/simple_selector_helper_test.py +154 -154
  211. tests/simple_selector_test.py +207 -207
  212. tests/simple_spider_test.py +49 -49
  213. tests/simple_url_test.py +73 -73
  214. tests/simulate_mysql_update_test.py +139 -139
  215. tests/spider_log_timing_test.py +177 -177
  216. tests/test_advanced_tools.py +148 -148
  217. tests/test_all_commands.py +230 -230
  218. tests/test_all_pipeline_fingerprints.py +133 -133
  219. tests/test_all_redis_key_configs.py +145 -145
  220. tests/test_asyncmy_usage.py +56 -56
  221. tests/test_batch_processor.py +178 -178
  222. tests/test_cleaners.py +54 -54
  223. tests/test_cli_arguments.py +118 -118
  224. tests/test_component_factory.py +174 -174
  225. tests/test_config_consistency.py +80 -80
  226. tests/test_config_merge.py +152 -152
  227. tests/test_config_validator.py +182 -182
  228. tests/test_controlled_spider_mixin.py +79 -79
  229. tests/test_crawler_process_import.py +38 -38
  230. tests/test_crawler_process_spider_modules.py +47 -47
  231. tests/test_crawlo_proxy_integration.py +114 -114
  232. tests/test_date_tools.py +123 -123
  233. tests/test_dedup_fix.py +220 -220
  234. tests/test_dedup_pipeline_consistency.py +124 -124
  235. tests/test_default_header_middleware.py +313 -313
  236. tests/test_distributed.py +65 -65
  237. tests/test_double_crawlo_fix.py +204 -204
  238. tests/test_double_crawlo_fix_simple.py +124 -124
  239. tests/test_download_delay_middleware.py +221 -221
  240. tests/test_downloader_proxy_compatibility.py +272 -272
  241. tests/test_edge_cases.py +305 -305
  242. tests/test_encoding_core.py +56 -56
  243. tests/test_encoding_detection.py +126 -126
  244. tests/test_enhanced_error_handler.py +270 -270
  245. tests/test_enhanced_error_handler_comprehensive.py +245 -245
  246. tests/test_error_handler_compatibility.py +112 -112
  247. tests/test_factories.py +252 -252
  248. tests/test_factory_compatibility.py +196 -196
  249. tests/test_final_validation.py +153 -153
  250. tests/test_fingerprint_consistency.py +135 -135
  251. tests/test_fingerprint_simple.py +51 -51
  252. tests/test_get_component_logger.py +83 -83
  253. tests/test_hash_performance.py +99 -99
  254. tests/test_integration.py +169 -169
  255. tests/test_item_dedup_redis_key.py +122 -122
  256. tests/test_large_scale_helper.py +235 -235
  257. tests/test_logging_enhancements.py +374 -374
  258. tests/test_logging_final.py +184 -184
  259. tests/test_logging_integration.py +312 -312
  260. tests/test_logging_system.py +282 -282
  261. tests/test_middleware_debug.py +141 -141
  262. tests/test_mode_consistency.py +51 -51
  263. tests/test_multi_directory.py +67 -67
  264. tests/test_multiple_spider_modules.py +80 -80
  265. tests/test_mysql_pipeline_config.py +164 -164
  266. tests/test_mysql_pipeline_error.py +98 -98
  267. tests/test_mysql_pipeline_init_log.py +82 -82
  268. tests/test_mysql_pipeline_integration.py +132 -132
  269. tests/test_mysql_pipeline_refactor.py +143 -143
  270. tests/test_mysql_pipeline_refactor_simple.py +85 -85
  271. tests/test_mysql_pipeline_robustness.py +195 -195
  272. tests/test_mysql_pipeline_types.py +88 -88
  273. tests/test_mysql_update_columns.py +93 -93
  274. tests/test_offsite_middleware.py +244 -244
  275. tests/test_offsite_middleware_simple.py +203 -203
  276. tests/test_optimized_selector_naming.py +100 -100
  277. tests/test_parsel.py +29 -29
  278. tests/test_performance.py +327 -327
  279. tests/test_performance_monitor.py +115 -115
  280. tests/test_pipeline_fingerprint_consistency.py +86 -86
  281. tests/test_priority_behavior.py +211 -211
  282. tests/test_priority_consistency.py +151 -151
  283. tests/test_priority_consistency_fixed.py +249 -249
  284. tests/test_proxy_health_check.py +32 -32
  285. tests/test_proxy_middleware.py +217 -217
  286. tests/test_proxy_middleware_enhanced.py +212 -212
  287. tests/test_proxy_middleware_integration.py +142 -142
  288. tests/test_proxy_middleware_refactored.py +207 -207
  289. tests/test_proxy_only.py +83 -83
  290. tests/test_proxy_providers.py +56 -56
  291. tests/test_proxy_stats.py +19 -19
  292. tests/test_proxy_strategies.py +59 -59
  293. tests/test_proxy_with_downloader.py +152 -152
  294. tests/test_queue_empty_check.py +41 -41
  295. tests/test_queue_manager_double_crawlo.py +173 -173
  296. tests/test_queue_manager_redis_key.py +179 -179
  297. tests/test_queue_naming.py +154 -154
  298. tests/test_queue_type.py +106 -106
  299. tests/test_queue_type_redis_config_consistency.py +130 -130
  300. tests/test_random_headers_default.py +322 -322
  301. tests/test_random_headers_necessity.py +308 -308
  302. tests/test_random_user_agent.py +72 -72
  303. tests/test_redis_config.py +28 -28
  304. tests/test_redis_connection_pool.py +294 -294
  305. tests/test_redis_key_naming.py +181 -181
  306. tests/test_redis_key_validator.py +123 -123
  307. tests/test_redis_queue.py +224 -224
  308. tests/test_redis_queue_name_fix.py +175 -175
  309. tests/test_redis_queue_type_fallback.py +129 -129
  310. tests/test_request_ignore_middleware.py +182 -182
  311. tests/test_request_params.py +111 -111
  312. tests/test_request_serialization.py +70 -70
  313. tests/test_response_code_middleware.py +349 -349
  314. tests/test_response_filter_middleware.py +427 -427
  315. tests/test_response_follow.py +104 -104
  316. tests/test_response_improvements.py +152 -152
  317. tests/test_response_selector_methods.py +92 -92
  318. tests/test_response_url_methods.py +70 -70
  319. tests/test_response_urljoin.py +86 -86
  320. tests/test_retry_middleware.py +333 -333
  321. tests/test_retry_middleware_realistic.py +273 -273
  322. tests/test_scheduler.py +252 -252
  323. tests/test_scheduler_config_update.py +133 -133
  324. tests/test_scrapy_style_encoding.py +112 -112
  325. tests/test_selector_helper.py +100 -100
  326. tests/test_selector_optimizations.py +146 -146
  327. tests/test_simple_response.py +61 -61
  328. tests/test_spider_loader.py +49 -49
  329. tests/test_spider_loader_comprehensive.py +69 -69
  330. tests/test_spider_modules.py +84 -84
  331. tests/test_spiders/test_spider.py +9 -9
  332. tests/test_telecom_spider_redis_key.py +205 -205
  333. tests/test_template_content.py +87 -87
  334. tests/test_template_redis_key.py +134 -134
  335. tests/test_tools.py +159 -159
  336. tests/test_user_agent_randomness.py +176 -176
  337. tests/test_user_agents.py +96 -96
  338. tests/untested_features_report.md +138 -138
  339. tests/verify_debug.py +51 -51
  340. tests/verify_distributed.py +117 -117
  341. tests/verify_log_fix.py +111 -111
  342. tests/verify_mysql_warnings.py +109 -109
  343. crawlo/utils/log.py +0 -80
  344. crawlo/utils/url_utils.py +0 -40
  345. crawlo-1.4.7.dist-info/RECORD +0 -347
  346. {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
  347. {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
  348. {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
crawlo/crawler.py CHANGED
@@ -1,699 +1,699 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- Crawler系统
5
- ==========
6
-
7
- 核心组件:
8
- - Crawler: 爬虫核心控制器,负责单个爬虫的生命周期管理
9
- - CrawlerProcess: 爬虫进程管理器,支持单个/多个爬虫运行
10
-
11
- 设计原则:
12
- 1. 单一职责 - 每个类只负责一个明确的功能
13
- 2. 依赖注入 - 通过工厂创建组件,便于测试
14
- 3. 状态管理 - 清晰的状态转换和生命周期
15
- 4. 错误处理 - 优雅的错误处理和恢复机制
16
- 5. 资源管理 - 统一的资源注册和清理机制
17
- """
18
-
19
- import asyncio
20
- import time
21
- from enum import Enum
22
- from dataclasses import dataclass
23
- from contextlib import asynccontextmanager
24
- from typing import Optional, Type, Dict, Any, List
25
-
26
- from crawlo.logging import get_logger
27
- from crawlo.factories import get_component_registry
28
- from crawlo.initialization import initialize_framework, is_framework_ready
29
- from crawlo.utils.resource_manager import ResourceManager, ResourceType
30
-
31
-
32
- class CrawlerState(Enum):
33
- """Crawler状态枚举"""
34
- CREATED = "created"
35
- INITIALIZING = "initializing"
36
- READY = "ready"
37
- RUNNING = "running"
38
- CLOSING = "closing"
39
- CLOSED = "closed"
40
- ERROR = "error"
41
-
42
-
43
- @dataclass
44
- class CrawlerMetrics:
45
- """Crawler性能指标"""
46
- start_time: Optional[float] = None
47
- end_time: Optional[float] = None
48
- initialization_duration: float = 0.0
49
- crawl_duration: float = 0.0
50
- request_count: int = 0
51
- success_count: int = 0
52
- error_count: int = 0
53
-
54
- def get_total_duration(self) -> float:
55
- if self.start_time and self.end_time:
56
- return self.end_time - self.start_time
57
- return 0.0
58
-
59
- def get_success_rate(self) -> float:
60
- total = self.success_count + self.error_count
61
- return (self.success_count / total * 100) if total > 0 else 0.0
62
-
63
-
64
- class Crawler:
65
- """
66
- 爬虫核心控制器
67
-
68
- 特点:
69
- 1. 清晰的状态管理
70
- 2. 依赖注入
71
- 3. 组件化架构
72
- 4. 完善的错误处理
73
- 5. 统一的资源管理
74
- """
75
-
76
- def __init__(self, spider_cls: Type, settings=None):
77
- self._spider_cls = spider_cls
78
- self._settings = settings
79
- self._state = CrawlerState.CREATED
80
- self._state_lock = asyncio.Lock()
81
-
82
- # 组件
83
- self._spider = None
84
- self._engine = None
85
- self._stats = None
86
- self._subscriber = None
87
- self._extension = None
88
-
89
- # 指标
90
- self._metrics = CrawlerMetrics()
91
-
92
- # 资源管理器
93
- self._resource_manager = ResourceManager(name=f"crawler.{spider_cls.__name__ if spider_cls else 'unknown'}")
94
-
95
- # 日志
96
- self._logger = get_logger(f'crawler.{spider_cls.__name__ if spider_cls else "unknown"}')
97
-
98
- # 确保框架已初始化
99
- self._ensure_framework_ready()
100
-
101
- def _ensure_framework_ready(self):
102
- """确保框架已准备就绪"""
103
- if not is_framework_ready():
104
- try:
105
- self._settings = initialize_framework(self._settings)
106
- self._logger.debug("Framework initialized successfully")
107
- except Exception as e:
108
- self._logger.warning(f"Framework initialization failed: {e}")
109
- # 使用降级策略
110
- if not self._settings:
111
- from crawlo.settings.setting_manager import SettingManager
112
- self._settings = SettingManager()
113
-
114
- # 确保是SettingManager实例
115
- if isinstance(self._settings, dict):
116
- from crawlo.settings.setting_manager import SettingManager
117
- settings_manager = SettingManager()
118
- settings_manager.update_attributes(self._settings)
119
- self._settings = settings_manager
120
-
121
- @property
122
- def state(self) -> CrawlerState:
123
- """获取当前状态"""
124
- return self._state
125
-
126
- @property
127
- def spider(self):
128
- """获取Spider实例"""
129
- return self._spider
130
-
131
- @property
132
- def stats(self):
133
- """获取Stats实例(向后兼容)"""
134
- return self._stats
135
-
136
- @property
137
- def metrics(self) -> CrawlerMetrics:
138
- """获取性能指标"""
139
- return self._metrics
140
-
141
- @property
142
- def settings(self):
143
- """获取配置"""
144
- return self._settings
145
-
146
- @property
147
- def engine(self):
148
- """获取Engine实例(向后兼容)"""
149
- return self._engine
150
-
151
- @property
152
- def subscriber(self):
153
- """获取Subscriber实例(向后兼容)"""
154
- return self._subscriber
155
-
156
- @property
157
- def extension(self):
158
- """获取Extension实例(向后兼容)"""
159
- return self._extension
160
-
161
- @extension.setter
162
- def extension(self, value):
163
- """设置Extension实例(向后兼容)"""
164
- self._extension = value
165
-
166
- def _create_extension(self):
167
- """创建Extension管理器(向后兼容)"""
168
- if self._extension is None:
169
- try:
170
- registry = get_component_registry()
171
- self._extension = registry.create('extension_manager', crawler=self)
172
- except Exception as e:
173
- self._logger.warning(f"Failed to create extension manager: {e}")
174
- return self._extension
175
-
176
- async def close(self):
177
- """关闭爹虫(向后兼容)"""
178
- await self._cleanup()
179
-
180
- async def crawl(self):
181
- """执行爬取任务"""
182
- async with self._lifecycle_manager():
183
- await self._initialize_components()
184
- await self._run_crawler()
185
-
186
- @asynccontextmanager
187
- async def _lifecycle_manager(self):
188
- """生命周期管理"""
189
- self._metrics.start_time = time.time()
190
-
191
- try:
192
- yield
193
- except Exception as e:
194
- await self._handle_error(e)
195
- raise
196
- finally:
197
- await self._cleanup()
198
- self._metrics.end_time = time.time()
199
-
200
- async def _initialize_components(self):
201
- """初始化组件"""
202
- async with self._state_lock:
203
- if self._state != CrawlerState.CREATED:
204
- raise RuntimeError(f"Cannot initialize from state {self._state}")
205
-
206
- self._state = CrawlerState.INITIALIZING
207
-
208
- init_start = time.time()
209
-
210
- try:
211
- # 使用组件工厂创建组件
212
- registry = get_component_registry()
213
-
214
- # 创建Subscriber(无依赖)
215
- self._subscriber = registry.create('subscriber')
216
-
217
- # 创建Spider
218
- self._spider = self._create_spider()
219
-
220
- # 创建Engine(需要crawler参数)
221
- self._engine = registry.create('engine', crawler=self)
222
- # 注册Engine到资源管理器
223
- if self._engine and hasattr(self._engine, 'close'):
224
- self._resource_manager.register(
225
- self._engine,
226
- lambda e: e.close() if hasattr(e, 'close') else None,
227
- ResourceType.OTHER,
228
- name="engine"
229
- )
230
-
231
- # 创建Stats(需要crawler参数)
232
- self._stats = registry.create('stats', crawler=self)
233
-
234
- # 创建Extension Manager (可选,需要crawler参数)
235
- try:
236
- self._extension = registry.create('extension_manager', crawler=self)
237
- except Exception as e:
238
- self._logger.warning(f"Failed to create extension manager: {e}")
239
-
240
- self._metrics.initialization_duration = time.time() - init_start
241
-
242
- async with self._state_lock:
243
- self._state = CrawlerState.READY
244
-
245
- self._logger.debug(f"Crawler components initialized successfully in {self._metrics.initialization_duration:.2f}s")
246
-
247
- except Exception as e:
248
- async with self._state_lock:
249
- self._state = CrawlerState.ERROR
250
- raise RuntimeError(f"Component initialization failed: {e}")
251
-
252
- def _create_spider(self):
253
- """创建Spider实例"""
254
- if not self._spider_cls:
255
- raise ValueError("Spider class not provided")
256
-
257
- # 检查Spider类的有效性
258
- if not hasattr(self._spider_cls, 'name'):
259
- raise ValueError("Spider class must have 'name' attribute")
260
-
261
- # 创建Spider实例
262
- spider = self._spider_cls()
263
-
264
- # 设置crawler引用
265
- if hasattr(spider, 'crawler'):
266
- spider.crawler = self
267
-
268
- return spider
269
-
270
- async def _run_crawler(self):
271
- """运行爬虫引擎"""
272
- async with self._state_lock:
273
- if self._state != CrawlerState.READY:
274
- raise RuntimeError(f"Cannot run from state {self._state}")
275
-
276
- self._state = CrawlerState.RUNNING
277
-
278
- crawl_start = time.time()
279
-
280
- try:
281
- # 启动引擎
282
- if self._engine:
283
- await self._engine.start_spider(self._spider)
284
- else:
285
- raise RuntimeError("Engine not initialized")
286
-
287
- self._metrics.crawl_duration = time.time() - crawl_start
288
-
289
- self._logger.info(f"Crawler completed successfully in {self._metrics.crawl_duration:.2f}s")
290
-
291
- except Exception as e:
292
- self._metrics.crawl_duration = time.time() - crawl_start
293
- raise RuntimeError(f"Crawler execution failed: {e}")
294
-
295
- async def _handle_error(self, error: Exception):
296
- """处理错误"""
297
- async with self._state_lock:
298
- self._state = CrawlerState.ERROR
299
-
300
- self._metrics.error_count += 1
301
- self._logger.error(f"Crawler error: {error}", exc_info=True)
302
-
303
- # 这里可以添加错误恢复逻辑
304
-
305
- async def _cleanup(self):
306
- """清理资源"""
307
- async with self._state_lock:
308
- if self._state not in [CrawlerState.CLOSING, CrawlerState.CLOSED]:
309
- self._state = CrawlerState.CLOSING
310
-
311
- try:
312
- # 使用资源管理器统一清理
313
- self._logger.debug("开始清理Crawler资源...")
314
- cleanup_result = await self._resource_manager.cleanup_all()
315
- self._logger.debug(
316
- f"资源清理完成: {cleanup_result['success']}成功, "
317
- f"{cleanup_result['errors']}失败, 耗时{cleanup_result['duration']:.2f}s"
318
- )
319
-
320
- # 关闭各个组件(继续兼容旧逻辑)
321
- if self._engine and hasattr(self._engine, 'close'):
322
- try:
323
- await self._engine.close()
324
- except Exception as e:
325
- self._logger.warning(f"Engine cleanup failed: {e}")
326
-
327
- # 调用Spider的spider_closed方法
328
- if self._spider:
329
- try:
330
- if asyncio.iscoroutinefunction(self._spider.spider_closed):
331
- await self._spider.spider_closed()
332
- else:
333
- self._spider.spider_closed()
334
- except Exception as e:
335
- self._logger.warning(f"Spider cleanup failed: {e}")
336
-
337
- # 调用StatsCollector的close_spider方法,设置reason和spider_name
338
- if self._stats and hasattr(self._stats, 'close_spider'):
339
- try:
340
- # 使用默认的'finished'作为reason
341
- self._stats.close_spider(self._spider, reason='finished')
342
- except Exception as e:
343
- self._logger.warning(f"Stats close_spider failed: {e}")
344
-
345
- # 触发spider_closed事件,通知所有订阅者(包括扩展)
346
- # 传递reason参数,这里使用默认的'finished'作为reason
347
- if self.subscriber:
348
- from crawlo.event import CrawlerEvent
349
- await self.subscriber.notify(CrawlerEvent.SPIDER_CLOSED, reason='finished')
350
-
351
- if self._stats and hasattr(self._stats, 'close'):
352
- try:
353
- close_result = self._stats.close()
354
- if asyncio.iscoroutine(close_result):
355
- await close_result
356
- except Exception as e:
357
- self._logger.warning(f"Stats cleanup failed: {e}")
358
-
359
- async with self._state_lock:
360
- self._state = CrawlerState.CLOSED
361
-
362
- self._logger.debug("Crawler cleanup completed")
363
-
364
- except Exception as e:
365
- self._logger.error(f"Cleanup error: {e}")
366
-
367
-
368
- class CrawlerProcess:
369
- """
370
- Crawler进程管理器 - 管理多个Crawler的执行
371
-
372
- 简化版本,专注于核心功能
373
- """
374
-
375
- def __init__(self, settings=None, max_concurrency: int = 3, spider_modules=None):
376
- # 初始化框架配置
377
- self._settings = settings or initialize_framework()
378
- self._max_concurrency = max_concurrency
379
- self._crawlers: List[Crawler] = []
380
- self._semaphore = asyncio.Semaphore(max_concurrency)
381
- self._logger = get_logger('crawler.process')
382
-
383
- # 如果没有显式提供spider_modules,则从settings中获取
384
- if spider_modules is None and self._settings:
385
- spider_modules = self._settings.get('SPIDER_MODULES', [])
386
- self._logger.debug(f"从settings中获取SPIDER_MODULES: {spider_modules}")
387
-
388
- self._spider_modules = spider_modules or [] # 保存spider_modules
389
-
390
- # 如果提供了spider_modules,自动注册这些模块中的爬虫
391
- if self._spider_modules:
392
- self._register_spider_modules(self._spider_modules)
393
-
394
- # 指标
395
- self._start_time: Optional[float] = None
396
- self._end_time: Optional[float] = None
397
-
398
- def _register_spider_modules(self, spider_modules):
399
- """注册爬虫模块"""
400
- try:
401
- from crawlo.spider import get_global_spider_registry
402
- registry = get_global_spider_registry()
403
-
404
- self._logger.debug(f"Registering spider modules: {spider_modules}")
405
-
406
- initial_spider_count = len(registry)
407
-
408
- for module_path in spider_modules:
409
- try:
410
- # 导入模块
411
- __import__(module_path)
412
- self._logger.debug(f"Successfully imported spider module: {module_path}")
413
- except ImportError as e:
414
- self._logger.warning(f"Failed to import spider module {module_path}: {e}")
415
- # 如果导入失败,尝试自动发现
416
- self._auto_discover_spider_modules([module_path])
417
-
418
- # 检查注册表中的爬虫
419
- spider_names = list(registry.keys())
420
- self._logger.debug(f"Registered spiders after import: {spider_names}")
421
-
422
- # 如果导入模块后没有新的爬虫被注册,则尝试自动发现
423
- final_spider_count = len(registry)
424
- if final_spider_count == initial_spider_count:
425
- self._logger.debug("No new spiders registered after importing modules, attempting auto-discovery")
426
- self._auto_discover_spider_modules(spider_modules)
427
- spider_names = list(registry.keys())
428
- self._logger.debug(f"Registered spiders after auto-discovery: {spider_names}")
429
- except Exception as e:
430
- self._logger.warning(f"Error registering spider modules: {e}")
431
-
432
- def _auto_discover_spider_modules(self, spider_modules):
433
- """
434
- 自动发现并导入爬虫模块中的所有爬虫
435
- 这个方法会扫描指定模块目录下的所有Python文件并自动导入
436
- """
437
- try:
438
- from crawlo.spider import get_global_spider_registry
439
- import importlib
440
- from pathlib import Path
441
- import sys
442
-
443
- registry = get_global_spider_registry()
444
- initial_spider_count = len(registry)
445
-
446
- for module_path in spider_modules:
447
- try:
448
- # 将模块路径转换为文件系统路径
449
- # 例如: ofweek_standalone.spiders -> ofweek_standalone/spiders
450
- package_parts = module_path.split('.')
451
- if len(package_parts) < 2:
452
- continue
453
-
454
- # 获取项目根目录
455
- project_root = None
456
- for path in sys.path:
457
- if path and Path(path).exists():
458
- possible_module_path = Path(path) / package_parts[0]
459
- if possible_module_path.exists():
460
- project_root = path
461
- break
462
-
463
- if not project_root:
464
- # 尝试使用当前工作目录
465
- project_root = str(Path.cwd())
466
-
467
- # 构建模块目录路径
468
- module_dir = Path(project_root)
469
- for part in package_parts:
470
- module_dir = module_dir / part
471
-
472
- # 如果目录存在,扫描其中的Python文件
473
- if module_dir.exists() and module_dir.is_dir():
474
- # 导入目录下的所有Python文件(除了__init__.py)
475
- for py_file in module_dir.glob("*.py"):
476
- if py_file.name.startswith('_'):
477
- continue
478
-
479
- # 构造模块名
480
- module_name = py_file.stem # 文件名(不含扩展名)
481
- full_module_path = f"{module_path}.{module_name}"
482
-
483
- try:
484
- # 导入模块以触发Spider注册
485
- importlib.import_module(full_module_path)
486
- except ImportError as e:
487
- self._logger.warning(f"Failed to auto-import spider module {full_module_path}: {e}")
488
- except Exception as e:
489
- self._logger.warning(f"Error during auto-discovery for module {module_path}: {e}")
490
-
491
- # 检查是否有新的爬虫被注册
492
- final_spider_count = len(registry)
493
- if final_spider_count > initial_spider_count:
494
- new_spiders = list(registry.keys())
495
- self._logger.info(f"Auto-discovered {final_spider_count - initial_spider_count} new spiders: {new_spiders}")
496
-
497
- except Exception as e:
498
- self._logger.warning(f"Error during auto-discovery of spider modules: {e}")
499
-
500
- def is_spider_registered(self, name: str) -> bool:
501
- """检查爬虫是否已注册"""
502
- from crawlo.spider import get_global_spider_registry
503
- registry = get_global_spider_registry()
504
- return name in registry
505
-
506
- def get_spider_class(self, name: str):
507
- """获取爬虫类"""
508
- from crawlo.spider import get_global_spider_registry
509
- registry = get_global_spider_registry()
510
- return registry.get(name)
511
-
512
- def get_spider_names(self):
513
- """获取所有注册的爬虫名称"""
514
- from crawlo.spider import get_global_spider_registry
515
- registry = get_global_spider_registry()
516
- return list(registry.keys())
517
-
518
- async def crawl(self, spider_cls_or_name, settings=None):
519
- """运行单个爬虫"""
520
- spider_cls = self._resolve_spider_class(spider_cls_or_name)
521
-
522
- # 记录启动的爬虫名称(符合规范要求)
523
- from crawlo.logging import get_logger
524
- logger = get_logger('crawlo.framework')
525
- logger.info(f"Starting spider: {spider_cls.name}")
526
-
527
- merged_settings = self._merge_settings(settings)
528
- crawler = Crawler(spider_cls, merged_settings)
529
-
530
- async with self._semaphore:
531
- await crawler.crawl()
532
-
533
- return crawler
534
-
535
- async def crawl_multiple(self, spider_classes_or_names, settings=None):
536
- """运行多个爬虫"""
537
- self._start_time = time.time()
538
-
539
- try:
540
- spider_classes = []
541
- for cls_or_name in spider_classes_or_names:
542
- spider_cls = self._resolve_spider_class(cls_or_name)
543
- spider_classes.append(spider_cls)
544
-
545
- # 记录启动的爬虫名称(符合规范要求)
546
- spider_names = [cls.name for cls in spider_classes]
547
- from crawlo.logging import get_logger
548
- logger = get_logger('crawlo.framework')
549
- if len(spider_names) == 1:
550
- logger.info(f"Starting spider: {spider_names[0]}")
551
- else:
552
- logger.info(f"Starting spiders: {', '.join(spider_names)}")
553
-
554
- tasks = []
555
- for spider_cls in spider_classes:
556
- merged_settings = self._merge_settings(settings)
557
- crawler = Crawler(spider_cls, merged_settings)
558
- self._crawlers.append(crawler)
559
-
560
- task = asyncio.create_task(self._run_with_semaphore(crawler))
561
- tasks.append(task)
562
-
563
- results = await asyncio.gather(*tasks, return_exceptions=True)
564
-
565
- # 处理结果
566
- successful = sum(1 for r in results if not isinstance(r, Exception))
567
- failed = len(results) - successful
568
-
569
- self._logger.info(f"Crawl completed: {successful} successful, {failed} failed")
570
-
571
- return results
572
-
573
- finally:
574
- # 清理所有crawler,防止资源累积
575
- self._logger.debug(f"Cleaning up {len(self._crawlers)} crawler(s)...")
576
- for crawler in self._crawlers:
577
- try:
578
- # 确保每个crawler都被清理
579
- if hasattr(crawler, '_resource_manager'):
580
- await crawler._resource_manager.cleanup_all()
581
- except Exception as e:
582
- self._logger.warning(f"Failed to cleanup crawler: {e}")
583
-
584
- # 清空crawlers列表,释放引用
585
- self._crawlers.clear()
586
-
587
- self._end_time = time.time()
588
- if self._start_time:
589
- duration = self._end_time - self._start_time
590
- self._logger.info(f"Total execution time: {duration:.2f}s")
591
-
592
- async def _run_with_semaphore(self, crawler: Crawler):
593
- """在信号量控制下运行爬虫"""
594
- async with self._semaphore:
595
- await crawler.crawl()
596
- return crawler
597
-
598
- def _resolve_spider_class(self, spider_cls_or_name):
599
- """解析Spider类"""
600
- if isinstance(spider_cls_or_name, str):
601
- # 从注册表中查找
602
- try:
603
- from crawlo.spider import get_global_spider_registry
604
- registry = get_global_spider_registry()
605
- if spider_cls_or_name in registry:
606
- return registry[spider_cls_or_name]
607
- else:
608
- # 如果在注册表中找不到,尝试通过spider_modules导入所有模块来触发注册
609
- # 然后再次检查注册表
610
- if hasattr(self, '_spider_modules') and self._spider_modules:
611
- for module_path in self._spider_modules:
612
- try:
613
- # 导入模块来触发爬虫注册
614
- __import__(module_path)
615
- except ImportError:
616
- pass # 忽略导入错误
617
-
618
- # 再次检查注册表
619
- if spider_cls_or_name in registry:
620
- return registry[spider_cls_or_name]
621
-
622
- # 如果仍然找不到,尝试自动发现模式
623
- if hasattr(self, '_spider_modules') and self._spider_modules:
624
- self._auto_discover_spider_modules(self._spider_modules)
625
- if spider_cls_or_name in registry:
626
- return registry[spider_cls_or_name]
627
-
628
- # 如果仍然找不到,尝试直接导入模块
629
- try:
630
- # 假设格式为 module.SpiderClass
631
- if '.' in spider_cls_or_name:
632
- module_path, class_name = spider_cls_or_name.rsplit('.', 1)
633
- module = __import__(module_path, fromlist=[class_name])
634
- spider_class = getattr(module, class_name)
635
- # 注册到全局注册表
636
- registry[spider_class.name] = spider_class
637
- return spider_class
638
- else:
639
- # 尝试在spider_modules中查找
640
- if hasattr(self, '_spider_modules') and self._spider_modules:
641
- for module_path in self._spider_modules:
642
- try:
643
- # 构造完整的模块路径
644
- full_module_path = f"{module_path}.{spider_cls_or_name}"
645
- module = __import__(full_module_path, fromlist=[spider_cls_or_name])
646
- # 获取模块中的Spider子类
647
- for attr_name in dir(module):
648
- attr_value = getattr(module, attr_name)
649
- if (isinstance(attr_value, type) and
650
- issubclass(attr_value, registry.__class__.__bases__[0]) and
651
- hasattr(attr_value, 'name') and
652
- attr_value.name == spider_cls_or_name):
653
- # 注册到全局注册表
654
- registry[spider_cls_or_name] = attr_value
655
- return attr_value
656
- except ImportError:
657
- continue
658
- raise ValueError(f"Spider '{spider_cls_or_name}' not found in registry")
659
- except (ImportError, AttributeError):
660
- raise ValueError(f"Spider '{spider_cls_or_name}' not found in registry")
661
- except ImportError:
662
- raise ValueError(f"Cannot resolve spider name '{spider_cls_or_name}'")
663
- else:
664
- return spider_cls_or_name
665
-
666
- def _merge_settings(self, additional_settings):
667
- """合并配置"""
668
- if not additional_settings:
669
- return self._settings
670
-
671
- # 这里可以实现更复杂的配置合并逻辑
672
- from crawlo.settings.setting_manager import SettingManager
673
- merged = SettingManager()
674
-
675
- # 复制基础配置
676
- if self._settings:
677
- merged.update_attributes(self._settings.__dict__)
678
-
679
- # 应用额外配置
680
- merged.update_attributes(additional_settings)
681
-
682
- return merged
683
-
684
- def get_metrics(self) -> Dict[str, Any]:
685
- """获取整体指标"""
686
- total_duration = 0.0
687
- if self._start_time and self._end_time:
688
- total_duration = self._end_time - self._start_time
689
-
690
- crawler_metrics = [crawler.metrics for crawler in self._crawlers]
691
-
692
- return {
693
- 'total_duration': total_duration,
694
- 'crawler_count': len(self._crawlers),
695
- 'total_requests': sum(m.request_count for m in crawler_metrics),
696
- 'total_success': sum(m.success_count for m in crawler_metrics),
697
- 'total_errors': sum(m.error_count for m in crawler_metrics),
698
- 'average_success_rate': sum(m.get_success_rate() for m in crawler_metrics) / len(crawler_metrics) if crawler_metrics else 0.0
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ Crawler系统
5
+ ==========
6
+
7
+ 核心组件:
8
+ - Crawler: 爬虫核心控制器,负责单个爬虫的生命周期管理
9
+ - CrawlerProcess: 爬虫进程管理器,支持单个/多个爬虫运行
10
+
11
+ 设计原则:
12
+ 1. 单一职责 - 每个类只负责一个明确的功能
13
+ 2. 依赖注入 - 通过工厂创建组件,便于测试
14
+ 3. 状态管理 - 清晰的状态转换和生命周期
15
+ 4. 错误处理 - 优雅的错误处理和恢复机制
16
+ 5. 资源管理 - 统一的资源注册和清理机制
17
+ """
18
+
19
+ import asyncio
20
+ import time
21
+ from enum import Enum
22
+ from dataclasses import dataclass
23
+ from contextlib import asynccontextmanager
24
+ from typing import Optional, Type, Dict, Any, List
25
+
26
+ from crawlo.logging import get_logger
27
+ from crawlo.factories import get_component_registry
28
+ from crawlo.initialization import initialize_framework, is_framework_ready
29
+ from crawlo.utils.resource_manager import ResourceManager, ResourceType
30
+
31
+
32
+ class CrawlerState(Enum):
33
+ """Crawler状态枚举"""
34
+ CREATED = "created"
35
+ INITIALIZING = "initializing"
36
+ READY = "ready"
37
+ RUNNING = "running"
38
+ CLOSING = "closing"
39
+ CLOSED = "closed"
40
+ ERROR = "error"
41
+
42
+
43
+ @dataclass
44
+ class CrawlerMetrics:
45
+ """Crawler性能指标"""
46
+ start_time: Optional[float] = None
47
+ end_time: Optional[float] = None
48
+ initialization_duration: float = 0.0
49
+ crawl_duration: float = 0.0
50
+ request_count: int = 0
51
+ success_count: int = 0
52
+ error_count: int = 0
53
+
54
+ def get_total_duration(self) -> float:
55
+ if self.start_time and self.end_time:
56
+ return self.end_time - self.start_time
57
+ return 0.0
58
+
59
+ def get_success_rate(self) -> float:
60
+ total = self.success_count + self.error_count
61
+ return (self.success_count / total * 100) if total > 0 else 0.0
62
+
63
+
64
+ class Crawler:
65
+ """
66
+ 爬虫核心控制器
67
+
68
+ 特点:
69
+ 1. 清晰的状态管理
70
+ 2. 依赖注入
71
+ 3. 组件化架构
72
+ 4. 完善的错误处理
73
+ 5. 统一的资源管理
74
+ """
75
+
76
+ def __init__(self, spider_cls: Type, settings=None):
77
+ self._spider_cls = spider_cls
78
+ self._settings = settings
79
+ self._state = CrawlerState.CREATED
80
+ self._state_lock = asyncio.Lock()
81
+
82
+ # 组件
83
+ self._spider = None
84
+ self._engine = None
85
+ self._stats = None
86
+ self._subscriber = None
87
+ self._extension = None
88
+
89
+ # 指标
90
+ self._metrics = CrawlerMetrics()
91
+
92
+ # 资源管理器
93
+ self._resource_manager = ResourceManager(name=f"crawler.{spider_cls.__name__ if spider_cls else 'unknown'}")
94
+
95
+ # 日志
96
+ self._logger = get_logger(f'crawler.{spider_cls.__name__ if spider_cls else "unknown"}')
97
+
98
+ # 确保框架已初始化
99
+ self._ensure_framework_ready()
100
+
101
+ def _ensure_framework_ready(self):
102
+ """确保框架已准备就绪"""
103
+ if not is_framework_ready():
104
+ try:
105
+ self._settings = initialize_framework(self._settings)
106
+ self._logger.debug("Framework initialized successfully")
107
+ except Exception as e:
108
+ self._logger.warning(f"Framework initialization failed: {e}")
109
+ # 使用降级策略
110
+ if not self._settings:
111
+ from crawlo.settings.setting_manager import SettingManager
112
+ self._settings = SettingManager()
113
+
114
+ # 确保是SettingManager实例
115
+ if isinstance(self._settings, dict):
116
+ from crawlo.settings.setting_manager import SettingManager
117
+ settings_manager = SettingManager()
118
+ settings_manager.update_attributes(self._settings)
119
+ self._settings = settings_manager
120
+
121
+ @property
122
+ def state(self) -> CrawlerState:
123
+ """获取当前状态"""
124
+ return self._state
125
+
126
+ @property
127
+ def spider(self):
128
+ """获取Spider实例"""
129
+ return self._spider
130
+
131
+ @property
132
+ def stats(self):
133
+ """获取Stats实例(向后兼容)"""
134
+ return self._stats
135
+
136
+ @property
137
+ def metrics(self) -> CrawlerMetrics:
138
+ """获取性能指标"""
139
+ return self._metrics
140
+
141
+ @property
142
+ def settings(self):
143
+ """获取配置"""
144
+ return self._settings
145
+
146
+ @property
147
+ def engine(self):
148
+ """获取Engine实例(向后兼容)"""
149
+ return self._engine
150
+
151
+ @property
152
+ def subscriber(self):
153
+ """获取Subscriber实例(向后兼容)"""
154
+ return self._subscriber
155
+
156
+ @property
157
+ def extension(self):
158
+ """获取Extension实例(向后兼容)"""
159
+ return self._extension
160
+
161
+ @extension.setter
162
+ def extension(self, value):
163
+ """设置Extension实例(向后兼容)"""
164
+ self._extension = value
165
+
166
+ def _create_extension(self):
167
+ """创建Extension管理器(向后兼容)"""
168
+ if self._extension is None:
169
+ try:
170
+ registry = get_component_registry()
171
+ self._extension = registry.create('extension_manager', crawler=self)
172
+ except Exception as e:
173
+ self._logger.warning(f"Failed to create extension manager: {e}")
174
+ return self._extension
175
+
176
+ async def close(self):
177
+ """关闭爹虫(向后兼容)"""
178
+ await self._cleanup()
179
+
180
+ async def crawl(self):
181
+ """执行爬取任务"""
182
+ async with self._lifecycle_manager():
183
+ await self._initialize_components()
184
+ await self._run_crawler()
185
+
186
+ @asynccontextmanager
187
+ async def _lifecycle_manager(self):
188
+ """生命周期管理"""
189
+ self._metrics.start_time = time.time()
190
+
191
+ try:
192
+ yield
193
+ except Exception as e:
194
+ await self._handle_error(e)
195
+ raise
196
+ finally:
197
+ await self._cleanup()
198
+ self._metrics.end_time = time.time()
199
+
200
+ async def _initialize_components(self):
201
+ """初始化组件"""
202
+ async with self._state_lock:
203
+ if self._state != CrawlerState.CREATED:
204
+ raise RuntimeError(f"Cannot initialize from state {self._state}")
205
+
206
+ self._state = CrawlerState.INITIALIZING
207
+
208
+ init_start = time.time()
209
+
210
+ try:
211
+ # 使用组件工厂创建组件
212
+ registry = get_component_registry()
213
+
214
+ # 创建Subscriber(无依赖)
215
+ self._subscriber = registry.create('subscriber')
216
+
217
+ # 创建Spider
218
+ self._spider = self._create_spider()
219
+
220
+ # 创建Engine(需要crawler参数)
221
+ self._engine = registry.create('engine', crawler=self)
222
+ # 注册Engine到资源管理器
223
+ if self._engine and hasattr(self._engine, 'close'):
224
+ self._resource_manager.register(
225
+ self._engine,
226
+ lambda e: e.close() if hasattr(e, 'close') else None,
227
+ ResourceType.OTHER,
228
+ name="engine"
229
+ )
230
+
231
+ # 创建Stats(需要crawler参数)
232
+ self._stats = registry.create('stats', crawler=self)
233
+
234
+ # 创建Extension Manager (可选,需要crawler参数)
235
+ try:
236
+ self._extension = registry.create('extension_manager', crawler=self)
237
+ except Exception as e:
238
+ self._logger.warning(f"Failed to create extension manager: {e}")
239
+
240
+ self._metrics.initialization_duration = time.time() - init_start
241
+
242
+ async with self._state_lock:
243
+ self._state = CrawlerState.READY
244
+
245
+ self._logger.debug(f"Crawler components initialized successfully in {self._metrics.initialization_duration:.2f}s")
246
+
247
+ except Exception as e:
248
+ async with self._state_lock:
249
+ self._state = CrawlerState.ERROR
250
+ raise RuntimeError(f"Component initialization failed: {e}")
251
+
252
+ def _create_spider(self):
253
+ """创建Spider实例"""
254
+ if not self._spider_cls:
255
+ raise ValueError("Spider class not provided")
256
+
257
+ # 检查Spider类的有效性
258
+ if not hasattr(self._spider_cls, 'name'):
259
+ raise ValueError("Spider class must have 'name' attribute")
260
+
261
+ # 创建Spider实例
262
+ spider = self._spider_cls()
263
+
264
+ # 设置crawler引用
265
+ if hasattr(spider, 'crawler'):
266
+ spider.crawler = self
267
+
268
+ return spider
269
+
270
+ async def _run_crawler(self):
271
+ """运行爬虫引擎"""
272
+ async with self._state_lock:
273
+ if self._state != CrawlerState.READY:
274
+ raise RuntimeError(f"Cannot run from state {self._state}")
275
+
276
+ self._state = CrawlerState.RUNNING
277
+
278
+ crawl_start = time.time()
279
+
280
+ try:
281
+ # 启动引擎
282
+ if self._engine:
283
+ await self._engine.start_spider(self._spider)
284
+ else:
285
+ raise RuntimeError("Engine not initialized")
286
+
287
+ self._metrics.crawl_duration = time.time() - crawl_start
288
+
289
+ self._logger.info(f"Crawler completed successfully in {self._metrics.crawl_duration:.2f}s")
290
+
291
+ except Exception as e:
292
+ self._metrics.crawl_duration = time.time() - crawl_start
293
+ raise RuntimeError(f"Crawler execution failed: {e}")
294
+
295
+ async def _handle_error(self, error: Exception):
296
+ """处理错误"""
297
+ async with self._state_lock:
298
+ self._state = CrawlerState.ERROR
299
+
300
+ self._metrics.error_count += 1
301
+ self._logger.error(f"Crawler error: {error}", exc_info=True)
302
+
303
+ # 这里可以添加错误恢复逻辑
304
+
305
+ async def _cleanup(self):
306
+ """清理资源"""
307
+ async with self._state_lock:
308
+ if self._state not in [CrawlerState.CLOSING, CrawlerState.CLOSED]:
309
+ self._state = CrawlerState.CLOSING
310
+
311
+ try:
312
+ # 使用资源管理器统一清理
313
+ self._logger.debug("开始清理Crawler资源...")
314
+ cleanup_result = await self._resource_manager.cleanup_all()
315
+ self._logger.debug(
316
+ f"资源清理完成: {cleanup_result['success']}成功, "
317
+ f"{cleanup_result['errors']}失败, 耗时{cleanup_result['duration']:.2f}s"
318
+ )
319
+
320
+ # 关闭各个组件(继续兼容旧逻辑)
321
+ if self._engine and hasattr(self._engine, 'close'):
322
+ try:
323
+ await self._engine.close()
324
+ except Exception as e:
325
+ self._logger.warning(f"Engine cleanup failed: {e}")
326
+
327
+ # 调用Spider的spider_closed方法
328
+ if self._spider:
329
+ try:
330
+ if asyncio.iscoroutinefunction(self._spider.spider_closed):
331
+ await self._spider.spider_closed()
332
+ else:
333
+ self._spider.spider_closed()
334
+ except Exception as e:
335
+ self._logger.warning(f"Spider cleanup failed: {e}")
336
+
337
+ # 调用StatsCollector的close_spider方法,设置reason和spider_name
338
+ if self._stats and hasattr(self._stats, 'close_spider'):
339
+ try:
340
+ # 使用默认的'finished'作为reason
341
+ self._stats.close_spider(self._spider, reason='finished')
342
+ except Exception as e:
343
+ self._logger.warning(f"Stats close_spider failed: {e}")
344
+
345
+ # 触发spider_closed事件,通知所有订阅者(包括扩展)
346
+ # 传递reason参数,这里使用默认的'finished'作为reason
347
+ if self.subscriber:
348
+ from crawlo.event import CrawlerEvent
349
+ await self.subscriber.notify(CrawlerEvent.SPIDER_CLOSED, reason='finished')
350
+
351
+ if self._stats and hasattr(self._stats, 'close'):
352
+ try:
353
+ close_result = self._stats.close()
354
+ if asyncio.iscoroutine(close_result):
355
+ await close_result
356
+ except Exception as e:
357
+ self._logger.warning(f"Stats cleanup failed: {e}")
358
+
359
+ async with self._state_lock:
360
+ self._state = CrawlerState.CLOSED
361
+
362
+ self._logger.debug("Crawler cleanup completed")
363
+
364
+ except Exception as e:
365
+ self._logger.error(f"Cleanup error: {e}")
366
+
367
+
368
+ class CrawlerProcess:
369
+ """
370
+ Crawler进程管理器 - 管理多个Crawler的执行
371
+
372
+ 简化版本,专注于核心功能
373
+ """
374
+
375
+ def __init__(self, settings=None, max_concurrency: int = 3, spider_modules=None):
376
+ # 初始化框架配置
377
+ self._settings = settings or initialize_framework()
378
+ self._max_concurrency = max_concurrency
379
+ self._crawlers: List[Crawler] = []
380
+ self._semaphore = asyncio.Semaphore(max_concurrency)
381
+ self._logger = get_logger('crawler.process')
382
+
383
+ # 如果没有显式提供spider_modules,则从settings中获取
384
+ if spider_modules is None and self._settings:
385
+ spider_modules = self._settings.get('SPIDER_MODULES', [])
386
+ self._logger.debug(f"从settings中获取SPIDER_MODULES: {spider_modules}")
387
+
388
+ self._spider_modules = spider_modules or [] # 保存spider_modules
389
+
390
+ # 如果提供了spider_modules,自动注册这些模块中的爬虫
391
+ if self._spider_modules:
392
+ self._register_spider_modules(self._spider_modules)
393
+
394
+ # 指标
395
+ self._start_time: Optional[float] = None
396
+ self._end_time: Optional[float] = None
397
+
398
+ def _register_spider_modules(self, spider_modules):
399
+ """注册爬虫模块"""
400
+ try:
401
+ from crawlo.spider import get_global_spider_registry
402
+ registry = get_global_spider_registry()
403
+
404
+ self._logger.debug(f"Registering spider modules: {spider_modules}")
405
+
406
+ initial_spider_count = len(registry)
407
+
408
+ for module_path in spider_modules:
409
+ try:
410
+ # 导入模块
411
+ __import__(module_path)
412
+ self._logger.debug(f"Successfully imported spider module: {module_path}")
413
+ except ImportError as e:
414
+ self._logger.warning(f"Failed to import spider module {module_path}: {e}")
415
+ # 如果导入失败,尝试自动发现
416
+ self._auto_discover_spider_modules([module_path])
417
+
418
+ # 检查注册表中的爬虫
419
+ spider_names = list(registry.keys())
420
+ self._logger.debug(f"Registered spiders after import: {spider_names}")
421
+
422
+ # 如果导入模块后没有新的爬虫被注册,则尝试自动发现
423
+ final_spider_count = len(registry)
424
+ if final_spider_count == initial_spider_count:
425
+ self._logger.debug("No new spiders registered after importing modules, attempting auto-discovery")
426
+ self._auto_discover_spider_modules(spider_modules)
427
+ spider_names = list(registry.keys())
428
+ self._logger.debug(f"Registered spiders after auto-discovery: {spider_names}")
429
+ except Exception as e:
430
+ self._logger.warning(f"Error registering spider modules: {e}")
431
+
432
+ def _auto_discover_spider_modules(self, spider_modules):
433
+ """
434
+ 自动发现并导入爬虫模块中的所有爬虫
435
+ 这个方法会扫描指定模块目录下的所有Python文件并自动导入
436
+ """
437
+ try:
438
+ from crawlo.spider import get_global_spider_registry
439
+ import importlib
440
+ from pathlib import Path
441
+ import sys
442
+
443
+ registry = get_global_spider_registry()
444
+ initial_spider_count = len(registry)
445
+
446
+ for module_path in spider_modules:
447
+ try:
448
+ # 将模块路径转换为文件系统路径
449
+ # 例如: ofweek_standalone.spiders -> ofweek_standalone/spiders
450
+ package_parts = module_path.split('.')
451
+ if len(package_parts) < 2:
452
+ continue
453
+
454
+ # 获取项目根目录
455
+ project_root = None
456
+ for path in sys.path:
457
+ if path and Path(path).exists():
458
+ possible_module_path = Path(path) / package_parts[0]
459
+ if possible_module_path.exists():
460
+ project_root = path
461
+ break
462
+
463
+ if not project_root:
464
+ # 尝试使用当前工作目录
465
+ project_root = str(Path.cwd())
466
+
467
+ # 构建模块目录路径
468
+ module_dir = Path(project_root)
469
+ for part in package_parts:
470
+ module_dir = module_dir / part
471
+
472
+ # 如果目录存在,扫描其中的Python文件
473
+ if module_dir.exists() and module_dir.is_dir():
474
+ # 导入目录下的所有Python文件(除了__init__.py)
475
+ for py_file in module_dir.glob("*.py"):
476
+ if py_file.name.startswith('_'):
477
+ continue
478
+
479
+ # 构造模块名
480
+ module_name = py_file.stem # 文件名(不含扩展名)
481
+ full_module_path = f"{module_path}.{module_name}"
482
+
483
+ try:
484
+ # 导入模块以触发Spider注册
485
+ importlib.import_module(full_module_path)
486
+ except ImportError as e:
487
+ self._logger.warning(f"Failed to auto-import spider module {full_module_path}: {e}")
488
+ except Exception as e:
489
+ self._logger.warning(f"Error during auto-discovery for module {module_path}: {e}")
490
+
491
+ # 检查是否有新的爬虫被注册
492
+ final_spider_count = len(registry)
493
+ if final_spider_count > initial_spider_count:
494
+ new_spiders = list(registry.keys())
495
+ self._logger.info(f"Auto-discovered {final_spider_count - initial_spider_count} new spiders: {new_spiders}")
496
+
497
+ except Exception as e:
498
+ self._logger.warning(f"Error during auto-discovery of spider modules: {e}")
499
+
500
+ def is_spider_registered(self, name: str) -> bool:
501
+ """检查爬虫是否已注册"""
502
+ from crawlo.spider import get_global_spider_registry
503
+ registry = get_global_spider_registry()
504
+ return name in registry
505
+
506
+ def get_spider_class(self, name: str):
507
+ """获取爬虫类"""
508
+ from crawlo.spider import get_global_spider_registry
509
+ registry = get_global_spider_registry()
510
+ return registry.get(name)
511
+
512
+ def get_spider_names(self):
513
+ """获取所有注册的爬虫名称"""
514
+ from crawlo.spider import get_global_spider_registry
515
+ registry = get_global_spider_registry()
516
+ return list(registry.keys())
517
+
518
+ async def crawl(self, spider_cls_or_name, settings=None):
519
+ """运行单个爬虫"""
520
+ spider_cls = self._resolve_spider_class(spider_cls_or_name)
521
+
522
+ # 记录启动的爬虫名称(符合规范要求)
523
+ from crawlo.logging import get_logger
524
+ logger = get_logger('crawlo.framework')
525
+ logger.info(f"Starting spider: {spider_cls.name}")
526
+
527
+ merged_settings = self._merge_settings(settings)
528
+ crawler = Crawler(spider_cls, merged_settings)
529
+
530
+ async with self._semaphore:
531
+ await crawler.crawl()
532
+
533
+ return crawler
534
+
535
+ async def crawl_multiple(self, spider_classes_or_names, settings=None):
536
+ """运行多个爬虫"""
537
+ self._start_time = time.time()
538
+
539
+ try:
540
+ spider_classes = []
541
+ for cls_or_name in spider_classes_or_names:
542
+ spider_cls = self._resolve_spider_class(cls_or_name)
543
+ spider_classes.append(spider_cls)
544
+
545
+ # 记录启动的爬虫名称(符合规范要求)
546
+ spider_names = [cls.name for cls in spider_classes]
547
+ from crawlo.logging import get_logger
548
+ logger = get_logger('crawlo.framework')
549
+ if len(spider_names) == 1:
550
+ logger.info(f"Starting spider: {spider_names[0]}")
551
+ else:
552
+ logger.info(f"Starting spiders: {', '.join(spider_names)}")
553
+
554
+ tasks = []
555
+ for spider_cls in spider_classes:
556
+ merged_settings = self._merge_settings(settings)
557
+ crawler = Crawler(spider_cls, merged_settings)
558
+ self._crawlers.append(crawler)
559
+
560
+ task = asyncio.create_task(self._run_with_semaphore(crawler))
561
+ tasks.append(task)
562
+
563
+ results = await asyncio.gather(*tasks, return_exceptions=True)
564
+
565
+ # 处理结果
566
+ successful = sum(1 for r in results if not isinstance(r, Exception))
567
+ failed = len(results) - successful
568
+
569
+ self._logger.info(f"Crawl completed: {successful} successful, {failed} failed")
570
+
571
+ return results
572
+
573
+ finally:
574
+ # 清理所有crawler,防止资源累积
575
+ self._logger.debug(f"Cleaning up {len(self._crawlers)} crawler(s)...")
576
+ for crawler in self._crawlers:
577
+ try:
578
+ # 确保每个crawler都被清理
579
+ if hasattr(crawler, '_resource_manager'):
580
+ await crawler._resource_manager.cleanup_all()
581
+ except Exception as e:
582
+ self._logger.warning(f"Failed to cleanup crawler: {e}")
583
+
584
+ # 清空crawlers列表,释放引用
585
+ self._crawlers.clear()
586
+
587
+ self._end_time = time.time()
588
+ if self._start_time:
589
+ duration = self._end_time - self._start_time
590
+ self._logger.info(f"Total execution time: {duration:.2f}s")
591
+
592
+ async def _run_with_semaphore(self, crawler: Crawler):
593
+ """在信号量控制下运行爬虫"""
594
+ async with self._semaphore:
595
+ await crawler.crawl()
596
+ return crawler
597
+
598
+ def _resolve_spider_class(self, spider_cls_or_name):
599
+ """解析Spider类"""
600
+ if isinstance(spider_cls_or_name, str):
601
+ # 从注册表中查找
602
+ try:
603
+ from crawlo.spider import get_global_spider_registry
604
+ registry = get_global_spider_registry()
605
+ if spider_cls_or_name in registry:
606
+ return registry[spider_cls_or_name]
607
+ else:
608
+ # 如果在注册表中找不到,尝试通过spider_modules导入所有模块来触发注册
609
+ # 然后再次检查注册表
610
+ if hasattr(self, '_spider_modules') and self._spider_modules:
611
+ for module_path in self._spider_modules:
612
+ try:
613
+ # 导入模块来触发爬虫注册
614
+ __import__(module_path)
615
+ except ImportError:
616
+ pass # 忽略导入错误
617
+
618
+ # 再次检查注册表
619
+ if spider_cls_or_name in registry:
620
+ return registry[spider_cls_or_name]
621
+
622
+ # 如果仍然找不到,尝试自动发现模式
623
+ if hasattr(self, '_spider_modules') and self._spider_modules:
624
+ self._auto_discover_spider_modules(self._spider_modules)
625
+ if spider_cls_or_name in registry:
626
+ return registry[spider_cls_or_name]
627
+
628
+ # 如果仍然找不到,尝试直接导入模块
629
+ try:
630
+ # 假设格式为 module.SpiderClass
631
+ if '.' in spider_cls_or_name:
632
+ module_path, class_name = spider_cls_or_name.rsplit('.', 1)
633
+ module = __import__(module_path, fromlist=[class_name])
634
+ spider_class = getattr(module, class_name)
635
+ # 注册到全局注册表
636
+ registry[spider_class.name] = spider_class
637
+ return spider_class
638
+ else:
639
+ # 尝试在spider_modules中查找
640
+ if hasattr(self, '_spider_modules') and self._spider_modules:
641
+ for module_path in self._spider_modules:
642
+ try:
643
+ # 构造完整的模块路径
644
+ full_module_path = f"{module_path}.{spider_cls_or_name}"
645
+ module = __import__(full_module_path, fromlist=[spider_cls_or_name])
646
+ # 获取模块中的Spider子类
647
+ for attr_name in dir(module):
648
+ attr_value = getattr(module, attr_name)
649
+ if (isinstance(attr_value, type) and
650
+ issubclass(attr_value, registry.__class__.__bases__[0]) and
651
+ hasattr(attr_value, 'name') and
652
+ attr_value.name == spider_cls_or_name):
653
+ # 注册到全局注册表
654
+ registry[spider_cls_or_name] = attr_value
655
+ return attr_value
656
+ except ImportError:
657
+ continue
658
+ raise ValueError(f"Spider '{spider_cls_or_name}' not found in registry")
659
+ except (ImportError, AttributeError):
660
+ raise ValueError(f"Spider '{spider_cls_or_name}' not found in registry")
661
+ except ImportError:
662
+ raise ValueError(f"Cannot resolve spider name '{spider_cls_or_name}'")
663
+ else:
664
+ return spider_cls_or_name
665
+
666
+ def _merge_settings(self, additional_settings):
667
+ """合并配置"""
668
+ if not additional_settings:
669
+ return self._settings
670
+
671
+ # 这里可以实现更复杂的配置合并逻辑
672
+ from crawlo.settings.setting_manager import SettingManager
673
+ merged = SettingManager()
674
+
675
+ # 复制基础配置
676
+ if self._settings:
677
+ merged.update_attributes(self._settings.__dict__)
678
+
679
+ # 应用额外配置
680
+ merged.update_attributes(additional_settings)
681
+
682
+ return merged
683
+
684
+ def get_metrics(self) -> Dict[str, Any]:
685
+ """获取整体指标"""
686
+ total_duration = 0.0
687
+ if self._start_time and self._end_time:
688
+ total_duration = self._end_time - self._start_time
689
+
690
+ crawler_metrics = [crawler.metrics for crawler in self._crawlers]
691
+
692
+ return {
693
+ 'total_duration': total_duration,
694
+ 'crawler_count': len(self._crawlers),
695
+ 'total_requests': sum(m.request_count for m in crawler_metrics),
696
+ 'total_success': sum(m.success_count for m in crawler_metrics),
697
+ 'total_errors': sum(m.error_count for m in crawler_metrics),
698
+ 'average_success_rate': sum(m.get_success_rate() for m in crawler_metrics) / len(crawler_metrics) if crawler_metrics else 0.0
699
699
  }