crawlo 1.4.5__py3-none-any.whl → 1.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (375) hide show
  1. crawlo/__init__.py +90 -89
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +186 -186
  7. crawlo/commands/help.py +140 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +379 -341
  10. crawlo/commands/startproject.py +460 -460
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +320 -312
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +52 -52
  16. crawlo/core/engine.py +451 -438
  17. crawlo/core/processor.py +47 -47
  18. crawlo/core/scheduler.py +290 -291
  19. crawlo/crawler.py +698 -657
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +280 -276
  23. crawlo/downloader/aiohttp_downloader.py +233 -233
  24. crawlo/downloader/cffi_downloader.py +250 -245
  25. crawlo/downloader/httpx_downloader.py +265 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +425 -402
  28. crawlo/downloader/selenium_downloader.py +486 -472
  29. crawlo/event.py +45 -11
  30. crawlo/exceptions.py +215 -82
  31. crawlo/extension/__init__.py +65 -64
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +94 -94
  34. crawlo/extension/log_stats.py +70 -70
  35. crawlo/extension/logging_extension.py +53 -61
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +27 -27
  40. crawlo/factories/base.py +68 -68
  41. crawlo/factories/crawler.py +104 -103
  42. crawlo/factories/registry.py +84 -84
  43. crawlo/factories/utils.py +135 -0
  44. crawlo/filters/__init__.py +170 -153
  45. crawlo/filters/aioredis_filter.py +348 -264
  46. crawlo/filters/memory_filter.py +261 -276
  47. crawlo/framework.py +306 -292
  48. crawlo/initialization/__init__.py +44 -44
  49. crawlo/initialization/built_in.py +391 -434
  50. crawlo/initialization/context.py +141 -141
  51. crawlo/initialization/core.py +240 -194
  52. crawlo/initialization/phases.py +230 -149
  53. crawlo/initialization/registry.py +143 -145
  54. crawlo/initialization/utils.py +49 -0
  55. crawlo/interfaces.py +23 -23
  56. crawlo/items/__init__.py +23 -23
  57. crawlo/items/base.py +23 -23
  58. crawlo/items/fields.py +52 -52
  59. crawlo/items/items.py +104 -104
  60. crawlo/logging/__init__.py +42 -46
  61. crawlo/logging/config.py +277 -197
  62. crawlo/logging/factory.py +175 -171
  63. crawlo/logging/manager.py +104 -112
  64. crawlo/middleware/__init__.py +87 -24
  65. crawlo/middleware/default_header.py +132 -132
  66. crawlo/middleware/download_delay.py +104 -104
  67. crawlo/middleware/middleware_manager.py +142 -142
  68. crawlo/middleware/offsite.py +123 -123
  69. crawlo/middleware/proxy.py +209 -386
  70. crawlo/middleware/request_ignore.py +86 -86
  71. crawlo/middleware/response_code.py +150 -150
  72. crawlo/middleware/response_filter.py +136 -136
  73. crawlo/middleware/retry.py +124 -124
  74. crawlo/mode_manager.py +287 -253
  75. crawlo/network/__init__.py +21 -21
  76. crawlo/network/request.py +375 -379
  77. crawlo/network/response.py +569 -664
  78. crawlo/pipelines/__init__.py +53 -22
  79. crawlo/pipelines/base_pipeline.py +452 -0
  80. crawlo/pipelines/bloom_dedup_pipeline.py +146 -146
  81. crawlo/pipelines/console_pipeline.py +39 -39
  82. crawlo/pipelines/csv_pipeline.py +316 -316
  83. crawlo/pipelines/database_dedup_pipeline.py +197 -197
  84. crawlo/pipelines/json_pipeline.py +218 -218
  85. crawlo/pipelines/memory_dedup_pipeline.py +105 -105
  86. crawlo/pipelines/mongo_pipeline.py +140 -132
  87. crawlo/pipelines/mysql_pipeline.py +470 -326
  88. crawlo/pipelines/pipeline_manager.py +100 -100
  89. crawlo/pipelines/redis_dedup_pipeline.py +155 -156
  90. crawlo/project.py +347 -347
  91. crawlo/queue/__init__.py +10 -0
  92. crawlo/queue/pqueue.py +38 -38
  93. crawlo/queue/queue_manager.py +591 -525
  94. crawlo/queue/redis_priority_queue.py +519 -370
  95. crawlo/settings/__init__.py +7 -7
  96. crawlo/settings/default_settings.py +285 -270
  97. crawlo/settings/setting_manager.py +219 -219
  98. crawlo/spider/__init__.py +657 -657
  99. crawlo/stats_collector.py +82 -73
  100. crawlo/subscriber.py +129 -129
  101. crawlo/task_manager.py +138 -138
  102. crawlo/templates/crawlo.cfg.tmpl +10 -10
  103. crawlo/templates/project/__init__.py.tmpl +2 -4
  104. crawlo/templates/project/items.py.tmpl +13 -17
  105. crawlo/templates/project/middlewares.py.tmpl +38 -38
  106. crawlo/templates/project/pipelines.py.tmpl +35 -36
  107. crawlo/templates/project/settings.py.tmpl +110 -157
  108. crawlo/templates/project/settings_distributed.py.tmpl +156 -161
  109. crawlo/templates/project/settings_gentle.py.tmpl +170 -171
  110. crawlo/templates/project/settings_high_performance.py.tmpl +171 -172
  111. crawlo/templates/project/settings_minimal.py.tmpl +99 -77
  112. crawlo/templates/project/settings_simple.py.tmpl +168 -169
  113. crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
  114. crawlo/templates/run.py.tmpl +23 -30
  115. crawlo/templates/spider/spider.py.tmpl +33 -144
  116. crawlo/templates/spiders_init.py.tmpl +5 -10
  117. crawlo/tools/__init__.py +86 -189
  118. crawlo/tools/date_tools.py +289 -289
  119. crawlo/tools/distributed_coordinator.py +384 -384
  120. crawlo/tools/scenario_adapter.py +262 -262
  121. crawlo/tools/text_cleaner.py +232 -232
  122. crawlo/utils/__init__.py +50 -50
  123. crawlo/utils/batch_processor.py +276 -259
  124. crawlo/utils/config_manager.py +442 -0
  125. crawlo/utils/controlled_spider_mixin.py +439 -439
  126. crawlo/utils/db_helper.py +250 -244
  127. crawlo/utils/error_handler.py +410 -410
  128. crawlo/utils/fingerprint.py +121 -121
  129. crawlo/utils/func_tools.py +82 -82
  130. crawlo/utils/large_scale_helper.py +344 -344
  131. crawlo/utils/leak_detector.py +335 -0
  132. crawlo/utils/log.py +79 -79
  133. crawlo/utils/misc.py +81 -81
  134. crawlo/utils/mongo_connection_pool.py +157 -0
  135. crawlo/utils/mysql_connection_pool.py +197 -0
  136. crawlo/utils/performance_monitor.py +285 -285
  137. crawlo/utils/queue_helper.py +175 -175
  138. crawlo/utils/redis_checker.py +91 -0
  139. crawlo/utils/redis_connection_pool.py +578 -388
  140. crawlo/utils/redis_key_validator.py +198 -198
  141. crawlo/utils/request.py +278 -256
  142. crawlo/utils/request_serializer.py +225 -225
  143. crawlo/utils/resource_manager.py +337 -0
  144. crawlo/utils/selector_helper.py +137 -137
  145. crawlo/utils/singleton.py +70 -0
  146. crawlo/utils/spider_loader.py +201 -201
  147. crawlo/utils/text_helper.py +94 -94
  148. crawlo/utils/{url.py → url_utils.py} +39 -39
  149. crawlo-1.4.7.dist-info/METADATA +689 -0
  150. crawlo-1.4.7.dist-info/RECORD +347 -0
  151. examples/__init__.py +7 -7
  152. tests/__init__.py +7 -7
  153. tests/advanced_tools_example.py +217 -275
  154. tests/authenticated_proxy_example.py +110 -106
  155. tests/baidu_performance_test.py +108 -108
  156. tests/baidu_test.py +59 -59
  157. tests/bug_check_test.py +250 -250
  158. tests/cleaners_example.py +160 -160
  159. tests/comprehensive_framework_test.py +212 -212
  160. tests/comprehensive_test.py +81 -81
  161. tests/comprehensive_testing_summary.md +186 -186
  162. tests/config_validation_demo.py +142 -142
  163. tests/controlled_spider_example.py +205 -205
  164. tests/date_tools_example.py +180 -180
  165. tests/debug_configure.py +69 -69
  166. tests/debug_framework_logger.py +84 -84
  167. tests/debug_log_config.py +126 -126
  168. tests/debug_log_levels.py +63 -63
  169. tests/debug_pipelines.py +66 -66
  170. tests/detailed_log_test.py +233 -233
  171. tests/direct_selector_helper_test.py +96 -96
  172. tests/distributed_dedup_test.py +467 -0
  173. tests/distributed_test.py +66 -66
  174. tests/distributed_test_debug.py +76 -76
  175. tests/dynamic_loading_example.py +523 -523
  176. tests/dynamic_loading_test.py +104 -104
  177. tests/error_handling_example.py +171 -171
  178. tests/explain_mysql_update_behavior.py +77 -0
  179. tests/final_comprehensive_test.py +151 -151
  180. tests/final_log_test.py +260 -260
  181. tests/final_validation_test.py +182 -182
  182. tests/fix_log_test.py +142 -142
  183. tests/framework_performance_test.py +202 -202
  184. tests/log_buffering_test.py +111 -111
  185. tests/log_generation_timing_test.py +153 -153
  186. tests/monitor_redis_dedup.sh +72 -0
  187. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
  188. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
  189. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
  190. tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
  191. tests/ofweek_scrapy/scrapy.cfg +11 -11
  192. tests/optimized_performance_test.py +211 -211
  193. tests/performance_comparison.py +244 -244
  194. tests/queue_blocking_test.py +113 -113
  195. tests/queue_test.py +89 -89
  196. tests/redis_key_validation_demo.py +130 -130
  197. tests/request_params_example.py +150 -150
  198. tests/response_improvements_example.py +144 -144
  199. tests/scrapy_comparison/ofweek_scrapy.py +138 -138
  200. tests/scrapy_comparison/scrapy_test.py +133 -133
  201. tests/simple_cli_test.py +55 -0
  202. tests/simple_command_test.py +119 -119
  203. tests/simple_crawlo_test.py +126 -126
  204. tests/simple_follow_test.py +38 -38
  205. tests/simple_log_test2.py +137 -137
  206. tests/simple_optimization_test.py +128 -128
  207. tests/simple_queue_type_test.py +41 -41
  208. tests/simple_response_selector_test.py +94 -94
  209. tests/simple_selector_helper_test.py +154 -154
  210. tests/simple_selector_test.py +207 -207
  211. tests/simple_spider_test.py +49 -49
  212. tests/simple_url_test.py +73 -73
  213. tests/simulate_mysql_update_test.py +140 -0
  214. tests/spider_log_timing_test.py +177 -177
  215. tests/test_advanced_tools.py +148 -148
  216. tests/test_all_commands.py +230 -230
  217. tests/test_all_pipeline_fingerprints.py +133 -133
  218. tests/test_all_redis_key_configs.py +145 -145
  219. tests/test_asyncmy_usage.py +57 -0
  220. tests/test_batch_processor.py +178 -178
  221. tests/test_cleaners.py +54 -54
  222. tests/test_cli_arguments.py +119 -0
  223. tests/test_component_factory.py +174 -174
  224. tests/test_config_consistency.py +80 -80
  225. tests/test_config_merge.py +152 -152
  226. tests/test_config_validator.py +182 -182
  227. tests/test_controlled_spider_mixin.py +79 -79
  228. tests/test_crawler_process_import.py +38 -38
  229. tests/test_crawler_process_spider_modules.py +47 -47
  230. tests/test_crawlo_proxy_integration.py +114 -108
  231. tests/test_date_tools.py +123 -123
  232. tests/test_dedup_fix.py +220 -220
  233. tests/test_dedup_pipeline_consistency.py +124 -124
  234. tests/test_default_header_middleware.py +313 -313
  235. tests/test_distributed.py +65 -65
  236. tests/test_double_crawlo_fix.py +204 -204
  237. tests/test_double_crawlo_fix_simple.py +124 -124
  238. tests/test_download_delay_middleware.py +221 -221
  239. tests/test_downloader_proxy_compatibility.py +272 -268
  240. tests/test_edge_cases.py +305 -305
  241. tests/test_encoding_core.py +56 -56
  242. tests/test_encoding_detection.py +126 -126
  243. tests/test_enhanced_error_handler.py +270 -270
  244. tests/test_enhanced_error_handler_comprehensive.py +245 -245
  245. tests/test_error_handler_compatibility.py +112 -112
  246. tests/test_factories.py +252 -252
  247. tests/test_factory_compatibility.py +196 -196
  248. tests/test_final_validation.py +153 -153
  249. tests/test_fingerprint_consistency.py +135 -135
  250. tests/test_fingerprint_simple.py +51 -51
  251. tests/test_get_component_logger.py +83 -83
  252. tests/test_hash_performance.py +99 -99
  253. tests/test_integration.py +169 -169
  254. tests/test_item_dedup_redis_key.py +122 -122
  255. tests/test_large_scale_helper.py +235 -235
  256. tests/test_logging_enhancements.py +374 -374
  257. tests/test_logging_final.py +184 -184
  258. tests/test_logging_integration.py +312 -312
  259. tests/test_logging_system.py +282 -282
  260. tests/test_middleware_debug.py +141 -141
  261. tests/test_mode_consistency.py +51 -51
  262. tests/test_multi_directory.py +67 -67
  263. tests/test_multiple_spider_modules.py +80 -80
  264. tests/test_mysql_pipeline_config.py +165 -0
  265. tests/test_mysql_pipeline_error.py +99 -0
  266. tests/test_mysql_pipeline_init_log.py +83 -0
  267. tests/test_mysql_pipeline_integration.py +133 -0
  268. tests/test_mysql_pipeline_refactor.py +144 -0
  269. tests/test_mysql_pipeline_refactor_simple.py +86 -0
  270. tests/test_mysql_pipeline_robustness.py +196 -0
  271. tests/test_mysql_pipeline_types.py +89 -0
  272. tests/test_mysql_update_columns.py +94 -0
  273. tests/test_offsite_middleware.py +244 -244
  274. tests/test_offsite_middleware_simple.py +203 -203
  275. tests/test_optimized_selector_naming.py +100 -100
  276. tests/test_parsel.py +29 -29
  277. tests/test_performance.py +327 -327
  278. tests/test_performance_monitor.py +115 -115
  279. tests/test_pipeline_fingerprint_consistency.py +86 -86
  280. tests/test_priority_behavior.py +211 -211
  281. tests/test_priority_consistency.py +151 -151
  282. tests/test_priority_consistency_fixed.py +249 -249
  283. tests/test_proxy_health_check.py +32 -32
  284. tests/test_proxy_middleware.py +217 -121
  285. tests/test_proxy_middleware_enhanced.py +212 -216
  286. tests/test_proxy_middleware_integration.py +142 -137
  287. tests/test_proxy_middleware_refactored.py +207 -184
  288. tests/test_proxy_only.py +84 -0
  289. tests/test_proxy_providers.py +56 -56
  290. tests/test_proxy_stats.py +19 -19
  291. tests/test_proxy_strategies.py +59 -59
  292. tests/test_proxy_with_downloader.py +153 -0
  293. tests/test_queue_empty_check.py +41 -41
  294. tests/test_queue_manager_double_crawlo.py +173 -173
  295. tests/test_queue_manager_redis_key.py +179 -179
  296. tests/test_queue_naming.py +154 -154
  297. tests/test_queue_type.py +106 -106
  298. tests/test_queue_type_redis_config_consistency.py +130 -130
  299. tests/test_random_headers_default.py +322 -322
  300. tests/test_random_headers_necessity.py +308 -308
  301. tests/test_random_user_agent.py +72 -72
  302. tests/test_redis_config.py +28 -28
  303. tests/test_redis_connection_pool.py +294 -294
  304. tests/test_redis_key_naming.py +181 -181
  305. tests/test_redis_key_validator.py +123 -123
  306. tests/test_redis_queue.py +224 -224
  307. tests/test_redis_queue_name_fix.py +175 -175
  308. tests/test_redis_queue_type_fallback.py +129 -129
  309. tests/test_request_ignore_middleware.py +182 -182
  310. tests/test_request_params.py +111 -111
  311. tests/test_request_serialization.py +70 -70
  312. tests/test_response_code_middleware.py +349 -349
  313. tests/test_response_filter_middleware.py +427 -427
  314. tests/test_response_follow.py +104 -104
  315. tests/test_response_improvements.py +152 -152
  316. tests/test_response_selector_methods.py +92 -92
  317. tests/test_response_url_methods.py +70 -70
  318. tests/test_response_urljoin.py +86 -86
  319. tests/test_retry_middleware.py +333 -333
  320. tests/test_retry_middleware_realistic.py +273 -273
  321. tests/test_scheduler.py +252 -252
  322. tests/test_scheduler_config_update.py +133 -133
  323. tests/test_scrapy_style_encoding.py +112 -112
  324. tests/test_selector_helper.py +100 -100
  325. tests/test_selector_optimizations.py +146 -146
  326. tests/test_simple_response.py +61 -61
  327. tests/test_spider_loader.py +49 -49
  328. tests/test_spider_loader_comprehensive.py +69 -69
  329. tests/test_spider_modules.py +84 -84
  330. tests/test_spiders/test_spider.py +9 -9
  331. tests/test_telecom_spider_redis_key.py +205 -205
  332. tests/test_template_content.py +87 -87
  333. tests/test_template_redis_key.py +134 -134
  334. tests/test_tools.py +159 -159
  335. tests/test_user_agent_randomness.py +176 -176
  336. tests/test_user_agents.py +96 -96
  337. tests/untested_features_report.md +138 -138
  338. tests/verify_debug.py +51 -51
  339. tests/verify_distributed.py +117 -117
  340. tests/verify_log_fix.py +111 -111
  341. tests/verify_mysql_warnings.py +110 -0
  342. crawlo/logging/async_handler.py +0 -181
  343. crawlo/logging/monitor.py +0 -153
  344. crawlo/logging/sampler.py +0 -167
  345. crawlo/middleware/simple_proxy.py +0 -65
  346. crawlo/tools/authenticated_proxy.py +0 -241
  347. crawlo/tools/data_formatter.py +0 -226
  348. crawlo/tools/data_validator.py +0 -181
  349. crawlo/tools/encoding_converter.py +0 -127
  350. crawlo/tools/network_diagnostic.py +0 -365
  351. crawlo/tools/request_tools.py +0 -83
  352. crawlo/tools/retry_mechanism.py +0 -224
  353. crawlo/utils/env_config.py +0 -143
  354. crawlo/utils/large_scale_config.py +0 -287
  355. crawlo/utils/system.py +0 -11
  356. crawlo/utils/tools.py +0 -5
  357. crawlo-1.4.5.dist-info/METADATA +0 -329
  358. crawlo-1.4.5.dist-info/RECORD +0 -347
  359. tests/env_config_example.py +0 -134
  360. tests/ofweek_scrapy/ofweek_scrapy/spiders/ofweek_spider.py +0 -162
  361. tests/test_authenticated_proxy.py +0 -142
  362. tests/test_comprehensive.py +0 -147
  363. tests/test_dynamic_downloaders_proxy.py +0 -125
  364. tests/test_dynamic_proxy.py +0 -93
  365. tests/test_dynamic_proxy_config.py +0 -147
  366. tests/test_dynamic_proxy_real.py +0 -110
  367. tests/test_env_config.py +0 -122
  368. tests/test_framework_env_usage.py +0 -104
  369. tests/test_large_scale_config.py +0 -113
  370. tests/test_proxy_api.py +0 -265
  371. tests/test_real_scenario_proxy.py +0 -196
  372. tests/tools_example.py +0 -261
  373. {crawlo-1.4.5.dist-info → crawlo-1.4.7.dist-info}/WHEEL +0 -0
  374. {crawlo-1.4.5.dist-info → crawlo-1.4.7.dist-info}/entry_points.txt +0 -0
  375. {crawlo-1.4.5.dist-info → crawlo-1.4.7.dist-info}/top_level.txt +0 -0
crawlo/crawler.py CHANGED
@@ -1,658 +1,699 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 重构后的Crawler系统
5
- ==================
6
-
7
- 设计原则:
8
- 1. 单一职责 - 每个类只负责一个明确的功能
9
- 2. 依赖注入 - 通过工厂创建组件,便于测试
10
- 3. 状态管理 - 清晰的状态转换和生命周期
11
- 4. 错误处理 - 优雅的错误处理和恢复机制
12
- """
13
-
14
- import asyncio
15
- import time
16
- from enum import Enum
17
- from dataclasses import dataclass
18
- from contextlib import asynccontextmanager
19
- from typing import Optional, Type, Dict, Any, List
20
-
21
- from crawlo.logging import get_logger
22
- from crawlo.factories import get_component_registry
23
- from crawlo.initialization import initialize_framework, is_framework_ready
24
-
25
-
26
- class CrawlerState(Enum):
27
- """Crawler状态枚举"""
28
- CREATED = "created"
29
- INITIALIZING = "initializing"
30
- READY = "ready"
31
- RUNNING = "running"
32
- CLOSING = "closing"
33
- CLOSED = "closed"
34
- ERROR = "error"
35
-
36
-
37
- @dataclass
38
- class CrawlerMetrics:
39
- """Crawler性能指标"""
40
- start_time: Optional[float] = None
41
- end_time: Optional[float] = None
42
- initialization_duration: float = 0.0
43
- crawl_duration: float = 0.0
44
- request_count: int = 0
45
- success_count: int = 0
46
- error_count: int = 0
47
-
48
- def get_total_duration(self) -> float:
49
- if self.start_time and self.end_time:
50
- return self.end_time - self.start_time
51
- return 0.0
52
-
53
- def get_success_rate(self) -> float:
54
- total = self.success_count + self.error_count
55
- return (self.success_count / total * 100) if total > 0 else 0.0
56
-
57
-
58
- class ModernCrawler:
59
- """
60
- 现代化的Crawler实现
61
-
62
- 特点:
63
- 1. 清晰的状态管理
64
- 2. 依赖注入
65
- 3. 组件化架构
66
- 4. 完善的错误处理
67
- """
68
-
69
- def __init__(self, spider_cls: Type, settings=None):
70
- self._spider_cls = spider_cls
71
- self._settings = settings
72
- self._state = CrawlerState.CREATED
73
- self._state_lock = asyncio.Lock()
74
-
75
- # 组件
76
- self._spider = None
77
- self._engine = None
78
- self._stats = None
79
- self._subscriber = None
80
- self._extension = None
81
-
82
- # 指标
83
- self._metrics = CrawlerMetrics()
84
-
85
- # 日志
86
- self._logger = get_logger(f'crawler.{spider_cls.__name__ if spider_cls else "unknown"}')
87
-
88
- # 确保框架已初始化
89
- self._ensure_framework_ready()
90
-
91
- def _ensure_framework_ready(self):
92
- """确保框架已准备就绪"""
93
- if not is_framework_ready():
94
- try:
95
- self._settings = initialize_framework(self._settings)
96
- self._logger.debug("Framework initialized successfully")
97
- except Exception as e:
98
- self._logger.warning(f"Framework initialization failed: {e}")
99
- # 使用降级策略
100
- if not self._settings:
101
- from crawlo.settings.setting_manager import SettingManager
102
- self._settings = SettingManager()
103
-
104
- # 确保是SettingManager实例
105
- if isinstance(self._settings, dict):
106
- from crawlo.settings.setting_manager import SettingManager
107
- settings_manager = SettingManager()
108
- settings_manager.update_attributes(self._settings)
109
- self._settings = settings_manager
110
-
111
- @property
112
- def state(self) -> CrawlerState:
113
- """获取当前状态"""
114
- return self._state
115
-
116
- @property
117
- def spider(self):
118
- """获取Spider实例"""
119
- return self._spider
120
-
121
- @property
122
- def stats(self):
123
- """获取Stats实例(向后兼容)"""
124
- return self._stats
125
-
126
- @property
127
- def metrics(self) -> CrawlerMetrics:
128
- """获取性能指标"""
129
- return self._metrics
130
-
131
- @property
132
- def settings(self):
133
- """获取配置"""
134
- return self._settings
135
-
136
- @property
137
- def engine(self):
138
- """获取Engine实例(向后兼容)"""
139
- return self._engine
140
-
141
- @property
142
- def subscriber(self):
143
- """获取Subscriber实例(向后兼容)"""
144
- return self._subscriber
145
-
146
- @property
147
- def extension(self):
148
- """获取Extension实例(向后兼容)"""
149
- return self._extension
150
-
151
- @extension.setter
152
- def extension(self, value):
153
- """设置Extension实例(向后兼容)"""
154
- self._extension = value
155
-
156
- def _create_extension(self):
157
- """创建Extension管理器(向后兼容)"""
158
- if self._extension is None:
159
- try:
160
- registry = get_component_registry()
161
- self._extension = registry.create('extension_manager', crawler=self)
162
- except Exception as e:
163
- self._logger.warning(f"Failed to create extension manager: {e}")
164
- return self._extension
165
-
166
- async def close(self):
167
- """关闭爹虫(向后兼容)"""
168
- await self._cleanup()
169
-
170
- async def crawl(self):
171
- """执行爬取任务"""
172
- async with self._lifecycle_manager():
173
- await self._initialize_components()
174
- await self._run_crawler()
175
-
176
- @asynccontextmanager
177
- async def _lifecycle_manager(self):
178
- """生命周期管理"""
179
- self._metrics.start_time = time.time()
180
-
181
- try:
182
- yield
183
- except Exception as e:
184
- await self._handle_error(e)
185
- raise
186
- finally:
187
- await self._cleanup()
188
- self._metrics.end_time = time.time()
189
-
190
- async def _initialize_components(self):
191
- """初始化组件"""
192
- async with self._state_lock:
193
- if self._state != CrawlerState.CREATED:
194
- raise RuntimeError(f"Cannot initialize from state {self._state}")
195
-
196
- self._state = CrawlerState.INITIALIZING
197
-
198
- init_start = time.time()
199
-
200
- try:
201
- # 使用组件工厂创建组件
202
- registry = get_component_registry()
203
-
204
- # 创建Subscriber(无依赖)
205
- self._subscriber = registry.create('subscriber')
206
-
207
- # 创建Spider
208
- self._spider = self._create_spider()
209
-
210
- # 创建Engine(需要crawler参数)
211
- self._engine = registry.create('engine', crawler=self)
212
-
213
- # 创建Stats(需要crawler参数)
214
- self._stats = registry.create('stats', crawler=self)
215
-
216
- # 创建Extension Manager (可选,需要crawler参数)
217
- try:
218
- self._extension = registry.create('extension_manager', crawler=self)
219
- except Exception as e:
220
- self._logger.warning(f"Failed to create extension manager: {e}")
221
-
222
- self._metrics.initialization_duration = time.time() - init_start
223
-
224
- async with self._state_lock:
225
- self._state = CrawlerState.READY
226
-
227
- self._logger.debug(f"Crawler components initialized successfully in {self._metrics.initialization_duration:.2f}s")
228
-
229
- except Exception as e:
230
- async with self._state_lock:
231
- self._state = CrawlerState.ERROR
232
- raise RuntimeError(f"Component initialization failed: {e}")
233
-
234
- def _create_spider(self):
235
- """创建Spider实例"""
236
- if not self._spider_cls:
237
- raise ValueError("Spider class not provided")
238
-
239
- # 检查Spider类的有效性
240
- if not hasattr(self._spider_cls, 'name'):
241
- raise ValueError("Spider class must have 'name' attribute")
242
-
243
- # 创建Spider实例
244
- spider = self._spider_cls()
245
-
246
- # 设置crawler引用
247
- if hasattr(spider, 'crawler'):
248
- spider.crawler = self
249
-
250
- return spider
251
-
252
- async def _run_crawler(self):
253
- """运行爬虫引擎"""
254
- async with self._state_lock:
255
- if self._state != CrawlerState.READY:
256
- raise RuntimeError(f"Cannot run from state {self._state}")
257
-
258
- self._state = CrawlerState.RUNNING
259
-
260
- crawl_start = time.time()
261
-
262
- try:
263
- # 启动引擎
264
- if self._engine:
265
- await self._engine.start_spider(self._spider)
266
- else:
267
- raise RuntimeError("Engine not initialized")
268
-
269
- self._metrics.crawl_duration = time.time() - crawl_start
270
-
271
- self._logger.info(f"Crawler completed successfully in {self._metrics.crawl_duration:.2f}s")
272
-
273
- except Exception as e:
274
- self._metrics.crawl_duration = time.time() - crawl_start
275
- raise RuntimeError(f"Crawler execution failed: {e}")
276
-
277
- async def _handle_error(self, error: Exception):
278
- """处理错误"""
279
- async with self._state_lock:
280
- self._state = CrawlerState.ERROR
281
-
282
- self._metrics.error_count += 1
283
- self._logger.error(f"Crawler error: {error}", exc_info=True)
284
-
285
- # 这里可以添加错误恢复逻辑
286
-
287
- async def _cleanup(self):
288
- """清理资源"""
289
- async with self._state_lock:
290
- if self._state not in [CrawlerState.CLOSING, CrawlerState.CLOSED]:
291
- self._state = CrawlerState.CLOSING
292
-
293
- try:
294
- # 关闭各个组件
295
- if self._engine and hasattr(self._engine, 'close'):
296
- try:
297
- await self._engine.close()
298
- except Exception as e:
299
- self._logger.warning(f"Engine cleanup failed: {e}")
300
-
301
- # 调用Spider的spider_closed方法
302
- if self._spider:
303
- try:
304
- if asyncio.iscoroutinefunction(self._spider.spider_closed):
305
- await self._spider.spider_closed()
306
- else:
307
- self._spider.spider_closed()
308
- except Exception as e:
309
- self._logger.warning(f"Spider cleanup failed: {e}")
310
-
311
- # 调用StatsCollector的close_spider方法,设置reason和spider_name
312
- if self._stats and hasattr(self._stats, 'close_spider'):
313
- try:
314
- # 使用默认的'finished'作为reason
315
- self._stats.close_spider(self._spider, reason='finished')
316
- except Exception as e:
317
- self._logger.warning(f"Stats close_spider failed: {e}")
318
-
319
- # 触发spider_closed事件,通知所有订阅者(包括扩展)
320
- # 传递reason参数,这里使用默认的'finished'作为reason
321
- await self.subscriber.notify("spider_closed", reason='finished')
322
-
323
- if self._stats and hasattr(self._stats, 'close'):
324
- try:
325
- close_result = self._stats.close()
326
- if asyncio.iscoroutine(close_result):
327
- await close_result
328
- except Exception as e:
329
- self._logger.warning(f"Stats cleanup failed: {e}")
330
-
331
- async with self._state_lock:
332
- self._state = CrawlerState.CLOSED
333
-
334
- self._logger.debug("Crawler cleanup completed")
335
-
336
- except Exception as e:
337
- self._logger.error(f"Cleanup error: {e}")
338
-
339
-
340
- class CrawlerProcess:
341
- """
342
- Crawler进程管理器 - 管理多个Crawler的执行
343
-
344
- 简化版本,专注于核心功能
345
- """
346
-
347
- def __init__(self, settings=None, max_concurrency: int = 3, spider_modules=None):
348
- # 初始化框架配置
349
- self._settings = settings or initialize_framework()
350
- self._max_concurrency = max_concurrency
351
- self._crawlers: List[ModernCrawler] = []
352
- self._semaphore = asyncio.Semaphore(max_concurrency)
353
- self._logger = get_logger('crawler.process')
354
-
355
- # 如果没有显式提供spider_modules,则从settings中获取
356
- if spider_modules is None and self._settings:
357
- spider_modules = self._settings.get('SPIDER_MODULES', [])
358
- self._logger.debug(f"从settings中获取SPIDER_MODULES: {spider_modules}")
359
-
360
- self._spider_modules = spider_modules or [] # 保存spider_modules
361
-
362
- # 如果提供了spider_modules,自动注册这些模块中的爬虫
363
- if self._spider_modules:
364
- self._register_spider_modules(self._spider_modules)
365
-
366
- # 指标
367
- self._start_time: Optional[float] = None
368
- self._end_time: Optional[float] = None
369
-
370
- def _register_spider_modules(self, spider_modules):
371
- """注册爬虫模块"""
372
- try:
373
- from crawlo.spider import get_global_spider_registry
374
- registry = get_global_spider_registry()
375
-
376
- self._logger.debug(f"Registering spider modules: {spider_modules}")
377
-
378
- initial_spider_count = len(registry)
379
-
380
- for module_path in spider_modules:
381
- try:
382
- # 导入模块
383
- __import__(module_path)
384
- self._logger.debug(f"Successfully imported spider module: {module_path}")
385
- except ImportError as e:
386
- self._logger.warning(f"Failed to import spider module {module_path}: {e}")
387
- # 如果导入失败,尝试自动发现
388
- self._auto_discover_spider_modules([module_path])
389
-
390
- # 检查注册表中的爬虫
391
- spider_names = list(registry.keys())
392
- self._logger.debug(f"Registered spiders after import: {spider_names}")
393
-
394
- # 如果导入模块后没有新的爬虫被注册,则尝试自动发现
395
- final_spider_count = len(registry)
396
- if final_spider_count == initial_spider_count:
397
- self._logger.debug("No new spiders registered after importing modules, attempting auto-discovery")
398
- self._auto_discover_spider_modules(spider_modules)
399
- spider_names = list(registry.keys())
400
- self._logger.debug(f"Registered spiders after auto-discovery: {spider_names}")
401
- except Exception as e:
402
- self._logger.warning(f"Error registering spider modules: {e}")
403
-
404
- def _auto_discover_spider_modules(self, spider_modules):
405
- """
406
- 自动发现并导入爬虫模块中的所有爬虫
407
- 这个方法会扫描指定模块目录下的所有Python文件并自动导入
408
- """
409
- try:
410
- from crawlo.spider import get_global_spider_registry
411
- import importlib
412
- from pathlib import Path
413
- import sys
414
-
415
- registry = get_global_spider_registry()
416
- initial_spider_count = len(registry)
417
-
418
- for module_path in spider_modules:
419
- try:
420
- # 将模块路径转换为文件系统路径
421
- # 例如: ofweek_standalone.spiders -> ofweek_standalone/spiders
422
- package_parts = module_path.split('.')
423
- if len(package_parts) < 2:
424
- continue
425
-
426
- # 获取项目根目录
427
- project_root = None
428
- for path in sys.path:
429
- if path and Path(path).exists():
430
- possible_module_path = Path(path) / package_parts[0]
431
- if possible_module_path.exists():
432
- project_root = path
433
- break
434
-
435
- if not project_root:
436
- # 尝试使用当前工作目录
437
- project_root = str(Path.cwd())
438
-
439
- # 构建模块目录路径
440
- module_dir = Path(project_root)
441
- for part in package_parts:
442
- module_dir = module_dir / part
443
-
444
- # 如果目录存在,扫描其中的Python文件
445
- if module_dir.exists() and module_dir.is_dir():
446
- # 导入目录下的所有Python文件(除了__init__.py)
447
- for py_file in module_dir.glob("*.py"):
448
- if py_file.name.startswith('_'):
449
- continue
450
-
451
- # 构造模块名
452
- module_name = py_file.stem # 文件名(不含扩展名)
453
- full_module_path = f"{module_path}.{module_name}"
454
-
455
- try:
456
- # 导入模块以触发Spider注册
457
- importlib.import_module(full_module_path)
458
- except ImportError as e:
459
- self._logger.warning(f"Failed to auto-import spider module {full_module_path}: {e}")
460
- except Exception as e:
461
- self._logger.warning(f"Error during auto-discovery for module {module_path}: {e}")
462
-
463
- # 检查是否有新的爬虫被注册
464
- final_spider_count = len(registry)
465
- if final_spider_count > initial_spider_count:
466
- new_spiders = list(registry.keys())
467
- self._logger.info(f"Auto-discovered {final_spider_count - initial_spider_count} new spiders: {new_spiders}")
468
-
469
- except Exception as e:
470
- self._logger.warning(f"Error during auto-discovery of spider modules: {e}")
471
-
472
- def is_spider_registered(self, name: str) -> bool:
473
- """检查爬虫是否已注册"""
474
- from crawlo.spider import get_global_spider_registry
475
- registry = get_global_spider_registry()
476
- return name in registry
477
-
478
- def get_spider_class(self, name: str):
479
- """获取爬虫类"""
480
- from crawlo.spider import get_global_spider_registry
481
- registry = get_global_spider_registry()
482
- return registry.get(name)
483
-
484
- def get_spider_names(self):
485
- """获取所有注册的爬虫名称"""
486
- from crawlo.spider import get_global_spider_registry
487
- registry = get_global_spider_registry()
488
- return list(registry.keys())
489
-
490
- async def crawl(self, spider_cls_or_name, settings=None):
491
- """运行单个爬虫"""
492
- spider_cls = self._resolve_spider_class(spider_cls_or_name)
493
-
494
- # 记录启动的爬虫名称(符合规范要求)
495
- from crawlo.logging import get_logger
496
- logger = get_logger('crawlo.framework')
497
- logger.info(f"Starting spider: {spider_cls.name}")
498
-
499
- merged_settings = self._merge_settings(settings)
500
- crawler = ModernCrawler(spider_cls, merged_settings)
501
-
502
- async with self._semaphore:
503
- await crawler.crawl()
504
-
505
- return crawler
506
-
507
- async def crawl_multiple(self, spider_classes_or_names, settings=None):
508
- """运行多个爬虫"""
509
- self._start_time = time.time()
510
-
511
- try:
512
- spider_classes = []
513
- for cls_or_name in spider_classes_or_names:
514
- spider_cls = self._resolve_spider_class(cls_or_name)
515
- spider_classes.append(spider_cls)
516
-
517
- # 记录启动的爬虫名称(符合规范要求)
518
- spider_names = [cls.name for cls in spider_classes]
519
- from crawlo.logging import get_logger
520
- logger = get_logger('crawlo.framework')
521
- if len(spider_names) == 1:
522
- logger.info(f"Starting spider: {spider_names[0]}")
523
- else:
524
- logger.info(f"Starting spiders: {', '.join(spider_names)}")
525
-
526
- tasks = []
527
- for spider_cls in spider_classes:
528
- merged_settings = self._merge_settings(settings)
529
- crawler = ModernCrawler(spider_cls, merged_settings)
530
- self._crawlers.append(crawler)
531
-
532
- task = asyncio.create_task(self._run_with_semaphore(crawler))
533
- tasks.append(task)
534
-
535
- results = await asyncio.gather(*tasks, return_exceptions=True)
536
-
537
- # 处理结果
538
- successful = sum(1 for r in results if not isinstance(r, Exception))
539
- failed = len(results) - successful
540
-
541
- self._logger.info(f"Crawl completed: {successful} successful, {failed} failed")
542
-
543
- return results
544
-
545
- finally:
546
- self._end_time = time.time()
547
- if self._start_time:
548
- duration = self._end_time - self._start_time
549
- self._logger.info(f"Total execution time: {duration:.2f}s")
550
-
551
- async def _run_with_semaphore(self, crawler: ModernCrawler):
552
- """在信号量控制下运行爬虫"""
553
- async with self._semaphore:
554
- await crawler.crawl()
555
- return crawler
556
-
557
- def _resolve_spider_class(self, spider_cls_or_name):
558
- """解析Spider类"""
559
- if isinstance(spider_cls_or_name, str):
560
- # 从注册表中查找
561
- try:
562
- from crawlo.spider import get_global_spider_registry
563
- registry = get_global_spider_registry()
564
- if spider_cls_or_name in registry:
565
- return registry[spider_cls_or_name]
566
- else:
567
- # 如果在注册表中找不到,尝试通过spider_modules导入所有模块来触发注册
568
- # 然后再次检查注册表
569
- if hasattr(self, '_spider_modules') and self._spider_modules:
570
- for module_path in self._spider_modules:
571
- try:
572
- # 导入模块来触发爬虫注册
573
- __import__(module_path)
574
- except ImportError:
575
- pass # 忽略导入错误
576
-
577
- # 再次检查注册表
578
- if spider_cls_or_name in registry:
579
- return registry[spider_cls_or_name]
580
-
581
- # 如果仍然找不到,尝试自动发现模式
582
- if hasattr(self, '_spider_modules') and self._spider_modules:
583
- self._auto_discover_spider_modules(self._spider_modules)
584
- if spider_cls_or_name in registry:
585
- return registry[spider_cls_or_name]
586
-
587
- # 如果仍然找不到,尝试直接导入模块
588
- try:
589
- # 假设格式为 module.SpiderClass
590
- if '.' in spider_cls_or_name:
591
- module_path, class_name = spider_cls_or_name.rsplit('.', 1)
592
- module = __import__(module_path, fromlist=[class_name])
593
- spider_class = getattr(module, class_name)
594
- # 注册到全局注册表
595
- registry[spider_class.name] = spider_class
596
- return spider_class
597
- else:
598
- # 尝试在spider_modules中查找
599
- if hasattr(self, '_spider_modules') and self._spider_modules:
600
- for module_path in self._spider_modules:
601
- try:
602
- # 构造完整的模块路径
603
- full_module_path = f"{module_path}.{spider_cls_or_name}"
604
- module = __import__(full_module_path, fromlist=[spider_cls_or_name])
605
- # 获取模块中的Spider子类
606
- for attr_name in dir(module):
607
- attr_value = getattr(module, attr_name)
608
- if (isinstance(attr_value, type) and
609
- issubclass(attr_value, registry.__class__.__bases__[0]) and
610
- hasattr(attr_value, 'name') and
611
- attr_value.name == spider_cls_or_name):
612
- # 注册到全局注册表
613
- registry[spider_cls_or_name] = attr_value
614
- return attr_value
615
- except ImportError:
616
- continue
617
- raise ValueError(f"Spider '{spider_cls_or_name}' not found in registry")
618
- except (ImportError, AttributeError):
619
- raise ValueError(f"Spider '{spider_cls_or_name}' not found in registry")
620
- except ImportError:
621
- raise ValueError(f"Cannot resolve spider name '{spider_cls_or_name}'")
622
- else:
623
- return spider_cls_or_name
624
-
625
- def _merge_settings(self, additional_settings):
626
- """合并配置"""
627
- if not additional_settings:
628
- return self._settings
629
-
630
- # 这里可以实现更复杂的配置合并逻辑
631
- from crawlo.settings.setting_manager import SettingManager
632
- merged = SettingManager()
633
-
634
- # 复制基础配置
635
- if self._settings:
636
- merged.update_attributes(self._settings.__dict__)
637
-
638
- # 应用额外配置
639
- merged.update_attributes(additional_settings)
640
-
641
- return merged
642
-
643
- def get_metrics(self) -> Dict[str, Any]:
644
- """获取整体指标"""
645
- total_duration = 0.0
646
- if self._start_time and self._end_time:
647
- total_duration = self._end_time - self._start_time
648
-
649
- crawler_metrics = [crawler.metrics for crawler in self._crawlers]
650
-
651
- return {
652
- 'total_duration': total_duration,
653
- 'crawler_count': len(self._crawlers),
654
- 'total_requests': sum(m.request_count for m in crawler_metrics),
655
- 'total_success': sum(m.success_count for m in crawler_metrics),
656
- 'total_errors': sum(m.error_count for m in crawler_metrics),
657
- 'average_success_rate': sum(m.get_success_rate() for m in crawler_metrics) / len(crawler_metrics) if crawler_metrics else 0.0
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ Crawler系统
5
+ ==========
6
+
7
+ 核心组件:
8
+ - Crawler: 爬虫核心控制器,负责单个爬虫的生命周期管理
9
+ - CrawlerProcess: 爬虫进程管理器,支持单个/多个爬虫运行
10
+
11
+ 设计原则:
12
+ 1. 单一职责 - 每个类只负责一个明确的功能
13
+ 2. 依赖注入 - 通过工厂创建组件,便于测试
14
+ 3. 状态管理 - 清晰的状态转换和生命周期
15
+ 4. 错误处理 - 优雅的错误处理和恢复机制
16
+ 5. 资源管理 - 统一的资源注册和清理机制
17
+ """
18
+
19
+ import asyncio
20
+ import time
21
+ from enum import Enum
22
+ from dataclasses import dataclass
23
+ from contextlib import asynccontextmanager
24
+ from typing import Optional, Type, Dict, Any, List
25
+
26
+ from crawlo.logging import get_logger
27
+ from crawlo.factories import get_component_registry
28
+ from crawlo.initialization import initialize_framework, is_framework_ready
29
+ from crawlo.utils.resource_manager import ResourceManager, ResourceType
30
+
31
+
32
+ class CrawlerState(Enum):
33
+ """Crawler状态枚举"""
34
+ CREATED = "created"
35
+ INITIALIZING = "initializing"
36
+ READY = "ready"
37
+ RUNNING = "running"
38
+ CLOSING = "closing"
39
+ CLOSED = "closed"
40
+ ERROR = "error"
41
+
42
+
43
+ @dataclass
44
+ class CrawlerMetrics:
45
+ """Crawler性能指标"""
46
+ start_time: Optional[float] = None
47
+ end_time: Optional[float] = None
48
+ initialization_duration: float = 0.0
49
+ crawl_duration: float = 0.0
50
+ request_count: int = 0
51
+ success_count: int = 0
52
+ error_count: int = 0
53
+
54
+ def get_total_duration(self) -> float:
55
+ if self.start_time and self.end_time:
56
+ return self.end_time - self.start_time
57
+ return 0.0
58
+
59
+ def get_success_rate(self) -> float:
60
+ total = self.success_count + self.error_count
61
+ return (self.success_count / total * 100) if total > 0 else 0.0
62
+
63
+
64
+ class Crawler:
65
+ """
66
+ 爬虫核心控制器
67
+
68
+ 特点:
69
+ 1. 清晰的状态管理
70
+ 2. 依赖注入
71
+ 3. 组件化架构
72
+ 4. 完善的错误处理
73
+ 5. 统一的资源管理
74
+ """
75
+
76
+ def __init__(self, spider_cls: Type, settings=None):
77
+ self._spider_cls = spider_cls
78
+ self._settings = settings
79
+ self._state = CrawlerState.CREATED
80
+ self._state_lock = asyncio.Lock()
81
+
82
+ # 组件
83
+ self._spider = None
84
+ self._engine = None
85
+ self._stats = None
86
+ self._subscriber = None
87
+ self._extension = None
88
+
89
+ # 指标
90
+ self._metrics = CrawlerMetrics()
91
+
92
+ # 资源管理器
93
+ self._resource_manager = ResourceManager(name=f"crawler.{spider_cls.__name__ if spider_cls else 'unknown'}")
94
+
95
+ # 日志
96
+ self._logger = get_logger(f'crawler.{spider_cls.__name__ if spider_cls else "unknown"}')
97
+
98
+ # 确保框架已初始化
99
+ self._ensure_framework_ready()
100
+
101
+ def _ensure_framework_ready(self):
102
+ """确保框架已准备就绪"""
103
+ if not is_framework_ready():
104
+ try:
105
+ self._settings = initialize_framework(self._settings)
106
+ self._logger.debug("Framework initialized successfully")
107
+ except Exception as e:
108
+ self._logger.warning(f"Framework initialization failed: {e}")
109
+ # 使用降级策略
110
+ if not self._settings:
111
+ from crawlo.settings.setting_manager import SettingManager
112
+ self._settings = SettingManager()
113
+
114
+ # 确保是SettingManager实例
115
+ if isinstance(self._settings, dict):
116
+ from crawlo.settings.setting_manager import SettingManager
117
+ settings_manager = SettingManager()
118
+ settings_manager.update_attributes(self._settings)
119
+ self._settings = settings_manager
120
+
121
+ @property
122
+ def state(self) -> CrawlerState:
123
+ """获取当前状态"""
124
+ return self._state
125
+
126
+ @property
127
+ def spider(self):
128
+ """获取Spider实例"""
129
+ return self._spider
130
+
131
+ @property
132
+ def stats(self):
133
+ """获取Stats实例(向后兼容)"""
134
+ return self._stats
135
+
136
+ @property
137
+ def metrics(self) -> CrawlerMetrics:
138
+ """获取性能指标"""
139
+ return self._metrics
140
+
141
+ @property
142
+ def settings(self):
143
+ """获取配置"""
144
+ return self._settings
145
+
146
+ @property
147
+ def engine(self):
148
+ """获取Engine实例(向后兼容)"""
149
+ return self._engine
150
+
151
+ @property
152
+ def subscriber(self):
153
+ """获取Subscriber实例(向后兼容)"""
154
+ return self._subscriber
155
+
156
+ @property
157
+ def extension(self):
158
+ """获取Extension实例(向后兼容)"""
159
+ return self._extension
160
+
161
+ @extension.setter
162
+ def extension(self, value):
163
+ """设置Extension实例(向后兼容)"""
164
+ self._extension = value
165
+
166
+ def _create_extension(self):
167
+ """创建Extension管理器(向后兼容)"""
168
+ if self._extension is None:
169
+ try:
170
+ registry = get_component_registry()
171
+ self._extension = registry.create('extension_manager', crawler=self)
172
+ except Exception as e:
173
+ self._logger.warning(f"Failed to create extension manager: {e}")
174
+ return self._extension
175
+
176
+ async def close(self):
177
+ """关闭爹虫(向后兼容)"""
178
+ await self._cleanup()
179
+
180
+ async def crawl(self):
181
+ """执行爬取任务"""
182
+ async with self._lifecycle_manager():
183
+ await self._initialize_components()
184
+ await self._run_crawler()
185
+
186
+ @asynccontextmanager
187
+ async def _lifecycle_manager(self):
188
+ """生命周期管理"""
189
+ self._metrics.start_time = time.time()
190
+
191
+ try:
192
+ yield
193
+ except Exception as e:
194
+ await self._handle_error(e)
195
+ raise
196
+ finally:
197
+ await self._cleanup()
198
+ self._metrics.end_time = time.time()
199
+
200
+ async def _initialize_components(self):
201
+ """初始化组件"""
202
+ async with self._state_lock:
203
+ if self._state != CrawlerState.CREATED:
204
+ raise RuntimeError(f"Cannot initialize from state {self._state}")
205
+
206
+ self._state = CrawlerState.INITIALIZING
207
+
208
+ init_start = time.time()
209
+
210
+ try:
211
+ # 使用组件工厂创建组件
212
+ registry = get_component_registry()
213
+
214
+ # 创建Subscriber(无依赖)
215
+ self._subscriber = registry.create('subscriber')
216
+
217
+ # 创建Spider
218
+ self._spider = self._create_spider()
219
+
220
+ # 创建Engine(需要crawler参数)
221
+ self._engine = registry.create('engine', crawler=self)
222
+ # 注册Engine到资源管理器
223
+ if self._engine and hasattr(self._engine, 'close'):
224
+ self._resource_manager.register(
225
+ self._engine,
226
+ lambda e: e.close() if hasattr(e, 'close') else None,
227
+ ResourceType.OTHER,
228
+ name="engine"
229
+ )
230
+
231
+ # 创建Stats(需要crawler参数)
232
+ self._stats = registry.create('stats', crawler=self)
233
+
234
+ # 创建Extension Manager (可选,需要crawler参数)
235
+ try:
236
+ self._extension = registry.create('extension_manager', crawler=self)
237
+ except Exception as e:
238
+ self._logger.warning(f"Failed to create extension manager: {e}")
239
+
240
+ self._metrics.initialization_duration = time.time() - init_start
241
+
242
+ async with self._state_lock:
243
+ self._state = CrawlerState.READY
244
+
245
+ self._logger.debug(f"Crawler components initialized successfully in {self._metrics.initialization_duration:.2f}s")
246
+
247
+ except Exception as e:
248
+ async with self._state_lock:
249
+ self._state = CrawlerState.ERROR
250
+ raise RuntimeError(f"Component initialization failed: {e}")
251
+
252
+ def _create_spider(self):
253
+ """创建Spider实例"""
254
+ if not self._spider_cls:
255
+ raise ValueError("Spider class not provided")
256
+
257
+ # 检查Spider类的有效性
258
+ if not hasattr(self._spider_cls, 'name'):
259
+ raise ValueError("Spider class must have 'name' attribute")
260
+
261
+ # 创建Spider实例
262
+ spider = self._spider_cls()
263
+
264
+ # 设置crawler引用
265
+ if hasattr(spider, 'crawler'):
266
+ spider.crawler = self
267
+
268
+ return spider
269
+
270
+ async def _run_crawler(self):
271
+ """运行爬虫引擎"""
272
+ async with self._state_lock:
273
+ if self._state != CrawlerState.READY:
274
+ raise RuntimeError(f"Cannot run from state {self._state}")
275
+
276
+ self._state = CrawlerState.RUNNING
277
+
278
+ crawl_start = time.time()
279
+
280
+ try:
281
+ # 启动引擎
282
+ if self._engine:
283
+ await self._engine.start_spider(self._spider)
284
+ else:
285
+ raise RuntimeError("Engine not initialized")
286
+
287
+ self._metrics.crawl_duration = time.time() - crawl_start
288
+
289
+ self._logger.info(f"Crawler completed successfully in {self._metrics.crawl_duration:.2f}s")
290
+
291
+ except Exception as e:
292
+ self._metrics.crawl_duration = time.time() - crawl_start
293
+ raise RuntimeError(f"Crawler execution failed: {e}")
294
+
295
+ async def _handle_error(self, error: Exception):
296
+ """处理错误"""
297
+ async with self._state_lock:
298
+ self._state = CrawlerState.ERROR
299
+
300
+ self._metrics.error_count += 1
301
+ self._logger.error(f"Crawler error: {error}", exc_info=True)
302
+
303
+ # 这里可以添加错误恢复逻辑
304
+
305
+ async def _cleanup(self):
306
+ """清理资源"""
307
+ async with self._state_lock:
308
+ if self._state not in [CrawlerState.CLOSING, CrawlerState.CLOSED]:
309
+ self._state = CrawlerState.CLOSING
310
+
311
+ try:
312
+ # 使用资源管理器统一清理
313
+ self._logger.debug("开始清理Crawler资源...")
314
+ cleanup_result = await self._resource_manager.cleanup_all()
315
+ self._logger.debug(
316
+ f"资源清理完成: {cleanup_result['success']}成功, "
317
+ f"{cleanup_result['errors']}失败, 耗时{cleanup_result['duration']:.2f}s"
318
+ )
319
+
320
+ # 关闭各个组件(继续兼容旧逻辑)
321
+ if self._engine and hasattr(self._engine, 'close'):
322
+ try:
323
+ await self._engine.close()
324
+ except Exception as e:
325
+ self._logger.warning(f"Engine cleanup failed: {e}")
326
+
327
+ # 调用Spider的spider_closed方法
328
+ if self._spider:
329
+ try:
330
+ if asyncio.iscoroutinefunction(self._spider.spider_closed):
331
+ await self._spider.spider_closed()
332
+ else:
333
+ self._spider.spider_closed()
334
+ except Exception as e:
335
+ self._logger.warning(f"Spider cleanup failed: {e}")
336
+
337
+ # 调用StatsCollector的close_spider方法,设置reason和spider_name
338
+ if self._stats and hasattr(self._stats, 'close_spider'):
339
+ try:
340
+ # 使用默认的'finished'作为reason
341
+ self._stats.close_spider(self._spider, reason='finished')
342
+ except Exception as e:
343
+ self._logger.warning(f"Stats close_spider failed: {e}")
344
+
345
+ # 触发spider_closed事件,通知所有订阅者(包括扩展)
346
+ # 传递reason参数,这里使用默认的'finished'作为reason
347
+ if self.subscriber:
348
+ from crawlo.event import CrawlerEvent
349
+ await self.subscriber.notify(CrawlerEvent.SPIDER_CLOSED, reason='finished')
350
+
351
+ if self._stats and hasattr(self._stats, 'close'):
352
+ try:
353
+ close_result = self._stats.close()
354
+ if asyncio.iscoroutine(close_result):
355
+ await close_result
356
+ except Exception as e:
357
+ self._logger.warning(f"Stats cleanup failed: {e}")
358
+
359
+ async with self._state_lock:
360
+ self._state = CrawlerState.CLOSED
361
+
362
+ self._logger.debug("Crawler cleanup completed")
363
+
364
+ except Exception as e:
365
+ self._logger.error(f"Cleanup error: {e}")
366
+
367
+
368
+ class CrawlerProcess:
369
+ """
370
+ Crawler进程管理器 - 管理多个Crawler的执行
371
+
372
+ 简化版本,专注于核心功能
373
+ """
374
+
375
+ def __init__(self, settings=None, max_concurrency: int = 3, spider_modules=None):
376
+ # 初始化框架配置
377
+ self._settings = settings or initialize_framework()
378
+ self._max_concurrency = max_concurrency
379
+ self._crawlers: List[Crawler] = []
380
+ self._semaphore = asyncio.Semaphore(max_concurrency)
381
+ self._logger = get_logger('crawler.process')
382
+
383
+ # 如果没有显式提供spider_modules,则从settings中获取
384
+ if spider_modules is None and self._settings:
385
+ spider_modules = self._settings.get('SPIDER_MODULES', [])
386
+ self._logger.debug(f"从settings中获取SPIDER_MODULES: {spider_modules}")
387
+
388
+ self._spider_modules = spider_modules or [] # 保存spider_modules
389
+
390
+ # 如果提供了spider_modules,自动注册这些模块中的爬虫
391
+ if self._spider_modules:
392
+ self._register_spider_modules(self._spider_modules)
393
+
394
+ # 指标
395
+ self._start_time: Optional[float] = None
396
+ self._end_time: Optional[float] = None
397
+
398
+ def _register_spider_modules(self, spider_modules):
399
+ """注册爬虫模块"""
400
+ try:
401
+ from crawlo.spider import get_global_spider_registry
402
+ registry = get_global_spider_registry()
403
+
404
+ self._logger.debug(f"Registering spider modules: {spider_modules}")
405
+
406
+ initial_spider_count = len(registry)
407
+
408
+ for module_path in spider_modules:
409
+ try:
410
+ # 导入模块
411
+ __import__(module_path)
412
+ self._logger.debug(f"Successfully imported spider module: {module_path}")
413
+ except ImportError as e:
414
+ self._logger.warning(f"Failed to import spider module {module_path}: {e}")
415
+ # 如果导入失败,尝试自动发现
416
+ self._auto_discover_spider_modules([module_path])
417
+
418
+ # 检查注册表中的爬虫
419
+ spider_names = list(registry.keys())
420
+ self._logger.debug(f"Registered spiders after import: {spider_names}")
421
+
422
+ # 如果导入模块后没有新的爬虫被注册,则尝试自动发现
423
+ final_spider_count = len(registry)
424
+ if final_spider_count == initial_spider_count:
425
+ self._logger.debug("No new spiders registered after importing modules, attempting auto-discovery")
426
+ self._auto_discover_spider_modules(spider_modules)
427
+ spider_names = list(registry.keys())
428
+ self._logger.debug(f"Registered spiders after auto-discovery: {spider_names}")
429
+ except Exception as e:
430
+ self._logger.warning(f"Error registering spider modules: {e}")
431
+
432
+ def _auto_discover_spider_modules(self, spider_modules):
433
+ """
434
+ 自动发现并导入爬虫模块中的所有爬虫
435
+ 这个方法会扫描指定模块目录下的所有Python文件并自动导入
436
+ """
437
+ try:
438
+ from crawlo.spider import get_global_spider_registry
439
+ import importlib
440
+ from pathlib import Path
441
+ import sys
442
+
443
+ registry = get_global_spider_registry()
444
+ initial_spider_count = len(registry)
445
+
446
+ for module_path in spider_modules:
447
+ try:
448
+ # 将模块路径转换为文件系统路径
449
+ # 例如: ofweek_standalone.spiders -> ofweek_standalone/spiders
450
+ package_parts = module_path.split('.')
451
+ if len(package_parts) < 2:
452
+ continue
453
+
454
+ # 获取项目根目录
455
+ project_root = None
456
+ for path in sys.path:
457
+ if path and Path(path).exists():
458
+ possible_module_path = Path(path) / package_parts[0]
459
+ if possible_module_path.exists():
460
+ project_root = path
461
+ break
462
+
463
+ if not project_root:
464
+ # 尝试使用当前工作目录
465
+ project_root = str(Path.cwd())
466
+
467
+ # 构建模块目录路径
468
+ module_dir = Path(project_root)
469
+ for part in package_parts:
470
+ module_dir = module_dir / part
471
+
472
+ # 如果目录存在,扫描其中的Python文件
473
+ if module_dir.exists() and module_dir.is_dir():
474
+ # 导入目录下的所有Python文件(除了__init__.py)
475
+ for py_file in module_dir.glob("*.py"):
476
+ if py_file.name.startswith('_'):
477
+ continue
478
+
479
+ # 构造模块名
480
+ module_name = py_file.stem # 文件名(不含扩展名)
481
+ full_module_path = f"{module_path}.{module_name}"
482
+
483
+ try:
484
+ # 导入模块以触发Spider注册
485
+ importlib.import_module(full_module_path)
486
+ except ImportError as e:
487
+ self._logger.warning(f"Failed to auto-import spider module {full_module_path}: {e}")
488
+ except Exception as e:
489
+ self._logger.warning(f"Error during auto-discovery for module {module_path}: {e}")
490
+
491
+ # 检查是否有新的爬虫被注册
492
+ final_spider_count = len(registry)
493
+ if final_spider_count > initial_spider_count:
494
+ new_spiders = list(registry.keys())
495
+ self._logger.info(f"Auto-discovered {final_spider_count - initial_spider_count} new spiders: {new_spiders}")
496
+
497
+ except Exception as e:
498
+ self._logger.warning(f"Error during auto-discovery of spider modules: {e}")
499
+
500
+ def is_spider_registered(self, name: str) -> bool:
501
+ """检查爬虫是否已注册"""
502
+ from crawlo.spider import get_global_spider_registry
503
+ registry = get_global_spider_registry()
504
+ return name in registry
505
+
506
+ def get_spider_class(self, name: str):
507
+ """获取爬虫类"""
508
+ from crawlo.spider import get_global_spider_registry
509
+ registry = get_global_spider_registry()
510
+ return registry.get(name)
511
+
512
+ def get_spider_names(self):
513
+ """获取所有注册的爬虫名称"""
514
+ from crawlo.spider import get_global_spider_registry
515
+ registry = get_global_spider_registry()
516
+ return list(registry.keys())
517
+
518
+ async def crawl(self, spider_cls_or_name, settings=None):
519
+ """运行单个爬虫"""
520
+ spider_cls = self._resolve_spider_class(spider_cls_or_name)
521
+
522
+ # 记录启动的爬虫名称(符合规范要求)
523
+ from crawlo.logging import get_logger
524
+ logger = get_logger('crawlo.framework')
525
+ logger.info(f"Starting spider: {spider_cls.name}")
526
+
527
+ merged_settings = self._merge_settings(settings)
528
+ crawler = Crawler(spider_cls, merged_settings)
529
+
530
+ async with self._semaphore:
531
+ await crawler.crawl()
532
+
533
+ return crawler
534
+
535
+ async def crawl_multiple(self, spider_classes_or_names, settings=None):
536
+ """运行多个爬虫"""
537
+ self._start_time = time.time()
538
+
539
+ try:
540
+ spider_classes = []
541
+ for cls_or_name in spider_classes_or_names:
542
+ spider_cls = self._resolve_spider_class(cls_or_name)
543
+ spider_classes.append(spider_cls)
544
+
545
+ # 记录启动的爬虫名称(符合规范要求)
546
+ spider_names = [cls.name for cls in spider_classes]
547
+ from crawlo.logging import get_logger
548
+ logger = get_logger('crawlo.framework')
549
+ if len(spider_names) == 1:
550
+ logger.info(f"Starting spider: {spider_names[0]}")
551
+ else:
552
+ logger.info(f"Starting spiders: {', '.join(spider_names)}")
553
+
554
+ tasks = []
555
+ for spider_cls in spider_classes:
556
+ merged_settings = self._merge_settings(settings)
557
+ crawler = Crawler(spider_cls, merged_settings)
558
+ self._crawlers.append(crawler)
559
+
560
+ task = asyncio.create_task(self._run_with_semaphore(crawler))
561
+ tasks.append(task)
562
+
563
+ results = await asyncio.gather(*tasks, return_exceptions=True)
564
+
565
+ # 处理结果
566
+ successful = sum(1 for r in results if not isinstance(r, Exception))
567
+ failed = len(results) - successful
568
+
569
+ self._logger.info(f"Crawl completed: {successful} successful, {failed} failed")
570
+
571
+ return results
572
+
573
+ finally:
574
+ # 清理所有crawler,防止资源累积
575
+ self._logger.debug(f"Cleaning up {len(self._crawlers)} crawler(s)...")
576
+ for crawler in self._crawlers:
577
+ try:
578
+ # 确保每个crawler都被清理
579
+ if hasattr(crawler, '_resource_manager'):
580
+ await crawler._resource_manager.cleanup_all()
581
+ except Exception as e:
582
+ self._logger.warning(f"Failed to cleanup crawler: {e}")
583
+
584
+ # 清空crawlers列表,释放引用
585
+ self._crawlers.clear()
586
+
587
+ self._end_time = time.time()
588
+ if self._start_time:
589
+ duration = self._end_time - self._start_time
590
+ self._logger.info(f"Total execution time: {duration:.2f}s")
591
+
592
+ async def _run_with_semaphore(self, crawler: Crawler):
593
+ """在信号量控制下运行爬虫"""
594
+ async with self._semaphore:
595
+ await crawler.crawl()
596
+ return crawler
597
+
598
+ def _resolve_spider_class(self, spider_cls_or_name):
599
+ """解析Spider类"""
600
+ if isinstance(spider_cls_or_name, str):
601
+ # 从注册表中查找
602
+ try:
603
+ from crawlo.spider import get_global_spider_registry
604
+ registry = get_global_spider_registry()
605
+ if spider_cls_or_name in registry:
606
+ return registry[spider_cls_or_name]
607
+ else:
608
+ # 如果在注册表中找不到,尝试通过spider_modules导入所有模块来触发注册
609
+ # 然后再次检查注册表
610
+ if hasattr(self, '_spider_modules') and self._spider_modules:
611
+ for module_path in self._spider_modules:
612
+ try:
613
+ # 导入模块来触发爬虫注册
614
+ __import__(module_path)
615
+ except ImportError:
616
+ pass # 忽略导入错误
617
+
618
+ # 再次检查注册表
619
+ if spider_cls_or_name in registry:
620
+ return registry[spider_cls_or_name]
621
+
622
+ # 如果仍然找不到,尝试自动发现模式
623
+ if hasattr(self, '_spider_modules') and self._spider_modules:
624
+ self._auto_discover_spider_modules(self._spider_modules)
625
+ if spider_cls_or_name in registry:
626
+ return registry[spider_cls_or_name]
627
+
628
+ # 如果仍然找不到,尝试直接导入模块
629
+ try:
630
+ # 假设格式为 module.SpiderClass
631
+ if '.' in spider_cls_or_name:
632
+ module_path, class_name = spider_cls_or_name.rsplit('.', 1)
633
+ module = __import__(module_path, fromlist=[class_name])
634
+ spider_class = getattr(module, class_name)
635
+ # 注册到全局注册表
636
+ registry[spider_class.name] = spider_class
637
+ return spider_class
638
+ else:
639
+ # 尝试在spider_modules中查找
640
+ if hasattr(self, '_spider_modules') and self._spider_modules:
641
+ for module_path in self._spider_modules:
642
+ try:
643
+ # 构造完整的模块路径
644
+ full_module_path = f"{module_path}.{spider_cls_or_name}"
645
+ module = __import__(full_module_path, fromlist=[spider_cls_or_name])
646
+ # 获取模块中的Spider子类
647
+ for attr_name in dir(module):
648
+ attr_value = getattr(module, attr_name)
649
+ if (isinstance(attr_value, type) and
650
+ issubclass(attr_value, registry.__class__.__bases__[0]) and
651
+ hasattr(attr_value, 'name') and
652
+ attr_value.name == spider_cls_or_name):
653
+ # 注册到全局注册表
654
+ registry[spider_cls_or_name] = attr_value
655
+ return attr_value
656
+ except ImportError:
657
+ continue
658
+ raise ValueError(f"Spider '{spider_cls_or_name}' not found in registry")
659
+ except (ImportError, AttributeError):
660
+ raise ValueError(f"Spider '{spider_cls_or_name}' not found in registry")
661
+ except ImportError:
662
+ raise ValueError(f"Cannot resolve spider name '{spider_cls_or_name}'")
663
+ else:
664
+ return spider_cls_or_name
665
+
666
+ def _merge_settings(self, additional_settings):
667
+ """合并配置"""
668
+ if not additional_settings:
669
+ return self._settings
670
+
671
+ # 这里可以实现更复杂的配置合并逻辑
672
+ from crawlo.settings.setting_manager import SettingManager
673
+ merged = SettingManager()
674
+
675
+ # 复制基础配置
676
+ if self._settings:
677
+ merged.update_attributes(self._settings.__dict__)
678
+
679
+ # 应用额外配置
680
+ merged.update_attributes(additional_settings)
681
+
682
+ return merged
683
+
684
+ def get_metrics(self) -> Dict[str, Any]:
685
+ """获取整体指标"""
686
+ total_duration = 0.0
687
+ if self._start_time and self._end_time:
688
+ total_duration = self._end_time - self._start_time
689
+
690
+ crawler_metrics = [crawler.metrics for crawler in self._crawlers]
691
+
692
+ return {
693
+ 'total_duration': total_duration,
694
+ 'crawler_count': len(self._crawlers),
695
+ 'total_requests': sum(m.request_count for m in crawler_metrics),
696
+ 'total_success': sum(m.success_count for m in crawler_metrics),
697
+ 'total_errors': sum(m.error_count for m in crawler_metrics),
698
+ 'average_success_rate': sum(m.get_success_rate() for m in crawler_metrics) / len(crawler_metrics) if crawler_metrics else 0.0
658
699
  }