crawlo 1.4.7__py3-none-any.whl → 1.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (348) hide show
  1. crawlo/__init__.py +90 -90
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +186 -186
  7. crawlo/commands/help.py +140 -140
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +379 -379
  10. crawlo/commands/startproject.py +460 -460
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +320 -320
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +52 -52
  16. crawlo/core/engine.py +451 -451
  17. crawlo/core/processor.py +47 -47
  18. crawlo/core/scheduler.py +290 -290
  19. crawlo/crawler.py +698 -698
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +280 -280
  23. crawlo/downloader/aiohttp_downloader.py +233 -233
  24. crawlo/downloader/cffi_downloader.py +250 -250
  25. crawlo/downloader/httpx_downloader.py +265 -265
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +425 -425
  28. crawlo/downloader/selenium_downloader.py +486 -486
  29. crawlo/event.py +45 -45
  30. crawlo/exceptions.py +214 -214
  31. crawlo/extension/__init__.py +64 -64
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +94 -94
  34. crawlo/extension/log_stats.py +70 -70
  35. crawlo/extension/logging_extension.py +53 -53
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +27 -27
  40. crawlo/factories/base.py +68 -68
  41. crawlo/factories/crawler.py +104 -104
  42. crawlo/factories/registry.py +84 -84
  43. crawlo/factories/utils.py +134 -134
  44. crawlo/filters/__init__.py +170 -170
  45. crawlo/filters/aioredis_filter.py +347 -347
  46. crawlo/filters/memory_filter.py +261 -261
  47. crawlo/framework.py +306 -306
  48. crawlo/initialization/__init__.py +44 -44
  49. crawlo/initialization/built_in.py +391 -391
  50. crawlo/initialization/context.py +141 -141
  51. crawlo/initialization/core.py +240 -240
  52. crawlo/initialization/phases.py +229 -229
  53. crawlo/initialization/registry.py +143 -143
  54. crawlo/initialization/utils.py +48 -48
  55. crawlo/interfaces.py +23 -23
  56. crawlo/items/__init__.py +23 -23
  57. crawlo/items/base.py +23 -23
  58. crawlo/items/fields.py +52 -52
  59. crawlo/items/items.py +104 -104
  60. crawlo/logging/__init__.py +42 -42
  61. crawlo/logging/config.py +280 -276
  62. crawlo/logging/factory.py +175 -175
  63. crawlo/logging/manager.py +104 -104
  64. crawlo/middleware/__init__.py +87 -87
  65. crawlo/middleware/default_header.py +132 -132
  66. crawlo/middleware/download_delay.py +104 -104
  67. crawlo/middleware/middleware_manager.py +142 -142
  68. crawlo/middleware/offsite.py +123 -123
  69. crawlo/middleware/proxy.py +209 -209
  70. crawlo/middleware/request_ignore.py +86 -86
  71. crawlo/middleware/response_code.py +150 -150
  72. crawlo/middleware/response_filter.py +136 -136
  73. crawlo/middleware/retry.py +124 -124
  74. crawlo/mode_manager.py +287 -287
  75. crawlo/network/__init__.py +21 -21
  76. crawlo/network/request.py +408 -376
  77. crawlo/network/response.py +598 -569
  78. crawlo/pipelines/__init__.py +52 -52
  79. crawlo/pipelines/base_pipeline.py +452 -452
  80. crawlo/pipelines/bloom_dedup_pipeline.py +145 -146
  81. crawlo/pipelines/console_pipeline.py +39 -39
  82. crawlo/pipelines/csv_pipeline.py +316 -316
  83. crawlo/pipelines/database_dedup_pipeline.py +196 -197
  84. crawlo/pipelines/json_pipeline.py +218 -218
  85. crawlo/pipelines/memory_dedup_pipeline.py +104 -105
  86. crawlo/pipelines/mongo_pipeline.py +140 -139
  87. crawlo/pipelines/mysql_pipeline.py +468 -469
  88. crawlo/pipelines/pipeline_manager.py +100 -100
  89. crawlo/pipelines/redis_dedup_pipeline.py +155 -155
  90. crawlo/project.py +347 -347
  91. crawlo/queue/__init__.py +9 -9
  92. crawlo/queue/pqueue.py +38 -38
  93. crawlo/queue/queue_manager.py +591 -591
  94. crawlo/queue/redis_priority_queue.py +518 -518
  95. crawlo/settings/__init__.py +7 -7
  96. crawlo/settings/default_settings.py +287 -284
  97. crawlo/settings/setting_manager.py +219 -219
  98. crawlo/spider/__init__.py +658 -657
  99. crawlo/stats_collector.py +81 -81
  100. crawlo/subscriber.py +129 -129
  101. crawlo/task_manager.py +138 -138
  102. crawlo/templates/crawlo.cfg.tmpl +10 -10
  103. crawlo/templates/project/__init__.py.tmpl +1 -1
  104. crawlo/templates/project/items.py.tmpl +13 -13
  105. crawlo/templates/project/middlewares.py.tmpl +38 -38
  106. crawlo/templates/project/pipelines.py.tmpl +35 -35
  107. crawlo/templates/project/settings.py.tmpl +113 -109
  108. crawlo/templates/project/settings_distributed.py.tmpl +160 -156
  109. crawlo/templates/project/settings_gentle.py.tmpl +174 -170
  110. crawlo/templates/project/settings_high_performance.py.tmpl +175 -171
  111. crawlo/templates/project/settings_minimal.py.tmpl +102 -98
  112. crawlo/templates/project/settings_simple.py.tmpl +172 -168
  113. crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
  114. crawlo/templates/run.py.tmpl +23 -23
  115. crawlo/templates/spider/spider.py.tmpl +32 -32
  116. crawlo/templates/spiders_init.py.tmpl +4 -4
  117. crawlo/tools/__init__.py +86 -86
  118. crawlo/tools/date_tools.py +289 -289
  119. crawlo/tools/distributed_coordinator.py +384 -384
  120. crawlo/tools/scenario_adapter.py +262 -262
  121. crawlo/tools/text_cleaner.py +232 -232
  122. crawlo/utils/__init__.py +74 -50
  123. crawlo/utils/batch_processor.py +276 -276
  124. crawlo/utils/config_manager.py +442 -442
  125. crawlo/utils/controlled_spider_mixin.py +439 -439
  126. crawlo/utils/db_helper.py +250 -250
  127. crawlo/utils/encoding_helper.py +190 -0
  128. crawlo/utils/error_handler.py +410 -410
  129. crawlo/utils/fingerprint.py +121 -121
  130. crawlo/utils/func_tools.py +82 -82
  131. crawlo/utils/large_scale_helper.py +344 -344
  132. crawlo/utils/leak_detector.py +335 -335
  133. crawlo/utils/misc.py +81 -81
  134. crawlo/utils/mongo_connection_pool.py +157 -157
  135. crawlo/utils/mysql_connection_pool.py +197 -197
  136. crawlo/utils/performance_monitor.py +285 -285
  137. crawlo/utils/queue_helper.py +175 -175
  138. crawlo/utils/redis_checker.py +90 -90
  139. crawlo/utils/redis_connection_pool.py +578 -578
  140. crawlo/utils/redis_key_validator.py +198 -198
  141. crawlo/utils/request.py +278 -278
  142. crawlo/utils/request_serializer.py +225 -225
  143. crawlo/utils/resource_manager.py +337 -337
  144. crawlo/utils/response_helper.py +113 -0
  145. crawlo/utils/selector_helper.py +138 -137
  146. crawlo/utils/singleton.py +69 -69
  147. crawlo/utils/spider_loader.py +201 -201
  148. crawlo/utils/text_helper.py +94 -94
  149. {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/METADATA +831 -689
  150. crawlo-1.4.8.dist-info/RECORD +347 -0
  151. examples/__init__.py +7 -7
  152. tests/__init__.py +7 -7
  153. tests/advanced_tools_example.py +217 -217
  154. tests/authenticated_proxy_example.py +110 -110
  155. tests/baidu_performance_test.py +108 -108
  156. tests/baidu_test.py +59 -59
  157. tests/bug_check_test.py +250 -250
  158. tests/cleaners_example.py +160 -160
  159. tests/comprehensive_framework_test.py +212 -212
  160. tests/comprehensive_test.py +81 -81
  161. tests/comprehensive_testing_summary.md +186 -186
  162. tests/config_validation_demo.py +142 -142
  163. tests/controlled_spider_example.py +205 -205
  164. tests/date_tools_example.py +180 -180
  165. tests/debug_configure.py +69 -69
  166. tests/debug_framework_logger.py +84 -84
  167. tests/debug_log_config.py +126 -126
  168. tests/debug_log_levels.py +63 -63
  169. tests/debug_pipelines.py +66 -66
  170. tests/detailed_log_test.py +233 -233
  171. tests/direct_selector_helper_test.py +96 -96
  172. tests/distributed_dedup_test.py +467 -467
  173. tests/distributed_test.py +66 -66
  174. tests/distributed_test_debug.py +76 -76
  175. tests/dynamic_loading_example.py +523 -523
  176. tests/dynamic_loading_test.py +104 -104
  177. tests/error_handling_example.py +171 -171
  178. tests/explain_mysql_update_behavior.py +76 -76
  179. tests/final_comprehensive_test.py +151 -151
  180. tests/final_log_test.py +260 -260
  181. tests/final_validation_test.py +182 -182
  182. tests/fix_log_test.py +142 -142
  183. tests/framework_performance_test.py +202 -202
  184. tests/log_buffering_test.py +111 -111
  185. tests/log_generation_timing_test.py +153 -153
  186. tests/monitor_redis_dedup.sh +72 -72
  187. tests/ofweek_scrapy/ofweek_scrapy/items.py +12 -12
  188. tests/ofweek_scrapy/ofweek_scrapy/middlewares.py +100 -100
  189. tests/ofweek_scrapy/ofweek_scrapy/pipelines.py +13 -13
  190. tests/ofweek_scrapy/ofweek_scrapy/settings.py +84 -84
  191. tests/ofweek_scrapy/ofweek_scrapy/spiders/__init__.py +4 -4
  192. tests/ofweek_scrapy/scrapy.cfg +11 -11
  193. tests/optimized_performance_test.py +211 -211
  194. tests/performance_comparison.py +244 -244
  195. tests/queue_blocking_test.py +113 -113
  196. tests/queue_test.py +89 -89
  197. tests/redis_key_validation_demo.py +130 -130
  198. tests/request_params_example.py +150 -150
  199. tests/response_improvements_example.py +144 -144
  200. tests/scrapy_comparison/ofweek_scrapy.py +138 -138
  201. tests/scrapy_comparison/scrapy_test.py +133 -133
  202. tests/simple_cli_test.py +54 -54
  203. tests/simple_command_test.py +119 -119
  204. tests/simple_crawlo_test.py +126 -126
  205. tests/simple_follow_test.py +38 -38
  206. tests/simple_log_test2.py +137 -137
  207. tests/simple_optimization_test.py +128 -128
  208. tests/simple_queue_type_test.py +41 -41
  209. tests/simple_response_selector_test.py +94 -94
  210. tests/simple_selector_helper_test.py +154 -154
  211. tests/simple_selector_test.py +207 -207
  212. tests/simple_spider_test.py +49 -49
  213. tests/simple_url_test.py +73 -73
  214. tests/simulate_mysql_update_test.py +139 -139
  215. tests/spider_log_timing_test.py +177 -177
  216. tests/test_advanced_tools.py +148 -148
  217. tests/test_all_commands.py +230 -230
  218. tests/test_all_pipeline_fingerprints.py +133 -133
  219. tests/test_all_redis_key_configs.py +145 -145
  220. tests/test_asyncmy_usage.py +56 -56
  221. tests/test_batch_processor.py +178 -178
  222. tests/test_cleaners.py +54 -54
  223. tests/test_cli_arguments.py +118 -118
  224. tests/test_component_factory.py +174 -174
  225. tests/test_config_consistency.py +80 -80
  226. tests/test_config_merge.py +152 -152
  227. tests/test_config_validator.py +182 -182
  228. tests/test_controlled_spider_mixin.py +79 -79
  229. tests/test_crawler_process_import.py +38 -38
  230. tests/test_crawler_process_spider_modules.py +47 -47
  231. tests/test_crawlo_proxy_integration.py +114 -114
  232. tests/test_date_tools.py +123 -123
  233. tests/test_dedup_fix.py +220 -220
  234. tests/test_dedup_pipeline_consistency.py +124 -124
  235. tests/test_default_header_middleware.py +313 -313
  236. tests/test_distributed.py +65 -65
  237. tests/test_double_crawlo_fix.py +204 -204
  238. tests/test_double_crawlo_fix_simple.py +124 -124
  239. tests/test_download_delay_middleware.py +221 -221
  240. tests/test_downloader_proxy_compatibility.py +272 -272
  241. tests/test_edge_cases.py +305 -305
  242. tests/test_encoding_core.py +56 -56
  243. tests/test_encoding_detection.py +126 -126
  244. tests/test_enhanced_error_handler.py +270 -270
  245. tests/test_enhanced_error_handler_comprehensive.py +245 -245
  246. tests/test_error_handler_compatibility.py +112 -112
  247. tests/test_factories.py +252 -252
  248. tests/test_factory_compatibility.py +196 -196
  249. tests/test_final_validation.py +153 -153
  250. tests/test_fingerprint_consistency.py +135 -135
  251. tests/test_fingerprint_simple.py +51 -51
  252. tests/test_get_component_logger.py +83 -83
  253. tests/test_hash_performance.py +99 -99
  254. tests/test_integration.py +169 -169
  255. tests/test_item_dedup_redis_key.py +122 -122
  256. tests/test_large_scale_helper.py +235 -235
  257. tests/test_logging_enhancements.py +374 -374
  258. tests/test_logging_final.py +184 -184
  259. tests/test_logging_integration.py +312 -312
  260. tests/test_logging_system.py +282 -282
  261. tests/test_middleware_debug.py +141 -141
  262. tests/test_mode_consistency.py +51 -51
  263. tests/test_multi_directory.py +67 -67
  264. tests/test_multiple_spider_modules.py +80 -80
  265. tests/test_mysql_pipeline_config.py +164 -164
  266. tests/test_mysql_pipeline_error.py +98 -98
  267. tests/test_mysql_pipeline_init_log.py +82 -82
  268. tests/test_mysql_pipeline_integration.py +132 -132
  269. tests/test_mysql_pipeline_refactor.py +143 -143
  270. tests/test_mysql_pipeline_refactor_simple.py +85 -85
  271. tests/test_mysql_pipeline_robustness.py +195 -195
  272. tests/test_mysql_pipeline_types.py +88 -88
  273. tests/test_mysql_update_columns.py +93 -93
  274. tests/test_offsite_middleware.py +244 -244
  275. tests/test_offsite_middleware_simple.py +203 -203
  276. tests/test_optimized_selector_naming.py +100 -100
  277. tests/test_parsel.py +29 -29
  278. tests/test_performance.py +327 -327
  279. tests/test_performance_monitor.py +115 -115
  280. tests/test_pipeline_fingerprint_consistency.py +86 -86
  281. tests/test_priority_behavior.py +211 -211
  282. tests/test_priority_consistency.py +151 -151
  283. tests/test_priority_consistency_fixed.py +249 -249
  284. tests/test_proxy_health_check.py +32 -32
  285. tests/test_proxy_middleware.py +217 -217
  286. tests/test_proxy_middleware_enhanced.py +212 -212
  287. tests/test_proxy_middleware_integration.py +142 -142
  288. tests/test_proxy_middleware_refactored.py +207 -207
  289. tests/test_proxy_only.py +83 -83
  290. tests/test_proxy_providers.py +56 -56
  291. tests/test_proxy_stats.py +19 -19
  292. tests/test_proxy_strategies.py +59 -59
  293. tests/test_proxy_with_downloader.py +152 -152
  294. tests/test_queue_empty_check.py +41 -41
  295. tests/test_queue_manager_double_crawlo.py +173 -173
  296. tests/test_queue_manager_redis_key.py +179 -179
  297. tests/test_queue_naming.py +154 -154
  298. tests/test_queue_type.py +106 -106
  299. tests/test_queue_type_redis_config_consistency.py +130 -130
  300. tests/test_random_headers_default.py +322 -322
  301. tests/test_random_headers_necessity.py +308 -308
  302. tests/test_random_user_agent.py +72 -72
  303. tests/test_redis_config.py +28 -28
  304. tests/test_redis_connection_pool.py +294 -294
  305. tests/test_redis_key_naming.py +181 -181
  306. tests/test_redis_key_validator.py +123 -123
  307. tests/test_redis_queue.py +224 -224
  308. tests/test_redis_queue_name_fix.py +175 -175
  309. tests/test_redis_queue_type_fallback.py +129 -129
  310. tests/test_request_ignore_middleware.py +182 -182
  311. tests/test_request_params.py +111 -111
  312. tests/test_request_serialization.py +70 -70
  313. tests/test_response_code_middleware.py +349 -349
  314. tests/test_response_filter_middleware.py +427 -427
  315. tests/test_response_follow.py +104 -104
  316. tests/test_response_improvements.py +152 -152
  317. tests/test_response_selector_methods.py +92 -92
  318. tests/test_response_url_methods.py +70 -70
  319. tests/test_response_urljoin.py +86 -86
  320. tests/test_retry_middleware.py +333 -333
  321. tests/test_retry_middleware_realistic.py +273 -273
  322. tests/test_scheduler.py +252 -252
  323. tests/test_scheduler_config_update.py +133 -133
  324. tests/test_scrapy_style_encoding.py +112 -112
  325. tests/test_selector_helper.py +100 -100
  326. tests/test_selector_optimizations.py +146 -146
  327. tests/test_simple_response.py +61 -61
  328. tests/test_spider_loader.py +49 -49
  329. tests/test_spider_loader_comprehensive.py +69 -69
  330. tests/test_spider_modules.py +84 -84
  331. tests/test_spiders/test_spider.py +9 -9
  332. tests/test_telecom_spider_redis_key.py +205 -205
  333. tests/test_template_content.py +87 -87
  334. tests/test_template_redis_key.py +134 -134
  335. tests/test_tools.py +159 -159
  336. tests/test_user_agent_randomness.py +176 -176
  337. tests/test_user_agents.py +96 -96
  338. tests/untested_features_report.md +138 -138
  339. tests/verify_debug.py +51 -51
  340. tests/verify_distributed.py +117 -117
  341. tests/verify_log_fix.py +111 -111
  342. tests/verify_mysql_warnings.py +109 -109
  343. crawlo/utils/log.py +0 -80
  344. crawlo/utils/url_utils.py +0 -40
  345. crawlo-1.4.7.dist-info/RECORD +0 -347
  346. {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/WHEEL +0 -0
  347. {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/entry_points.txt +0 -0
  348. {crawlo-1.4.7.dist-info → crawlo-1.4.8.dist-info}/top_level.txt +0 -0
@@ -1,591 +1,591 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 统一的队列管理器
5
- 提供简洁、一致的队列接口,自动处理不同队列类型的差异
6
- """
7
- import asyncio
8
- import time
9
- import traceback
10
- from enum import Enum
11
- from typing import Optional, Dict, Any, Union, TYPE_CHECKING
12
-
13
- if TYPE_CHECKING:
14
- from crawlo import Request
15
-
16
- from crawlo.queue.pqueue import SpiderPriorityQueue
17
- from crawlo.utils.error_handler import ErrorHandler
18
- from crawlo.logging import get_logger
19
- from crawlo.utils.request_serializer import RequestSerializer
20
-
21
- try:
22
- # 使用完整版Redis队列
23
- from crawlo.queue.redis_priority_queue import RedisPriorityQueue
24
-
25
- REDIS_AVAILABLE = True
26
- except ImportError:
27
- RedisPriorityQueue = None
28
- REDIS_AVAILABLE = False
29
-
30
-
31
- class QueueType(Enum):
32
- """Queue type enumeration"""
33
- MEMORY = "memory"
34
- REDIS = "redis"
35
- AUTO = "auto" # 自动选择
36
-
37
-
38
- class IntelligentScheduler:
39
- """智能调度器"""
40
-
41
- def __init__(self):
42
- self.domain_stats = {} # 域名统计信息
43
- self.url_stats = {} # URL统计信息
44
- self.last_request_time = {} # 最后请求时间
45
-
46
- def calculate_priority(self, request: "Request") -> int:
47
- """计算请求的智能优先级"""
48
- priority = getattr(request, 'priority', 0)
49
-
50
- # 获取域名
51
- domain = self._extract_domain(request.url)
52
-
53
- # 基于域名访问频率调整优先级
54
- if domain in self.domain_stats:
55
- domain_access_count = self.domain_stats[domain]['count']
56
- last_access_time = self.domain_stats[domain]['last_time']
57
-
58
- # 如果最近访问过该域名,降低优先级(避免过度集中访问同一域名)
59
- time_since_last = time.time() - last_access_time
60
- if time_since_last < 5: # 5秒内访问过
61
- priority -= 2
62
- elif time_since_last < 30: # 30秒内访问过
63
- priority -= 1
64
-
65
- # 如果该域名访问次数过多,进一步降低优先级
66
- if domain_access_count > 10:
67
- priority -= 1
68
-
69
- # 基于URL访问历史调整优先级
70
- if request.url in self.url_stats:
71
- url_access_count = self.url_stats[request.url]
72
- if url_access_count > 1:
73
- # 重复URL降低优先级
74
- priority -= url_access_count
75
-
76
- # 基于深度调整优先级
77
- depth = getattr(request, 'meta', {}).get('depth', 0)
78
- priority -= depth # 深度越大,优先级越低
79
-
80
- return priority
81
-
82
- def update_stats(self, request: "Request"):
83
- """更新统计信息"""
84
- domain = self._extract_domain(request.url)
85
-
86
- # 更新域名统计
87
- if domain not in self.domain_stats:
88
- self.domain_stats[domain] = {'count': 0, 'last_time': 0}
89
-
90
- self.domain_stats[domain]['count'] += 1
91
- self.domain_stats[domain]['last_time'] = time.time()
92
-
93
- # 更新URL统计
94
- if request.url not in self.url_stats:
95
- self.url_stats[request.url] = 0
96
- self.url_stats[request.url] += 1
97
-
98
- # 更新最后请求时间
99
- self.last_request_time[domain] = time.time()
100
-
101
- def _extract_domain(self, url: str) -> str:
102
- """提取域名"""
103
- try:
104
- from urllib.parse import urlparse
105
- parsed = urlparse(url)
106
- return parsed.netloc
107
- except:
108
- return "unknown"
109
-
110
-
111
- class QueueConfig:
112
- """Queue configuration class"""
113
-
114
- def __init__(
115
- self,
116
- queue_type: Union[QueueType, str] = QueueType.AUTO,
117
- redis_url: Optional[str] = None,
118
- redis_host: str = "127.0.0.1",
119
- redis_port: int = 6379,
120
- redis_password: Optional[str] = None,
121
- redis_db: int = 0,
122
- queue_name: str = "crawlo:requests",
123
- max_queue_size: int = 1000,
124
- max_retries: int = 3,
125
- timeout: int = 300,
126
- run_mode: Optional[str] = None, # 新增:运行模式
127
- **kwargs
128
- ):
129
- self.queue_type = QueueType(queue_type) if isinstance(queue_type, str) else queue_type
130
- self.run_mode = run_mode # 保存运行模式
131
-
132
- # Redis 配置
133
- if redis_url:
134
- self.redis_url = redis_url
135
- else:
136
- if redis_password:
137
- self.redis_url = f"redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}"
138
- else:
139
- self.redis_url = f"redis://{redis_host}:{redis_port}/{redis_db}"
140
-
141
- self.queue_name = queue_name
142
- self.max_queue_size = max_queue_size
143
- self.max_retries = max_retries
144
- self.timeout = timeout
145
- self.extra_config = kwargs
146
-
147
- @classmethod
148
- def from_settings(cls, settings) -> 'QueueConfig':
149
- """Create configuration from settings"""
150
- # 获取项目名称,用于生成默认队列名称
151
- project_name = settings.get('PROJECT_NAME', 'default')
152
- default_queue_name = f"crawlo:{project_name}:queue:requests"
153
-
154
- # 如果设置了SCHEDULER_QUEUE_NAME,则使用该值,否则使用基于项目名称的默认值
155
- scheduler_queue_name = settings.get('SCHEDULER_QUEUE_NAME')
156
- if scheduler_queue_name is not None:
157
- queue_name = scheduler_queue_name
158
- else:
159
- queue_name = default_queue_name
160
-
161
- return cls(
162
- queue_type=settings.get('QUEUE_TYPE', QueueType.AUTO),
163
- redis_url=settings.get('REDIS_URL'),
164
- redis_host=settings.get('REDIS_HOST', '127.0.0.1'),
165
- redis_port=settings.get_int('REDIS_PORT', 6379),
166
- redis_password=settings.get('REDIS_PASSWORD'),
167
- redis_db=settings.get_int('REDIS_DB', 0),
168
- queue_name=queue_name,
169
- max_queue_size=settings.get_int('SCHEDULER_MAX_QUEUE_SIZE', 1000),
170
- max_retries=settings.get_int('QUEUE_MAX_RETRIES', 3),
171
- timeout=settings.get_int('QUEUE_TIMEOUT', 300),
172
- run_mode=settings.get('RUN_MODE') # 传递运行模式
173
- )
174
-
175
-
176
- class QueueManager:
177
- """Unified queue manager"""
178
-
179
- def __init__(self, config: QueueConfig):
180
- self.config = config
181
- # 延迟初始化logger和error_handler避免循环依赖
182
- self._logger = None
183
- self._error_handler = None
184
- self.request_serializer = RequestSerializer()
185
- self._queue = None
186
- self._queue_semaphore = None
187
- self._queue_type = None
188
- self._health_status = "unknown"
189
- self._intelligent_scheduler = IntelligentScheduler() # 智能调度器
190
-
191
- @property
192
- def logger(self):
193
- if self._logger is None:
194
- self._logger = get_logger(self.__class__.__name__)
195
- return self._logger
196
-
197
- @property
198
- def error_handler(self):
199
- if self._error_handler is None:
200
- self._error_handler = ErrorHandler(self.__class__.__name__)
201
- return self._error_handler
202
-
203
- async def initialize(self) -> bool:
204
- """初始化队列"""
205
- try:
206
- queue_type = await self._determine_queue_type()
207
- self._queue = await self._create_queue(queue_type)
208
- self._queue_type = queue_type
209
-
210
- # 测试队列健康状态
211
- health_check_result = await self._health_check()
212
-
213
- self.logger.info(f"Queue initialized successfully Type: {queue_type.value}")
214
- # 只在调试模式下输出详细配置信息
215
- self.logger.debug(f"Queue configuration: {self._get_queue_info()}")
216
-
217
- # 如果健康检查返回True,表示队列类型发生了切换,需要更新配置
218
- if health_check_result:
219
- return True
220
-
221
- # 如果队列类型是Redis,检查是否需要更新配置
222
- if queue_type == QueueType.REDIS:
223
- # 这个检查需要在调度器中进行,因为队列管理器无法访问crawler.settings
224
- # 但我们不需要总是返回True,只有在确实需要更新时才返回True
225
- # 调度器会进行更详细的检查
226
- pass
227
-
228
- return False # 默认不需要更新配置
229
-
230
- except RuntimeError as e:
231
- # Distributed 模式下的 RuntimeError 必须重新抛出
232
- if self.config.run_mode == 'distributed':
233
- self.logger.error(f"Queue initialization failed: {e}")
234
- self._health_status = "error"
235
- raise # 重新抛出异常
236
- # 其他模式记录错误但不抛出
237
- self.logger.error(f"Queue initialization failed: {e}")
238
- self.logger.debug(f"详细错误信息:\n{traceback.format_exc()}")
239
- self._health_status = "error"
240
- return False
241
- except Exception as e:
242
- # 记录详细的错误信息和堆栈跟踪
243
- self.logger.error(f"Queue initialization failed: {e}")
244
- self.logger.debug(f"详细错误信息:\n{traceback.format_exc()}")
245
- self._health_status = "error"
246
- return False
247
-
248
- async def put(self, request: "Request", priority: int = 0) -> bool:
249
- """Unified enqueue interface"""
250
- if not self._queue:
251
- raise RuntimeError("队列未初始化")
252
-
253
- try:
254
- # 应用智能调度算法计算优先级
255
- intelligent_priority = self._intelligent_scheduler.calculate_priority(request)
256
- # 结合原始优先级和智能优先级
257
- final_priority = priority + intelligent_priority
258
-
259
- # 更新统计信息
260
- self._intelligent_scheduler.update_stats(request)
261
-
262
- # 序列化处理(仅对 Redis 队列)
263
- if self._queue_type == QueueType.REDIS:
264
- request = self.request_serializer.prepare_for_serialization(request)
265
-
266
- # 背压控制(仅对内存队列)
267
- if self._queue_semaphore:
268
- # 对于大量请求,使用阻塞式等待而不是跳过
269
- # 这样可以确保不会丢失任何请求
270
- await self._queue_semaphore.acquire()
271
-
272
- # 统一的入队操作
273
- if hasattr(self._queue, 'put'):
274
- if self._queue_type == QueueType.REDIS:
275
- success = await self._queue.put(request, final_priority)
276
- else:
277
- # 对于内存队列,我们需要手动处理优先级
278
- # 在SpiderPriorityQueue中,元素应该是(priority, item)的元组
279
- await self._queue.put((final_priority, request))
280
- success = True
281
- else:
282
- raise RuntimeError(f"队列类型 {self._queue_type} 不支持 put 操作")
283
-
284
- if success:
285
- self.logger.debug(f"Request enqueued successfully: {request.url} with priority {final_priority}")
286
-
287
- return success
288
-
289
- except Exception as e:
290
- self.logger.error(f"Failed to enqueue request: {e}")
291
- if self._queue_semaphore:
292
- self._queue_semaphore.release()
293
- return False
294
-
295
- async def get(self, timeout: float = 5.0) -> Optional["Request"]:
296
- """Unified dequeue interface"""
297
- if not self._queue:
298
- raise RuntimeError("队列未初始化")
299
-
300
- try:
301
- request = await self._queue.get(timeout=timeout)
302
-
303
- # 释放信号量(仅对内存队列)
304
- if self._queue_semaphore and request:
305
- self._queue_semaphore.release()
306
-
307
- # 反序列化处理(仅对 Redis 队列)
308
- if request and self._queue_type == QueueType.REDIS:
309
- # 这里需要 spider 实例,暂时返回原始请求
310
- # 实际的 callback 恢复在 scheduler 中处理
311
- pass
312
-
313
- # 如果是内存队列,需要解包(priority, request)元组
314
- if request and self._queue_type == QueueType.MEMORY:
315
- if isinstance(request, tuple) and len(request) == 2:
316
- request = request[1] # 取元组中的请求对象
317
-
318
- return request
319
-
320
- except Exception as e:
321
- self.logger.error(f"Failed to dequeue request: {e}")
322
- return None
323
-
324
- async def size(self) -> int:
325
- """Get queue size"""
326
- if not self._queue:
327
- return 0
328
-
329
- try:
330
- if hasattr(self._queue, 'qsize'):
331
- if asyncio.iscoroutinefunction(self._queue.qsize):
332
- return await self._queue.qsize()
333
- else:
334
- return self._queue.qsize()
335
- return 0
336
- except Exception as e:
337
- self.logger.warning(f"Failed to get queue size: {e}")
338
- return 0
339
-
340
- def empty(self) -> bool:
341
- """Check if queue is empty (synchronous version, for compatibility)"""
342
- try:
343
- # 对于内存队列,可以同步检查
344
- if self._queue_type == QueueType.MEMORY:
345
- # 确保正确检查队列大小
346
- if hasattr(self._queue, 'qsize'):
347
- return self._queue.qsize() == 0
348
- else:
349
- # 如果没有qsize方法,假设队列为空
350
- return True
351
- # 对于 Redis 队列,由于需要异步操作,这里返回近似值
352
- # 为了确保程序能正常退出,我们返回True,让上层通过更精确的异步检查来判断
353
- return True
354
- except Exception:
355
- return True
356
-
357
- async def async_empty(self) -> bool:
358
- """Check if queue is empty (asynchronous version, more accurate)"""
359
- try:
360
- # 对于内存队列
361
- if self._queue_type == QueueType.MEMORY:
362
- # 确保正确检查队列大小
363
- if hasattr(self._queue, 'qsize'):
364
- if asyncio.iscoroutinefunction(self._queue.qsize):
365
- size = await self._queue.qsize()
366
- else:
367
- size = self._queue.qsize()
368
- return size == 0
369
- else:
370
- # 如果没有qsize方法,假设队列为空
371
- return True
372
- # 对于 Redis 队列,使用异步检查
373
- elif self._queue_type == QueueType.REDIS:
374
- size = await self.size()
375
- return size == 0
376
- return True
377
- except Exception:
378
- return True
379
-
380
- async def close(self) -> None:
381
- """Close queue"""
382
- if self._queue and hasattr(self._queue, 'close'):
383
- try:
384
- await self._queue.close()
385
- # Change INFO level log to DEBUG level to avoid redundant output
386
- self.logger.debug("Queue closed")
387
- except Exception as e:
388
- self.logger.warning(f"Error closing queue: {e}")
389
-
390
- def get_status(self) -> Dict[str, Any]:
391
- """Get queue status information"""
392
- return {
393
- "type": self._queue_type.value if self._queue_type else "unknown",
394
- "health": self._health_status,
395
- "config": self._get_queue_info(),
396
- "initialized": self._queue is not None
397
- }
398
-
399
- async def _determine_queue_type(self) -> QueueType:
400
- """Determine queue type"""
401
- if self.config.queue_type == QueueType.AUTO:
402
- # 自动选择:优先使用 Redis(如果可用)
403
- if REDIS_AVAILABLE and self.config.redis_url:
404
- # 测试 Redis 连接
405
- try:
406
- from crawlo.queue.redis_priority_queue import RedisPriorityQueue
407
- test_queue = RedisPriorityQueue(self.config.redis_url)
408
- await test_queue.connect()
409
- await test_queue.close()
410
- self.logger.debug("Auto-detection: Redis available, using distributed queue")
411
- return QueueType.REDIS
412
- except Exception as e:
413
- self.logger.debug(f"Auto-detection: Redis unavailable ({e}), using memory queue")
414
- return QueueType.MEMORY
415
- else:
416
- self.logger.debug("Auto-detection: Redis not configured, using memory queue")
417
- return QueueType.MEMORY
418
-
419
- elif self.config.queue_type == QueueType.REDIS:
420
- # Distributed 模式:必须使用 Redis,不允许降级
421
- if self.config.run_mode == 'distributed':
422
- # 分布式模式必须确保 Redis 可用
423
- if not REDIS_AVAILABLE:
424
- error_msg = (
425
- "Distributed 模式要求 Redis 可用,但 Redis 客户端库未安装。\n"
426
- "请安装 Redis 支持: pip install redis"
427
- )
428
- self.logger.error(error_msg)
429
- raise RuntimeError(error_msg)
430
-
431
- if not self.config.redis_url:
432
- error_msg = (
433
- "Distributed 模式要求配置 Redis 连接信息。\n"
434
- "请在 settings.py 中配置 REDIS_HOST、REDIS_PORT 等参数"
435
- )
436
- self.logger.error(error_msg)
437
- raise RuntimeError(error_msg)
438
-
439
- # 测试 Redis 连接
440
- try:
441
- from crawlo.queue.redis_priority_queue import RedisPriorityQueue
442
- test_queue = RedisPriorityQueue(self.config.redis_url)
443
- await test_queue.connect()
444
- await test_queue.close()
445
- self.logger.debug("Distributed mode: Redis connection verified")
446
- return QueueType.REDIS
447
- except Exception as e:
448
- error_msg = (
449
- f"Distributed 模式要求 Redis 可用,但无法连接到 Redis 服务器。\n"
450
- f"错误信息: {e}\n"
451
- f"Redis URL: {self.config.redis_url}\n"
452
- f"请检查:\n"
453
- f" 1. Redis 服务是否正在运行\n"
454
- f" 2. Redis 连接配置是否正确\n"
455
- f" 3. 网络连接是否正常"
456
- )
457
- self.logger.error(error_msg)
458
- raise RuntimeError(error_msg) from e
459
- else:
460
- # 非 distributed 模式:QUEUE_TYPE='redis' 时允许降级到 memory
461
- # 这提供了向后兼容性和更好的容错性
462
- if REDIS_AVAILABLE and self.config.redis_url:
463
- # 测试 Redis 连接
464
- try:
465
- from crawlo.queue.redis_priority_queue import RedisPriorityQueue
466
- test_queue = RedisPriorityQueue(self.config.redis_url)
467
- await test_queue.connect()
468
- await test_queue.close()
469
- self.logger.debug("Redis mode: Redis available, using distributed queue")
470
- return QueueType.REDIS
471
- except Exception as e:
472
- self.logger.warning(f"Redis mode: Redis unavailable ({e}), falling back to memory queue")
473
- return QueueType.MEMORY
474
- else:
475
- self.logger.warning("Redis mode: Redis not configured, falling back to memory queue")
476
- return QueueType.MEMORY
477
-
478
- elif self.config.queue_type == QueueType.MEMORY:
479
- return QueueType.MEMORY
480
-
481
- else:
482
- raise ValueError(f"不支持的队列类型: {self.config.queue_type}")
483
-
484
- async def _create_queue(self, queue_type: QueueType):
485
- """Create queue instance"""
486
- if queue_type == QueueType.REDIS:
487
- # 延迟导入Redis队列
488
- try:
489
- from crawlo.queue.redis_priority_queue import RedisPriorityQueue
490
- except ImportError as e:
491
- raise RuntimeError(f"Redis队列不可用:未能导入RedisPriorityQueue ({e})")
492
-
493
- # 修复项目名称提取逻辑,严格按照测试文件中的逻辑实现
494
- project_name = "default"
495
- if ':' in self.config.queue_name:
496
- parts = self.config.queue_name.split(':')
497
- if len(parts) >= 2:
498
- # 处理可能的双重 crawlo 前缀
499
- if parts[0] == "crawlo" and parts[1] == "crawlo":
500
- # 双重 crawlo 前缀,取"crawlo"作为项目名称
501
- project_name = "crawlo"
502
- elif parts[0] == "crawlo":
503
- # 正常的 crawlo 前缀,取第二个部分作为项目名称
504
- project_name = parts[1]
505
- else:
506
- # 没有 crawlo 前缀,使用第一个部分作为项目名称
507
- project_name = parts[0]
508
- else:
509
- project_name = self.config.queue_name or "default"
510
- else:
511
- project_name = self.config.queue_name or "default"
512
-
513
- queue = RedisPriorityQueue(
514
- redis_url=self.config.redis_url,
515
- queue_name=self.config.queue_name,
516
- max_retries=self.config.max_retries,
517
- timeout=self.config.timeout,
518
- module_name=project_name # 传递项目名称作为module_name
519
- )
520
- # 不需要立即连接,使用 lazy connect
521
- return queue
522
-
523
- elif queue_type == QueueType.MEMORY:
524
- queue = SpiderPriorityQueue()
525
- # 为内存队列设置背压控制
526
- self._queue_semaphore = asyncio.Semaphore(self.config.max_queue_size)
527
- return queue
528
-
529
- else:
530
- raise ValueError(f"不支持的队列类型: {queue_type}")
531
-
532
- async def _health_check(self) -> bool:
533
- """Health check"""
534
- try:
535
- if self._queue_type == QueueType.REDIS:
536
- # 测试 Redis 连接
537
- await self._queue.connect()
538
- self._health_status = "healthy"
539
- else:
540
- # 内存队列总是健康的
541
- self._health_status = "healthy"
542
- return False # 内存队列不需要更新配置
543
- except Exception as e:
544
- self.logger.warning(f"Queue health check failed: {e}")
545
- self._health_status = "unhealthy"
546
-
547
- # Distributed 模式下 Redis 健康检查失败应该报错
548
- if self.config.run_mode == 'distributed':
549
- error_msg = (
550
- f"Distributed 模式下 Redis 健康检查失败。\n"
551
- f"错误信息: {e}\n"
552
- f"Redis URL: {self.config.redis_url}\n"
553
- f"分布式模式不允许降级到内存队列,请修复 Redis 连接问题。"
554
- )
555
- self.logger.error(error_msg)
556
- raise RuntimeError(error_msg) from e
557
-
558
- # 非 Distributed 模式:如果是Redis队列且健康检查失败,尝试切换到内存队列
559
- # 对于 AUTO 模式允许回退
560
- if self._queue_type == QueueType.REDIS and self.config.queue_type == QueueType.AUTO:
561
- self.logger.info("Redis queue unavailable, attempting to switch to memory queue...")
562
- try:
563
- await self._queue.close()
564
- except:
565
- pass
566
- self._queue = None
567
- # 重新创建内存队列
568
- self._queue = await self._create_queue(QueueType.MEMORY)
569
- self._queue_type = QueueType.MEMORY
570
- self._queue_semaphore = asyncio.Semaphore(self.config.max_queue_size)
571
- self._health_status = "healthy"
572
- self.logger.info("Switched to memory queue")
573
- # 返回一个信号,表示需要更新过滤器和去重管道配置
574
- return True
575
- return False
576
-
577
- def _get_queue_info(self) -> Dict[str, Any]:
578
- """Get queue configuration information"""
579
- info = {
580
- "queue_name": self.config.queue_name,
581
- "max_queue_size": self.config.max_queue_size
582
- }
583
-
584
- if self._queue_type == QueueType.REDIS:
585
- info.update({
586
- "redis_url": self.config.redis_url,
587
- "max_retries": self.config.max_retries,
588
- "timeout": self.config.timeout
589
- })
590
-
591
- return info
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 统一的队列管理器
5
+ 提供简洁、一致的队列接口,自动处理不同队列类型的差异
6
+ """
7
+ import asyncio
8
+ import time
9
+ import traceback
10
+ from enum import Enum
11
+ from typing import Optional, Dict, Any, Union, TYPE_CHECKING
12
+
13
+ if TYPE_CHECKING:
14
+ from crawlo import Request
15
+
16
+ from crawlo.queue.pqueue import SpiderPriorityQueue
17
+ from crawlo.utils.error_handler import ErrorHandler
18
+ from crawlo.logging import get_logger
19
+ from crawlo.utils.request_serializer import RequestSerializer
20
+
21
+ try:
22
+ # 使用完整版Redis队列
23
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
24
+
25
+ REDIS_AVAILABLE = True
26
+ except ImportError:
27
+ RedisPriorityQueue = None
28
+ REDIS_AVAILABLE = False
29
+
30
+
31
+ class QueueType(Enum):
32
+ """Queue type enumeration"""
33
+ MEMORY = "memory"
34
+ REDIS = "redis"
35
+ AUTO = "auto" # 自动选择
36
+
37
+
38
+ class IntelligentScheduler:
39
+ """智能调度器"""
40
+
41
+ def __init__(self):
42
+ self.domain_stats = {} # 域名统计信息
43
+ self.url_stats = {} # URL统计信息
44
+ self.last_request_time = {} # 最后请求时间
45
+
46
+ def calculate_priority(self, request: "Request") -> int:
47
+ """计算请求的智能优先级"""
48
+ priority = getattr(request, 'priority', 0)
49
+
50
+ # 获取域名
51
+ domain = self._extract_domain(request.url)
52
+
53
+ # 基于域名访问频率调整优先级
54
+ if domain in self.domain_stats:
55
+ domain_access_count = self.domain_stats[domain]['count']
56
+ last_access_time = self.domain_stats[domain]['last_time']
57
+
58
+ # 如果最近访问过该域名,降低优先级(避免过度集中访问同一域名)
59
+ time_since_last = time.time() - last_access_time
60
+ if time_since_last < 5: # 5秒内访问过
61
+ priority -= 2
62
+ elif time_since_last < 30: # 30秒内访问过
63
+ priority -= 1
64
+
65
+ # 如果该域名访问次数过多,进一步降低优先级
66
+ if domain_access_count > 10:
67
+ priority -= 1
68
+
69
+ # 基于URL访问历史调整优先级
70
+ if request.url in self.url_stats:
71
+ url_access_count = self.url_stats[request.url]
72
+ if url_access_count > 1:
73
+ # 重复URL降低优先级
74
+ priority -= url_access_count
75
+
76
+ # 基于深度调整优先级
77
+ depth = getattr(request, 'meta', {}).get('depth', 0)
78
+ priority -= depth # 深度越大,优先级越低
79
+
80
+ return priority
81
+
82
+ def update_stats(self, request: "Request"):
83
+ """更新统计信息"""
84
+ domain = self._extract_domain(request.url)
85
+
86
+ # 更新域名统计
87
+ if domain not in self.domain_stats:
88
+ self.domain_stats[domain] = {'count': 0, 'last_time': 0}
89
+
90
+ self.domain_stats[domain]['count'] += 1
91
+ self.domain_stats[domain]['last_time'] = time.time()
92
+
93
+ # 更新URL统计
94
+ if request.url not in self.url_stats:
95
+ self.url_stats[request.url] = 0
96
+ self.url_stats[request.url] += 1
97
+
98
+ # 更新最后请求时间
99
+ self.last_request_time[domain] = time.time()
100
+
101
+ def _extract_domain(self, url: str) -> str:
102
+ """提取域名"""
103
+ try:
104
+ from urllib.parse import urlparse
105
+ parsed = urlparse(url)
106
+ return parsed.netloc
107
+ except:
108
+ return "unknown"
109
+
110
+
111
+ class QueueConfig:
112
+ """Queue configuration class"""
113
+
114
+ def __init__(
115
+ self,
116
+ queue_type: Union[QueueType, str] = QueueType.AUTO,
117
+ redis_url: Optional[str] = None,
118
+ redis_host: str = "127.0.0.1",
119
+ redis_port: int = 6379,
120
+ redis_password: Optional[str] = None,
121
+ redis_db: int = 0,
122
+ queue_name: str = "crawlo:requests",
123
+ max_queue_size: int = 1000,
124
+ max_retries: int = 3,
125
+ timeout: int = 300,
126
+ run_mode: Optional[str] = None, # 新增:运行模式
127
+ **kwargs
128
+ ):
129
+ self.queue_type = QueueType(queue_type) if isinstance(queue_type, str) else queue_type
130
+ self.run_mode = run_mode # 保存运行模式
131
+
132
+ # Redis 配置
133
+ if redis_url:
134
+ self.redis_url = redis_url
135
+ else:
136
+ if redis_password:
137
+ self.redis_url = f"redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}"
138
+ else:
139
+ self.redis_url = f"redis://{redis_host}:{redis_port}/{redis_db}"
140
+
141
+ self.queue_name = queue_name
142
+ self.max_queue_size = max_queue_size
143
+ self.max_retries = max_retries
144
+ self.timeout = timeout
145
+ self.extra_config = kwargs
146
+
147
+ @classmethod
148
+ def from_settings(cls, settings) -> 'QueueConfig':
149
+ """Create configuration from settings"""
150
+ # 获取项目名称,用于生成默认队列名称
151
+ project_name = settings.get('PROJECT_NAME', 'default')
152
+ default_queue_name = f"crawlo:{project_name}:queue:requests"
153
+
154
+ # 如果设置了SCHEDULER_QUEUE_NAME,则使用该值,否则使用基于项目名称的默认值
155
+ scheduler_queue_name = settings.get('SCHEDULER_QUEUE_NAME')
156
+ if scheduler_queue_name is not None:
157
+ queue_name = scheduler_queue_name
158
+ else:
159
+ queue_name = default_queue_name
160
+
161
+ return cls(
162
+ queue_type=settings.get('QUEUE_TYPE', QueueType.AUTO),
163
+ redis_url=settings.get('REDIS_URL'),
164
+ redis_host=settings.get('REDIS_HOST', '127.0.0.1'),
165
+ redis_port=settings.get_int('REDIS_PORT', 6379),
166
+ redis_password=settings.get('REDIS_PASSWORD'),
167
+ redis_db=settings.get_int('REDIS_DB', 0),
168
+ queue_name=queue_name,
169
+ max_queue_size=settings.get_int('SCHEDULER_MAX_QUEUE_SIZE', 1000),
170
+ max_retries=settings.get_int('QUEUE_MAX_RETRIES', 3),
171
+ timeout=settings.get_int('QUEUE_TIMEOUT', 300),
172
+ run_mode=settings.get('RUN_MODE') # 传递运行模式
173
+ )
174
+
175
+
176
+ class QueueManager:
177
+ """Unified queue manager"""
178
+
179
+ def __init__(self, config: QueueConfig):
180
+ self.config = config
181
+ # 延迟初始化logger和error_handler避免循环依赖
182
+ self._logger = None
183
+ self._error_handler = None
184
+ self.request_serializer = RequestSerializer()
185
+ self._queue = None
186
+ self._queue_semaphore = None
187
+ self._queue_type = None
188
+ self._health_status = "unknown"
189
+ self._intelligent_scheduler = IntelligentScheduler() # 智能调度器
190
+
191
+ @property
192
+ def logger(self):
193
+ if self._logger is None:
194
+ self._logger = get_logger(self.__class__.__name__)
195
+ return self._logger
196
+
197
+ @property
198
+ def error_handler(self):
199
+ if self._error_handler is None:
200
+ self._error_handler = ErrorHandler(self.__class__.__name__)
201
+ return self._error_handler
202
+
203
+ async def initialize(self) -> bool:
204
+ """初始化队列"""
205
+ try:
206
+ queue_type = await self._determine_queue_type()
207
+ self._queue = await self._create_queue(queue_type)
208
+ self._queue_type = queue_type
209
+
210
+ # 测试队列健康状态
211
+ health_check_result = await self._health_check()
212
+
213
+ self.logger.info(f"Queue initialized successfully Type: {queue_type.value}")
214
+ # 只在调试模式下输出详细配置信息
215
+ self.logger.debug(f"Queue configuration: {self._get_queue_info()}")
216
+
217
+ # 如果健康检查返回True,表示队列类型发生了切换,需要更新配置
218
+ if health_check_result:
219
+ return True
220
+
221
+ # 如果队列类型是Redis,检查是否需要更新配置
222
+ if queue_type == QueueType.REDIS:
223
+ # 这个检查需要在调度器中进行,因为队列管理器无法访问crawler.settings
224
+ # 但我们不需要总是返回True,只有在确实需要更新时才返回True
225
+ # 调度器会进行更详细的检查
226
+ pass
227
+
228
+ return False # 默认不需要更新配置
229
+
230
+ except RuntimeError as e:
231
+ # Distributed 模式下的 RuntimeError 必须重新抛出
232
+ if self.config.run_mode == 'distributed':
233
+ self.logger.error(f"Queue initialization failed: {e}")
234
+ self._health_status = "error"
235
+ raise # 重新抛出异常
236
+ # 其他模式记录错误但不抛出
237
+ self.logger.error(f"Queue initialization failed: {e}")
238
+ self.logger.debug(f"详细错误信息:\n{traceback.format_exc()}")
239
+ self._health_status = "error"
240
+ return False
241
+ except Exception as e:
242
+ # 记录详细的错误信息和堆栈跟踪
243
+ self.logger.error(f"Queue initialization failed: {e}")
244
+ self.logger.debug(f"详细错误信息:\n{traceback.format_exc()}")
245
+ self._health_status = "error"
246
+ return False
247
+
248
+ async def put(self, request: "Request", priority: int = 0) -> bool:
249
+ """Unified enqueue interface"""
250
+ if not self._queue:
251
+ raise RuntimeError("队列未初始化")
252
+
253
+ try:
254
+ # 应用智能调度算法计算优先级
255
+ intelligent_priority = self._intelligent_scheduler.calculate_priority(request)
256
+ # 结合原始优先级和智能优先级
257
+ final_priority = priority + intelligent_priority
258
+
259
+ # 更新统计信息
260
+ self._intelligent_scheduler.update_stats(request)
261
+
262
+ # 序列化处理(仅对 Redis 队列)
263
+ if self._queue_type == QueueType.REDIS:
264
+ request = self.request_serializer.prepare_for_serialization(request)
265
+
266
+ # 背压控制(仅对内存队列)
267
+ if self._queue_semaphore:
268
+ # 对于大量请求,使用阻塞式等待而不是跳过
269
+ # 这样可以确保不会丢失任何请求
270
+ await self._queue_semaphore.acquire()
271
+
272
+ # 统一的入队操作
273
+ if hasattr(self._queue, 'put'):
274
+ if self._queue_type == QueueType.REDIS:
275
+ success = await self._queue.put(request, final_priority)
276
+ else:
277
+ # 对于内存队列,我们需要手动处理优先级
278
+ # 在SpiderPriorityQueue中,元素应该是(priority, item)的元组
279
+ await self._queue.put((final_priority, request))
280
+ success = True
281
+ else:
282
+ raise RuntimeError(f"队列类型 {self._queue_type} 不支持 put 操作")
283
+
284
+ if success:
285
+ self.logger.debug(f"Request enqueued successfully: {request.url} with priority {final_priority}")
286
+
287
+ return success
288
+
289
+ except Exception as e:
290
+ self.logger.error(f"Failed to enqueue request: {e}")
291
+ if self._queue_semaphore:
292
+ self._queue_semaphore.release()
293
+ return False
294
+
295
+ async def get(self, timeout: float = 5.0) -> Optional["Request"]:
296
+ """Unified dequeue interface"""
297
+ if not self._queue:
298
+ raise RuntimeError("队列未初始化")
299
+
300
+ try:
301
+ request = await self._queue.get(timeout=timeout)
302
+
303
+ # 释放信号量(仅对内存队列)
304
+ if self._queue_semaphore and request:
305
+ self._queue_semaphore.release()
306
+
307
+ # 反序列化处理(仅对 Redis 队列)
308
+ if request and self._queue_type == QueueType.REDIS:
309
+ # 这里需要 spider 实例,暂时返回原始请求
310
+ # 实际的 callback 恢复在 scheduler 中处理
311
+ pass
312
+
313
+ # 如果是内存队列,需要解包(priority, request)元组
314
+ if request and self._queue_type == QueueType.MEMORY:
315
+ if isinstance(request, tuple) and len(request) == 2:
316
+ request = request[1] # 取元组中的请求对象
317
+
318
+ return request
319
+
320
+ except Exception as e:
321
+ self.logger.error(f"Failed to dequeue request: {e}")
322
+ return None
323
+
324
+ async def size(self) -> int:
325
+ """Get queue size"""
326
+ if not self._queue:
327
+ return 0
328
+
329
+ try:
330
+ if hasattr(self._queue, 'qsize'):
331
+ if asyncio.iscoroutinefunction(self._queue.qsize):
332
+ return await self._queue.qsize()
333
+ else:
334
+ return self._queue.qsize()
335
+ return 0
336
+ except Exception as e:
337
+ self.logger.warning(f"Failed to get queue size: {e}")
338
+ return 0
339
+
340
+ def empty(self) -> bool:
341
+ """Check if queue is empty (synchronous version, for compatibility)"""
342
+ try:
343
+ # 对于内存队列,可以同步检查
344
+ if self._queue_type == QueueType.MEMORY:
345
+ # 确保正确检查队列大小
346
+ if hasattr(self._queue, 'qsize'):
347
+ return self._queue.qsize() == 0
348
+ else:
349
+ # 如果没有qsize方法,假设队列为空
350
+ return True
351
+ # 对于 Redis 队列,由于需要异步操作,这里返回近似值
352
+ # 为了确保程序能正常退出,我们返回True,让上层通过更精确的异步检查来判断
353
+ return True
354
+ except Exception:
355
+ return True
356
+
357
+ async def async_empty(self) -> bool:
358
+ """Check if queue is empty (asynchronous version, more accurate)"""
359
+ try:
360
+ # 对于内存队列
361
+ if self._queue_type == QueueType.MEMORY:
362
+ # 确保正确检查队列大小
363
+ if hasattr(self._queue, 'qsize'):
364
+ if asyncio.iscoroutinefunction(self._queue.qsize):
365
+ size = await self._queue.qsize()
366
+ else:
367
+ size = self._queue.qsize()
368
+ return size == 0
369
+ else:
370
+ # 如果没有qsize方法,假设队列为空
371
+ return True
372
+ # 对于 Redis 队列,使用异步检查
373
+ elif self._queue_type == QueueType.REDIS:
374
+ size = await self.size()
375
+ return size == 0
376
+ return True
377
+ except Exception:
378
+ return True
379
+
380
+ async def close(self) -> None:
381
+ """Close queue"""
382
+ if self._queue and hasattr(self._queue, 'close'):
383
+ try:
384
+ await self._queue.close()
385
+ # Change INFO level log to DEBUG level to avoid redundant output
386
+ self.logger.debug("Queue closed")
387
+ except Exception as e:
388
+ self.logger.warning(f"Error closing queue: {e}")
389
+
390
+ def get_status(self) -> Dict[str, Any]:
391
+ """Get queue status information"""
392
+ return {
393
+ "type": self._queue_type.value if self._queue_type else "unknown",
394
+ "health": self._health_status,
395
+ "config": self._get_queue_info(),
396
+ "initialized": self._queue is not None
397
+ }
398
+
399
+ async def _determine_queue_type(self) -> QueueType:
400
+ """Determine queue type"""
401
+ if self.config.queue_type == QueueType.AUTO:
402
+ # 自动选择:优先使用 Redis(如果可用)
403
+ if REDIS_AVAILABLE and self.config.redis_url:
404
+ # 测试 Redis 连接
405
+ try:
406
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
407
+ test_queue = RedisPriorityQueue(self.config.redis_url)
408
+ await test_queue.connect()
409
+ await test_queue.close()
410
+ self.logger.debug("Auto-detection: Redis available, using distributed queue")
411
+ return QueueType.REDIS
412
+ except Exception as e:
413
+ self.logger.debug(f"Auto-detection: Redis unavailable ({e}), using memory queue")
414
+ return QueueType.MEMORY
415
+ else:
416
+ self.logger.debug("Auto-detection: Redis not configured, using memory queue")
417
+ return QueueType.MEMORY
418
+
419
+ elif self.config.queue_type == QueueType.REDIS:
420
+ # Distributed 模式:必须使用 Redis,不允许降级
421
+ if self.config.run_mode == 'distributed':
422
+ # 分布式模式必须确保 Redis 可用
423
+ if not REDIS_AVAILABLE:
424
+ error_msg = (
425
+ "Distributed 模式要求 Redis 可用,但 Redis 客户端库未安装。\n"
426
+ "请安装 Redis 支持: pip install redis"
427
+ )
428
+ self.logger.error(error_msg)
429
+ raise RuntimeError(error_msg)
430
+
431
+ if not self.config.redis_url:
432
+ error_msg = (
433
+ "Distributed 模式要求配置 Redis 连接信息。\n"
434
+ "请在 settings.py 中配置 REDIS_HOST、REDIS_PORT 等参数"
435
+ )
436
+ self.logger.error(error_msg)
437
+ raise RuntimeError(error_msg)
438
+
439
+ # 测试 Redis 连接
440
+ try:
441
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
442
+ test_queue = RedisPriorityQueue(self.config.redis_url)
443
+ await test_queue.connect()
444
+ await test_queue.close()
445
+ self.logger.debug("Distributed mode: Redis connection verified")
446
+ return QueueType.REDIS
447
+ except Exception as e:
448
+ error_msg = (
449
+ f"Distributed 模式要求 Redis 可用,但无法连接到 Redis 服务器。\n"
450
+ f"错误信息: {e}\n"
451
+ f"Redis URL: {self.config.redis_url}\n"
452
+ f"请检查:\n"
453
+ f" 1. Redis 服务是否正在运行\n"
454
+ f" 2. Redis 连接配置是否正确\n"
455
+ f" 3. 网络连接是否正常"
456
+ )
457
+ self.logger.error(error_msg)
458
+ raise RuntimeError(error_msg) from e
459
+ else:
460
+ # 非 distributed 模式:QUEUE_TYPE='redis' 时允许降级到 memory
461
+ # 这提供了向后兼容性和更好的容错性
462
+ if REDIS_AVAILABLE and self.config.redis_url:
463
+ # 测试 Redis 连接
464
+ try:
465
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
466
+ test_queue = RedisPriorityQueue(self.config.redis_url)
467
+ await test_queue.connect()
468
+ await test_queue.close()
469
+ self.logger.debug("Redis mode: Redis available, using distributed queue")
470
+ return QueueType.REDIS
471
+ except Exception as e:
472
+ self.logger.warning(f"Redis mode: Redis unavailable ({e}), falling back to memory queue")
473
+ return QueueType.MEMORY
474
+ else:
475
+ self.logger.warning("Redis mode: Redis not configured, falling back to memory queue")
476
+ return QueueType.MEMORY
477
+
478
+ elif self.config.queue_type == QueueType.MEMORY:
479
+ return QueueType.MEMORY
480
+
481
+ else:
482
+ raise ValueError(f"不支持的队列类型: {self.config.queue_type}")
483
+
484
+ async def _create_queue(self, queue_type: QueueType):
485
+ """Create queue instance"""
486
+ if queue_type == QueueType.REDIS:
487
+ # 延迟导入Redis队列
488
+ try:
489
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
490
+ except ImportError as e:
491
+ raise RuntimeError(f"Redis队列不可用:未能导入RedisPriorityQueue ({e})")
492
+
493
+ # 修复项目名称提取逻辑,严格按照测试文件中的逻辑实现
494
+ project_name = "default"
495
+ if ':' in self.config.queue_name:
496
+ parts = self.config.queue_name.split(':')
497
+ if len(parts) >= 2:
498
+ # 处理可能的双重 crawlo 前缀
499
+ if parts[0] == "crawlo" and parts[1] == "crawlo":
500
+ # 双重 crawlo 前缀,取"crawlo"作为项目名称
501
+ project_name = "crawlo"
502
+ elif parts[0] == "crawlo":
503
+ # 正常的 crawlo 前缀,取第二个部分作为项目名称
504
+ project_name = parts[1]
505
+ else:
506
+ # 没有 crawlo 前缀,使用第一个部分作为项目名称
507
+ project_name = parts[0]
508
+ else:
509
+ project_name = self.config.queue_name or "default"
510
+ else:
511
+ project_name = self.config.queue_name or "default"
512
+
513
+ queue = RedisPriorityQueue(
514
+ redis_url=self.config.redis_url,
515
+ queue_name=self.config.queue_name,
516
+ max_retries=self.config.max_retries,
517
+ timeout=self.config.timeout,
518
+ module_name=project_name # 传递项目名称作为module_name
519
+ )
520
+ # 不需要立即连接,使用 lazy connect
521
+ return queue
522
+
523
+ elif queue_type == QueueType.MEMORY:
524
+ queue = SpiderPriorityQueue()
525
+ # 为内存队列设置背压控制
526
+ self._queue_semaphore = asyncio.Semaphore(self.config.max_queue_size)
527
+ return queue
528
+
529
+ else:
530
+ raise ValueError(f"不支持的队列类型: {queue_type}")
531
+
532
+ async def _health_check(self) -> bool:
533
+ """Health check"""
534
+ try:
535
+ if self._queue_type == QueueType.REDIS:
536
+ # 测试 Redis 连接
537
+ await self._queue.connect()
538
+ self._health_status = "healthy"
539
+ else:
540
+ # 内存队列总是健康的
541
+ self._health_status = "healthy"
542
+ return False # 内存队列不需要更新配置
543
+ except Exception as e:
544
+ self.logger.warning(f"Queue health check failed: {e}")
545
+ self._health_status = "unhealthy"
546
+
547
+ # Distributed 模式下 Redis 健康检查失败应该报错
548
+ if self.config.run_mode == 'distributed':
549
+ error_msg = (
550
+ f"Distributed 模式下 Redis 健康检查失败。\n"
551
+ f"错误信息: {e}\n"
552
+ f"Redis URL: {self.config.redis_url}\n"
553
+ f"分布式模式不允许降级到内存队列,请修复 Redis 连接问题。"
554
+ )
555
+ self.logger.error(error_msg)
556
+ raise RuntimeError(error_msg) from e
557
+
558
+ # 非 Distributed 模式:如果是Redis队列且健康检查失败,尝试切换到内存队列
559
+ # 对于 AUTO 模式允许回退
560
+ if self._queue_type == QueueType.REDIS and self.config.queue_type == QueueType.AUTO:
561
+ self.logger.info("Redis queue unavailable, attempting to switch to memory queue...")
562
+ try:
563
+ await self._queue.close()
564
+ except:
565
+ pass
566
+ self._queue = None
567
+ # 重新创建内存队列
568
+ self._queue = await self._create_queue(QueueType.MEMORY)
569
+ self._queue_type = QueueType.MEMORY
570
+ self._queue_semaphore = asyncio.Semaphore(self.config.max_queue_size)
571
+ self._health_status = "healthy"
572
+ self.logger.info("Switched to memory queue")
573
+ # 返回一个信号,表示需要更新过滤器和去重管道配置
574
+ return True
575
+ return False
576
+
577
+ def _get_queue_info(self) -> Dict[str, Any]:
578
+ """Get queue configuration information"""
579
+ info = {
580
+ "queue_name": self.config.queue_name,
581
+ "max_queue_size": self.config.max_queue_size
582
+ }
583
+
584
+ if self._queue_type == QueueType.REDIS:
585
+ info.update({
586
+ "redis_url": self.config.redis_url,
587
+ "max_retries": self.config.max_retries,
588
+ "timeout": self.config.timeout
589
+ })
590
+
591
+ return info