crawlo 1.3.5__py3-none-any.whl → 1.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (289) hide show
  1. crawlo/__init__.py +87 -87
  2. crawlo/__version__.py +1 -1
  3. crawlo/cli.py +75 -75
  4. crawlo/commands/__init__.py +14 -14
  5. crawlo/commands/check.py +594 -594
  6. crawlo/commands/genspider.py +151 -151
  7. crawlo/commands/help.py +138 -138
  8. crawlo/commands/list.py +155 -155
  9. crawlo/commands/run.py +341 -341
  10. crawlo/commands/startproject.py +436 -436
  11. crawlo/commands/stats.py +187 -187
  12. crawlo/commands/utils.py +196 -196
  13. crawlo/config.py +312 -312
  14. crawlo/config_validator.py +277 -277
  15. crawlo/core/__init__.py +45 -45
  16. crawlo/core/engine.py +439 -439
  17. crawlo/core/processor.py +40 -40
  18. crawlo/core/scheduler.py +257 -257
  19. crawlo/crawler.py +638 -638
  20. crawlo/data/__init__.py +5 -5
  21. crawlo/data/user_agents.py +194 -194
  22. crawlo/downloader/__init__.py +273 -273
  23. crawlo/downloader/aiohttp_downloader.py +228 -228
  24. crawlo/downloader/cffi_downloader.py +245 -245
  25. crawlo/downloader/httpx_downloader.py +259 -259
  26. crawlo/downloader/hybrid_downloader.py +212 -212
  27. crawlo/downloader/playwright_downloader.py +402 -402
  28. crawlo/downloader/selenium_downloader.py +472 -472
  29. crawlo/event.py +11 -11
  30. crawlo/exceptions.py +81 -81
  31. crawlo/extension/__init__.py +39 -39
  32. crawlo/extension/health_check.py +141 -141
  33. crawlo/extension/log_interval.py +57 -57
  34. crawlo/extension/log_stats.py +81 -81
  35. crawlo/extension/logging_extension.py +61 -61
  36. crawlo/extension/memory_monitor.py +104 -104
  37. crawlo/extension/performance_profiler.py +133 -133
  38. crawlo/extension/request_recorder.py +107 -107
  39. crawlo/factories/__init__.py +27 -27
  40. crawlo/factories/base.py +68 -68
  41. crawlo/factories/crawler.py +103 -103
  42. crawlo/factories/registry.py +84 -84
  43. crawlo/filters/__init__.py +154 -154
  44. crawlo/filters/aioredis_filter.py +257 -257
  45. crawlo/filters/memory_filter.py +269 -269
  46. crawlo/framework.py +292 -291
  47. crawlo/initialization/__init__.py +39 -39
  48. crawlo/initialization/built_in.py +425 -425
  49. crawlo/initialization/context.py +141 -141
  50. crawlo/initialization/core.py +193 -193
  51. crawlo/initialization/phases.py +148 -148
  52. crawlo/initialization/registry.py +145 -145
  53. crawlo/items/__init__.py +23 -23
  54. crawlo/items/base.py +23 -23
  55. crawlo/items/fields.py +52 -52
  56. crawlo/items/items.py +104 -104
  57. crawlo/logging/__init__.py +37 -37
  58. crawlo/logging/config.py +96 -96
  59. crawlo/logging/factory.py +128 -128
  60. crawlo/logging/manager.py +111 -111
  61. crawlo/middleware/__init__.py +21 -21
  62. crawlo/middleware/default_header.py +132 -132
  63. crawlo/middleware/download_delay.py +104 -104
  64. crawlo/middleware/middleware_manager.py +135 -135
  65. crawlo/middleware/offsite.py +123 -123
  66. crawlo/middleware/proxy.py +386 -386
  67. crawlo/middleware/request_ignore.py +86 -86
  68. crawlo/middleware/response_code.py +163 -163
  69. crawlo/middleware/response_filter.py +136 -136
  70. crawlo/middleware/retry.py +124 -124
  71. crawlo/middleware/simple_proxy.py +65 -65
  72. crawlo/mode_manager.py +212 -212
  73. crawlo/network/__init__.py +21 -21
  74. crawlo/network/request.py +379 -379
  75. crawlo/network/response.py +359 -359
  76. crawlo/pipelines/__init__.py +21 -21
  77. crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
  78. crawlo/pipelines/console_pipeline.py +39 -39
  79. crawlo/pipelines/csv_pipeline.py +316 -316
  80. crawlo/pipelines/database_dedup_pipeline.py +222 -222
  81. crawlo/pipelines/json_pipeline.py +218 -218
  82. crawlo/pipelines/memory_dedup_pipeline.py +115 -115
  83. crawlo/pipelines/mongo_pipeline.py +131 -131
  84. crawlo/pipelines/mysql_pipeline.py +325 -325
  85. crawlo/pipelines/pipeline_manager.py +76 -76
  86. crawlo/pipelines/redis_dedup_pipeline.py +166 -166
  87. crawlo/project.py +327 -327
  88. crawlo/queue/pqueue.py +42 -42
  89. crawlo/queue/queue_manager.py +503 -503
  90. crawlo/queue/redis_priority_queue.py +326 -326
  91. crawlo/settings/__init__.py +7 -7
  92. crawlo/settings/default_settings.py +321 -321
  93. crawlo/settings/setting_manager.py +214 -214
  94. crawlo/spider/__init__.py +657 -657
  95. crawlo/stats_collector.py +73 -73
  96. crawlo/subscriber.py +129 -129
  97. crawlo/task_manager.py +138 -138
  98. crawlo/templates/crawlo.cfg.tmpl +10 -10
  99. crawlo/templates/project/__init__.py.tmpl +3 -3
  100. crawlo/templates/project/items.py.tmpl +17 -17
  101. crawlo/templates/project/middlewares.py.tmpl +118 -118
  102. crawlo/templates/project/pipelines.py.tmpl +96 -96
  103. crawlo/templates/project/settings.py.tmpl +167 -167
  104. crawlo/templates/project/settings_distributed.py.tmpl +166 -166
  105. crawlo/templates/project/settings_gentle.py.tmpl +166 -166
  106. crawlo/templates/project/settings_high_performance.py.tmpl +167 -167
  107. crawlo/templates/project/settings_minimal.py.tmpl +65 -65
  108. crawlo/templates/project/settings_simple.py.tmpl +164 -164
  109. crawlo/templates/project/spiders/__init__.py.tmpl +9 -9
  110. crawlo/templates/run.py.tmpl +34 -34
  111. crawlo/templates/spider/spider.py.tmpl +143 -143
  112. crawlo/templates/spiders_init.py.tmpl +9 -9
  113. crawlo/tools/__init__.py +200 -200
  114. crawlo/tools/anti_crawler.py +268 -268
  115. crawlo/tools/authenticated_proxy.py +240 -240
  116. crawlo/tools/data_formatter.py +225 -225
  117. crawlo/tools/data_validator.py +180 -180
  118. crawlo/tools/date_tools.py +289 -289
  119. crawlo/tools/distributed_coordinator.py +388 -388
  120. crawlo/tools/encoding_converter.py +127 -127
  121. crawlo/tools/network_diagnostic.py +364 -364
  122. crawlo/tools/request_tools.py +82 -82
  123. crawlo/tools/retry_mechanism.py +224 -224
  124. crawlo/tools/scenario_adapter.py +262 -262
  125. crawlo/tools/text_cleaner.py +232 -232
  126. crawlo/utils/__init__.py +34 -34
  127. crawlo/utils/batch_processor.py +259 -259
  128. crawlo/utils/class_loader.py +25 -25
  129. crawlo/utils/controlled_spider_mixin.py +439 -439
  130. crawlo/utils/db_helper.py +343 -343
  131. crawlo/utils/enhanced_error_handler.py +356 -356
  132. crawlo/utils/env_config.py +142 -142
  133. crawlo/utils/error_handler.py +165 -165
  134. crawlo/utils/func_tools.py +82 -82
  135. crawlo/utils/large_scale_config.py +286 -286
  136. crawlo/utils/large_scale_helper.py +344 -344
  137. crawlo/utils/log.py +79 -79
  138. crawlo/utils/performance_monitor.py +285 -285
  139. crawlo/utils/queue_helper.py +175 -175
  140. crawlo/utils/redis_connection_pool.py +388 -388
  141. crawlo/utils/redis_key_validator.py +198 -198
  142. crawlo/utils/request.py +267 -267
  143. crawlo/utils/request_serializer.py +225 -225
  144. crawlo/utils/spider_loader.py +61 -61
  145. crawlo/utils/system.py +11 -11
  146. crawlo/utils/tools.py +4 -4
  147. crawlo/utils/url.py +39 -39
  148. {crawlo-1.3.5.dist-info → crawlo-1.3.6.dist-info}/METADATA +1126 -1126
  149. crawlo-1.3.6.dist-info/RECORD +290 -0
  150. examples/__init__.py +7 -7
  151. tests/__init__.py +7 -7
  152. tests/advanced_tools_example.py +275 -275
  153. tests/authenticated_proxy_example.py +106 -106
  154. tests/baidu_performance_test.py +108 -108
  155. tests/baidu_test.py +59 -59
  156. tests/cleaners_example.py +160 -160
  157. tests/comprehensive_framework_test.py +212 -212
  158. tests/comprehensive_test.py +81 -81
  159. tests/comprehensive_testing_summary.md +186 -186
  160. tests/config_validation_demo.py +142 -142
  161. tests/controlled_spider_example.py +205 -205
  162. tests/date_tools_example.py +180 -180
  163. tests/debug_configure.py +69 -69
  164. tests/debug_framework_logger.py +84 -84
  165. tests/debug_log_config.py +126 -126
  166. tests/debug_log_levels.py +63 -63
  167. tests/debug_pipelines.py +66 -66
  168. tests/detailed_log_test.py +233 -233
  169. tests/distributed_test.py +66 -66
  170. tests/distributed_test_debug.py +76 -76
  171. tests/dynamic_loading_example.py +523 -523
  172. tests/dynamic_loading_test.py +104 -104
  173. tests/env_config_example.py +133 -133
  174. tests/error_handling_example.py +171 -171
  175. tests/final_comprehensive_test.py +151 -151
  176. tests/final_log_test.py +260 -260
  177. tests/final_validation_test.py +182 -182
  178. tests/fix_log_test.py +142 -142
  179. tests/framework_performance_test.py +202 -202
  180. tests/log_buffering_test.py +111 -111
  181. tests/log_generation_timing_test.py +153 -153
  182. tests/optimized_performance_test.py +211 -211
  183. tests/performance_comparison.py +245 -245
  184. tests/queue_blocking_test.py +113 -113
  185. tests/queue_test.py +89 -89
  186. tests/redis_key_validation_demo.py +130 -130
  187. tests/request_params_example.py +150 -150
  188. tests/response_improvements_example.py +144 -144
  189. tests/scrapy_comparison/ofweek_scrapy.py +138 -138
  190. tests/scrapy_comparison/scrapy_test.py +133 -133
  191. tests/simple_command_test.py +119 -119
  192. tests/simple_crawlo_test.py +127 -127
  193. tests/simple_log_test.py +57 -57
  194. tests/simple_log_test2.py +137 -137
  195. tests/simple_optimization_test.py +128 -128
  196. tests/simple_queue_type_test.py +42 -0
  197. tests/simple_spider_test.py +49 -49
  198. tests/simple_test.py +47 -47
  199. tests/spider_log_timing_test.py +177 -177
  200. tests/test_advanced_tools.py +148 -148
  201. tests/test_all_commands.py +230 -230
  202. tests/test_all_redis_key_configs.py +145 -145
  203. tests/test_authenticated_proxy.py +141 -141
  204. tests/test_batch_processor.py +178 -178
  205. tests/test_cleaners.py +54 -54
  206. tests/test_component_factory.py +174 -174
  207. tests/test_comprehensive.py +146 -146
  208. tests/test_config_consistency.py +80 -80
  209. tests/test_config_merge.py +152 -152
  210. tests/test_config_validator.py +182 -182
  211. tests/test_controlled_spider_mixin.py +79 -79
  212. tests/test_crawlo_proxy_integration.py +108 -108
  213. tests/test_date_tools.py +123 -123
  214. tests/test_default_header_middleware.py +158 -158
  215. tests/test_distributed.py +65 -65
  216. tests/test_double_crawlo_fix.py +207 -207
  217. tests/test_double_crawlo_fix_simple.py +124 -124
  218. tests/test_download_delay_middleware.py +221 -221
  219. tests/test_downloader_proxy_compatibility.py +268 -268
  220. tests/test_dynamic_downloaders_proxy.py +124 -124
  221. tests/test_dynamic_proxy.py +92 -92
  222. tests/test_dynamic_proxy_config.py +146 -146
  223. tests/test_dynamic_proxy_real.py +109 -109
  224. tests/test_edge_cases.py +303 -303
  225. tests/test_enhanced_error_handler.py +270 -270
  226. tests/test_enhanced_error_handler_comprehensive.py +245 -245
  227. tests/test_env_config.py +121 -121
  228. tests/test_error_handler_compatibility.py +112 -112
  229. tests/test_factories.py +252 -252
  230. tests/test_final_validation.py +153 -153
  231. tests/test_framework_env_usage.py +103 -103
  232. tests/test_framework_logger.py +66 -66
  233. tests/test_framework_startup.py +64 -64
  234. tests/test_get_component_logger.py +83 -83
  235. tests/test_integration.py +169 -169
  236. tests/test_item_dedup_redis_key.py +122 -122
  237. tests/test_large_scale_config.py +112 -112
  238. tests/test_large_scale_helper.py +235 -235
  239. tests/test_logging_system.py +282 -282
  240. tests/test_mode_change.py +72 -72
  241. tests/test_mode_consistency.py +51 -51
  242. tests/test_offsite_middleware.py +221 -221
  243. tests/test_parsel.py +29 -29
  244. tests/test_performance.py +327 -327
  245. tests/test_performance_monitor.py +115 -115
  246. tests/test_proxy_api.py +264 -264
  247. tests/test_proxy_health_check.py +32 -32
  248. tests/test_proxy_middleware.py +121 -121
  249. tests/test_proxy_middleware_enhanced.py +216 -216
  250. tests/test_proxy_middleware_integration.py +136 -136
  251. tests/test_proxy_middleware_refactored.py +184 -184
  252. tests/test_proxy_providers.py +56 -56
  253. tests/test_proxy_stats.py +19 -19
  254. tests/test_proxy_strategies.py +59 -59
  255. tests/test_queue_empty_check.py +41 -41
  256. tests/test_queue_manager_double_crawlo.py +173 -173
  257. tests/test_queue_manager_redis_key.py +176 -176
  258. tests/test_queue_type.py +107 -0
  259. tests/test_random_user_agent.py +72 -72
  260. tests/test_real_scenario_proxy.py +195 -195
  261. tests/test_redis_config.py +28 -28
  262. tests/test_redis_connection_pool.py +294 -294
  263. tests/test_redis_key_naming.py +181 -181
  264. tests/test_redis_key_validator.py +123 -123
  265. tests/test_redis_queue.py +224 -224
  266. tests/test_request_ignore_middleware.py +182 -182
  267. tests/test_request_params.py +111 -111
  268. tests/test_request_serialization.py +70 -70
  269. tests/test_response_code_middleware.py +349 -349
  270. tests/test_response_filter_middleware.py +427 -427
  271. tests/test_response_improvements.py +152 -152
  272. tests/test_retry_middleware.py +241 -241
  273. tests/test_scheduler.py +252 -252
  274. tests/test_scheduler_config_update.py +133 -133
  275. tests/test_simple_response.py +61 -61
  276. tests/test_telecom_spider_redis_key.py +205 -205
  277. tests/test_template_content.py +87 -87
  278. tests/test_template_redis_key.py +134 -134
  279. tests/test_tools.py +159 -159
  280. tests/test_user_agents.py +96 -96
  281. tests/tools_example.py +260 -260
  282. tests/untested_features_report.md +138 -138
  283. tests/verify_debug.py +51 -51
  284. tests/verify_distributed.py +117 -117
  285. tests/verify_log_fix.py +111 -111
  286. crawlo-1.3.5.dist-info/RECORD +0 -288
  287. {crawlo-1.3.5.dist-info → crawlo-1.3.6.dist-info}/WHEEL +0 -0
  288. {crawlo-1.3.5.dist-info → crawlo-1.3.6.dist-info}/entry_points.txt +0 -0
  289. {crawlo-1.3.5.dist-info → crawlo-1.3.6.dist-info}/top_level.txt +0 -0
@@ -1,503 +1,503 @@
1
- #!/usr/bin/python
2
- # -*- coding: UTF-8 -*-
3
- """
4
- 统一的队列管理器
5
- 提供简洁、一致的队列接口,自动处理不同队列类型的差异
6
- """
7
- import asyncio
8
- import traceback
9
- from enum import Enum
10
- from typing import Optional, Dict, Any, Union, TYPE_CHECKING
11
- import time
12
- import random
13
-
14
- if TYPE_CHECKING:
15
- from crawlo import Request
16
-
17
- from crawlo.queue.pqueue import SpiderPriorityQueue
18
- from crawlo.utils.error_handler import ErrorHandler
19
- from crawlo.utils.log import get_logger
20
- from crawlo.utils.request_serializer import RequestSerializer
21
-
22
- try:
23
- # 使用完整版Redis队列
24
- from crawlo.queue.redis_priority_queue import RedisPriorityQueue
25
-
26
- REDIS_AVAILABLE = True
27
- except ImportError:
28
- RedisPriorityQueue = None
29
- REDIS_AVAILABLE = False
30
-
31
-
32
- class QueueType(Enum):
33
- """Queue type enumeration"""
34
- MEMORY = "memory"
35
- REDIS = "redis"
36
- AUTO = "auto" # 自动选择
37
-
38
-
39
- class IntelligentScheduler:
40
- """智能调度器"""
41
-
42
- def __init__(self):
43
- self.domain_stats = {} # 域名统计信息
44
- self.url_stats = {} # URL统计信息
45
- self.last_request_time = {} # 最后请求时间
46
-
47
- def calculate_priority(self, request: "Request") -> int:
48
- """计算请求的智能优先级"""
49
- priority = getattr(request, 'priority', 0)
50
-
51
- # 获取域名
52
- domain = self._extract_domain(request.url)
53
-
54
- # 基于域名访问频率调整优先级
55
- if domain in self.domain_stats:
56
- domain_access_count = self.domain_stats[domain]['count']
57
- last_access_time = self.domain_stats[domain]['last_time']
58
-
59
- # 如果最近访问过该域名,降低优先级(避免过度集中访问同一域名)
60
- time_since_last = time.time() - last_access_time
61
- if time_since_last < 5: # 5秒内访问过
62
- priority -= 2
63
- elif time_since_last < 30: # 30秒内访问过
64
- priority -= 1
65
-
66
- # 如果该域名访问次数过多,进一步降低优先级
67
- if domain_access_count > 10:
68
- priority -= 1
69
-
70
- # 基于URL访问历史调整优先级
71
- if request.url in self.url_stats:
72
- url_access_count = self.url_stats[request.url]
73
- if url_access_count > 1:
74
- # 重复URL降低优先级
75
- priority -= url_access_count
76
-
77
- # 基于深度调整优先级
78
- depth = getattr(request, 'meta', {}).get('depth', 0)
79
- priority -= depth # 深度越大,优先级越低
80
-
81
- return priority
82
-
83
- def update_stats(self, request: "Request"):
84
- """更新统计信息"""
85
- domain = self._extract_domain(request.url)
86
-
87
- # 更新域名统计
88
- if domain not in self.domain_stats:
89
- self.domain_stats[domain] = {'count': 0, 'last_time': 0}
90
-
91
- self.domain_stats[domain]['count'] += 1
92
- self.domain_stats[domain]['last_time'] = time.time()
93
-
94
- # 更新URL统计
95
- if request.url not in self.url_stats:
96
- self.url_stats[request.url] = 0
97
- self.url_stats[request.url] += 1
98
-
99
- # 更新最后请求时间
100
- self.last_request_time[domain] = time.time()
101
-
102
- def _extract_domain(self, url: str) -> str:
103
- """提取域名"""
104
- try:
105
- from urllib.parse import urlparse
106
- parsed = urlparse(url)
107
- return parsed.netloc
108
- except:
109
- return "unknown"
110
-
111
-
112
- class QueueConfig:
113
- """Queue configuration class"""
114
-
115
- def __init__(
116
- self,
117
- queue_type: Union[QueueType, str] = QueueType.AUTO,
118
- redis_url: Optional[str] = None,
119
- redis_host: str = "127.0.0.1",
120
- redis_port: int = 6379,
121
- redis_password: Optional[str] = None,
122
- redis_db: int = 0,
123
- queue_name: str = "crawlo:requests",
124
- max_queue_size: int = 1000,
125
- max_retries: int = 3,
126
- timeout: int = 300,
127
- **kwargs
128
- ):
129
- self.queue_type = QueueType(queue_type) if isinstance(queue_type, str) else queue_type
130
-
131
- # Redis 配置
132
- if redis_url:
133
- self.redis_url = redis_url
134
- else:
135
- if redis_password:
136
- self.redis_url = f"redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}"
137
- else:
138
- self.redis_url = f"redis://{redis_host}:{redis_port}/{redis_db}"
139
-
140
- self.queue_name = queue_name
141
- self.max_queue_size = max_queue_size
142
- self.max_retries = max_retries
143
- self.timeout = timeout
144
- self.extra_config = kwargs
145
-
146
- @classmethod
147
- def from_settings(cls, settings) -> 'QueueConfig':
148
- """Create configuration from settings"""
149
- return cls(
150
- queue_type=settings.get('QUEUE_TYPE', QueueType.AUTO),
151
- redis_url=settings.get('REDIS_URL'),
152
- redis_host=settings.get('REDIS_HOST', '127.0.0.1'),
153
- redis_port=settings.get_int('REDIS_PORT', 6379),
154
- redis_password=settings.get('REDIS_PASSWORD'),
155
- redis_db=settings.get_int('REDIS_DB', 0),
156
- queue_name=settings.get('SCHEDULER_QUEUE_NAME', 'crawlo:requests'),
157
- max_queue_size=settings.get_int('SCHEDULER_MAX_QUEUE_SIZE', 1000),
158
- max_retries=settings.get_int('QUEUE_MAX_RETRIES', 3),
159
- timeout=settings.get_int('QUEUE_TIMEOUT', 300)
160
- )
161
-
162
-
163
- class QueueManager:
164
- """Unified queue manager"""
165
-
166
- def __init__(self, config: QueueConfig):
167
- self.config = config
168
- # 延迟初始化logger和error_handler避免循环依赖
169
- self._logger = None
170
- self._error_handler = None
171
- self.request_serializer = RequestSerializer()
172
- self._queue = None
173
- self._queue_semaphore = None
174
- self._queue_type = None
175
- self._health_status = "unknown"
176
- self._intelligent_scheduler = IntelligentScheduler() # 智能调度器
177
-
178
- @property
179
- def logger(self):
180
- if self._logger is None:
181
- self._logger = get_logger(self.__class__.__name__)
182
- return self._logger
183
-
184
- @property
185
- def error_handler(self):
186
- if self._error_handler is None:
187
- self._error_handler = ErrorHandler(self.__class__.__name__)
188
- return self._error_handler
189
-
190
- async def initialize(self) -> bool:
191
- """初始化队列"""
192
- try:
193
- queue_type = await self._determine_queue_type()
194
- self._queue = await self._create_queue(queue_type)
195
- self._queue_type = queue_type
196
-
197
- # 测试队列健康状态
198
- health_check_result = await self._health_check()
199
-
200
- self.logger.info(f"Queue initialized successfully Type: {queue_type.value}")
201
- # 只在调试模式下输出详细配置信息
202
- self.logger.debug(f"Queue configuration: {self._get_queue_info()}")
203
-
204
- # 如果健康检查返回True,表示队列类型发生了切换,需要更新配置
205
- if health_check_result:
206
- return True
207
-
208
- # 如果队列类型是Redis,检查是否需要更新配置
209
- if queue_type == QueueType.REDIS:
210
- # 这个检查需要在调度器中进行,因为队列管理器无法访问crawler.settings
211
- # 但我们不需要总是返回True,只有在确实需要更新时才返回True
212
- # 调度器会进行更详细的检查
213
- pass
214
-
215
- return False # 默认不需要更新配置
216
-
217
- except Exception as e:
218
- # 记录详细的错误信息和堆栈跟踪
219
- self.logger.error(f"Queue initialization failed: {e}")
220
- self.logger.debug(f"详细错误信息:\n{traceback.format_exc()}")
221
- self._health_status = "error"
222
- return False
223
-
224
- async def put(self, request: "Request", priority: int = 0) -> bool:
225
- """Unified enqueue interface"""
226
- if not self._queue:
227
- raise RuntimeError("队列未初始化")
228
-
229
- try:
230
- # 应用智能调度算法计算优先级
231
- intelligent_priority = self._intelligent_scheduler.calculate_priority(request)
232
- # 结合原始优先级和智能优先级
233
- final_priority = priority + intelligent_priority
234
-
235
- # 更新统计信息
236
- self._intelligent_scheduler.update_stats(request)
237
-
238
- # 序列化处理(仅对 Redis 队列)
239
- if self._queue_type == QueueType.REDIS:
240
- request = self.request_serializer.prepare_for_serialization(request)
241
-
242
- # 背压控制(仅对内存队列)
243
- if self._queue_semaphore:
244
- # 对于大量请求,使用阻塞式等待而不是跳过
245
- # 这样可以确保不会丢失任何请求
246
- await self._queue_semaphore.acquire()
247
-
248
- # 统一的入队操作
249
- if hasattr(self._queue, 'put'):
250
- if self._queue_type == QueueType.REDIS:
251
- success = await self._queue.put(request, final_priority)
252
- else:
253
- # 对于内存队列,我们需要手动处理优先级
254
- # 在SpiderPriorityQueue中,元素应该是(priority, item)的元组
255
- await self._queue.put((final_priority, request))
256
- success = True
257
- else:
258
- raise RuntimeError(f"队列类型 {self._queue_type} 不支持 put 操作")
259
-
260
- if success:
261
- self.logger.debug(f"Request enqueued successfully: {request.url} with priority {final_priority}")
262
-
263
- return success
264
-
265
- except Exception as e:
266
- self.logger.error(f"Failed to enqueue request: {e}")
267
- if self._queue_semaphore:
268
- self._queue_semaphore.release()
269
- return False
270
-
271
- async def get(self, timeout: float = 5.0) -> Optional["Request"]:
272
- """Unified dequeue interface"""
273
- if not self._queue:
274
- raise RuntimeError("队列未初始化")
275
-
276
- try:
277
- request = await self._queue.get(timeout=timeout)
278
-
279
- # 释放信号量(仅对内存队列)
280
- if self._queue_semaphore and request:
281
- self._queue_semaphore.release()
282
-
283
- # 反序列化处理(仅对 Redis 队列)
284
- if request and self._queue_type == QueueType.REDIS:
285
- # 这里需要 spider 实例,暂时返回原始请求
286
- # 实际的 callback 恢复在 scheduler 中处理
287
- pass
288
-
289
- # 如果是内存队列,需要解包(priority, request)元组
290
- if request and self._queue_type == QueueType.MEMORY:
291
- if isinstance(request, tuple) and len(request) == 2:
292
- request = request[1] # 取元组中的请求对象
293
-
294
- return request
295
-
296
- except Exception as e:
297
- self.logger.error(f"Failed to dequeue request: {e}")
298
- return None
299
-
300
- async def size(self) -> int:
301
- """Get queue size"""
302
- if not self._queue:
303
- return 0
304
-
305
- try:
306
- if hasattr(self._queue, 'qsize'):
307
- if asyncio.iscoroutinefunction(self._queue.qsize):
308
- return await self._queue.qsize()
309
- else:
310
- return self._queue.qsize()
311
- return 0
312
- except Exception as e:
313
- self.logger.warning(f"Failed to get queue size: {e}")
314
- return 0
315
-
316
- def empty(self) -> bool:
317
- """Check if queue is empty (synchronous version, for compatibility)"""
318
- try:
319
- # 对于内存队列,可以同步检查
320
- if self._queue_type == QueueType.MEMORY:
321
- # 确保正确检查队列大小
322
- if hasattr(self._queue, 'qsize'):
323
- return self._queue.qsize() == 0
324
- else:
325
- # 如果没有qsize方法,假设队列为空
326
- return True
327
- # 对于 Redis 队列,由于需要异步操作,这里返回近似值
328
- # 为了确保程序能正常退出,我们返回True,让上层通过更精确的异步检查来判断
329
- return True
330
- except Exception:
331
- return True
332
-
333
- async def async_empty(self) -> bool:
334
- """Check if queue is empty (asynchronous version, more accurate)"""
335
- try:
336
- # 对于内存队列
337
- if self._queue_type == QueueType.MEMORY:
338
- # 确保正确检查队列大小
339
- if hasattr(self._queue, 'qsize'):
340
- if asyncio.iscoroutinefunction(self._queue.qsize):
341
- size = await self._queue.qsize()
342
- else:
343
- size = self._queue.qsize()
344
- return size == 0
345
- else:
346
- # 如果没有qsize方法,假设队列为空
347
- return True
348
- # 对于 Redis 队列,使用异步检查
349
- elif self._queue_type == QueueType.REDIS:
350
- size = await self.size()
351
- return size == 0
352
- return True
353
- except Exception:
354
- return True
355
-
356
- async def close(self) -> None:
357
- """Close queue"""
358
- if self._queue and hasattr(self._queue, 'close'):
359
- try:
360
- await self._queue.close()
361
- # Change INFO level log to DEBUG level to avoid redundant output
362
- self.logger.debug("Queue closed")
363
- except Exception as e:
364
- self.logger.warning(f"Error closing queue: {e}")
365
-
366
- def get_status(self) -> Dict[str, Any]:
367
- """Get queue status information"""
368
- return {
369
- "type": self._queue_type.value if self._queue_type else "unknown",
370
- "health": self._health_status,
371
- "config": self._get_queue_info(),
372
- "initialized": self._queue is not None
373
- }
374
-
375
- async def _determine_queue_type(self) -> QueueType:
376
- """Determine queue type"""
377
- if self.config.queue_type == QueueType.AUTO:
378
- # 自动选择:优先使用 Redis(如果可用)
379
- if REDIS_AVAILABLE and self.config.redis_url:
380
- # 测试 Redis 连接
381
- try:
382
- from crawlo.queue.redis_priority_queue import RedisPriorityQueue
383
- test_queue = RedisPriorityQueue(self.config.redis_url)
384
- await test_queue.connect()
385
- await test_queue.close()
386
- self.logger.debug("Auto-detection: Redis available, using distributed queue")
387
- return QueueType.REDIS
388
- except Exception as e:
389
- self.logger.debug(f"Auto-detection: Redis unavailable ({e}), using memory queue")
390
- return QueueType.MEMORY
391
- else:
392
- self.logger.debug("Auto-detection: Redis not configured, using memory queue")
393
- return QueueType.MEMORY
394
-
395
- elif self.config.queue_type == QueueType.REDIS:
396
- if not REDIS_AVAILABLE:
397
- raise RuntimeError("Redis 队列不可用:未安装 redis 依赖")
398
- if not self.config.redis_url:
399
- raise RuntimeError("Redis 队列不可用:未配置 REDIS_URL")
400
- # 测试 Redis 连接
401
- try:
402
- from crawlo.queue.redis_priority_queue import RedisPriorityQueue
403
- test_queue = RedisPriorityQueue(self.config.redis_url)
404
- await test_queue.connect()
405
- await test_queue.close()
406
- return QueueType.REDIS
407
- except Exception as e:
408
- # 如果强制使用Redis但连接失败,则抛出异常
409
- raise RuntimeError(f"Redis 队列不可用:无法连接到 Redis ({e})")
410
-
411
- elif self.config.queue_type == QueueType.MEMORY:
412
- return QueueType.MEMORY
413
-
414
- else:
415
- raise ValueError(f"不支持的队列类型: {self.config.queue_type}")
416
-
417
- async def _create_queue(self, queue_type: QueueType):
418
- """Create queue instance"""
419
- if queue_type == QueueType.REDIS:
420
- # 延迟导入Redis队列
421
- try:
422
- from crawlo.queue.redis_priority_queue import RedisPriorityQueue
423
- except ImportError as e:
424
- raise RuntimeError(f"Redis队列不可用:未能导入RedisPriorityQueue ({e})")
425
-
426
- # 简化项目名称提取逻辑
427
- project_name = "default"
428
- if ':' in self.config.queue_name:
429
- parts = self.config.queue_name.split(':')
430
- # 跳过所有"crawlo"前缀,取第一个非"crawlo"部分作为项目名称
431
- for part in parts:
432
- if part != "crawlo":
433
- project_name = part
434
- break
435
- else:
436
- project_name = self.config.queue_name or "default"
437
-
438
- queue = RedisPriorityQueue(
439
- redis_url=self.config.redis_url,
440
- queue_name=self.config.queue_name,
441
- max_retries=self.config.max_retries,
442
- timeout=self.config.timeout,
443
- module_name=project_name # 传递项目名称作为module_name
444
- )
445
- # 不需要立即连接,使用 lazy connect
446
- return queue
447
-
448
- elif queue_type == QueueType.MEMORY:
449
- queue = SpiderPriorityQueue()
450
- # 为内存队列设置背压控制
451
- self._queue_semaphore = asyncio.Semaphore(self.config.max_queue_size)
452
- return queue
453
-
454
- else:
455
- raise ValueError(f"不支持的队列类型: {queue_type}")
456
-
457
- async def _health_check(self) -> bool:
458
- """Health check"""
459
- try:
460
- if self._queue_type == QueueType.REDIS:
461
- # 测试 Redis 连接
462
- await self._queue.connect()
463
- self._health_status = "healthy"
464
- else:
465
- # 内存队列总是健康的
466
- self._health_status = "healthy"
467
- return False # 内存队列不需要更新配置
468
- except Exception as e:
469
- self.logger.warning(f"Queue health check failed: {e}")
470
- self._health_status = "unhealthy"
471
- # 如果是Redis队列且健康检查失败,尝试切换到内存队列
472
- if self._queue_type == QueueType.REDIS and self.config.queue_type == QueueType.AUTO:
473
- self.logger.info("Redis queue unavailable, attempting to switch to memory queue...")
474
- try:
475
- await self._queue.close()
476
- except:
477
- pass
478
- self._queue = None
479
- # 重新创建内存队列
480
- self._queue = await self._create_queue(QueueType.MEMORY)
481
- self._queue_type = QueueType.MEMORY
482
- self._queue_semaphore = asyncio.Semaphore(self.config.max_queue_size)
483
- self._health_status = "healthy"
484
- self.logger.info("Switched to memory queue")
485
- # 返回一个信号,表示需要更新过滤器和去重管道配置
486
- return True
487
- return False
488
-
489
- def _get_queue_info(self) -> Dict[str, Any]:
490
- """Get queue configuration information"""
491
- info = {
492
- "queue_name": self.config.queue_name,
493
- "max_queue_size": self.config.max_queue_size
494
- }
495
-
496
- if self._queue_type == QueueType.REDIS:
497
- info.update({
498
- "redis_url": self.config.redis_url,
499
- "max_retries": self.config.max_retries,
500
- "timeout": self.config.timeout
501
- })
502
-
503
- return info
1
+ #!/usr/bin/python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ 统一的队列管理器
5
+ 提供简洁、一致的队列接口,自动处理不同队列类型的差异
6
+ """
7
+ import asyncio
8
+ import traceback
9
+ from enum import Enum
10
+ from typing import Optional, Dict, Any, Union, TYPE_CHECKING
11
+ import time
12
+ import random
13
+
14
+ if TYPE_CHECKING:
15
+ from crawlo import Request
16
+
17
+ from crawlo.queue.pqueue import SpiderPriorityQueue
18
+ from crawlo.utils.error_handler import ErrorHandler
19
+ from crawlo.utils.log import get_logger
20
+ from crawlo.utils.request_serializer import RequestSerializer
21
+
22
+ try:
23
+ # 使用完整版Redis队列
24
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
25
+
26
+ REDIS_AVAILABLE = True
27
+ except ImportError:
28
+ RedisPriorityQueue = None
29
+ REDIS_AVAILABLE = False
30
+
31
+
32
+ class QueueType(Enum):
33
+ """Queue type enumeration"""
34
+ MEMORY = "memory"
35
+ REDIS = "redis"
36
+ AUTO = "auto" # 自动选择
37
+
38
+
39
+ class IntelligentScheduler:
40
+ """智能调度器"""
41
+
42
+ def __init__(self):
43
+ self.domain_stats = {} # 域名统计信息
44
+ self.url_stats = {} # URL统计信息
45
+ self.last_request_time = {} # 最后请求时间
46
+
47
+ def calculate_priority(self, request: "Request") -> int:
48
+ """计算请求的智能优先级"""
49
+ priority = getattr(request, 'priority', 0)
50
+
51
+ # 获取域名
52
+ domain = self._extract_domain(request.url)
53
+
54
+ # 基于域名访问频率调整优先级
55
+ if domain in self.domain_stats:
56
+ domain_access_count = self.domain_stats[domain]['count']
57
+ last_access_time = self.domain_stats[domain]['last_time']
58
+
59
+ # 如果最近访问过该域名,降低优先级(避免过度集中访问同一域名)
60
+ time_since_last = time.time() - last_access_time
61
+ if time_since_last < 5: # 5秒内访问过
62
+ priority -= 2
63
+ elif time_since_last < 30: # 30秒内访问过
64
+ priority -= 1
65
+
66
+ # 如果该域名访问次数过多,进一步降低优先级
67
+ if domain_access_count > 10:
68
+ priority -= 1
69
+
70
+ # 基于URL访问历史调整优先级
71
+ if request.url in self.url_stats:
72
+ url_access_count = self.url_stats[request.url]
73
+ if url_access_count > 1:
74
+ # 重复URL降低优先级
75
+ priority -= url_access_count
76
+
77
+ # 基于深度调整优先级
78
+ depth = getattr(request, 'meta', {}).get('depth', 0)
79
+ priority -= depth # 深度越大,优先级越低
80
+
81
+ return priority
82
+
83
+ def update_stats(self, request: "Request"):
84
+ """更新统计信息"""
85
+ domain = self._extract_domain(request.url)
86
+
87
+ # 更新域名统计
88
+ if domain not in self.domain_stats:
89
+ self.domain_stats[domain] = {'count': 0, 'last_time': 0}
90
+
91
+ self.domain_stats[domain]['count'] += 1
92
+ self.domain_stats[domain]['last_time'] = time.time()
93
+
94
+ # 更新URL统计
95
+ if request.url not in self.url_stats:
96
+ self.url_stats[request.url] = 0
97
+ self.url_stats[request.url] += 1
98
+
99
+ # 更新最后请求时间
100
+ self.last_request_time[domain] = time.time()
101
+
102
+ def _extract_domain(self, url: str) -> str:
103
+ """提取域名"""
104
+ try:
105
+ from urllib.parse import urlparse
106
+ parsed = urlparse(url)
107
+ return parsed.netloc
108
+ except:
109
+ return "unknown"
110
+
111
+
112
+ class QueueConfig:
113
+ """Queue configuration class"""
114
+
115
+ def __init__(
116
+ self,
117
+ queue_type: Union[QueueType, str] = QueueType.AUTO,
118
+ redis_url: Optional[str] = None,
119
+ redis_host: str = "127.0.0.1",
120
+ redis_port: int = 6379,
121
+ redis_password: Optional[str] = None,
122
+ redis_db: int = 0,
123
+ queue_name: str = "crawlo:requests",
124
+ max_queue_size: int = 1000,
125
+ max_retries: int = 3,
126
+ timeout: int = 300,
127
+ **kwargs
128
+ ):
129
+ self.queue_type = QueueType(queue_type) if isinstance(queue_type, str) else queue_type
130
+
131
+ # Redis 配置
132
+ if redis_url:
133
+ self.redis_url = redis_url
134
+ else:
135
+ if redis_password:
136
+ self.redis_url = f"redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}"
137
+ else:
138
+ self.redis_url = f"redis://{redis_host}:{redis_port}/{redis_db}"
139
+
140
+ self.queue_name = queue_name
141
+ self.max_queue_size = max_queue_size
142
+ self.max_retries = max_retries
143
+ self.timeout = timeout
144
+ self.extra_config = kwargs
145
+
146
+ @classmethod
147
+ def from_settings(cls, settings) -> 'QueueConfig':
148
+ """Create configuration from settings"""
149
+ return cls(
150
+ queue_type=settings.get('QUEUE_TYPE', QueueType.AUTO),
151
+ redis_url=settings.get('REDIS_URL'),
152
+ redis_host=settings.get('REDIS_HOST', '127.0.0.1'),
153
+ redis_port=settings.get_int('REDIS_PORT', 6379),
154
+ redis_password=settings.get('REDIS_PASSWORD'),
155
+ redis_db=settings.get_int('REDIS_DB', 0),
156
+ queue_name=settings.get('SCHEDULER_QUEUE_NAME', 'crawlo:requests'),
157
+ max_queue_size=settings.get_int('SCHEDULER_MAX_QUEUE_SIZE', 1000),
158
+ max_retries=settings.get_int('QUEUE_MAX_RETRIES', 3),
159
+ timeout=settings.get_int('QUEUE_TIMEOUT', 300)
160
+ )
161
+
162
+
163
+ class QueueManager:
164
+ """Unified queue manager"""
165
+
166
+ def __init__(self, config: QueueConfig):
167
+ self.config = config
168
+ # 延迟初始化logger和error_handler避免循环依赖
169
+ self._logger = None
170
+ self._error_handler = None
171
+ self.request_serializer = RequestSerializer()
172
+ self._queue = None
173
+ self._queue_semaphore = None
174
+ self._queue_type = None
175
+ self._health_status = "unknown"
176
+ self._intelligent_scheduler = IntelligentScheduler() # 智能调度器
177
+
178
+ @property
179
+ def logger(self):
180
+ if self._logger is None:
181
+ self._logger = get_logger(self.__class__.__name__)
182
+ return self._logger
183
+
184
+ @property
185
+ def error_handler(self):
186
+ if self._error_handler is None:
187
+ self._error_handler = ErrorHandler(self.__class__.__name__)
188
+ return self._error_handler
189
+
190
+ async def initialize(self) -> bool:
191
+ """初始化队列"""
192
+ try:
193
+ queue_type = await self._determine_queue_type()
194
+ self._queue = await self._create_queue(queue_type)
195
+ self._queue_type = queue_type
196
+
197
+ # 测试队列健康状态
198
+ health_check_result = await self._health_check()
199
+
200
+ self.logger.info(f"Queue initialized successfully Type: {queue_type.value}")
201
+ # 只在调试模式下输出详细配置信息
202
+ self.logger.debug(f"Queue configuration: {self._get_queue_info()}")
203
+
204
+ # 如果健康检查返回True,表示队列类型发生了切换,需要更新配置
205
+ if health_check_result:
206
+ return True
207
+
208
+ # 如果队列类型是Redis,检查是否需要更新配置
209
+ if queue_type == QueueType.REDIS:
210
+ # 这个检查需要在调度器中进行,因为队列管理器无法访问crawler.settings
211
+ # 但我们不需要总是返回True,只有在确实需要更新时才返回True
212
+ # 调度器会进行更详细的检查
213
+ pass
214
+
215
+ return False # 默认不需要更新配置
216
+
217
+ except Exception as e:
218
+ # 记录详细的错误信息和堆栈跟踪
219
+ self.logger.error(f"Queue initialization failed: {e}")
220
+ self.logger.debug(f"详细错误信息:\n{traceback.format_exc()}")
221
+ self._health_status = "error"
222
+ return False
223
+
224
+ async def put(self, request: "Request", priority: int = 0) -> bool:
225
+ """Unified enqueue interface"""
226
+ if not self._queue:
227
+ raise RuntimeError("队列未初始化")
228
+
229
+ try:
230
+ # 应用智能调度算法计算优先级
231
+ intelligent_priority = self._intelligent_scheduler.calculate_priority(request)
232
+ # 结合原始优先级和智能优先级
233
+ final_priority = priority + intelligent_priority
234
+
235
+ # 更新统计信息
236
+ self._intelligent_scheduler.update_stats(request)
237
+
238
+ # 序列化处理(仅对 Redis 队列)
239
+ if self._queue_type == QueueType.REDIS:
240
+ request = self.request_serializer.prepare_for_serialization(request)
241
+
242
+ # 背压控制(仅对内存队列)
243
+ if self._queue_semaphore:
244
+ # 对于大量请求,使用阻塞式等待而不是跳过
245
+ # 这样可以确保不会丢失任何请求
246
+ await self._queue_semaphore.acquire()
247
+
248
+ # 统一的入队操作
249
+ if hasattr(self._queue, 'put'):
250
+ if self._queue_type == QueueType.REDIS:
251
+ success = await self._queue.put(request, final_priority)
252
+ else:
253
+ # 对于内存队列,我们需要手动处理优先级
254
+ # 在SpiderPriorityQueue中,元素应该是(priority, item)的元组
255
+ await self._queue.put((final_priority, request))
256
+ success = True
257
+ else:
258
+ raise RuntimeError(f"队列类型 {self._queue_type} 不支持 put 操作")
259
+
260
+ if success:
261
+ self.logger.debug(f"Request enqueued successfully: {request.url} with priority {final_priority}")
262
+
263
+ return success
264
+
265
+ except Exception as e:
266
+ self.logger.error(f"Failed to enqueue request: {e}")
267
+ if self._queue_semaphore:
268
+ self._queue_semaphore.release()
269
+ return False
270
+
271
+ async def get(self, timeout: float = 5.0) -> Optional["Request"]:
272
+ """Unified dequeue interface"""
273
+ if not self._queue:
274
+ raise RuntimeError("队列未初始化")
275
+
276
+ try:
277
+ request = await self._queue.get(timeout=timeout)
278
+
279
+ # 释放信号量(仅对内存队列)
280
+ if self._queue_semaphore and request:
281
+ self._queue_semaphore.release()
282
+
283
+ # 反序列化处理(仅对 Redis 队列)
284
+ if request and self._queue_type == QueueType.REDIS:
285
+ # 这里需要 spider 实例,暂时返回原始请求
286
+ # 实际的 callback 恢复在 scheduler 中处理
287
+ pass
288
+
289
+ # 如果是内存队列,需要解包(priority, request)元组
290
+ if request and self._queue_type == QueueType.MEMORY:
291
+ if isinstance(request, tuple) and len(request) == 2:
292
+ request = request[1] # 取元组中的请求对象
293
+
294
+ return request
295
+
296
+ except Exception as e:
297
+ self.logger.error(f"Failed to dequeue request: {e}")
298
+ return None
299
+
300
+ async def size(self) -> int:
301
+ """Get queue size"""
302
+ if not self._queue:
303
+ return 0
304
+
305
+ try:
306
+ if hasattr(self._queue, 'qsize'):
307
+ if asyncio.iscoroutinefunction(self._queue.qsize):
308
+ return await self._queue.qsize()
309
+ else:
310
+ return self._queue.qsize()
311
+ return 0
312
+ except Exception as e:
313
+ self.logger.warning(f"Failed to get queue size: {e}")
314
+ return 0
315
+
316
+ def empty(self) -> bool:
317
+ """Check if queue is empty (synchronous version, for compatibility)"""
318
+ try:
319
+ # 对于内存队列,可以同步检查
320
+ if self._queue_type == QueueType.MEMORY:
321
+ # 确保正确检查队列大小
322
+ if hasattr(self._queue, 'qsize'):
323
+ return self._queue.qsize() == 0
324
+ else:
325
+ # 如果没有qsize方法,假设队列为空
326
+ return True
327
+ # 对于 Redis 队列,由于需要异步操作,这里返回近似值
328
+ # 为了确保程序能正常退出,我们返回True,让上层通过更精确的异步检查来判断
329
+ return True
330
+ except Exception:
331
+ return True
332
+
333
+ async def async_empty(self) -> bool:
334
+ """Check if queue is empty (asynchronous version, more accurate)"""
335
+ try:
336
+ # 对于内存队列
337
+ if self._queue_type == QueueType.MEMORY:
338
+ # 确保正确检查队列大小
339
+ if hasattr(self._queue, 'qsize'):
340
+ if asyncio.iscoroutinefunction(self._queue.qsize):
341
+ size = await self._queue.qsize()
342
+ else:
343
+ size = self._queue.qsize()
344
+ return size == 0
345
+ else:
346
+ # 如果没有qsize方法,假设队列为空
347
+ return True
348
+ # 对于 Redis 队列,使用异步检查
349
+ elif self._queue_type == QueueType.REDIS:
350
+ size = await self.size()
351
+ return size == 0
352
+ return True
353
+ except Exception:
354
+ return True
355
+
356
+ async def close(self) -> None:
357
+ """Close queue"""
358
+ if self._queue and hasattr(self._queue, 'close'):
359
+ try:
360
+ await self._queue.close()
361
+ # Change INFO level log to DEBUG level to avoid redundant output
362
+ self.logger.debug("Queue closed")
363
+ except Exception as e:
364
+ self.logger.warning(f"Error closing queue: {e}")
365
+
366
+ def get_status(self) -> Dict[str, Any]:
367
+ """Get queue status information"""
368
+ return {
369
+ "type": self._queue_type.value if self._queue_type else "unknown",
370
+ "health": self._health_status,
371
+ "config": self._get_queue_info(),
372
+ "initialized": self._queue is not None
373
+ }
374
+
375
+ async def _determine_queue_type(self) -> QueueType:
376
+ """Determine queue type"""
377
+ if self.config.queue_type == QueueType.AUTO:
378
+ # 自动选择:优先使用 Redis(如果可用)
379
+ if REDIS_AVAILABLE and self.config.redis_url:
380
+ # 测试 Redis 连接
381
+ try:
382
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
383
+ test_queue = RedisPriorityQueue(self.config.redis_url)
384
+ await test_queue.connect()
385
+ await test_queue.close()
386
+ self.logger.debug("Auto-detection: Redis available, using distributed queue")
387
+ return QueueType.REDIS
388
+ except Exception as e:
389
+ self.logger.debug(f"Auto-detection: Redis unavailable ({e}), using memory queue")
390
+ return QueueType.MEMORY
391
+ else:
392
+ self.logger.debug("Auto-detection: Redis not configured, using memory queue")
393
+ return QueueType.MEMORY
394
+
395
+ elif self.config.queue_type == QueueType.REDIS:
396
+ if not REDIS_AVAILABLE:
397
+ raise RuntimeError("Redis 队列不可用:未安装 redis 依赖")
398
+ if not self.config.redis_url:
399
+ raise RuntimeError("Redis 队列不可用:未配置 REDIS_URL")
400
+ # 测试 Redis 连接
401
+ try:
402
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
403
+ test_queue = RedisPriorityQueue(self.config.redis_url)
404
+ await test_queue.connect()
405
+ await test_queue.close()
406
+ return QueueType.REDIS
407
+ except Exception as e:
408
+ # 如果强制使用Redis但连接失败,则抛出异常
409
+ raise RuntimeError(f"Redis 队列不可用:无法连接到 Redis ({e})")
410
+
411
+ elif self.config.queue_type == QueueType.MEMORY:
412
+ return QueueType.MEMORY
413
+
414
+ else:
415
+ raise ValueError(f"不支持的队列类型: {self.config.queue_type}")
416
+
417
+ async def _create_queue(self, queue_type: QueueType):
418
+ """Create queue instance"""
419
+ if queue_type == QueueType.REDIS:
420
+ # 延迟导入Redis队列
421
+ try:
422
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
423
+ except ImportError as e:
424
+ raise RuntimeError(f"Redis队列不可用:未能导入RedisPriorityQueue ({e})")
425
+
426
+ # 简化项目名称提取逻辑
427
+ project_name = "default"
428
+ if ':' in self.config.queue_name:
429
+ parts = self.config.queue_name.split(':')
430
+ # 跳过所有"crawlo"前缀,取第一个非"crawlo"部分作为项目名称
431
+ for part in parts:
432
+ if part != "crawlo":
433
+ project_name = part
434
+ break
435
+ else:
436
+ project_name = self.config.queue_name or "default"
437
+
438
+ queue = RedisPriorityQueue(
439
+ redis_url=self.config.redis_url,
440
+ queue_name=self.config.queue_name,
441
+ max_retries=self.config.max_retries,
442
+ timeout=self.config.timeout,
443
+ module_name=project_name # 传递项目名称作为module_name
444
+ )
445
+ # 不需要立即连接,使用 lazy connect
446
+ return queue
447
+
448
+ elif queue_type == QueueType.MEMORY:
449
+ queue = SpiderPriorityQueue()
450
+ # 为内存队列设置背压控制
451
+ self._queue_semaphore = asyncio.Semaphore(self.config.max_queue_size)
452
+ return queue
453
+
454
+ else:
455
+ raise ValueError(f"不支持的队列类型: {queue_type}")
456
+
457
+ async def _health_check(self) -> bool:
458
+ """Health check"""
459
+ try:
460
+ if self._queue_type == QueueType.REDIS:
461
+ # 测试 Redis 连接
462
+ await self._queue.connect()
463
+ self._health_status = "healthy"
464
+ else:
465
+ # 内存队列总是健康的
466
+ self._health_status = "healthy"
467
+ return False # 内存队列不需要更新配置
468
+ except Exception as e:
469
+ self.logger.warning(f"Queue health check failed: {e}")
470
+ self._health_status = "unhealthy"
471
+ # 如果是Redis队列且健康检查失败,尝试切换到内存队列
472
+ if self._queue_type == QueueType.REDIS and self.config.queue_type == QueueType.AUTO:
473
+ self.logger.info("Redis queue unavailable, attempting to switch to memory queue...")
474
+ try:
475
+ await self._queue.close()
476
+ except:
477
+ pass
478
+ self._queue = None
479
+ # 重新创建内存队列
480
+ self._queue = await self._create_queue(QueueType.MEMORY)
481
+ self._queue_type = QueueType.MEMORY
482
+ self._queue_semaphore = asyncio.Semaphore(self.config.max_queue_size)
483
+ self._health_status = "healthy"
484
+ self.logger.info("Switched to memory queue")
485
+ # 返回一个信号,表示需要更新过滤器和去重管道配置
486
+ return True
487
+ return False
488
+
489
+ def _get_queue_info(self) -> Dict[str, Any]:
490
+ """Get queue configuration information"""
491
+ info = {
492
+ "queue_name": self.config.queue_name,
493
+ "max_queue_size": self.config.max_queue_size
494
+ }
495
+
496
+ if self._queue_type == QueueType.REDIS:
497
+ info.update({
498
+ "redis_url": self.config.redis_url,
499
+ "max_retries": self.config.max_retries,
500
+ "timeout": self.config.timeout
501
+ })
502
+
503
+ return info