crawlo 1.3.1__py3-none-any.whl → 1.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +63 -63
- crawlo/__version__.py +1 -1
- crawlo/cli.py +75 -75
- crawlo/commands/__init__.py +14 -14
- crawlo/commands/check.py +594 -594
- crawlo/commands/genspider.py +151 -151
- crawlo/commands/help.py +138 -138
- crawlo/commands/list.py +155 -155
- crawlo/commands/run.py +322 -314
- crawlo/commands/startproject.py +436 -436
- crawlo/commands/stats.py +187 -187
- crawlo/commands/utils.py +196 -196
- crawlo/config.py +312 -312
- crawlo/config_validator.py +277 -277
- crawlo/core/__init__.py +2 -2
- crawlo/core/engine.py +365 -365
- crawlo/core/processor.py +40 -40
- crawlo/core/scheduler.py +256 -256
- crawlo/crawler.py +1166 -1168
- crawlo/data/__init__.py +5 -5
- crawlo/data/user_agents.py +194 -194
- crawlo/downloader/__init__.py +273 -273
- crawlo/downloader/aiohttp_downloader.py +226 -226
- crawlo/downloader/cffi_downloader.py +245 -245
- crawlo/downloader/httpx_downloader.py +259 -259
- crawlo/downloader/hybrid_downloader.py +212 -212
- crawlo/downloader/playwright_downloader.py +402 -402
- crawlo/downloader/selenium_downloader.py +472 -472
- crawlo/event.py +11 -11
- crawlo/exceptions.py +81 -81
- crawlo/extension/__init__.py +39 -39
- crawlo/extension/health_check.py +141 -141
- crawlo/extension/log_interval.py +57 -57
- crawlo/extension/log_stats.py +81 -81
- crawlo/extension/logging_extension.py +52 -45
- crawlo/extension/memory_monitor.py +104 -104
- crawlo/extension/performance_profiler.py +133 -133
- crawlo/extension/request_recorder.py +107 -107
- crawlo/filters/__init__.py +154 -154
- crawlo/filters/aioredis_filter.py +234 -234
- crawlo/filters/memory_filter.py +269 -269
- crawlo/items/__init__.py +23 -23
- crawlo/items/base.py +21 -21
- crawlo/items/fields.py +52 -52
- crawlo/items/items.py +104 -104
- crawlo/middleware/__init__.py +21 -21
- crawlo/middleware/default_header.py +132 -132
- crawlo/middleware/download_delay.py +104 -104
- crawlo/middleware/middleware_manager.py +135 -135
- crawlo/middleware/offsite.py +123 -115
- crawlo/middleware/proxy.py +386 -386
- crawlo/middleware/request_ignore.py +86 -86
- crawlo/middleware/response_code.py +163 -163
- crawlo/middleware/response_filter.py +136 -136
- crawlo/middleware/retry.py +124 -124
- crawlo/middleware/simple_proxy.py +65 -65
- crawlo/mode_manager.py +187 -148
- crawlo/network/__init__.py +21 -21
- crawlo/network/request.py +379 -379
- crawlo/network/response.py +359 -359
- crawlo/pipelines/__init__.py +21 -21
- crawlo/pipelines/bloom_dedup_pipeline.py +156 -156
- crawlo/pipelines/console_pipeline.py +39 -39
- crawlo/pipelines/csv_pipeline.py +316 -316
- crawlo/pipelines/database_dedup_pipeline.py +222 -222
- crawlo/pipelines/json_pipeline.py +218 -218
- crawlo/pipelines/memory_dedup_pipeline.py +115 -115
- crawlo/pipelines/mongo_pipeline.py +131 -131
- crawlo/pipelines/mysql_pipeline.py +318 -318
- crawlo/pipelines/pipeline_manager.py +75 -75
- crawlo/pipelines/redis_dedup_pipeline.py +166 -166
- crawlo/project.py +325 -297
- crawlo/queue/pqueue.py +37 -37
- crawlo/queue/queue_manager.py +379 -379
- crawlo/queue/redis_priority_queue.py +306 -306
- crawlo/settings/__init__.py +7 -7
- crawlo/settings/default_settings.py +225 -225
- crawlo/settings/setting_manager.py +198 -198
- crawlo/spider/__init__.py +639 -639
- crawlo/stats_collector.py +59 -59
- crawlo/subscriber.py +129 -129
- crawlo/task_manager.py +30 -30
- crawlo/templates/crawlo.cfg.tmpl +10 -10
- crawlo/templates/project/__init__.py.tmpl +3 -3
- crawlo/templates/project/items.py.tmpl +17 -17
- crawlo/templates/project/middlewares.py.tmpl +118 -118
- crawlo/templates/project/pipelines.py.tmpl +96 -96
- crawlo/templates/project/settings.py.tmpl +266 -261
- crawlo/templates/project/settings_distributed.py.tmpl +179 -174
- crawlo/templates/project/settings_gentle.py.tmpl +60 -95
- crawlo/templates/project/settings_high_performance.py.tmpl +130 -125
- crawlo/templates/project/settings_minimal.py.tmpl +34 -29
- crawlo/templates/project/settings_simple.py.tmpl +101 -96
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -5
- crawlo/templates/run.py.tmpl +38 -47
- crawlo/templates/spider/spider.py.tmpl +143 -143
- crawlo/tools/__init__.py +200 -200
- crawlo/tools/anti_crawler.py +268 -268
- crawlo/tools/authenticated_proxy.py +240 -240
- crawlo/tools/data_formatter.py +225 -225
- crawlo/tools/data_validator.py +180 -180
- crawlo/tools/date_tools.py +289 -289
- crawlo/tools/distributed_coordinator.py +388 -388
- crawlo/tools/encoding_converter.py +127 -127
- crawlo/tools/request_tools.py +82 -82
- crawlo/tools/retry_mechanism.py +224 -224
- crawlo/tools/scenario_adapter.py +262 -262
- crawlo/tools/text_cleaner.py +232 -232
- crawlo/utils/__init__.py +34 -34
- crawlo/utils/batch_processor.py +259 -259
- crawlo/utils/controlled_spider_mixin.py +439 -439
- crawlo/utils/db_helper.py +343 -343
- crawlo/utils/enhanced_error_handler.py +356 -356
- crawlo/utils/env_config.py +142 -142
- crawlo/utils/error_handler.py +123 -123
- crawlo/utils/func_tools.py +82 -82
- crawlo/utils/large_scale_config.py +286 -286
- crawlo/utils/large_scale_helper.py +344 -344
- crawlo/utils/log.py +199 -146
- crawlo/utils/performance_monitor.py +285 -285
- crawlo/utils/queue_helper.py +175 -175
- crawlo/utils/redis_connection_pool.py +351 -351
- crawlo/utils/redis_key_validator.py +198 -198
- crawlo/utils/request.py +267 -267
- crawlo/utils/request_serializer.py +218 -218
- crawlo/utils/spider_loader.py +61 -61
- crawlo/utils/system.py +11 -11
- crawlo/utils/tools.py +4 -4
- crawlo/utils/url.py +39 -39
- {crawlo-1.3.1.dist-info → crawlo-1.3.3.dist-info}/METADATA +1020 -1020
- crawlo-1.3.3.dist-info/RECORD +219 -0
- examples/__init__.py +7 -7
- tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +81 -81
- tests/__init__.py +7 -7
- tests/advanced_tools_example.py +275 -275
- tests/authenticated_proxy_example.py +107 -107
- tests/cleaners_example.py +160 -160
- tests/config_validation_demo.py +142 -142
- tests/controlled_spider_example.py +205 -205
- tests/date_tools_example.py +180 -180
- tests/debug_pipelines.py +66 -66
- tests/dynamic_loading_example.py +523 -523
- tests/dynamic_loading_test.py +104 -104
- tests/env_config_example.py +133 -133
- tests/error_handling_example.py +171 -171
- tests/redis_key_validation_demo.py +130 -130
- tests/request_params_example.py +150 -150
- tests/response_improvements_example.py +144 -144
- tests/test_advanced_tools.py +148 -148
- tests/test_all_redis_key_configs.py +145 -145
- tests/test_authenticated_proxy.py +141 -141
- tests/test_cleaners.py +54 -54
- tests/test_comprehensive.py +146 -146
- tests/test_config_consistency.py +80 -80
- tests/test_config_merge.py +152 -152
- tests/test_config_validator.py +182 -182
- tests/test_crawlo_proxy_integration.py +108 -108
- tests/test_date_tools.py +123 -123
- tests/test_default_header_middleware.py +158 -158
- tests/test_distributed.py +65 -65
- tests/test_double_crawlo_fix.py +207 -207
- tests/test_double_crawlo_fix_simple.py +124 -124
- tests/test_download_delay_middleware.py +221 -221
- tests/test_downloader_proxy_compatibility.py +268 -268
- tests/test_dynamic_downloaders_proxy.py +124 -124
- tests/test_dynamic_proxy.py +92 -92
- tests/test_dynamic_proxy_config.py +146 -146
- tests/test_dynamic_proxy_real.py +109 -109
- tests/test_edge_cases.py +303 -303
- tests/test_enhanced_error_handler.py +270 -270
- tests/test_env_config.py +121 -121
- tests/test_error_handler_compatibility.py +112 -112
- tests/test_final_validation.py +153 -153
- tests/test_framework_env_usage.py +103 -103
- tests/test_integration.py +169 -169
- tests/test_item_dedup_redis_key.py +122 -122
- tests/test_mode_consistency.py +51 -51
- tests/test_offsite_middleware.py +221 -221
- tests/test_parsel.py +29 -29
- tests/test_performance.py +327 -327
- tests/test_proxy_api.py +264 -264
- tests/test_proxy_health_check.py +32 -32
- tests/test_proxy_middleware.py +121 -121
- tests/test_proxy_middleware_enhanced.py +216 -216
- tests/test_proxy_middleware_integration.py +136 -136
- tests/test_proxy_middleware_refactored.py +184 -184
- tests/test_proxy_providers.py +56 -56
- tests/test_proxy_stats.py +19 -19
- tests/test_proxy_strategies.py +59 -59
- tests/test_queue_manager_double_crawlo.py +173 -173
- tests/test_queue_manager_redis_key.py +176 -176
- tests/test_random_user_agent.py +72 -72
- tests/test_real_scenario_proxy.py +195 -195
- tests/test_redis_config.py +28 -28
- tests/test_redis_connection_pool.py +294 -294
- tests/test_redis_key_naming.py +181 -181
- tests/test_redis_key_validator.py +123 -123
- tests/test_redis_queue.py +224 -224
- tests/test_request_ignore_middleware.py +182 -182
- tests/test_request_params.py +111 -111
- tests/test_request_serialization.py +70 -70
- tests/test_response_code_middleware.py +349 -349
- tests/test_response_filter_middleware.py +427 -427
- tests/test_response_improvements.py +152 -152
- tests/test_retry_middleware.py +241 -241
- tests/test_scheduler.py +252 -252
- tests/test_scheduler_config_update.py +133 -133
- tests/test_simple_response.py +61 -61
- tests/test_telecom_spider_redis_key.py +205 -205
- tests/test_template_content.py +87 -87
- tests/test_template_redis_key.py +134 -134
- tests/test_tools.py +159 -159
- tests/test_user_agents.py +96 -96
- tests/tools_example.py +260 -260
- tests/verify_distributed.py +117 -117
- crawlo-1.3.1.dist-info/RECORD +0 -219
- {crawlo-1.3.1.dist-info → crawlo-1.3.3.dist-info}/WHEEL +0 -0
- {crawlo-1.3.1.dist-info → crawlo-1.3.3.dist-info}/entry_points.txt +0 -0
- {crawlo-1.3.1.dist-info → crawlo-1.3.3.dist-info}/top_level.txt +0 -0
crawlo/queue/queue_manager.py
CHANGED
|
@@ -1,379 +1,379 @@
|
|
|
1
|
-
#!/usr/bin/python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
"""
|
|
4
|
-
统一的队列管理器
|
|
5
|
-
提供简洁、一致的队列接口,自动处理不同队列类型的差异
|
|
6
|
-
"""
|
|
7
|
-
import asyncio
|
|
8
|
-
import traceback
|
|
9
|
-
from enum import Enum
|
|
10
|
-
from typing import Optional, Dict, Any, Union
|
|
11
|
-
|
|
12
|
-
from crawlo import Request
|
|
13
|
-
from crawlo.queue.pqueue import SpiderPriorityQueue
|
|
14
|
-
from crawlo.utils.error_handler import ErrorHandler
|
|
15
|
-
from crawlo.utils.log import get_logger
|
|
16
|
-
from crawlo.utils.request_serializer import RequestSerializer
|
|
17
|
-
|
|
18
|
-
try:
|
|
19
|
-
from crawlo.queue.redis_priority_queue import RedisPriorityQueue
|
|
20
|
-
|
|
21
|
-
REDIS_AVAILABLE = True
|
|
22
|
-
except ImportError:
|
|
23
|
-
RedisPriorityQueue = None
|
|
24
|
-
REDIS_AVAILABLE = False
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class QueueType(Enum):
|
|
28
|
-
"""Queue type enumeration"""
|
|
29
|
-
MEMORY = "memory"
|
|
30
|
-
REDIS = "redis"
|
|
31
|
-
AUTO = "auto" # 自动选择
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
class QueueConfig:
|
|
35
|
-
"""Queue configuration class"""
|
|
36
|
-
|
|
37
|
-
def __init__(
|
|
38
|
-
self,
|
|
39
|
-
queue_type: Union[QueueType, str] = QueueType.AUTO,
|
|
40
|
-
redis_url: Optional[str] = None,
|
|
41
|
-
redis_host: str = "127.0.0.1",
|
|
42
|
-
redis_port: int = 6379,
|
|
43
|
-
redis_password: Optional[str] = None,
|
|
44
|
-
redis_db: int = 0,
|
|
45
|
-
queue_name: str = "crawlo:requests",
|
|
46
|
-
max_queue_size: int = 1000,
|
|
47
|
-
max_retries: int = 3,
|
|
48
|
-
timeout: int = 300,
|
|
49
|
-
**kwargs
|
|
50
|
-
):
|
|
51
|
-
self.queue_type = QueueType(queue_type) if isinstance(queue_type, str) else queue_type
|
|
52
|
-
|
|
53
|
-
# Redis 配置
|
|
54
|
-
if redis_url:
|
|
55
|
-
self.redis_url = redis_url
|
|
56
|
-
else:
|
|
57
|
-
if redis_password:
|
|
58
|
-
self.redis_url = f"redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}"
|
|
59
|
-
else:
|
|
60
|
-
self.redis_url = f"redis://{redis_host}:{redis_port}/{redis_db}"
|
|
61
|
-
|
|
62
|
-
self.queue_name = queue_name
|
|
63
|
-
self.max_queue_size = max_queue_size
|
|
64
|
-
self.max_retries = max_retries
|
|
65
|
-
self.timeout = timeout
|
|
66
|
-
self.extra_config = kwargs
|
|
67
|
-
|
|
68
|
-
@classmethod
|
|
69
|
-
def from_settings(cls, settings) -> 'QueueConfig':
|
|
70
|
-
"""Create configuration from settings"""
|
|
71
|
-
return cls(
|
|
72
|
-
queue_type=settings.get('QUEUE_TYPE', QueueType.AUTO),
|
|
73
|
-
redis_url=settings.get('REDIS_URL'),
|
|
74
|
-
redis_host=settings.get('REDIS_HOST', '127.0.0.1'),
|
|
75
|
-
redis_port=settings.get_int('REDIS_PORT', 6379),
|
|
76
|
-
redis_password=settings.get('REDIS_PASSWORD'),
|
|
77
|
-
redis_db=settings.get_int('REDIS_DB', 0),
|
|
78
|
-
queue_name=settings.get('SCHEDULER_QUEUE_NAME', 'crawlo:requests'),
|
|
79
|
-
max_queue_size=settings.get_int('SCHEDULER_MAX_QUEUE_SIZE', 1000),
|
|
80
|
-
max_retries=settings.get_int('QUEUE_MAX_RETRIES', 3),
|
|
81
|
-
timeout=settings.get_int('QUEUE_TIMEOUT', 300)
|
|
82
|
-
)
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
class QueueManager:
|
|
86
|
-
"""Unified queue manager"""
|
|
87
|
-
|
|
88
|
-
def __init__(self, config: QueueConfig):
|
|
89
|
-
self.config = config
|
|
90
|
-
self.logger = get_logger(self.__class__.__name__)
|
|
91
|
-
self.error_handler = ErrorHandler(self.__class__.__name__)
|
|
92
|
-
self.request_serializer = RequestSerializer()
|
|
93
|
-
self._queue = None
|
|
94
|
-
self._queue_semaphore = None
|
|
95
|
-
self._queue_type = None
|
|
96
|
-
self._health_status = "unknown"
|
|
97
|
-
|
|
98
|
-
async def initialize(self) -> bool:
|
|
99
|
-
"""初始化队列"""
|
|
100
|
-
try:
|
|
101
|
-
queue_type = await self._determine_queue_type()
|
|
102
|
-
self._queue = await self._create_queue(queue_type)
|
|
103
|
-
self._queue_type = queue_type
|
|
104
|
-
|
|
105
|
-
# 测试队列健康状态
|
|
106
|
-
health_check_result = await self._health_check()
|
|
107
|
-
|
|
108
|
-
self.logger.info(f"Queue initialized successfully Type: {queue_type.value}")
|
|
109
|
-
# 只在调试模式下输出详细配置信息
|
|
110
|
-
self.logger.debug(f"Queue configuration: {self._get_queue_info()}")
|
|
111
|
-
|
|
112
|
-
# 如果健康检查返回True,表示队列类型发生了切换,需要更新配置
|
|
113
|
-
if health_check_result:
|
|
114
|
-
return True
|
|
115
|
-
|
|
116
|
-
# 如果队列类型是Redis,检查是否需要更新配置
|
|
117
|
-
if queue_type == QueueType.REDIS:
|
|
118
|
-
# 这个检查需要在调度器中进行,因为队列管理器无法访问crawler.settings
|
|
119
|
-
# 但我们不需要总是返回True,只有在确实需要更新时才返回True
|
|
120
|
-
# 调度器会进行更详细的检查
|
|
121
|
-
pass
|
|
122
|
-
|
|
123
|
-
return False # 默认不需要更新配置
|
|
124
|
-
|
|
125
|
-
except Exception as e:
|
|
126
|
-
# 记录详细的错误信息和堆栈跟踪
|
|
127
|
-
self.logger.error(f"Queue initialization failed: {e}")
|
|
128
|
-
self.logger.debug(f"详细错误信息:\n{traceback.format_exc()}")
|
|
129
|
-
self._health_status = "error"
|
|
130
|
-
return False
|
|
131
|
-
|
|
132
|
-
async def put(self, request: Request, priority: int = 0) -> bool:
|
|
133
|
-
"""Unified enqueue interface"""
|
|
134
|
-
if not self._queue:
|
|
135
|
-
raise RuntimeError("队列未初始化")
|
|
136
|
-
|
|
137
|
-
try:
|
|
138
|
-
# 序列化处理(仅对 Redis 队列)
|
|
139
|
-
if self._queue_type == QueueType.REDIS:
|
|
140
|
-
request = self.request_serializer.prepare_for_serialization(request)
|
|
141
|
-
|
|
142
|
-
# 背压控制(仅对内存队列)
|
|
143
|
-
if self._queue_semaphore:
|
|
144
|
-
# 对于大量请求,使用非阻塞式检查
|
|
145
|
-
if not self._queue_semaphore.locked():
|
|
146
|
-
await self._queue_semaphore.acquire()
|
|
147
|
-
else:
|
|
148
|
-
# 如果队列已满,返回 False 而不是阻塞
|
|
149
|
-
self.logger.warning("Queue is full, skipping current request")
|
|
150
|
-
return False
|
|
151
|
-
|
|
152
|
-
# 统一的入队操作
|
|
153
|
-
if hasattr(self._queue, 'put'):
|
|
154
|
-
if self._queue_type == QueueType.REDIS:
|
|
155
|
-
success = await self._queue.put(request, priority)
|
|
156
|
-
else:
|
|
157
|
-
await self._queue.put(request)
|
|
158
|
-
success = True
|
|
159
|
-
else:
|
|
160
|
-
raise RuntimeError(f"队列类型 {self._queue_type} 不支持 put 操作")
|
|
161
|
-
|
|
162
|
-
if success:
|
|
163
|
-
self.logger.debug(f"Request enqueued successfully: {request.url}")
|
|
164
|
-
|
|
165
|
-
return success
|
|
166
|
-
|
|
167
|
-
except Exception as e:
|
|
168
|
-
self.logger.error(f"Failed to enqueue request: {e}")
|
|
169
|
-
if self._queue_semaphore:
|
|
170
|
-
self._queue_semaphore.release()
|
|
171
|
-
return False
|
|
172
|
-
|
|
173
|
-
async def get(self, timeout: float = 5.0) -> Optional[Request]:
|
|
174
|
-
"""Unified dequeue interface"""
|
|
175
|
-
if not self._queue:
|
|
176
|
-
raise RuntimeError("队列未初始化")
|
|
177
|
-
|
|
178
|
-
try:
|
|
179
|
-
request = await self._queue.get(timeout=timeout)
|
|
180
|
-
|
|
181
|
-
# 释放信号量(仅对内存队列)
|
|
182
|
-
if self._queue_semaphore and request:
|
|
183
|
-
self._queue_semaphore.release()
|
|
184
|
-
|
|
185
|
-
# 反序列化处理(仅对 Redis 队列)
|
|
186
|
-
if request and self._queue_type == QueueType.REDIS:
|
|
187
|
-
# 这里需要 spider 实例,暂时返回原始请求
|
|
188
|
-
# 实际的 callback 恢复在 scheduler 中处理
|
|
189
|
-
pass
|
|
190
|
-
|
|
191
|
-
return request
|
|
192
|
-
|
|
193
|
-
except Exception as e:
|
|
194
|
-
self.logger.error(f"Failed to dequeue request: {e}")
|
|
195
|
-
return None
|
|
196
|
-
|
|
197
|
-
async def size(self) -> int:
|
|
198
|
-
"""Get queue size"""
|
|
199
|
-
if not self._queue:
|
|
200
|
-
return 0
|
|
201
|
-
|
|
202
|
-
try:
|
|
203
|
-
if hasattr(self._queue, 'qsize'):
|
|
204
|
-
if asyncio.iscoroutinefunction(self._queue.qsize):
|
|
205
|
-
return await self._queue.qsize()
|
|
206
|
-
else:
|
|
207
|
-
return self._queue.qsize()
|
|
208
|
-
return 0
|
|
209
|
-
except Exception as e:
|
|
210
|
-
self.logger.warning(f"Failed to get queue size: {e}")
|
|
211
|
-
return 0
|
|
212
|
-
|
|
213
|
-
def empty(self) -> bool:
|
|
214
|
-
"""Check if queue is empty (synchronous version, for compatibility)"""
|
|
215
|
-
try:
|
|
216
|
-
# 对于内存队列,可以同步检查
|
|
217
|
-
if self._queue_type == QueueType.MEMORY:
|
|
218
|
-
return self._queue.qsize() == 0
|
|
219
|
-
# 对于 Redis 队列,由于需要异步操作,这里返回近似值
|
|
220
|
-
# 为了确保程序能正常退出,我们返回True,让上层通过更精确的异步检查来判断
|
|
221
|
-
return True
|
|
222
|
-
except Exception:
|
|
223
|
-
return True
|
|
224
|
-
|
|
225
|
-
async def async_empty(self) -> bool:
|
|
226
|
-
"""Check if queue is empty (asynchronous version, more accurate)"""
|
|
227
|
-
try:
|
|
228
|
-
# 对于内存队列
|
|
229
|
-
if self._queue_type == QueueType.MEMORY:
|
|
230
|
-
return self._queue.qsize() == 0
|
|
231
|
-
# 对于 Redis 队列,使用异步检查
|
|
232
|
-
elif self._queue_type == QueueType.REDIS:
|
|
233
|
-
size = await self.size()
|
|
234
|
-
return size == 0
|
|
235
|
-
return True
|
|
236
|
-
except Exception:
|
|
237
|
-
return True
|
|
238
|
-
|
|
239
|
-
async def close(self) -> None:
|
|
240
|
-
"""Close queue"""
|
|
241
|
-
if self._queue and hasattr(self._queue, 'close'):
|
|
242
|
-
try:
|
|
243
|
-
await self._queue.close()
|
|
244
|
-
# Change INFO level log to DEBUG level to avoid redundant output
|
|
245
|
-
self.logger.debug("Queue closed")
|
|
246
|
-
except Exception as e:
|
|
247
|
-
self.logger.warning(f"Error closing queue: {e}")
|
|
248
|
-
|
|
249
|
-
def get_status(self) -> Dict[str, Any]:
|
|
250
|
-
"""Get queue status information"""
|
|
251
|
-
return {
|
|
252
|
-
"type": self._queue_type.value if self._queue_type else "unknown",
|
|
253
|
-
"health": self._health_status,
|
|
254
|
-
"config": self._get_queue_info(),
|
|
255
|
-
"initialized": self._queue is not None
|
|
256
|
-
}
|
|
257
|
-
|
|
258
|
-
async def _determine_queue_type(self) -> QueueType:
|
|
259
|
-
"""Determine queue type"""
|
|
260
|
-
if self.config.queue_type == QueueType.AUTO:
|
|
261
|
-
# 自动选择:优先使用 Redis(如果可用)
|
|
262
|
-
if REDIS_AVAILABLE and self.config.redis_url:
|
|
263
|
-
# 测试 Redis 连接
|
|
264
|
-
try:
|
|
265
|
-
test_queue = RedisPriorityQueue(self.config.redis_url)
|
|
266
|
-
await test_queue.connect()
|
|
267
|
-
await test_queue.close()
|
|
268
|
-
# Change INFO level log to DEBUG level to avoid redundant output
|
|
269
|
-
self.logger.debug("Auto-detection: Redis available, using distributed queue")
|
|
270
|
-
return QueueType.REDIS
|
|
271
|
-
except Exception as e:
|
|
272
|
-
self.logger.debug(f"Auto-detection: Redis unavailable ({e}), using memory queue")
|
|
273
|
-
return QueueType.MEMORY
|
|
274
|
-
else:
|
|
275
|
-
self.logger.debug("Auto-detection: Redis not configured, using memory queue")
|
|
276
|
-
return QueueType.MEMORY
|
|
277
|
-
|
|
278
|
-
elif self.config.queue_type == QueueType.REDIS:
|
|
279
|
-
if not REDIS_AVAILABLE:
|
|
280
|
-
raise RuntimeError("Redis 队列不可用:未安装 redis 依赖")
|
|
281
|
-
if not self.config.redis_url:
|
|
282
|
-
raise RuntimeError("Redis 队列不可用:未配置 REDIS_URL")
|
|
283
|
-
# 测试 Redis 连接
|
|
284
|
-
try:
|
|
285
|
-
test_queue = RedisPriorityQueue(self.config.redis_url)
|
|
286
|
-
await test_queue.connect()
|
|
287
|
-
await test_queue.close()
|
|
288
|
-
return QueueType.REDIS
|
|
289
|
-
except Exception as e:
|
|
290
|
-
# 如果强制使用Redis但连接失败,则抛出异常
|
|
291
|
-
raise RuntimeError(f"Redis 队列不可用:无法连接到 Redis ({e})")
|
|
292
|
-
|
|
293
|
-
elif self.config.queue_type == QueueType.MEMORY:
|
|
294
|
-
return QueueType.MEMORY
|
|
295
|
-
|
|
296
|
-
else:
|
|
297
|
-
raise ValueError(f"不支持的队列类型: {self.config.queue_type}")
|
|
298
|
-
|
|
299
|
-
async def _create_queue(self, queue_type: QueueType):
|
|
300
|
-
"""Create queue instance"""
|
|
301
|
-
if queue_type == QueueType.REDIS:
|
|
302
|
-
# 简化项目名称提取逻辑
|
|
303
|
-
project_name = "default"
|
|
304
|
-
if ':' in self.config.queue_name:
|
|
305
|
-
parts = self.config.queue_name.split(':')
|
|
306
|
-
# 跳过所有"crawlo"前缀,取第一个非"crawlo"部分作为项目名称
|
|
307
|
-
for part in parts:
|
|
308
|
-
if part != "crawlo":
|
|
309
|
-
project_name = part
|
|
310
|
-
break
|
|
311
|
-
else:
|
|
312
|
-
project_name = self.config.queue_name or "default"
|
|
313
|
-
|
|
314
|
-
queue = RedisPriorityQueue(
|
|
315
|
-
redis_url=self.config.redis_url,
|
|
316
|
-
queue_name=self.config.queue_name,
|
|
317
|
-
max_retries=self.config.max_retries,
|
|
318
|
-
timeout=self.config.timeout,
|
|
319
|
-
module_name=project_name # 传递项目名称作为module_name
|
|
320
|
-
)
|
|
321
|
-
# 不需要立即连接,使用 lazy connect
|
|
322
|
-
return queue
|
|
323
|
-
|
|
324
|
-
elif queue_type == QueueType.MEMORY:
|
|
325
|
-
queue = SpiderPriorityQueue()
|
|
326
|
-
# 为内存队列设置背压控制
|
|
327
|
-
self._queue_semaphore = asyncio.Semaphore(self.config.max_queue_size)
|
|
328
|
-
return queue
|
|
329
|
-
|
|
330
|
-
else:
|
|
331
|
-
raise ValueError(f"不支持的队列类型: {queue_type}")
|
|
332
|
-
|
|
333
|
-
async def _health_check(self) -> bool:
|
|
334
|
-
"""Health check"""
|
|
335
|
-
try:
|
|
336
|
-
if self._queue_type == QueueType.REDIS:
|
|
337
|
-
# 测试 Redis 连接
|
|
338
|
-
await self._queue.connect()
|
|
339
|
-
self._health_status = "healthy"
|
|
340
|
-
else:
|
|
341
|
-
# 内存队列总是健康的
|
|
342
|
-
self._health_status = "healthy"
|
|
343
|
-
return False # 内存队列不需要更新配置
|
|
344
|
-
except Exception as e:
|
|
345
|
-
self.logger.warning(f"Queue health check failed: {e}")
|
|
346
|
-
self._health_status = "unhealthy"
|
|
347
|
-
# 如果是Redis队列且健康检查失败,尝试切换到内存队列
|
|
348
|
-
if self._queue_type == QueueType.REDIS and self.config.queue_type == QueueType.AUTO:
|
|
349
|
-
self.logger.info("Redis queue unavailable, attempting to switch to memory queue...")
|
|
350
|
-
try:
|
|
351
|
-
await self._queue.close()
|
|
352
|
-
except:
|
|
353
|
-
pass
|
|
354
|
-
self._queue = None
|
|
355
|
-
# 重新创建内存队列
|
|
356
|
-
self._queue = await self._create_queue(QueueType.MEMORY)
|
|
357
|
-
self._queue_type = QueueType.MEMORY
|
|
358
|
-
self._queue_semaphore = asyncio.Semaphore(self.config.max_queue_size)
|
|
359
|
-
self._health_status = "healthy"
|
|
360
|
-
self.logger.info("Switched to memory queue")
|
|
361
|
-
# 返回一个信号,表示需要更新过滤器和去重管道配置
|
|
362
|
-
return True
|
|
363
|
-
return False
|
|
364
|
-
|
|
365
|
-
def _get_queue_info(self) -> Dict[str, Any]:
|
|
366
|
-
"""Get queue configuration information"""
|
|
367
|
-
info = {
|
|
368
|
-
"queue_name": self.config.queue_name,
|
|
369
|
-
"max_queue_size": self.config.max_queue_size
|
|
370
|
-
}
|
|
371
|
-
|
|
372
|
-
if self._queue_type == QueueType.REDIS:
|
|
373
|
-
info.update({
|
|
374
|
-
"redis_url": self.config.redis_url,
|
|
375
|
-
"max_retries": self.config.max_retries,
|
|
376
|
-
"timeout": self.config.timeout
|
|
377
|
-
})
|
|
378
|
-
|
|
379
|
-
return info
|
|
1
|
+
#!/usr/bin/python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
统一的队列管理器
|
|
5
|
+
提供简洁、一致的队列接口,自动处理不同队列类型的差异
|
|
6
|
+
"""
|
|
7
|
+
import asyncio
|
|
8
|
+
import traceback
|
|
9
|
+
from enum import Enum
|
|
10
|
+
from typing import Optional, Dict, Any, Union
|
|
11
|
+
|
|
12
|
+
from crawlo import Request
|
|
13
|
+
from crawlo.queue.pqueue import SpiderPriorityQueue
|
|
14
|
+
from crawlo.utils.error_handler import ErrorHandler
|
|
15
|
+
from crawlo.utils.log import get_logger
|
|
16
|
+
from crawlo.utils.request_serializer import RequestSerializer
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
from crawlo.queue.redis_priority_queue import RedisPriorityQueue
|
|
20
|
+
|
|
21
|
+
REDIS_AVAILABLE = True
|
|
22
|
+
except ImportError:
|
|
23
|
+
RedisPriorityQueue = None
|
|
24
|
+
REDIS_AVAILABLE = False
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class QueueType(Enum):
|
|
28
|
+
"""Queue type enumeration"""
|
|
29
|
+
MEMORY = "memory"
|
|
30
|
+
REDIS = "redis"
|
|
31
|
+
AUTO = "auto" # 自动选择
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class QueueConfig:
|
|
35
|
+
"""Queue configuration class"""
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
queue_type: Union[QueueType, str] = QueueType.AUTO,
|
|
40
|
+
redis_url: Optional[str] = None,
|
|
41
|
+
redis_host: str = "127.0.0.1",
|
|
42
|
+
redis_port: int = 6379,
|
|
43
|
+
redis_password: Optional[str] = None,
|
|
44
|
+
redis_db: int = 0,
|
|
45
|
+
queue_name: str = "crawlo:requests",
|
|
46
|
+
max_queue_size: int = 1000,
|
|
47
|
+
max_retries: int = 3,
|
|
48
|
+
timeout: int = 300,
|
|
49
|
+
**kwargs
|
|
50
|
+
):
|
|
51
|
+
self.queue_type = QueueType(queue_type) if isinstance(queue_type, str) else queue_type
|
|
52
|
+
|
|
53
|
+
# Redis 配置
|
|
54
|
+
if redis_url:
|
|
55
|
+
self.redis_url = redis_url
|
|
56
|
+
else:
|
|
57
|
+
if redis_password:
|
|
58
|
+
self.redis_url = f"redis://:{redis_password}@{redis_host}:{redis_port}/{redis_db}"
|
|
59
|
+
else:
|
|
60
|
+
self.redis_url = f"redis://{redis_host}:{redis_port}/{redis_db}"
|
|
61
|
+
|
|
62
|
+
self.queue_name = queue_name
|
|
63
|
+
self.max_queue_size = max_queue_size
|
|
64
|
+
self.max_retries = max_retries
|
|
65
|
+
self.timeout = timeout
|
|
66
|
+
self.extra_config = kwargs
|
|
67
|
+
|
|
68
|
+
@classmethod
|
|
69
|
+
def from_settings(cls, settings) -> 'QueueConfig':
|
|
70
|
+
"""Create configuration from settings"""
|
|
71
|
+
return cls(
|
|
72
|
+
queue_type=settings.get('QUEUE_TYPE', QueueType.AUTO),
|
|
73
|
+
redis_url=settings.get('REDIS_URL'),
|
|
74
|
+
redis_host=settings.get('REDIS_HOST', '127.0.0.1'),
|
|
75
|
+
redis_port=settings.get_int('REDIS_PORT', 6379),
|
|
76
|
+
redis_password=settings.get('REDIS_PASSWORD'),
|
|
77
|
+
redis_db=settings.get_int('REDIS_DB', 0),
|
|
78
|
+
queue_name=settings.get('SCHEDULER_QUEUE_NAME', 'crawlo:requests'),
|
|
79
|
+
max_queue_size=settings.get_int('SCHEDULER_MAX_QUEUE_SIZE', 1000),
|
|
80
|
+
max_retries=settings.get_int('QUEUE_MAX_RETRIES', 3),
|
|
81
|
+
timeout=settings.get_int('QUEUE_TIMEOUT', 300)
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class QueueManager:
|
|
86
|
+
"""Unified queue manager"""
|
|
87
|
+
|
|
88
|
+
def __init__(self, config: QueueConfig):
|
|
89
|
+
self.config = config
|
|
90
|
+
self.logger = get_logger(self.__class__.__name__)
|
|
91
|
+
self.error_handler = ErrorHandler(self.__class__.__name__)
|
|
92
|
+
self.request_serializer = RequestSerializer()
|
|
93
|
+
self._queue = None
|
|
94
|
+
self._queue_semaphore = None
|
|
95
|
+
self._queue_type = None
|
|
96
|
+
self._health_status = "unknown"
|
|
97
|
+
|
|
98
|
+
async def initialize(self) -> bool:
|
|
99
|
+
"""初始化队列"""
|
|
100
|
+
try:
|
|
101
|
+
queue_type = await self._determine_queue_type()
|
|
102
|
+
self._queue = await self._create_queue(queue_type)
|
|
103
|
+
self._queue_type = queue_type
|
|
104
|
+
|
|
105
|
+
# 测试队列健康状态
|
|
106
|
+
health_check_result = await self._health_check()
|
|
107
|
+
|
|
108
|
+
self.logger.info(f"Queue initialized successfully Type: {queue_type.value}")
|
|
109
|
+
# 只在调试模式下输出详细配置信息
|
|
110
|
+
self.logger.debug(f"Queue configuration: {self._get_queue_info()}")
|
|
111
|
+
|
|
112
|
+
# 如果健康检查返回True,表示队列类型发生了切换,需要更新配置
|
|
113
|
+
if health_check_result:
|
|
114
|
+
return True
|
|
115
|
+
|
|
116
|
+
# 如果队列类型是Redis,检查是否需要更新配置
|
|
117
|
+
if queue_type == QueueType.REDIS:
|
|
118
|
+
# 这个检查需要在调度器中进行,因为队列管理器无法访问crawler.settings
|
|
119
|
+
# 但我们不需要总是返回True,只有在确实需要更新时才返回True
|
|
120
|
+
# 调度器会进行更详细的检查
|
|
121
|
+
pass
|
|
122
|
+
|
|
123
|
+
return False # 默认不需要更新配置
|
|
124
|
+
|
|
125
|
+
except Exception as e:
|
|
126
|
+
# 记录详细的错误信息和堆栈跟踪
|
|
127
|
+
self.logger.error(f"Queue initialization failed: {e}")
|
|
128
|
+
self.logger.debug(f"详细错误信息:\n{traceback.format_exc()}")
|
|
129
|
+
self._health_status = "error"
|
|
130
|
+
return False
|
|
131
|
+
|
|
132
|
+
async def put(self, request: Request, priority: int = 0) -> bool:
|
|
133
|
+
"""Unified enqueue interface"""
|
|
134
|
+
if not self._queue:
|
|
135
|
+
raise RuntimeError("队列未初始化")
|
|
136
|
+
|
|
137
|
+
try:
|
|
138
|
+
# 序列化处理(仅对 Redis 队列)
|
|
139
|
+
if self._queue_type == QueueType.REDIS:
|
|
140
|
+
request = self.request_serializer.prepare_for_serialization(request)
|
|
141
|
+
|
|
142
|
+
# 背压控制(仅对内存队列)
|
|
143
|
+
if self._queue_semaphore:
|
|
144
|
+
# 对于大量请求,使用非阻塞式检查
|
|
145
|
+
if not self._queue_semaphore.locked():
|
|
146
|
+
await self._queue_semaphore.acquire()
|
|
147
|
+
else:
|
|
148
|
+
# 如果队列已满,返回 False 而不是阻塞
|
|
149
|
+
self.logger.warning("Queue is full, skipping current request")
|
|
150
|
+
return False
|
|
151
|
+
|
|
152
|
+
# 统一的入队操作
|
|
153
|
+
if hasattr(self._queue, 'put'):
|
|
154
|
+
if self._queue_type == QueueType.REDIS:
|
|
155
|
+
success = await self._queue.put(request, priority)
|
|
156
|
+
else:
|
|
157
|
+
await self._queue.put(request)
|
|
158
|
+
success = True
|
|
159
|
+
else:
|
|
160
|
+
raise RuntimeError(f"队列类型 {self._queue_type} 不支持 put 操作")
|
|
161
|
+
|
|
162
|
+
if success:
|
|
163
|
+
self.logger.debug(f"Request enqueued successfully: {request.url}")
|
|
164
|
+
|
|
165
|
+
return success
|
|
166
|
+
|
|
167
|
+
except Exception as e:
|
|
168
|
+
self.logger.error(f"Failed to enqueue request: {e}")
|
|
169
|
+
if self._queue_semaphore:
|
|
170
|
+
self._queue_semaphore.release()
|
|
171
|
+
return False
|
|
172
|
+
|
|
173
|
+
async def get(self, timeout: float = 5.0) -> Optional[Request]:
|
|
174
|
+
"""Unified dequeue interface"""
|
|
175
|
+
if not self._queue:
|
|
176
|
+
raise RuntimeError("队列未初始化")
|
|
177
|
+
|
|
178
|
+
try:
|
|
179
|
+
request = await self._queue.get(timeout=timeout)
|
|
180
|
+
|
|
181
|
+
# 释放信号量(仅对内存队列)
|
|
182
|
+
if self._queue_semaphore and request:
|
|
183
|
+
self._queue_semaphore.release()
|
|
184
|
+
|
|
185
|
+
# 反序列化处理(仅对 Redis 队列)
|
|
186
|
+
if request and self._queue_type == QueueType.REDIS:
|
|
187
|
+
# 这里需要 spider 实例,暂时返回原始请求
|
|
188
|
+
# 实际的 callback 恢复在 scheduler 中处理
|
|
189
|
+
pass
|
|
190
|
+
|
|
191
|
+
return request
|
|
192
|
+
|
|
193
|
+
except Exception as e:
|
|
194
|
+
self.logger.error(f"Failed to dequeue request: {e}")
|
|
195
|
+
return None
|
|
196
|
+
|
|
197
|
+
async def size(self) -> int:
|
|
198
|
+
"""Get queue size"""
|
|
199
|
+
if not self._queue:
|
|
200
|
+
return 0
|
|
201
|
+
|
|
202
|
+
try:
|
|
203
|
+
if hasattr(self._queue, 'qsize'):
|
|
204
|
+
if asyncio.iscoroutinefunction(self._queue.qsize):
|
|
205
|
+
return await self._queue.qsize()
|
|
206
|
+
else:
|
|
207
|
+
return self._queue.qsize()
|
|
208
|
+
return 0
|
|
209
|
+
except Exception as e:
|
|
210
|
+
self.logger.warning(f"Failed to get queue size: {e}")
|
|
211
|
+
return 0
|
|
212
|
+
|
|
213
|
+
def empty(self) -> bool:
|
|
214
|
+
"""Check if queue is empty (synchronous version, for compatibility)"""
|
|
215
|
+
try:
|
|
216
|
+
# 对于内存队列,可以同步检查
|
|
217
|
+
if self._queue_type == QueueType.MEMORY:
|
|
218
|
+
return self._queue.qsize() == 0
|
|
219
|
+
# 对于 Redis 队列,由于需要异步操作,这里返回近似值
|
|
220
|
+
# 为了确保程序能正常退出,我们返回True,让上层通过更精确的异步检查来判断
|
|
221
|
+
return True
|
|
222
|
+
except Exception:
|
|
223
|
+
return True
|
|
224
|
+
|
|
225
|
+
async def async_empty(self) -> bool:
|
|
226
|
+
"""Check if queue is empty (asynchronous version, more accurate)"""
|
|
227
|
+
try:
|
|
228
|
+
# 对于内存队列
|
|
229
|
+
if self._queue_type == QueueType.MEMORY:
|
|
230
|
+
return self._queue.qsize() == 0
|
|
231
|
+
# 对于 Redis 队列,使用异步检查
|
|
232
|
+
elif self._queue_type == QueueType.REDIS:
|
|
233
|
+
size = await self.size()
|
|
234
|
+
return size == 0
|
|
235
|
+
return True
|
|
236
|
+
except Exception:
|
|
237
|
+
return True
|
|
238
|
+
|
|
239
|
+
async def close(self) -> None:
|
|
240
|
+
"""Close queue"""
|
|
241
|
+
if self._queue and hasattr(self._queue, 'close'):
|
|
242
|
+
try:
|
|
243
|
+
await self._queue.close()
|
|
244
|
+
# Change INFO level log to DEBUG level to avoid redundant output
|
|
245
|
+
self.logger.debug("Queue closed")
|
|
246
|
+
except Exception as e:
|
|
247
|
+
self.logger.warning(f"Error closing queue: {e}")
|
|
248
|
+
|
|
249
|
+
def get_status(self) -> Dict[str, Any]:
|
|
250
|
+
"""Get queue status information"""
|
|
251
|
+
return {
|
|
252
|
+
"type": self._queue_type.value if self._queue_type else "unknown",
|
|
253
|
+
"health": self._health_status,
|
|
254
|
+
"config": self._get_queue_info(),
|
|
255
|
+
"initialized": self._queue is not None
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
async def _determine_queue_type(self) -> QueueType:
|
|
259
|
+
"""Determine queue type"""
|
|
260
|
+
if self.config.queue_type == QueueType.AUTO:
|
|
261
|
+
# 自动选择:优先使用 Redis(如果可用)
|
|
262
|
+
if REDIS_AVAILABLE and self.config.redis_url:
|
|
263
|
+
# 测试 Redis 连接
|
|
264
|
+
try:
|
|
265
|
+
test_queue = RedisPriorityQueue(self.config.redis_url)
|
|
266
|
+
await test_queue.connect()
|
|
267
|
+
await test_queue.close()
|
|
268
|
+
# Change INFO level log to DEBUG level to avoid redundant output
|
|
269
|
+
self.logger.debug("Auto-detection: Redis available, using distributed queue")
|
|
270
|
+
return QueueType.REDIS
|
|
271
|
+
except Exception as e:
|
|
272
|
+
self.logger.debug(f"Auto-detection: Redis unavailable ({e}), using memory queue")
|
|
273
|
+
return QueueType.MEMORY
|
|
274
|
+
else:
|
|
275
|
+
self.logger.debug("Auto-detection: Redis not configured, using memory queue")
|
|
276
|
+
return QueueType.MEMORY
|
|
277
|
+
|
|
278
|
+
elif self.config.queue_type == QueueType.REDIS:
|
|
279
|
+
if not REDIS_AVAILABLE:
|
|
280
|
+
raise RuntimeError("Redis 队列不可用:未安装 redis 依赖")
|
|
281
|
+
if not self.config.redis_url:
|
|
282
|
+
raise RuntimeError("Redis 队列不可用:未配置 REDIS_URL")
|
|
283
|
+
# 测试 Redis 连接
|
|
284
|
+
try:
|
|
285
|
+
test_queue = RedisPriorityQueue(self.config.redis_url)
|
|
286
|
+
await test_queue.connect()
|
|
287
|
+
await test_queue.close()
|
|
288
|
+
return QueueType.REDIS
|
|
289
|
+
except Exception as e:
|
|
290
|
+
# 如果强制使用Redis但连接失败,则抛出异常
|
|
291
|
+
raise RuntimeError(f"Redis 队列不可用:无法连接到 Redis ({e})")
|
|
292
|
+
|
|
293
|
+
elif self.config.queue_type == QueueType.MEMORY:
|
|
294
|
+
return QueueType.MEMORY
|
|
295
|
+
|
|
296
|
+
else:
|
|
297
|
+
raise ValueError(f"不支持的队列类型: {self.config.queue_type}")
|
|
298
|
+
|
|
299
|
+
async def _create_queue(self, queue_type: QueueType):
|
|
300
|
+
"""Create queue instance"""
|
|
301
|
+
if queue_type == QueueType.REDIS:
|
|
302
|
+
# 简化项目名称提取逻辑
|
|
303
|
+
project_name = "default"
|
|
304
|
+
if ':' in self.config.queue_name:
|
|
305
|
+
parts = self.config.queue_name.split(':')
|
|
306
|
+
# 跳过所有"crawlo"前缀,取第一个非"crawlo"部分作为项目名称
|
|
307
|
+
for part in parts:
|
|
308
|
+
if part != "crawlo":
|
|
309
|
+
project_name = part
|
|
310
|
+
break
|
|
311
|
+
else:
|
|
312
|
+
project_name = self.config.queue_name or "default"
|
|
313
|
+
|
|
314
|
+
queue = RedisPriorityQueue(
|
|
315
|
+
redis_url=self.config.redis_url,
|
|
316
|
+
queue_name=self.config.queue_name,
|
|
317
|
+
max_retries=self.config.max_retries,
|
|
318
|
+
timeout=self.config.timeout,
|
|
319
|
+
module_name=project_name # 传递项目名称作为module_name
|
|
320
|
+
)
|
|
321
|
+
# 不需要立即连接,使用 lazy connect
|
|
322
|
+
return queue
|
|
323
|
+
|
|
324
|
+
elif queue_type == QueueType.MEMORY:
|
|
325
|
+
queue = SpiderPriorityQueue()
|
|
326
|
+
# 为内存队列设置背压控制
|
|
327
|
+
self._queue_semaphore = asyncio.Semaphore(self.config.max_queue_size)
|
|
328
|
+
return queue
|
|
329
|
+
|
|
330
|
+
else:
|
|
331
|
+
raise ValueError(f"不支持的队列类型: {queue_type}")
|
|
332
|
+
|
|
333
|
+
async def _health_check(self) -> bool:
|
|
334
|
+
"""Health check"""
|
|
335
|
+
try:
|
|
336
|
+
if self._queue_type == QueueType.REDIS:
|
|
337
|
+
# 测试 Redis 连接
|
|
338
|
+
await self._queue.connect()
|
|
339
|
+
self._health_status = "healthy"
|
|
340
|
+
else:
|
|
341
|
+
# 内存队列总是健康的
|
|
342
|
+
self._health_status = "healthy"
|
|
343
|
+
return False # 内存队列不需要更新配置
|
|
344
|
+
except Exception as e:
|
|
345
|
+
self.logger.warning(f"Queue health check failed: {e}")
|
|
346
|
+
self._health_status = "unhealthy"
|
|
347
|
+
# 如果是Redis队列且健康检查失败,尝试切换到内存队列
|
|
348
|
+
if self._queue_type == QueueType.REDIS and self.config.queue_type == QueueType.AUTO:
|
|
349
|
+
self.logger.info("Redis queue unavailable, attempting to switch to memory queue...")
|
|
350
|
+
try:
|
|
351
|
+
await self._queue.close()
|
|
352
|
+
except:
|
|
353
|
+
pass
|
|
354
|
+
self._queue = None
|
|
355
|
+
# 重新创建内存队列
|
|
356
|
+
self._queue = await self._create_queue(QueueType.MEMORY)
|
|
357
|
+
self._queue_type = QueueType.MEMORY
|
|
358
|
+
self._queue_semaphore = asyncio.Semaphore(self.config.max_queue_size)
|
|
359
|
+
self._health_status = "healthy"
|
|
360
|
+
self.logger.info("Switched to memory queue")
|
|
361
|
+
# 返回一个信号,表示需要更新过滤器和去重管道配置
|
|
362
|
+
return True
|
|
363
|
+
return False
|
|
364
|
+
|
|
365
|
+
def _get_queue_info(self) -> Dict[str, Any]:
|
|
366
|
+
"""Get queue configuration information"""
|
|
367
|
+
info = {
|
|
368
|
+
"queue_name": self.config.queue_name,
|
|
369
|
+
"max_queue_size": self.config.max_queue_size
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
if self._queue_type == QueueType.REDIS:
|
|
373
|
+
info.update({
|
|
374
|
+
"redis_url": self.config.redis_url,
|
|
375
|
+
"max_retries": self.config.max_retries,
|
|
376
|
+
"timeout": self.config.timeout
|
|
377
|
+
})
|
|
378
|
+
|
|
379
|
+
return info
|