crawlo 1.3.2__py3-none-any.whl → 1.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crawlo might be problematic. Click here for more details.
- crawlo/__init__.py +24 -0
- crawlo/__version__.py +1 -1
- crawlo/commands/run.py +58 -32
- crawlo/core/__init__.py +44 -0
- crawlo/core/engine.py +119 -45
- crawlo/core/scheduler.py +4 -3
- crawlo/crawler.py +603 -1133
- crawlo/downloader/aiohttp_downloader.py +4 -2
- crawlo/extension/__init__.py +1 -1
- crawlo/extension/logging_extension.py +23 -7
- crawlo/factories/__init__.py +28 -0
- crawlo/factories/base.py +69 -0
- crawlo/factories/crawler.py +104 -0
- crawlo/factories/registry.py +85 -0
- crawlo/filters/aioredis_filter.py +25 -2
- crawlo/framework.py +292 -0
- crawlo/initialization/__init__.py +40 -0
- crawlo/initialization/built_in.py +426 -0
- crawlo/initialization/context.py +142 -0
- crawlo/initialization/core.py +194 -0
- crawlo/initialization/phases.py +149 -0
- crawlo/initialization/registry.py +146 -0
- crawlo/items/base.py +2 -1
- crawlo/logging/__init__.py +38 -0
- crawlo/logging/config.py +97 -0
- crawlo/logging/factory.py +129 -0
- crawlo/logging/manager.py +112 -0
- crawlo/middleware/middleware_manager.py +1 -1
- crawlo/middleware/offsite.py +1 -1
- crawlo/mode_manager.py +26 -1
- crawlo/pipelines/pipeline_manager.py +2 -1
- crawlo/project.py +76 -46
- crawlo/queue/pqueue.py +11 -5
- crawlo/queue/queue_manager.py +143 -19
- crawlo/queue/redis_priority_queue.py +69 -49
- crawlo/settings/default_settings.py +110 -14
- crawlo/settings/setting_manager.py +29 -13
- crawlo/spider/__init__.py +34 -16
- crawlo/stats_collector.py +17 -3
- crawlo/task_manager.py +112 -3
- crawlo/templates/project/settings.py.tmpl +103 -202
- crawlo/templates/project/settings_distributed.py.tmpl +122 -135
- crawlo/templates/project/settings_gentle.py.tmpl +149 -43
- crawlo/templates/project/settings_high_performance.py.tmpl +127 -90
- crawlo/templates/project/settings_minimal.py.tmpl +46 -15
- crawlo/templates/project/settings_simple.py.tmpl +138 -75
- crawlo/templates/project/spiders/__init__.py.tmpl +5 -1
- crawlo/templates/run.py.tmpl +10 -14
- crawlo/templates/spiders_init.py.tmpl +10 -0
- crawlo/tools/network_diagnostic.py +365 -0
- crawlo/utils/class_loader.py +26 -0
- crawlo/utils/error_handler.py +76 -35
- crawlo/utils/log.py +41 -144
- crawlo/utils/redis_connection_pool.py +43 -6
- crawlo/utils/request_serializer.py +8 -1
- {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/METADATA +120 -14
- {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/RECORD +104 -45
- tests/authenticated_proxy_example.py +2 -2
- tests/baidu_performance_test.py +109 -0
- tests/baidu_test.py +60 -0
- tests/comprehensive_framework_test.py +213 -0
- tests/comprehensive_test.py +82 -0
- tests/comprehensive_testing_summary.md +187 -0
- tests/debug_configure.py +70 -0
- tests/debug_framework_logger.py +85 -0
- tests/debug_log_levels.py +64 -0
- tests/distributed_test.py +67 -0
- tests/distributed_test_debug.py +77 -0
- tests/final_command_test_report.md +0 -0
- tests/final_comprehensive_test.py +152 -0
- tests/final_validation_test.py +183 -0
- tests/framework_performance_test.py +203 -0
- tests/optimized_performance_test.py +212 -0
- tests/performance_comparison.py +246 -0
- tests/queue_blocking_test.py +114 -0
- tests/queue_test.py +90 -0
- tests/scrapy_comparison/ofweek_scrapy.py +139 -0
- tests/scrapy_comparison/scrapy_test.py +134 -0
- tests/simple_command_test.py +120 -0
- tests/simple_crawlo_test.py +128 -0
- tests/simple_log_test.py +58 -0
- tests/simple_optimization_test.py +129 -0
- tests/simple_spider_test.py +50 -0
- tests/simple_test.py +48 -0
- tests/test_all_commands.py +231 -0
- tests/test_batch_processor.py +179 -0
- tests/test_component_factory.py +175 -0
- tests/test_controlled_spider_mixin.py +80 -0
- tests/test_enhanced_error_handler_comprehensive.py +246 -0
- tests/test_factories.py +253 -0
- tests/test_framework_logger.py +67 -0
- tests/test_framework_startup.py +65 -0
- tests/test_large_scale_config.py +113 -0
- tests/test_large_scale_helper.py +236 -0
- tests/test_mode_change.py +73 -0
- tests/test_mode_consistency.py +1 -1
- tests/test_performance_monitor.py +116 -0
- tests/test_queue_empty_check.py +42 -0
- tests/untested_features_report.md +139 -0
- tests/verify_debug.py +52 -0
- tests/verify_log_fix.py +112 -0
- tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -82
- {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/WHEEL +0 -0
- {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/entry_points.txt +0 -0
- {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/top_level.txt +0 -0
crawlo/queue/queue_manager.py
CHANGED
|
@@ -7,15 +7,20 @@
|
|
|
7
7
|
import asyncio
|
|
8
8
|
import traceback
|
|
9
9
|
from enum import Enum
|
|
10
|
-
from typing import Optional, Dict, Any, Union
|
|
10
|
+
from typing import Optional, Dict, Any, Union, TYPE_CHECKING
|
|
11
|
+
import time
|
|
12
|
+
import random
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from crawlo import Request
|
|
11
16
|
|
|
12
|
-
from crawlo import Request
|
|
13
17
|
from crawlo.queue.pqueue import SpiderPriorityQueue
|
|
14
18
|
from crawlo.utils.error_handler import ErrorHandler
|
|
15
19
|
from crawlo.utils.log import get_logger
|
|
16
20
|
from crawlo.utils.request_serializer import RequestSerializer
|
|
17
21
|
|
|
18
22
|
try:
|
|
23
|
+
# 使用完整版Redis队列
|
|
19
24
|
from crawlo.queue.redis_priority_queue import RedisPriorityQueue
|
|
20
25
|
|
|
21
26
|
REDIS_AVAILABLE = True
|
|
@@ -31,6 +36,79 @@ class QueueType(Enum):
|
|
|
31
36
|
AUTO = "auto" # 自动选择
|
|
32
37
|
|
|
33
38
|
|
|
39
|
+
class IntelligentScheduler:
|
|
40
|
+
"""智能调度器"""
|
|
41
|
+
|
|
42
|
+
def __init__(self):
|
|
43
|
+
self.domain_stats = {} # 域名统计信息
|
|
44
|
+
self.url_stats = {} # URL统计信息
|
|
45
|
+
self.last_request_time = {} # 最后请求时间
|
|
46
|
+
|
|
47
|
+
def calculate_priority(self, request: "Request") -> int:
|
|
48
|
+
"""计算请求的智能优先级"""
|
|
49
|
+
priority = getattr(request, 'priority', 0)
|
|
50
|
+
|
|
51
|
+
# 获取域名
|
|
52
|
+
domain = self._extract_domain(request.url)
|
|
53
|
+
|
|
54
|
+
# 基于域名访问频率调整优先级
|
|
55
|
+
if domain in self.domain_stats:
|
|
56
|
+
domain_access_count = self.domain_stats[domain]['count']
|
|
57
|
+
last_access_time = self.domain_stats[domain]['last_time']
|
|
58
|
+
|
|
59
|
+
# 如果最近访问过该域名,降低优先级(避免过度集中访问同一域名)
|
|
60
|
+
time_since_last = time.time() - last_access_time
|
|
61
|
+
if time_since_last < 5: # 5秒内访问过
|
|
62
|
+
priority -= 2
|
|
63
|
+
elif time_since_last < 30: # 30秒内访问过
|
|
64
|
+
priority -= 1
|
|
65
|
+
|
|
66
|
+
# 如果该域名访问次数过多,进一步降低优先级
|
|
67
|
+
if domain_access_count > 10:
|
|
68
|
+
priority -= 1
|
|
69
|
+
|
|
70
|
+
# 基于URL访问历史调整优先级
|
|
71
|
+
if request.url in self.url_stats:
|
|
72
|
+
url_access_count = self.url_stats[request.url]
|
|
73
|
+
if url_access_count > 1:
|
|
74
|
+
# 重复URL降低优先级
|
|
75
|
+
priority -= url_access_count
|
|
76
|
+
|
|
77
|
+
# 基于深度调整优先级
|
|
78
|
+
depth = getattr(request, 'meta', {}).get('depth', 0)
|
|
79
|
+
priority -= depth # 深度越大,优先级越低
|
|
80
|
+
|
|
81
|
+
return priority
|
|
82
|
+
|
|
83
|
+
def update_stats(self, request: "Request"):
|
|
84
|
+
"""更新统计信息"""
|
|
85
|
+
domain = self._extract_domain(request.url)
|
|
86
|
+
|
|
87
|
+
# 更新域名统计
|
|
88
|
+
if domain not in self.domain_stats:
|
|
89
|
+
self.domain_stats[domain] = {'count': 0, 'last_time': 0}
|
|
90
|
+
|
|
91
|
+
self.domain_stats[domain]['count'] += 1
|
|
92
|
+
self.domain_stats[domain]['last_time'] = time.time()
|
|
93
|
+
|
|
94
|
+
# 更新URL统计
|
|
95
|
+
if request.url not in self.url_stats:
|
|
96
|
+
self.url_stats[request.url] = 0
|
|
97
|
+
self.url_stats[request.url] += 1
|
|
98
|
+
|
|
99
|
+
# 更新最后请求时间
|
|
100
|
+
self.last_request_time[domain] = time.time()
|
|
101
|
+
|
|
102
|
+
def _extract_domain(self, url: str) -> str:
|
|
103
|
+
"""提取域名"""
|
|
104
|
+
try:
|
|
105
|
+
from urllib.parse import urlparse
|
|
106
|
+
parsed = urlparse(url)
|
|
107
|
+
return parsed.netloc
|
|
108
|
+
except:
|
|
109
|
+
return "unknown"
|
|
110
|
+
|
|
111
|
+
|
|
34
112
|
class QueueConfig:
|
|
35
113
|
"""Queue configuration class"""
|
|
36
114
|
|
|
@@ -87,13 +165,27 @@ class QueueManager:
|
|
|
87
165
|
|
|
88
166
|
def __init__(self, config: QueueConfig):
|
|
89
167
|
self.config = config
|
|
90
|
-
|
|
91
|
-
self.
|
|
168
|
+
# 延迟初始化logger和error_handler避免循环依赖
|
|
169
|
+
self._logger = None
|
|
170
|
+
self._error_handler = None
|
|
92
171
|
self.request_serializer = RequestSerializer()
|
|
93
172
|
self._queue = None
|
|
94
173
|
self._queue_semaphore = None
|
|
95
174
|
self._queue_type = None
|
|
96
175
|
self._health_status = "unknown"
|
|
176
|
+
self._intelligent_scheduler = IntelligentScheduler() # 智能调度器
|
|
177
|
+
|
|
178
|
+
@property
|
|
179
|
+
def logger(self):
|
|
180
|
+
if self._logger is None:
|
|
181
|
+
self._logger = get_logger(self.__class__.__name__)
|
|
182
|
+
return self._logger
|
|
183
|
+
|
|
184
|
+
@property
|
|
185
|
+
def error_handler(self):
|
|
186
|
+
if self._error_handler is None:
|
|
187
|
+
self._error_handler = ErrorHandler(self.__class__.__name__)
|
|
188
|
+
return self._error_handler
|
|
97
189
|
|
|
98
190
|
async def initialize(self) -> bool:
|
|
99
191
|
"""初始化队列"""
|
|
@@ -129,38 +221,44 @@ class QueueManager:
|
|
|
129
221
|
self._health_status = "error"
|
|
130
222
|
return False
|
|
131
223
|
|
|
132
|
-
async def put(self, request: Request, priority: int = 0) -> bool:
|
|
224
|
+
async def put(self, request: "Request", priority: int = 0) -> bool:
|
|
133
225
|
"""Unified enqueue interface"""
|
|
134
226
|
if not self._queue:
|
|
135
227
|
raise RuntimeError("队列未初始化")
|
|
136
228
|
|
|
137
229
|
try:
|
|
230
|
+
# 应用智能调度算法计算优先级
|
|
231
|
+
intelligent_priority = self._intelligent_scheduler.calculate_priority(request)
|
|
232
|
+
# 结合原始优先级和智能优先级
|
|
233
|
+
final_priority = priority + intelligent_priority
|
|
234
|
+
|
|
235
|
+
# 更新统计信息
|
|
236
|
+
self._intelligent_scheduler.update_stats(request)
|
|
237
|
+
|
|
138
238
|
# 序列化处理(仅对 Redis 队列)
|
|
139
239
|
if self._queue_type == QueueType.REDIS:
|
|
140
240
|
request = self.request_serializer.prepare_for_serialization(request)
|
|
141
241
|
|
|
142
242
|
# 背压控制(仅对内存队列)
|
|
143
243
|
if self._queue_semaphore:
|
|
144
|
-
#
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
else:
|
|
148
|
-
# 如果队列已满,返回 False 而不是阻塞
|
|
149
|
-
self.logger.warning("Queue is full, skipping current request")
|
|
150
|
-
return False
|
|
244
|
+
# 对于大量请求,使用阻塞式等待而不是跳过
|
|
245
|
+
# 这样可以确保不会丢失任何请求
|
|
246
|
+
await self._queue_semaphore.acquire()
|
|
151
247
|
|
|
152
248
|
# 统一的入队操作
|
|
153
249
|
if hasattr(self._queue, 'put'):
|
|
154
250
|
if self._queue_type == QueueType.REDIS:
|
|
155
|
-
success = await self._queue.put(request,
|
|
251
|
+
success = await self._queue.put(request, final_priority)
|
|
156
252
|
else:
|
|
157
|
-
|
|
253
|
+
# 对于内存队列,我们需要手动处理优先级
|
|
254
|
+
# 在SpiderPriorityQueue中,元素应该是(priority, item)的元组
|
|
255
|
+
await self._queue.put((final_priority, request))
|
|
158
256
|
success = True
|
|
159
257
|
else:
|
|
160
258
|
raise RuntimeError(f"队列类型 {self._queue_type} 不支持 put 操作")
|
|
161
259
|
|
|
162
260
|
if success:
|
|
163
|
-
self.logger.debug(f"Request enqueued successfully: {request.url}")
|
|
261
|
+
self.logger.debug(f"Request enqueued successfully: {request.url} with priority {final_priority}")
|
|
164
262
|
|
|
165
263
|
return success
|
|
166
264
|
|
|
@@ -170,7 +268,7 @@ class QueueManager:
|
|
|
170
268
|
self._queue_semaphore.release()
|
|
171
269
|
return False
|
|
172
270
|
|
|
173
|
-
async def get(self, timeout: float = 5.0) -> Optional[Request]:
|
|
271
|
+
async def get(self, timeout: float = 5.0) -> Optional["Request"]:
|
|
174
272
|
"""Unified dequeue interface"""
|
|
175
273
|
if not self._queue:
|
|
176
274
|
raise RuntimeError("队列未初始化")
|
|
@@ -188,6 +286,11 @@ class QueueManager:
|
|
|
188
286
|
# 实际的 callback 恢复在 scheduler 中处理
|
|
189
287
|
pass
|
|
190
288
|
|
|
289
|
+
# 如果是内存队列,需要解包(priority, request)元组
|
|
290
|
+
if request and self._queue_type == QueueType.MEMORY:
|
|
291
|
+
if isinstance(request, tuple) and len(request) == 2:
|
|
292
|
+
request = request[1] # 取元组中的请求对象
|
|
293
|
+
|
|
191
294
|
return request
|
|
192
295
|
|
|
193
296
|
except Exception as e:
|
|
@@ -215,7 +318,12 @@ class QueueManager:
|
|
|
215
318
|
try:
|
|
216
319
|
# 对于内存队列,可以同步检查
|
|
217
320
|
if self._queue_type == QueueType.MEMORY:
|
|
218
|
-
|
|
321
|
+
# 确保正确检查队列大小
|
|
322
|
+
if hasattr(self._queue, 'qsize'):
|
|
323
|
+
return self._queue.qsize() == 0
|
|
324
|
+
else:
|
|
325
|
+
# 如果没有qsize方法,假设队列为空
|
|
326
|
+
return True
|
|
219
327
|
# 对于 Redis 队列,由于需要异步操作,这里返回近似值
|
|
220
328
|
# 为了确保程序能正常退出,我们返回True,让上层通过更精确的异步检查来判断
|
|
221
329
|
return True
|
|
@@ -227,7 +335,16 @@ class QueueManager:
|
|
|
227
335
|
try:
|
|
228
336
|
# 对于内存队列
|
|
229
337
|
if self._queue_type == QueueType.MEMORY:
|
|
230
|
-
|
|
338
|
+
# 确保正确检查队列大小
|
|
339
|
+
if hasattr(self._queue, 'qsize'):
|
|
340
|
+
if asyncio.iscoroutinefunction(self._queue.qsize):
|
|
341
|
+
size = await self._queue.qsize()
|
|
342
|
+
else:
|
|
343
|
+
size = self._queue.qsize()
|
|
344
|
+
return size == 0
|
|
345
|
+
else:
|
|
346
|
+
# 如果没有qsize方法,假设队列为空
|
|
347
|
+
return True
|
|
231
348
|
# 对于 Redis 队列,使用异步检查
|
|
232
349
|
elif self._queue_type == QueueType.REDIS:
|
|
233
350
|
size = await self.size()
|
|
@@ -262,10 +379,10 @@ class QueueManager:
|
|
|
262
379
|
if REDIS_AVAILABLE and self.config.redis_url:
|
|
263
380
|
# 测试 Redis 连接
|
|
264
381
|
try:
|
|
382
|
+
from crawlo.queue.redis_priority_queue import RedisPriorityQueue
|
|
265
383
|
test_queue = RedisPriorityQueue(self.config.redis_url)
|
|
266
384
|
await test_queue.connect()
|
|
267
385
|
await test_queue.close()
|
|
268
|
-
# Change INFO level log to DEBUG level to avoid redundant output
|
|
269
386
|
self.logger.debug("Auto-detection: Redis available, using distributed queue")
|
|
270
387
|
return QueueType.REDIS
|
|
271
388
|
except Exception as e:
|
|
@@ -282,6 +399,7 @@ class QueueManager:
|
|
|
282
399
|
raise RuntimeError("Redis 队列不可用:未配置 REDIS_URL")
|
|
283
400
|
# 测试 Redis 连接
|
|
284
401
|
try:
|
|
402
|
+
from crawlo.queue.redis_priority_queue import RedisPriorityQueue
|
|
285
403
|
test_queue = RedisPriorityQueue(self.config.redis_url)
|
|
286
404
|
await test_queue.connect()
|
|
287
405
|
await test_queue.close()
|
|
@@ -299,6 +417,12 @@ class QueueManager:
|
|
|
299
417
|
async def _create_queue(self, queue_type: QueueType):
|
|
300
418
|
"""Create queue instance"""
|
|
301
419
|
if queue_type == QueueType.REDIS:
|
|
420
|
+
# 延迟导入Redis队列
|
|
421
|
+
try:
|
|
422
|
+
from crawlo.queue.redis_priority_queue import RedisPriorityQueue
|
|
423
|
+
except ImportError as e:
|
|
424
|
+
raise RuntimeError(f"Redis队列不可用:未能导入RedisPriorityQueue ({e})")
|
|
425
|
+
|
|
302
426
|
# 简化项目名称提取逻辑
|
|
303
427
|
project_name = "default"
|
|
304
428
|
if ':' in self.config.queue_name:
|
|
@@ -1,19 +1,38 @@
|
|
|
1
1
|
import asyncio
|
|
2
|
+
import asyncio
|
|
2
3
|
import pickle
|
|
3
4
|
import time
|
|
4
5
|
import traceback
|
|
5
|
-
from typing import Optional
|
|
6
|
+
from typing import Optional, TYPE_CHECKING
|
|
6
7
|
|
|
7
8
|
import redis.asyncio as aioredis
|
|
8
9
|
|
|
9
|
-
|
|
10
|
+
# 使用 TYPE_CHECKING 避免运行时循环导入
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from crawlo import Request
|
|
13
|
+
|
|
10
14
|
from crawlo.utils.error_handler import ErrorHandler
|
|
11
15
|
from crawlo.utils.log import get_logger
|
|
12
16
|
from crawlo.utils.redis_connection_pool import get_redis_pool, OptimizedRedisConnectionPool
|
|
13
17
|
from crawlo.utils.request_serializer import RequestSerializer
|
|
14
18
|
|
|
15
|
-
|
|
16
|
-
|
|
19
|
+
# 延迟初始化避免循环依赖
|
|
20
|
+
_logger = None
|
|
21
|
+
_error_handler = None
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_module_logger():
|
|
25
|
+
global _logger
|
|
26
|
+
if _logger is None:
|
|
27
|
+
_logger = get_logger(__name__)
|
|
28
|
+
return _logger
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def get_module_error_handler():
|
|
32
|
+
global _error_handler
|
|
33
|
+
if _error_handler is None:
|
|
34
|
+
_error_handler = ErrorHandler(__name__)
|
|
35
|
+
return _error_handler
|
|
17
36
|
|
|
18
37
|
|
|
19
38
|
class RedisPriorityQueue:
|
|
@@ -39,14 +58,14 @@ class RedisPriorityQueue:
|
|
|
39
58
|
|
|
40
59
|
self.redis_url = redis_url
|
|
41
60
|
self.module_name = module_name # 保存 module_name
|
|
42
|
-
|
|
61
|
+
|
|
43
62
|
# 如果未提供 queue_name,则根据 module_name 自动生成
|
|
44
63
|
if queue_name is None:
|
|
45
64
|
self.queue_name = f"crawlo:{module_name}:queue:requests"
|
|
46
65
|
else:
|
|
47
66
|
# 保持用户提供的队列名称不变,不做修改
|
|
48
67
|
self.queue_name = queue_name
|
|
49
|
-
|
|
68
|
+
|
|
50
69
|
# 如果未提供 processing_queue,则根据 queue_name 自动生成
|
|
51
70
|
if processing_queue is None:
|
|
52
71
|
if ":queue:requests" in self.queue_name:
|
|
@@ -55,7 +74,7 @@ class RedisPriorityQueue:
|
|
|
55
74
|
self.processing_queue = f"{self.queue_name}:processing"
|
|
56
75
|
else:
|
|
57
76
|
self.processing_queue = processing_queue
|
|
58
|
-
|
|
77
|
+
|
|
59
78
|
# 如果未提供 failed_queue,则根据 queue_name 自动生成
|
|
60
79
|
if failed_queue is None:
|
|
61
80
|
if ":queue:requests" in self.queue_name:
|
|
@@ -64,7 +83,7 @@ class RedisPriorityQueue:
|
|
|
64
83
|
self.failed_queue = f"{self.queue_name}:failed"
|
|
65
84
|
else:
|
|
66
85
|
self.failed_queue = failed_queue
|
|
67
|
-
|
|
86
|
+
|
|
68
87
|
self.max_retries = max_retries
|
|
69
88
|
self.timeout = timeout
|
|
70
89
|
self.max_connections = max_connections
|
|
@@ -98,18 +117,18 @@ class RedisPriorityQueue:
|
|
|
98
117
|
decode_responses=False, # 确保不自动解码响应
|
|
99
118
|
encoding='utf-8'
|
|
100
119
|
)
|
|
101
|
-
|
|
120
|
+
|
|
102
121
|
self._redis = await self._redis_pool.get_connection()
|
|
103
|
-
|
|
122
|
+
|
|
104
123
|
# 测试连接
|
|
105
124
|
await self._redis.ping()
|
|
106
125
|
# 只在调试模式下输出详细连接信息
|
|
107
|
-
#
|
|
126
|
+
# get_module_logger().debug(f"Redis 连接成功 (Module: {self.module_name})") # 注释掉重复的日志
|
|
108
127
|
return self._redis
|
|
109
128
|
except Exception as e:
|
|
110
129
|
error_msg = f"Redis 连接失败 (尝试 {attempt + 1}/{max_retries}, Module: {self.module_name}): {e}"
|
|
111
|
-
|
|
112
|
-
|
|
130
|
+
get_module_logger().warning(error_msg)
|
|
131
|
+
get_module_logger().debug(f"详细错误信息:\n{traceback.format_exc()}")
|
|
113
132
|
if attempt < max_retries - 1:
|
|
114
133
|
await asyncio.sleep(delay)
|
|
115
134
|
else:
|
|
@@ -122,46 +141,46 @@ class RedisPriorityQueue:
|
|
|
122
141
|
try:
|
|
123
142
|
await self._redis.ping()
|
|
124
143
|
except Exception as e:
|
|
125
|
-
|
|
144
|
+
get_module_logger().warning(f"Redis 连接失效 (Module: {self.module_name}),尝试重连...: {e}")
|
|
126
145
|
self._redis = None
|
|
127
146
|
await self.connect()
|
|
128
147
|
|
|
129
|
-
async def put(self, request
|
|
148
|
+
async def put(self, request, priority: int = 0) -> bool:
|
|
130
149
|
"""放入请求到队列"""
|
|
131
150
|
try:
|
|
132
151
|
await self._ensure_connection()
|
|
133
152
|
score = -priority
|
|
134
153
|
key = self._get_request_key(request)
|
|
135
|
-
|
|
154
|
+
|
|
136
155
|
# 🔥 使用专用的序列化工具清理 Request
|
|
137
156
|
clean_request = self.request_serializer.prepare_for_serialization(request)
|
|
138
|
-
|
|
157
|
+
|
|
139
158
|
# 确保序列化后的数据可以被正确反序列化
|
|
140
159
|
try:
|
|
141
160
|
serialized = pickle.dumps(clean_request)
|
|
142
161
|
# 验证序列化数据可以被反序列化
|
|
143
162
|
pickle.loads(serialized)
|
|
144
163
|
except Exception as serialize_error:
|
|
145
|
-
|
|
164
|
+
get_module_logger().error(f"请求序列化验证失败 (Module: {self.module_name}): {serialize_error}")
|
|
146
165
|
return False
|
|
147
|
-
|
|
166
|
+
|
|
148
167
|
pipe = self._redis.pipeline()
|
|
149
168
|
pipe.zadd(self.queue_name, {key: score})
|
|
150
169
|
pipe.hset(f"{self.queue_name}:data", key, serialized)
|
|
151
170
|
result = await pipe.execute()
|
|
152
|
-
|
|
171
|
+
|
|
153
172
|
if result[0] > 0:
|
|
154
|
-
|
|
173
|
+
get_module_logger().debug(f"成功入队 (Module: {self.module_name}): {request.url}") # 注释掉重复的日志
|
|
155
174
|
return result[0] > 0
|
|
156
175
|
except Exception as e:
|
|
157
|
-
|
|
158
|
-
e,
|
|
159
|
-
context=f"放入队列失败 (Module: {self.module_name})",
|
|
176
|
+
get_module_error_handler().handle_error(
|
|
177
|
+
e,
|
|
178
|
+
context=f"放入队列失败 (Module: {self.module_name})",
|
|
160
179
|
raise_error=False
|
|
161
180
|
)
|
|
162
181
|
return False
|
|
163
182
|
|
|
164
|
-
async def get(self, timeout: float = 5.0)
|
|
183
|
+
async def get(self, timeout: float = 5.0):
|
|
165
184
|
"""
|
|
166
185
|
获取请求(带超时)
|
|
167
186
|
:param timeout: 最大等待时间(秒),避免无限轮询
|
|
@@ -198,7 +217,7 @@ class RedisPriorityQueue:
|
|
|
198
217
|
return request
|
|
199
218
|
except Exception as pickle_error:
|
|
200
219
|
# 如果pickle反序列化失败,记录错误并跳过这个任务
|
|
201
|
-
|
|
220
|
+
get_module_logger().error(f"无法反序列化请求数据 (Module: {self.module_name}): {pickle_error}")
|
|
202
221
|
# 从processing队列中移除这个无效的任务
|
|
203
222
|
await self._redis.zrem(self.processing_queue, processing_key)
|
|
204
223
|
await self._redis.hdel(f"{self.processing_queue}:data", processing_key)
|
|
@@ -209,18 +228,18 @@ class RedisPriorityQueue:
|
|
|
209
228
|
if asyncio.get_event_loop().time() - start_time > timeout:
|
|
210
229
|
return None
|
|
211
230
|
|
|
212
|
-
#
|
|
213
|
-
await asyncio.sleep(0.
|
|
231
|
+
# 短暂等待,避免空轮询,但减少等待时间以提高响应速度
|
|
232
|
+
await asyncio.sleep(0.001) # 从0.01减少到0.001
|
|
214
233
|
|
|
215
234
|
except Exception as e:
|
|
216
|
-
|
|
217
|
-
e,
|
|
218
|
-
context=f"获取队列任务失败 (Module: {self.module_name})",
|
|
235
|
+
get_module_error_handler().handle_error(
|
|
236
|
+
e,
|
|
237
|
+
context=f"获取队列任务失败 (Module: {self.module_name})",
|
|
219
238
|
raise_error=False
|
|
220
239
|
)
|
|
221
240
|
return None
|
|
222
241
|
|
|
223
|
-
async def ack(self, request: Request):
|
|
242
|
+
async def ack(self, request: "Request"):
|
|
224
243
|
"""确认任务完成"""
|
|
225
244
|
try:
|
|
226
245
|
await self._ensure_connection()
|
|
@@ -237,13 +256,13 @@ class RedisPriorityQueue:
|
|
|
237
256
|
if cursor == 0:
|
|
238
257
|
break
|
|
239
258
|
except Exception as e:
|
|
240
|
-
|
|
241
|
-
e,
|
|
242
|
-
context=f"确认任务完成失败 (Module: {self.module_name})",
|
|
259
|
+
get_module_error_handler().handle_error(
|
|
260
|
+
e,
|
|
261
|
+
context=f"确认任务完成失败 (Module: {self.module_name})",
|
|
243
262
|
raise_error=False
|
|
244
263
|
)
|
|
245
264
|
|
|
246
|
-
async def fail(self, request: Request, reason: str = ""):
|
|
265
|
+
async def fail(self, request: "Request", reason: str = ""):
|
|
247
266
|
"""标记任务失败"""
|
|
248
267
|
try:
|
|
249
268
|
await self._ensure_connection()
|
|
@@ -256,7 +275,8 @@ class RedisPriorityQueue:
|
|
|
256
275
|
|
|
257
276
|
if retries <= self.max_retries:
|
|
258
277
|
await self.put(request, priority=request.priority + 1)
|
|
259
|
-
|
|
278
|
+
get_module_logger().info(
|
|
279
|
+
f"任务重试 [{retries}/{self.max_retries}] (Module: {self.module_name}): {request.url}")
|
|
260
280
|
else:
|
|
261
281
|
failed_data = {
|
|
262
282
|
"url": request.url,
|
|
@@ -266,15 +286,15 @@ class RedisPriorityQueue:
|
|
|
266
286
|
"request_pickle": pickle.dumps(request).hex(), # 可选:保存完整请求
|
|
267
287
|
}
|
|
268
288
|
await self._redis.lpush(self.failed_queue, pickle.dumps(failed_data))
|
|
269
|
-
|
|
289
|
+
get_module_logger().error(f"任务彻底失败 [{retries}次] (Module: {self.module_name}): {request.url}")
|
|
270
290
|
except Exception as e:
|
|
271
|
-
|
|
272
|
-
e,
|
|
273
|
-
context=f"标记任务失败失败 (Module: {self.module_name})",
|
|
291
|
+
get_module_error_handler().handle_error(
|
|
292
|
+
e,
|
|
293
|
+
context=f"标记任务失败失败 (Module: {self.module_name})",
|
|
274
294
|
raise_error=False
|
|
275
295
|
)
|
|
276
296
|
|
|
277
|
-
def _get_request_key(self, request
|
|
297
|
+
def _get_request_key(self, request) -> str:
|
|
278
298
|
"""生成请求唯一键"""
|
|
279
299
|
return f"{self.module_name}:url:{hash(request.url) & 0x7FFFFFFF}" # 确保正数
|
|
280
300
|
|
|
@@ -284,9 +304,9 @@ class RedisPriorityQueue:
|
|
|
284
304
|
await self._ensure_connection()
|
|
285
305
|
return await self._redis.zcard(self.queue_name)
|
|
286
306
|
except Exception as e:
|
|
287
|
-
|
|
288
|
-
e,
|
|
289
|
-
context=f"Failed to get queue size (Module: {self.module_name})",
|
|
307
|
+
get_module_error_handler().handle_error(
|
|
308
|
+
e,
|
|
309
|
+
context=f"Failed to get queue size (Module: {self.module_name})",
|
|
290
310
|
raise_error=False
|
|
291
311
|
)
|
|
292
312
|
return 0
|
|
@@ -297,10 +317,10 @@ class RedisPriorityQueue:
|
|
|
297
317
|
# 连接池会自动管理连接,这里不需要显式关闭单个连接
|
|
298
318
|
self._redis = None
|
|
299
319
|
self._redis_pool = None
|
|
300
|
-
|
|
320
|
+
get_module_logger().debug(f"Redis 连接已释放 (Module: {self.module_name})")
|
|
301
321
|
except Exception as e:
|
|
302
|
-
|
|
303
|
-
e,
|
|
304
|
-
context=f"释放 Redis 连接失败 (Module: {self.module_name})",
|
|
322
|
+
get_module_error_handler().handle_error(
|
|
323
|
+
e,
|
|
324
|
+
context=f"释放 Redis 连接失败 (Module: {self.module_name})",
|
|
305
325
|
raise_error=False
|
|
306
326
|
)
|