crawlo 1.3.2__py3-none-any.whl → 1.3.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crawlo might be problematic. Click here for more details.

Files changed (105) hide show
  1. crawlo/__init__.py +24 -0
  2. crawlo/__version__.py +1 -1
  3. crawlo/commands/run.py +58 -32
  4. crawlo/core/__init__.py +44 -0
  5. crawlo/core/engine.py +119 -45
  6. crawlo/core/scheduler.py +4 -3
  7. crawlo/crawler.py +603 -1133
  8. crawlo/downloader/aiohttp_downloader.py +4 -2
  9. crawlo/extension/__init__.py +1 -1
  10. crawlo/extension/logging_extension.py +23 -7
  11. crawlo/factories/__init__.py +28 -0
  12. crawlo/factories/base.py +69 -0
  13. crawlo/factories/crawler.py +104 -0
  14. crawlo/factories/registry.py +85 -0
  15. crawlo/filters/aioredis_filter.py +25 -2
  16. crawlo/framework.py +292 -0
  17. crawlo/initialization/__init__.py +40 -0
  18. crawlo/initialization/built_in.py +426 -0
  19. crawlo/initialization/context.py +142 -0
  20. crawlo/initialization/core.py +194 -0
  21. crawlo/initialization/phases.py +149 -0
  22. crawlo/initialization/registry.py +146 -0
  23. crawlo/items/base.py +2 -1
  24. crawlo/logging/__init__.py +38 -0
  25. crawlo/logging/config.py +97 -0
  26. crawlo/logging/factory.py +129 -0
  27. crawlo/logging/manager.py +112 -0
  28. crawlo/middleware/middleware_manager.py +1 -1
  29. crawlo/middleware/offsite.py +1 -1
  30. crawlo/mode_manager.py +26 -1
  31. crawlo/pipelines/pipeline_manager.py +2 -1
  32. crawlo/project.py +76 -46
  33. crawlo/queue/pqueue.py +11 -5
  34. crawlo/queue/queue_manager.py +143 -19
  35. crawlo/queue/redis_priority_queue.py +69 -49
  36. crawlo/settings/default_settings.py +110 -14
  37. crawlo/settings/setting_manager.py +29 -13
  38. crawlo/spider/__init__.py +34 -16
  39. crawlo/stats_collector.py +17 -3
  40. crawlo/task_manager.py +112 -3
  41. crawlo/templates/project/settings.py.tmpl +103 -202
  42. crawlo/templates/project/settings_distributed.py.tmpl +122 -135
  43. crawlo/templates/project/settings_gentle.py.tmpl +149 -43
  44. crawlo/templates/project/settings_high_performance.py.tmpl +127 -90
  45. crawlo/templates/project/settings_minimal.py.tmpl +46 -15
  46. crawlo/templates/project/settings_simple.py.tmpl +138 -75
  47. crawlo/templates/project/spiders/__init__.py.tmpl +5 -1
  48. crawlo/templates/run.py.tmpl +10 -14
  49. crawlo/templates/spiders_init.py.tmpl +10 -0
  50. crawlo/tools/network_diagnostic.py +365 -0
  51. crawlo/utils/class_loader.py +26 -0
  52. crawlo/utils/error_handler.py +76 -35
  53. crawlo/utils/log.py +41 -144
  54. crawlo/utils/redis_connection_pool.py +43 -6
  55. crawlo/utils/request_serializer.py +8 -1
  56. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/METADATA +120 -14
  57. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/RECORD +104 -45
  58. tests/authenticated_proxy_example.py +2 -2
  59. tests/baidu_performance_test.py +109 -0
  60. tests/baidu_test.py +60 -0
  61. tests/comprehensive_framework_test.py +213 -0
  62. tests/comprehensive_test.py +82 -0
  63. tests/comprehensive_testing_summary.md +187 -0
  64. tests/debug_configure.py +70 -0
  65. tests/debug_framework_logger.py +85 -0
  66. tests/debug_log_levels.py +64 -0
  67. tests/distributed_test.py +67 -0
  68. tests/distributed_test_debug.py +77 -0
  69. tests/final_command_test_report.md +0 -0
  70. tests/final_comprehensive_test.py +152 -0
  71. tests/final_validation_test.py +183 -0
  72. tests/framework_performance_test.py +203 -0
  73. tests/optimized_performance_test.py +212 -0
  74. tests/performance_comparison.py +246 -0
  75. tests/queue_blocking_test.py +114 -0
  76. tests/queue_test.py +90 -0
  77. tests/scrapy_comparison/ofweek_scrapy.py +139 -0
  78. tests/scrapy_comparison/scrapy_test.py +134 -0
  79. tests/simple_command_test.py +120 -0
  80. tests/simple_crawlo_test.py +128 -0
  81. tests/simple_log_test.py +58 -0
  82. tests/simple_optimization_test.py +129 -0
  83. tests/simple_spider_test.py +50 -0
  84. tests/simple_test.py +48 -0
  85. tests/test_all_commands.py +231 -0
  86. tests/test_batch_processor.py +179 -0
  87. tests/test_component_factory.py +175 -0
  88. tests/test_controlled_spider_mixin.py +80 -0
  89. tests/test_enhanced_error_handler_comprehensive.py +246 -0
  90. tests/test_factories.py +253 -0
  91. tests/test_framework_logger.py +67 -0
  92. tests/test_framework_startup.py +65 -0
  93. tests/test_large_scale_config.py +113 -0
  94. tests/test_large_scale_helper.py +236 -0
  95. tests/test_mode_change.py +73 -0
  96. tests/test_mode_consistency.py +1 -1
  97. tests/test_performance_monitor.py +116 -0
  98. tests/test_queue_empty_check.py +42 -0
  99. tests/untested_features_report.md +139 -0
  100. tests/verify_debug.py +52 -0
  101. tests/verify_log_fix.py +112 -0
  102. tests/DOUBLE_CRAWLO_PREFIX_FIX_REPORT.md +0 -82
  103. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/WHEEL +0 -0
  104. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/entry_points.txt +0 -0
  105. {crawlo-1.3.2.dist-info → crawlo-1.3.4.dist-info}/top_level.txt +0 -0
@@ -7,15 +7,20 @@
7
7
  import asyncio
8
8
  import traceback
9
9
  from enum import Enum
10
- from typing import Optional, Dict, Any, Union
10
+ from typing import Optional, Dict, Any, Union, TYPE_CHECKING
11
+ import time
12
+ import random
13
+
14
+ if TYPE_CHECKING:
15
+ from crawlo import Request
11
16
 
12
- from crawlo import Request
13
17
  from crawlo.queue.pqueue import SpiderPriorityQueue
14
18
  from crawlo.utils.error_handler import ErrorHandler
15
19
  from crawlo.utils.log import get_logger
16
20
  from crawlo.utils.request_serializer import RequestSerializer
17
21
 
18
22
  try:
23
+ # 使用完整版Redis队列
19
24
  from crawlo.queue.redis_priority_queue import RedisPriorityQueue
20
25
 
21
26
  REDIS_AVAILABLE = True
@@ -31,6 +36,79 @@ class QueueType(Enum):
31
36
  AUTO = "auto" # 自动选择
32
37
 
33
38
 
39
+ class IntelligentScheduler:
40
+ """智能调度器"""
41
+
42
+ def __init__(self):
43
+ self.domain_stats = {} # 域名统计信息
44
+ self.url_stats = {} # URL统计信息
45
+ self.last_request_time = {} # 最后请求时间
46
+
47
+ def calculate_priority(self, request: "Request") -> int:
48
+ """计算请求的智能优先级"""
49
+ priority = getattr(request, 'priority', 0)
50
+
51
+ # 获取域名
52
+ domain = self._extract_domain(request.url)
53
+
54
+ # 基于域名访问频率调整优先级
55
+ if domain in self.domain_stats:
56
+ domain_access_count = self.domain_stats[domain]['count']
57
+ last_access_time = self.domain_stats[domain]['last_time']
58
+
59
+ # 如果最近访问过该域名,降低优先级(避免过度集中访问同一域名)
60
+ time_since_last = time.time() - last_access_time
61
+ if time_since_last < 5: # 5秒内访问过
62
+ priority -= 2
63
+ elif time_since_last < 30: # 30秒内访问过
64
+ priority -= 1
65
+
66
+ # 如果该域名访问次数过多,进一步降低优先级
67
+ if domain_access_count > 10:
68
+ priority -= 1
69
+
70
+ # 基于URL访问历史调整优先级
71
+ if request.url in self.url_stats:
72
+ url_access_count = self.url_stats[request.url]
73
+ if url_access_count > 1:
74
+ # 重复URL降低优先级
75
+ priority -= url_access_count
76
+
77
+ # 基于深度调整优先级
78
+ depth = getattr(request, 'meta', {}).get('depth', 0)
79
+ priority -= depth # 深度越大,优先级越低
80
+
81
+ return priority
82
+
83
+ def update_stats(self, request: "Request"):
84
+ """更新统计信息"""
85
+ domain = self._extract_domain(request.url)
86
+
87
+ # 更新域名统计
88
+ if domain not in self.domain_stats:
89
+ self.domain_stats[domain] = {'count': 0, 'last_time': 0}
90
+
91
+ self.domain_stats[domain]['count'] += 1
92
+ self.domain_stats[domain]['last_time'] = time.time()
93
+
94
+ # 更新URL统计
95
+ if request.url not in self.url_stats:
96
+ self.url_stats[request.url] = 0
97
+ self.url_stats[request.url] += 1
98
+
99
+ # 更新最后请求时间
100
+ self.last_request_time[domain] = time.time()
101
+
102
+ def _extract_domain(self, url: str) -> str:
103
+ """提取域名"""
104
+ try:
105
+ from urllib.parse import urlparse
106
+ parsed = urlparse(url)
107
+ return parsed.netloc
108
+ except:
109
+ return "unknown"
110
+
111
+
34
112
  class QueueConfig:
35
113
  """Queue configuration class"""
36
114
 
@@ -87,13 +165,27 @@ class QueueManager:
87
165
 
88
166
  def __init__(self, config: QueueConfig):
89
167
  self.config = config
90
- self.logger = get_logger(self.__class__.__name__)
91
- self.error_handler = ErrorHandler(self.__class__.__name__)
168
+ # 延迟初始化logger和error_handler避免循环依赖
169
+ self._logger = None
170
+ self._error_handler = None
92
171
  self.request_serializer = RequestSerializer()
93
172
  self._queue = None
94
173
  self._queue_semaphore = None
95
174
  self._queue_type = None
96
175
  self._health_status = "unknown"
176
+ self._intelligent_scheduler = IntelligentScheduler() # 智能调度器
177
+
178
+ @property
179
+ def logger(self):
180
+ if self._logger is None:
181
+ self._logger = get_logger(self.__class__.__name__)
182
+ return self._logger
183
+
184
+ @property
185
+ def error_handler(self):
186
+ if self._error_handler is None:
187
+ self._error_handler = ErrorHandler(self.__class__.__name__)
188
+ return self._error_handler
97
189
 
98
190
  async def initialize(self) -> bool:
99
191
  """初始化队列"""
@@ -129,38 +221,44 @@ class QueueManager:
129
221
  self._health_status = "error"
130
222
  return False
131
223
 
132
- async def put(self, request: Request, priority: int = 0) -> bool:
224
+ async def put(self, request: "Request", priority: int = 0) -> bool:
133
225
  """Unified enqueue interface"""
134
226
  if not self._queue:
135
227
  raise RuntimeError("队列未初始化")
136
228
 
137
229
  try:
230
+ # 应用智能调度算法计算优先级
231
+ intelligent_priority = self._intelligent_scheduler.calculate_priority(request)
232
+ # 结合原始优先级和智能优先级
233
+ final_priority = priority + intelligent_priority
234
+
235
+ # 更新统计信息
236
+ self._intelligent_scheduler.update_stats(request)
237
+
138
238
  # 序列化处理(仅对 Redis 队列)
139
239
  if self._queue_type == QueueType.REDIS:
140
240
  request = self.request_serializer.prepare_for_serialization(request)
141
241
 
142
242
  # 背压控制(仅对内存队列)
143
243
  if self._queue_semaphore:
144
- # 对于大量请求,使用非阻塞式检查
145
- if not self._queue_semaphore.locked():
146
- await self._queue_semaphore.acquire()
147
- else:
148
- # 如果队列已满,返回 False 而不是阻塞
149
- self.logger.warning("Queue is full, skipping current request")
150
- return False
244
+ # 对于大量请求,使用阻塞式等待而不是跳过
245
+ # 这样可以确保不会丢失任何请求
246
+ await self._queue_semaphore.acquire()
151
247
 
152
248
  # 统一的入队操作
153
249
  if hasattr(self._queue, 'put'):
154
250
  if self._queue_type == QueueType.REDIS:
155
- success = await self._queue.put(request, priority)
251
+ success = await self._queue.put(request, final_priority)
156
252
  else:
157
- await self._queue.put(request)
253
+ # 对于内存队列,我们需要手动处理优先级
254
+ # 在SpiderPriorityQueue中,元素应该是(priority, item)的元组
255
+ await self._queue.put((final_priority, request))
158
256
  success = True
159
257
  else:
160
258
  raise RuntimeError(f"队列类型 {self._queue_type} 不支持 put 操作")
161
259
 
162
260
  if success:
163
- self.logger.debug(f"Request enqueued successfully: {request.url}")
261
+ self.logger.debug(f"Request enqueued successfully: {request.url} with priority {final_priority}")
164
262
 
165
263
  return success
166
264
 
@@ -170,7 +268,7 @@ class QueueManager:
170
268
  self._queue_semaphore.release()
171
269
  return False
172
270
 
173
- async def get(self, timeout: float = 5.0) -> Optional[Request]:
271
+ async def get(self, timeout: float = 5.0) -> Optional["Request"]:
174
272
  """Unified dequeue interface"""
175
273
  if not self._queue:
176
274
  raise RuntimeError("队列未初始化")
@@ -188,6 +286,11 @@ class QueueManager:
188
286
  # 实际的 callback 恢复在 scheduler 中处理
189
287
  pass
190
288
 
289
+ # 如果是内存队列,需要解包(priority, request)元组
290
+ if request and self._queue_type == QueueType.MEMORY:
291
+ if isinstance(request, tuple) and len(request) == 2:
292
+ request = request[1] # 取元组中的请求对象
293
+
191
294
  return request
192
295
 
193
296
  except Exception as e:
@@ -215,7 +318,12 @@ class QueueManager:
215
318
  try:
216
319
  # 对于内存队列,可以同步检查
217
320
  if self._queue_type == QueueType.MEMORY:
218
- return self._queue.qsize() == 0
321
+ # 确保正确检查队列大小
322
+ if hasattr(self._queue, 'qsize'):
323
+ return self._queue.qsize() == 0
324
+ else:
325
+ # 如果没有qsize方法,假设队列为空
326
+ return True
219
327
  # 对于 Redis 队列,由于需要异步操作,这里返回近似值
220
328
  # 为了确保程序能正常退出,我们返回True,让上层通过更精确的异步检查来判断
221
329
  return True
@@ -227,7 +335,16 @@ class QueueManager:
227
335
  try:
228
336
  # 对于内存队列
229
337
  if self._queue_type == QueueType.MEMORY:
230
- return self._queue.qsize() == 0
338
+ # 确保正确检查队列大小
339
+ if hasattr(self._queue, 'qsize'):
340
+ if asyncio.iscoroutinefunction(self._queue.qsize):
341
+ size = await self._queue.qsize()
342
+ else:
343
+ size = self._queue.qsize()
344
+ return size == 0
345
+ else:
346
+ # 如果没有qsize方法,假设队列为空
347
+ return True
231
348
  # 对于 Redis 队列,使用异步检查
232
349
  elif self._queue_type == QueueType.REDIS:
233
350
  size = await self.size()
@@ -262,10 +379,10 @@ class QueueManager:
262
379
  if REDIS_AVAILABLE and self.config.redis_url:
263
380
  # 测试 Redis 连接
264
381
  try:
382
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
265
383
  test_queue = RedisPriorityQueue(self.config.redis_url)
266
384
  await test_queue.connect()
267
385
  await test_queue.close()
268
- # Change INFO level log to DEBUG level to avoid redundant output
269
386
  self.logger.debug("Auto-detection: Redis available, using distributed queue")
270
387
  return QueueType.REDIS
271
388
  except Exception as e:
@@ -282,6 +399,7 @@ class QueueManager:
282
399
  raise RuntimeError("Redis 队列不可用:未配置 REDIS_URL")
283
400
  # 测试 Redis 连接
284
401
  try:
402
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
285
403
  test_queue = RedisPriorityQueue(self.config.redis_url)
286
404
  await test_queue.connect()
287
405
  await test_queue.close()
@@ -299,6 +417,12 @@ class QueueManager:
299
417
  async def _create_queue(self, queue_type: QueueType):
300
418
  """Create queue instance"""
301
419
  if queue_type == QueueType.REDIS:
420
+ # 延迟导入Redis队列
421
+ try:
422
+ from crawlo.queue.redis_priority_queue import RedisPriorityQueue
423
+ except ImportError as e:
424
+ raise RuntimeError(f"Redis队列不可用:未能导入RedisPriorityQueue ({e})")
425
+
302
426
  # 简化项目名称提取逻辑
303
427
  project_name = "default"
304
428
  if ':' in self.config.queue_name:
@@ -1,19 +1,38 @@
1
1
  import asyncio
2
+ import asyncio
2
3
  import pickle
3
4
  import time
4
5
  import traceback
5
- from typing import Optional
6
+ from typing import Optional, TYPE_CHECKING
6
7
 
7
8
  import redis.asyncio as aioredis
8
9
 
9
- from crawlo import Request
10
+ # 使用 TYPE_CHECKING 避免运行时循环导入
11
+ if TYPE_CHECKING:
12
+ from crawlo import Request
13
+
10
14
  from crawlo.utils.error_handler import ErrorHandler
11
15
  from crawlo.utils.log import get_logger
12
16
  from crawlo.utils.redis_connection_pool import get_redis_pool, OptimizedRedisConnectionPool
13
17
  from crawlo.utils.request_serializer import RequestSerializer
14
18
 
15
- logger = get_logger(__name__)
16
- error_handler = ErrorHandler(__name__)
19
+ # 延迟初始化避免循环依赖
20
+ _logger = None
21
+ _error_handler = None
22
+
23
+
24
+ def get_module_logger():
25
+ global _logger
26
+ if _logger is None:
27
+ _logger = get_logger(__name__)
28
+ return _logger
29
+
30
+
31
+ def get_module_error_handler():
32
+ global _error_handler
33
+ if _error_handler is None:
34
+ _error_handler = ErrorHandler(__name__)
35
+ return _error_handler
17
36
 
18
37
 
19
38
  class RedisPriorityQueue:
@@ -39,14 +58,14 @@ class RedisPriorityQueue:
39
58
 
40
59
  self.redis_url = redis_url
41
60
  self.module_name = module_name # 保存 module_name
42
-
61
+
43
62
  # 如果未提供 queue_name,则根据 module_name 自动生成
44
63
  if queue_name is None:
45
64
  self.queue_name = f"crawlo:{module_name}:queue:requests"
46
65
  else:
47
66
  # 保持用户提供的队列名称不变,不做修改
48
67
  self.queue_name = queue_name
49
-
68
+
50
69
  # 如果未提供 processing_queue,则根据 queue_name 自动生成
51
70
  if processing_queue is None:
52
71
  if ":queue:requests" in self.queue_name:
@@ -55,7 +74,7 @@ class RedisPriorityQueue:
55
74
  self.processing_queue = f"{self.queue_name}:processing"
56
75
  else:
57
76
  self.processing_queue = processing_queue
58
-
77
+
59
78
  # 如果未提供 failed_queue,则根据 queue_name 自动生成
60
79
  if failed_queue is None:
61
80
  if ":queue:requests" in self.queue_name:
@@ -64,7 +83,7 @@ class RedisPriorityQueue:
64
83
  self.failed_queue = f"{self.queue_name}:failed"
65
84
  else:
66
85
  self.failed_queue = failed_queue
67
-
86
+
68
87
  self.max_retries = max_retries
69
88
  self.timeout = timeout
70
89
  self.max_connections = max_connections
@@ -98,18 +117,18 @@ class RedisPriorityQueue:
98
117
  decode_responses=False, # 确保不自动解码响应
99
118
  encoding='utf-8'
100
119
  )
101
-
120
+
102
121
  self._redis = await self._redis_pool.get_connection()
103
-
122
+
104
123
  # 测试连接
105
124
  await self._redis.ping()
106
125
  # 只在调试模式下输出详细连接信息
107
- # logger.debug(f"Redis 连接成功 (Module: {self.module_name})") # 注释掉重复的日志
126
+ # get_module_logger().debug(f"Redis 连接成功 (Module: {self.module_name})") # 注释掉重复的日志
108
127
  return self._redis
109
128
  except Exception as e:
110
129
  error_msg = f"Redis 连接失败 (尝试 {attempt + 1}/{max_retries}, Module: {self.module_name}): {e}"
111
- logger.warning(error_msg)
112
- logger.debug(f"详细错误信息:\n{traceback.format_exc()}")
130
+ get_module_logger().warning(error_msg)
131
+ get_module_logger().debug(f"详细错误信息:\n{traceback.format_exc()}")
113
132
  if attempt < max_retries - 1:
114
133
  await asyncio.sleep(delay)
115
134
  else:
@@ -122,46 +141,46 @@ class RedisPriorityQueue:
122
141
  try:
123
142
  await self._redis.ping()
124
143
  except Exception as e:
125
- logger.warning(f"Redis 连接失效 (Module: {self.module_name}),尝试重连...: {e}")
144
+ get_module_logger().warning(f"Redis 连接失效 (Module: {self.module_name}),尝试重连...: {e}")
126
145
  self._redis = None
127
146
  await self.connect()
128
147
 
129
- async def put(self, request: Request, priority: int = 0) -> bool:
148
+ async def put(self, request, priority: int = 0) -> bool:
130
149
  """放入请求到队列"""
131
150
  try:
132
151
  await self._ensure_connection()
133
152
  score = -priority
134
153
  key = self._get_request_key(request)
135
-
154
+
136
155
  # 🔥 使用专用的序列化工具清理 Request
137
156
  clean_request = self.request_serializer.prepare_for_serialization(request)
138
-
157
+
139
158
  # 确保序列化后的数据可以被正确反序列化
140
159
  try:
141
160
  serialized = pickle.dumps(clean_request)
142
161
  # 验证序列化数据可以被反序列化
143
162
  pickle.loads(serialized)
144
163
  except Exception as serialize_error:
145
- logger.error(f"请求序列化验证失败 (Module: {self.module_name}): {serialize_error}")
164
+ get_module_logger().error(f"请求序列化验证失败 (Module: {self.module_name}): {serialize_error}")
146
165
  return False
147
-
166
+
148
167
  pipe = self._redis.pipeline()
149
168
  pipe.zadd(self.queue_name, {key: score})
150
169
  pipe.hset(f"{self.queue_name}:data", key, serialized)
151
170
  result = await pipe.execute()
152
-
171
+
153
172
  if result[0] > 0:
154
- logger.debug(f"成功入队 (Module: {self.module_name}): {request.url}") # 注释掉重复的日志
173
+ get_module_logger().debug(f"成功入队 (Module: {self.module_name}): {request.url}") # 注释掉重复的日志
155
174
  return result[0] > 0
156
175
  except Exception as e:
157
- error_handler.handle_error(
158
- e,
159
- context=f"放入队列失败 (Module: {self.module_name})",
176
+ get_module_error_handler().handle_error(
177
+ e,
178
+ context=f"放入队列失败 (Module: {self.module_name})",
160
179
  raise_error=False
161
180
  )
162
181
  return False
163
182
 
164
- async def get(self, timeout: float = 5.0) -> Optional[Request]:
183
+ async def get(self, timeout: float = 5.0):
165
184
  """
166
185
  获取请求(带超时)
167
186
  :param timeout: 最大等待时间(秒),避免无限轮询
@@ -198,7 +217,7 @@ class RedisPriorityQueue:
198
217
  return request
199
218
  except Exception as pickle_error:
200
219
  # 如果pickle反序列化失败,记录错误并跳过这个任务
201
- logger.error(f"无法反序列化请求数据 (Module: {self.module_name}): {pickle_error}")
220
+ get_module_logger().error(f"无法反序列化请求数据 (Module: {self.module_name}): {pickle_error}")
202
221
  # 从processing队列中移除这个无效的任务
203
222
  await self._redis.zrem(self.processing_queue, processing_key)
204
223
  await self._redis.hdel(f"{self.processing_queue}:data", processing_key)
@@ -209,18 +228,18 @@ class RedisPriorityQueue:
209
228
  if asyncio.get_event_loop().time() - start_time > timeout:
210
229
  return None
211
230
 
212
- # 短暂等待,避免空轮询
213
- await asyncio.sleep(0.1)
231
+ # 短暂等待,避免空轮询,但减少等待时间以提高响应速度
232
+ await asyncio.sleep(0.001) # 从0.01减少到0.001
214
233
 
215
234
  except Exception as e:
216
- error_handler.handle_error(
217
- e,
218
- context=f"获取队列任务失败 (Module: {self.module_name})",
235
+ get_module_error_handler().handle_error(
236
+ e,
237
+ context=f"获取队列任务失败 (Module: {self.module_name})",
219
238
  raise_error=False
220
239
  )
221
240
  return None
222
241
 
223
- async def ack(self, request: Request):
242
+ async def ack(self, request: "Request"):
224
243
  """确认任务完成"""
225
244
  try:
226
245
  await self._ensure_connection()
@@ -237,13 +256,13 @@ class RedisPriorityQueue:
237
256
  if cursor == 0:
238
257
  break
239
258
  except Exception as e:
240
- error_handler.handle_error(
241
- e,
242
- context=f"确认任务完成失败 (Module: {self.module_name})",
259
+ get_module_error_handler().handle_error(
260
+ e,
261
+ context=f"确认任务完成失败 (Module: {self.module_name})",
243
262
  raise_error=False
244
263
  )
245
264
 
246
- async def fail(self, request: Request, reason: str = ""):
265
+ async def fail(self, request: "Request", reason: str = ""):
247
266
  """标记任务失败"""
248
267
  try:
249
268
  await self._ensure_connection()
@@ -256,7 +275,8 @@ class RedisPriorityQueue:
256
275
 
257
276
  if retries <= self.max_retries:
258
277
  await self.put(request, priority=request.priority + 1)
259
- logger.info(f"任务重试 [{retries}/{self.max_retries}] (Module: {self.module_name}): {request.url}")
278
+ get_module_logger().info(
279
+ f"任务重试 [{retries}/{self.max_retries}] (Module: {self.module_name}): {request.url}")
260
280
  else:
261
281
  failed_data = {
262
282
  "url": request.url,
@@ -266,15 +286,15 @@ class RedisPriorityQueue:
266
286
  "request_pickle": pickle.dumps(request).hex(), # 可选:保存完整请求
267
287
  }
268
288
  await self._redis.lpush(self.failed_queue, pickle.dumps(failed_data))
269
- logger.error(f"任务彻底失败 [{retries}次] (Module: {self.module_name}): {request.url}")
289
+ get_module_logger().error(f"任务彻底失败 [{retries}次] (Module: {self.module_name}): {request.url}")
270
290
  except Exception as e:
271
- error_handler.handle_error(
272
- e,
273
- context=f"标记任务失败失败 (Module: {self.module_name})",
291
+ get_module_error_handler().handle_error(
292
+ e,
293
+ context=f"标记任务失败失败 (Module: {self.module_name})",
274
294
  raise_error=False
275
295
  )
276
296
 
277
- def _get_request_key(self, request: Request) -> str:
297
+ def _get_request_key(self, request) -> str:
278
298
  """生成请求唯一键"""
279
299
  return f"{self.module_name}:url:{hash(request.url) & 0x7FFFFFFF}" # 确保正数
280
300
 
@@ -284,9 +304,9 @@ class RedisPriorityQueue:
284
304
  await self._ensure_connection()
285
305
  return await self._redis.zcard(self.queue_name)
286
306
  except Exception as e:
287
- error_handler.handle_error(
288
- e,
289
- context=f"Failed to get queue size (Module: {self.module_name})",
307
+ get_module_error_handler().handle_error(
308
+ e,
309
+ context=f"Failed to get queue size (Module: {self.module_name})",
290
310
  raise_error=False
291
311
  )
292
312
  return 0
@@ -297,10 +317,10 @@ class RedisPriorityQueue:
297
317
  # 连接池会自动管理连接,这里不需要显式关闭单个连接
298
318
  self._redis = None
299
319
  self._redis_pool = None
300
- logger.debug(f"Redis 连接已释放 (Module: {self.module_name})")
320
+ get_module_logger().debug(f"Redis 连接已释放 (Module: {self.module_name})")
301
321
  except Exception as e:
302
- error_handler.handle_error(
303
- e,
304
- context=f"释放 Redis 连接失败 (Module: {self.module_name})",
322
+ get_module_error_handler().handle_error(
323
+ e,
324
+ context=f"释放 Redis 连接失败 (Module: {self.module_name})",
305
325
  raise_error=False
306
326
  )