jettask 0.2.19__py3-none-any.whl → 0.2.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. jettask/__init__.py +10 -3
  2. jettask/cli.py +314 -228
  3. jettask/config/__init__.py +9 -1
  4. jettask/config/config.py +245 -0
  5. jettask/config/env_loader.py +381 -0
  6. jettask/config/lua_scripts.py +158 -0
  7. jettask/config/nacos_config.py +132 -5
  8. jettask/core/__init__.py +1 -1
  9. jettask/core/app.py +1573 -666
  10. jettask/core/app_importer.py +33 -16
  11. jettask/core/container.py +532 -0
  12. jettask/core/task.py +1 -4
  13. jettask/core/unified_manager_base.py +2 -2
  14. jettask/executor/__init__.py +38 -0
  15. jettask/executor/core.py +625 -0
  16. jettask/executor/executor.py +338 -0
  17. jettask/executor/orchestrator.py +290 -0
  18. jettask/executor/process_entry.py +638 -0
  19. jettask/executor/task_executor.py +317 -0
  20. jettask/messaging/__init__.py +68 -0
  21. jettask/messaging/event_pool.py +2188 -0
  22. jettask/messaging/reader.py +519 -0
  23. jettask/messaging/registry.py +266 -0
  24. jettask/messaging/scanner.py +369 -0
  25. jettask/messaging/sender.py +312 -0
  26. jettask/persistence/__init__.py +118 -0
  27. jettask/persistence/backlog_monitor.py +567 -0
  28. jettask/{backend/data_access.py → persistence/base.py} +58 -57
  29. jettask/persistence/consumer.py +315 -0
  30. jettask/{core → persistence}/db_manager.py +23 -22
  31. jettask/persistence/maintenance.py +81 -0
  32. jettask/persistence/message_consumer.py +259 -0
  33. jettask/{backend/namespace_data_access.py → persistence/namespace.py} +66 -98
  34. jettask/persistence/offline_recovery.py +196 -0
  35. jettask/persistence/queue_discovery.py +215 -0
  36. jettask/persistence/task_persistence.py +218 -0
  37. jettask/persistence/task_updater.py +583 -0
  38. jettask/scheduler/__init__.py +2 -2
  39. jettask/scheduler/loader.py +6 -5
  40. jettask/scheduler/run_scheduler.py +1 -1
  41. jettask/scheduler/scheduler.py +7 -7
  42. jettask/scheduler/{unified_scheduler_manager.py → scheduler_coordinator.py} +18 -13
  43. jettask/task/__init__.py +16 -0
  44. jettask/{router.py → task/router.py} +26 -8
  45. jettask/task/task_center/__init__.py +9 -0
  46. jettask/task/task_executor.py +318 -0
  47. jettask/task/task_registry.py +291 -0
  48. jettask/test_connection_monitor.py +73 -0
  49. jettask/utils/__init__.py +31 -1
  50. jettask/{monitor/run_backlog_collector.py → utils/backlog_collector.py} +1 -1
  51. jettask/utils/db_connector.py +1629 -0
  52. jettask/{db_init.py → utils/db_init.py} +1 -1
  53. jettask/utils/rate_limit/__init__.py +30 -0
  54. jettask/utils/rate_limit/concurrency_limiter.py +665 -0
  55. jettask/utils/rate_limit/config.py +145 -0
  56. jettask/utils/rate_limit/limiter.py +41 -0
  57. jettask/utils/rate_limit/manager.py +269 -0
  58. jettask/utils/rate_limit/qps_limiter.py +154 -0
  59. jettask/utils/rate_limit/task_limiter.py +384 -0
  60. jettask/utils/serializer.py +3 -0
  61. jettask/{monitor/stream_backlog_monitor.py → utils/stream_backlog.py} +14 -6
  62. jettask/utils/time_sync.py +173 -0
  63. jettask/webui/__init__.py +27 -0
  64. jettask/{api/v1 → webui/api}/alerts.py +1 -1
  65. jettask/{api/v1 → webui/api}/analytics.py +2 -2
  66. jettask/{api/v1 → webui/api}/namespaces.py +1 -1
  67. jettask/{api/v1 → webui/api}/overview.py +1 -1
  68. jettask/{api/v1 → webui/api}/queues.py +3 -3
  69. jettask/{api/v1 → webui/api}/scheduled.py +1 -1
  70. jettask/{api/v1 → webui/api}/settings.py +1 -1
  71. jettask/{api.py → webui/app.py} +253 -145
  72. jettask/webui/namespace_manager/__init__.py +10 -0
  73. jettask/{multi_namespace_consumer.py → webui/namespace_manager/multi.py} +69 -22
  74. jettask/{unified_consumer_manager.py → webui/namespace_manager/unified.py} +1 -1
  75. jettask/{run.py → webui/run.py} +2 -2
  76. jettask/{services → webui/services}/__init__.py +1 -3
  77. jettask/{services → webui/services}/overview_service.py +34 -16
  78. jettask/{services → webui/services}/queue_service.py +1 -1
  79. jettask/{backend → webui/services}/queue_stats_v2.py +1 -1
  80. jettask/{services → webui/services}/settings_service.py +1 -1
  81. jettask/worker/__init__.py +53 -0
  82. jettask/worker/lifecycle.py +1507 -0
  83. jettask/worker/manager.py +583 -0
  84. jettask/{core/offline_worker_recovery.py → worker/recovery.py} +268 -175
  85. {jettask-0.2.19.dist-info → jettask-0.2.20.dist-info}/METADATA +2 -71
  86. jettask-0.2.20.dist-info/RECORD +145 -0
  87. jettask/__main__.py +0 -140
  88. jettask/api/__init__.py +0 -103
  89. jettask/backend/__init__.py +0 -1
  90. jettask/backend/api/__init__.py +0 -3
  91. jettask/backend/api/v1/__init__.py +0 -17
  92. jettask/backend/api/v1/monitoring.py +0 -431
  93. jettask/backend/api/v1/namespaces.py +0 -504
  94. jettask/backend/api/v1/queues.py +0 -342
  95. jettask/backend/api/v1/tasks.py +0 -367
  96. jettask/backend/core/__init__.py +0 -3
  97. jettask/backend/core/cache.py +0 -221
  98. jettask/backend/core/database.py +0 -200
  99. jettask/backend/core/exceptions.py +0 -102
  100. jettask/backend/dependencies.py +0 -261
  101. jettask/backend/init_meta_db.py +0 -158
  102. jettask/backend/main.py +0 -1426
  103. jettask/backend/main_unified.py +0 -78
  104. jettask/backend/main_v2.py +0 -394
  105. jettask/backend/models/__init__.py +0 -3
  106. jettask/backend/models/requests.py +0 -236
  107. jettask/backend/models/responses.py +0 -230
  108. jettask/backend/namespace_api_old.py +0 -267
  109. jettask/backend/services/__init__.py +0 -3
  110. jettask/backend/start.py +0 -42
  111. jettask/backend/unified_api_router.py +0 -1541
  112. jettask/cleanup_deprecated_tables.sql +0 -16
  113. jettask/core/consumer_manager.py +0 -1695
  114. jettask/core/delay_scanner.py +0 -256
  115. jettask/core/event_pool.py +0 -1700
  116. jettask/core/heartbeat_process.py +0 -222
  117. jettask/core/task_batch.py +0 -153
  118. jettask/core/worker_scanner.py +0 -271
  119. jettask/executors/__init__.py +0 -5
  120. jettask/executors/asyncio.py +0 -876
  121. jettask/executors/base.py +0 -30
  122. jettask/executors/common.py +0 -148
  123. jettask/executors/multi_asyncio.py +0 -309
  124. jettask/gradio_app.py +0 -570
  125. jettask/integrated_gradio_app.py +0 -1088
  126. jettask/main.py +0 -0
  127. jettask/monitoring/__init__.py +0 -3
  128. jettask/pg_consumer.py +0 -1896
  129. jettask/run_monitor.py +0 -22
  130. jettask/run_webui.py +0 -148
  131. jettask/scheduler/multi_namespace_scheduler.py +0 -294
  132. jettask/scheduler/unified_manager.py +0 -450
  133. jettask/task_center_client.py +0 -150
  134. jettask/utils/serializer_optimized.py +0 -33
  135. jettask/webui_exceptions.py +0 -67
  136. jettask-0.2.19.dist-info/RECORD +0 -150
  137. /jettask/{constants.py → config/constants.py} +0 -0
  138. /jettask/{backend/config.py → config/task_center.py} +0 -0
  139. /jettask/{pg_consumer → messaging/pg_consumer}/pg_consumer_v2.py +0 -0
  140. /jettask/{pg_consumer → messaging/pg_consumer}/sql/add_execution_time_field.sql +0 -0
  141. /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_new_tables.sql +0 -0
  142. /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_tables_v3.sql +0 -0
  143. /jettask/{pg_consumer → messaging/pg_consumer}/sql/migrate_to_new_structure.sql +0 -0
  144. /jettask/{pg_consumer → messaging/pg_consumer}/sql/modify_time_fields.sql +0 -0
  145. /jettask/{pg_consumer → messaging/pg_consumer}/sql_utils.py +0 -0
  146. /jettask/{models.py → persistence/models.py} +0 -0
  147. /jettask/scheduler/{manager.py → task_crud.py} +0 -0
  148. /jettask/{schema.sql → schemas/schema.sql} +0 -0
  149. /jettask/{task_center.py → task/task_center/client.py} +0 -0
  150. /jettask/{monitoring → utils}/file_watcher.py +0 -0
  151. /jettask/{services/redis_monitor_service.py → utils/redis_monitor.py} +0 -0
  152. /jettask/{api/v1 → webui/api}/__init__.py +0 -0
  153. /jettask/{webui_config.py → webui/config.py} +0 -0
  154. /jettask/{webui_models → webui/models}/__init__.py +0 -0
  155. /jettask/{webui_models → webui/models}/namespace.py +0 -0
  156. /jettask/{services → webui/services}/alert_service.py +0 -0
  157. /jettask/{services → webui/services}/analytics_service.py +0 -0
  158. /jettask/{services → webui/services}/scheduled_task_service.py +0 -0
  159. /jettask/{services → webui/services}/task_service.py +0 -0
  160. /jettask/{webui_sql → webui/sql}/batch_upsert_functions.sql +0 -0
  161. /jettask/{webui_sql → webui/sql}/verify_database.sql +0 -0
  162. {jettask-0.2.19.dist-info → jettask-0.2.20.dist-info}/WHEEL +0 -0
  163. {jettask-0.2.19.dist-info → jettask-0.2.20.dist-info}/entry_points.txt +0 -0
  164. {jettask-0.2.19.dist-info → jettask-0.2.20.dist-info}/licenses/LICENSE +0 -0
  165. {jettask-0.2.19.dist-info → jettask-0.2.20.dist-info}/top_level.txt +0 -0
@@ -1,1695 +0,0 @@
1
- import os
2
- import time
3
- import uuid
4
- import json
5
- import logging
6
- import threading
7
- import asyncio
8
- import multiprocessing
9
- from typing import Dict, Any
10
- from enum import Enum
11
- from collections import defaultdict, namedtuple
12
-
13
- import redis
14
- from redis.asyncio.lock import Lock as AsyncLock
15
-
16
- from ..utils.serializer import dumps_str
17
-
18
- logger = logging.getLogger('app')
19
-
20
- from .heartbeat_process import HeartbeatProcessManager
21
- from .worker_scanner import WorkerScanner
22
-
23
-
24
- class ConsumerStrategy(Enum):
25
- """消费者名称策略
26
-
27
- 策略选择指南:
28
-
29
- ⚠️ POD (仅推荐单进程使用):
30
- - 基于K8s Pod名称的固定consumer
31
- - 适用场景: 单进程应用 (asyncio/thread执行器)
32
- - 优点: 语义清晰,便于监控
33
- - 缺点: 多进程下会产生冲突
34
-
35
- 🔧 FIXED (高级用户):
36
- - 完全自定义的consumer名称
37
- - 适用场景: 有特殊命名需求的场景
38
- - 优点: 完全可控
39
- - 缺点: 需要用户确保唯一性
40
-
41
- 🔥 HEARTBEAT (推荐用于生产环境):
42
- - 基于心跳的简化策略
43
- - 适用场景: 无状态服务平台(Cloud Run、Serverless、K8s)
44
- - 优点: 逻辑简单,稳定可靠,自动故障恢复
45
- - 特点: 使用随机consumer name,通过有序集合维护心跳
46
- """
47
- FIXED = "fixed" # 固定名称
48
- POD = "pod" # K8s Pod名称 (⚠️ 多进程下不推荐)
49
- HEARTBEAT = "heartbeat" # 心跳策略 (推荐用于生产环境)
50
-
51
-
52
- class ConsumerManager:
53
- """消费者名称管理器"""
54
-
55
- def __init__(
56
- self,
57
- redis_client: redis.StrictRedis,
58
- strategy: ConsumerStrategy = ConsumerStrategy.HEARTBEAT,
59
- config: Dict[str, Any] = None
60
- ):
61
- self.redis_client = redis_client
62
- self.strategy = strategy
63
- self.config = config or {}
64
- self._consumer_name = None
65
-
66
- # Redis prefix configuration
67
- self.redis_prefix = config.get('redis_prefix', 'jettask')
68
-
69
- # 验证策略配置的合理性
70
- self._validate_strategy_configuration()
71
-
72
- # 心跳策略实例 - 如果是HEARTBEAT策略,立即初始化
73
- if self.strategy == ConsumerStrategy.HEARTBEAT:
74
- # 传递队列信息到心跳策略
75
- heartbeat_config = self.config.copy()
76
- heartbeat_config['queues'] = self.config.get('queues', [])
77
- self._heartbeat_strategy = HeartbeatConsumerStrategy(
78
- self.redis_client,
79
- heartbeat_config
80
- )
81
- else:
82
- self._heartbeat_strategy = None
83
-
84
- def get_prefixed_queue_name(self, queue: str) -> str:
85
- """为队列名称添加前缀"""
86
- return f"{self.redis_prefix}:QUEUE:{queue}"
87
-
88
- def _validate_strategy_configuration(self):
89
- """验证消费者策略配置的合理性"""
90
- # 检查是否在多进程环境中
91
- current_process = multiprocessing.current_process()
92
- is_multiprocess = current_process.name != 'MainProcess'
93
-
94
- if self.strategy == ConsumerStrategy.POD and is_multiprocess:
95
- # POD策略在多进程环境下是不允许的,直接退出
96
- error_msg = (
97
- "\n"
98
- "❌ 错误: POD策略不能在多进程环境中使用!\n"
99
- "\n"
100
- "原因: POD策略使用固定的consumer名称,多进程会导致消息重复消费。\n"
101
- "\n"
102
- "解决方案:\n"
103
- " 1. 使用 ConsumerStrategy.HEARTBEAT - 心跳策略 (推荐)\n"
104
- " 2. 使用 ConsumerStrategy.FIXED - 自定义固定名称\n"
105
- " 3. 使用单进程执行器 (asyncio/thread)\n"
106
- "\n"
107
- f"当前环境: {current_process.name} (PID: {os.getpid()})\n"
108
- )
109
- logger.error(error_msg)
110
- # 立即退出程序
111
- import sys
112
- sys.exit(1)
113
-
114
- # 记录策略选择用于调试
115
- if logger.isEnabledFor(logging.DEBUG):
116
- logger.debug(f"Consumer strategy: {self.strategy.value}, Process: {current_process.name}")
117
-
118
- def get_consumer_name(self, queue: str) -> str:
119
- """获取消费者名称"""
120
- # print(f'获取消费者名称: {self.strategy} {queue}')
121
- if self.strategy == ConsumerStrategy.FIXED:
122
- return self._get_fixed_name(queue)
123
- elif self.strategy == ConsumerStrategy.POD:
124
- return self._get_pod_name(queue)
125
- elif self.strategy == ConsumerStrategy.HEARTBEAT:
126
- return self._get_heartbeat_name(queue)
127
- else:
128
- raise ValueError(f"Unknown consumer strategy: {self.strategy}")
129
-
130
- def _get_fixed_name(self, queue: str) -> str:
131
- """获取固定的消费者名称"""
132
- if not self._consumer_name:
133
- # 可以从配置、环境变量或文件中读取
134
- self._consumer_name = self.config.get('consumer_name') or \
135
- os.environ.get('EASYTASK_CONSUMER_NAME') or \
136
- f"worker-{os.getpid()}"
137
- return f"{self._consumer_name}-{queue}"
138
-
139
- def _get_pod_name(self, queue: str) -> str:
140
- """获取基于K8s Pod的消费者名称
141
-
142
- 注意:POD策略只能在单进程环境下使用
143
- """
144
- if not self._consumer_name:
145
- # 在K8s中,通常通过环境变量获取Pod名称
146
- pod_name = os.environ.get('HOSTNAME') or \
147
- os.environ.get('POD_NAME') or \
148
- os.environ.get('K8S_POD_NAME')
149
-
150
- if not pod_name:
151
- logger.warning("Pod name not found, falling back to hostname")
152
- import socket
153
- pod_name = socket.gethostname()
154
-
155
- # 由于已经在_validate_strategy_configuration中验证过,
156
- # 这里应该只会在MainProcess中执行
157
- self._consumer_name = pod_name
158
- logger.debug(f"使用Pod策略的consumer名称: {self._consumer_name}")
159
-
160
- return f"{self._consumer_name}-{queue}"
161
-
162
-
163
- def _get_heartbeat_name(self, queue: str) -> str:
164
- """基于心跳策略获取消费者名称"""
165
- if not self._heartbeat_strategy:
166
- raise RuntimeError("Heartbeat strategy not initialized properly")
167
-
168
- return self._heartbeat_strategy.get_consumer_name(queue)
169
-
170
- def cleanup(self):
171
- """清理资源(优雅关闭时调用)"""
172
- # 处理心跳策略的清理
173
- if self.strategy == ConsumerStrategy.HEARTBEAT and self._heartbeat_strategy:
174
- self._heartbeat_strategy.cleanup()
175
-
176
- def update_stats(self, queue: str, success: bool = True, processing_time: float = 0.0,
177
- total_latency: float = None):
178
- """更新消费者的统计信息(仅对HEARTBEAT策略有效)"""
179
- if self.strategy == ConsumerStrategy.HEARTBEAT and self._heartbeat_strategy:
180
- self._heartbeat_strategy.update_stats(queue, success, processing_time, total_latency)
181
-
182
- def task_started(self, queue: str):
183
- """任务开始执行时调用(仅对HEARTBEAT策略有效)"""
184
- if self.strategy == ConsumerStrategy.HEARTBEAT and self._heartbeat_strategy:
185
- self._heartbeat_strategy.task_started(queue)
186
-
187
- def task_finished(self, queue: str):
188
- """任务完成时调用(仅对HEARTBEAT策略有效)"""
189
- if self.strategy == ConsumerStrategy.HEARTBEAT and self._heartbeat_strategy:
190
- self._heartbeat_strategy.task_finished(queue)
191
-
192
- def is_heartbeat_timeout(self) -> bool:
193
- """检查心跳是否已超时(仅对HEARTBEAT策略有效)"""
194
- if self.strategy == ConsumerStrategy.HEARTBEAT and self._heartbeat_strategy:
195
- return self._heartbeat_strategy.is_heartbeat_timeout()
196
- return False
197
-
198
- def record_group_info(self, queue: str, task_name: str, group_name: str, consumer_name: str):
199
- """记录task的group信息到worker hash表(仅对HEARTBEAT策略有效)"""
200
- if self.strategy == ConsumerStrategy.HEARTBEAT and self._heartbeat_strategy:
201
- self._heartbeat_strategy.record_group_info(queue, task_name, group_name, consumer_name)
202
-
203
- async def record_group_info_async(self, queue: str, task_name: str, group_name: str, consumer_name: str):
204
- """异步记录task的group信息到worker hash表(仅对HEARTBEAT策略有效)"""
205
- if self.strategy == ConsumerStrategy.HEARTBEAT and self._heartbeat_strategy:
206
- await self._heartbeat_strategy.record_group_info_async(queue, task_name, group_name, consumer_name)
207
-
208
- def cleanup_expired_consumers(self, queue: str):
209
- """清理过期的消费者(可选功能)"""
210
- try:
211
- # 获取消费者组的pending消息信息
212
- prefixed_queue = self.get_prefixed_queue_name(queue)
213
- pending_info = self.redis_client.xpending(prefixed_queue, prefixed_queue)
214
- if not pending_info:
215
- return
216
-
217
- # 获取详细的pending消息
218
- consumers = self.redis_client.xpending_range(
219
- prefixed_queue, prefixed_queue, min='-', max='+', count=100
220
- )
221
-
222
- for consumer_info in consumers:
223
- consumer_name = consumer_info['consumer']
224
- idle_time = consumer_info['time_since_delivered']
225
-
226
- # 如果消息空闲时间超过阈值,可能消费者已经死亡
227
- # 使用120秒作为默认的死亡检测阈值
228
- if idle_time > 120 * 1000: # 120秒
229
- logger.warning(
230
- f"Consumer {consumer_name} has pending messages "
231
- f"idle for {idle_time/1000}s, may be dead"
232
- )
233
- # 这里可以实现消息重新分配逻辑
234
-
235
- except Exception as e:
236
- logger.error(f"Error cleaning up expired consumers: {e}")
237
-
238
- class HeartbeatConsumerStrategy:
239
- """基于心跳的简化消费者策略
240
-
241
- 特点:
242
- 1. 使用随机consumer name
243
- 2. 每个队列维护独立的心跳有序集合
244
- 3. 心跳数据包含worker的详细信息
245
- 4. 自动重置死亡worker的pending任务
246
- """
247
-
248
- def __init__(self, redis_client: redis.StrictRedis, config: Dict = None):
249
- self.redis = redis_client
250
- self.config = config or {}
251
- # 获取异步Redis客户端(从app模块)
252
- try:
253
- from ..core.app import get_async_redis_pool
254
- from redis import asyncio as aioredis
255
- redis_url = config.get('redis_url') or 'redis://localhost:6379'
256
- async_pool = get_async_redis_pool(redis_url)
257
- self.async_redis = aioredis.StrictRedis(connection_pool=async_pool)
258
- except Exception as e:
259
- logger.warning(f"Failed to create async redis client: {e}")
260
- self.async_redis = None
261
- # 配置参数
262
- self.heartbeat_interval = self.config.get('heartbeat_interval', 1) # 5秒心跳
263
- self.heartbeat_timeout = self.config.get('heartbeat_timeout', 3) # 30秒超时
264
- self.scan_interval = self.config.get('scan_interval', 5) # 10秒扫描一次
265
-
266
- # 获取Redis前缀(从配置中)
267
- self.redis_prefix = config.get('redis_prefix', 'jettask')
268
-
269
- # 获取Worker前缀(从配置中,默认为WORKER)
270
- # 允许不同的服务使用不同的前缀来区分命名空间
271
- self.worker_prefix = config.get('worker_prefix', 'WORKER')
272
-
273
- # 保存配置中的队列列表
274
- self.configured_queues = config.get('queues', [])
275
-
276
- # 获取主机名前缀
277
- import socket
278
- try:
279
- # 首先尝试获取hostname
280
- hostname = socket.gethostname()
281
- # 尝试获取IP地址
282
- ip = socket.gethostbyname(hostname)
283
- # 优先使用hostname,如果hostname是localhost则使用IP
284
- prefix = hostname if hostname != 'localhost' else ip
285
- except:
286
- # 如果获取失败,使用环境变量或默认值
287
- prefix = os.environ.get('HOSTNAME', 'unknown')
288
-
289
- # 保存主机名前缀,延迟创建consumer_id
290
- self.hostname_prefix = prefix
291
- self.consumer_id = None # 延迟创建,避免在主进程中创建
292
-
293
- # 新的数据结构设计 - worker_key 也延迟创建
294
- self._worker_key = None
295
-
296
- self.consumer_names = {} # queue -> consumer_name mapping
297
- self.active_queues = set() # 记录当前活跃的队列
298
-
299
- # 后台控制
300
- self._scanner_thread = None
301
- self._scanner_task = None
302
- self._scanner_stop = threading.Event()
303
-
304
- # 统计刷新线程/协程
305
- self._stats_flusher_thread = None
306
- self._stats_flusher_task = None
307
- self._stats_flusher_stop = threading.Event()
308
-
309
- # 心跳进程管理器
310
- self._heartbeat_process_manager = None
311
- self._heartbeat_processes = {} # queue -> process mapping
312
- logger.debug("HeartbeatStrategy initialized with process-based heartbeat support")
313
-
314
- # 统计缓冲区 - 使用无锁设计
315
- # 定义统计事件类型
316
- self.StatsEvent = namedtuple('StatsEvent', ['type', 'queue', 'value', 'timestamp'])
317
-
318
- # 使用简单列表替代队列(现在是纯异步环境)
319
- self.stats_events = [] # 统计事件列表
320
-
321
- # 本地累积缓冲区(仅在flush时使用)
322
- self.stats_accumulator = {
323
- 'running_tasks': defaultdict(int),
324
- 'success_count': defaultdict(int),
325
- 'failed_count': defaultdict(int),
326
- 'total_time': defaultdict(float),
327
- 'total_count': defaultdict(int),
328
- 'total_latency': defaultdict(float)
329
- }
330
-
331
- self.stats_flush_interval = self.config.get('stats_flush_interval', 0.5) # 更频繁地刷新
332
- self.last_stats_flush = time.time()
333
-
334
- # 延迟启动扫描线程,只有在真正需要时才启动
335
- self._scanner_started = False
336
- self._scanner_needs_start = False # 标记是否需要在异步上下文中启动
337
- self._startup_time = time.time() # 记录启动时间,用于心跳超时宽限期
338
-
339
- # Worker 扫描器 - 直接初始化
340
- self.scanner = WorkerScanner(
341
- self.redis, self.async_redis,
342
- self.redis_prefix, self.heartbeat_timeout
343
- )
344
-
345
- # 延迟启动统计刷新线程
346
- self._stats_flusher_started = False
347
-
348
- # 注册退出处理
349
- import atexit
350
- atexit.register(self.cleanup)
351
-
352
- def _find_reusable_worker_id(self, prefix: str) -> str:
353
- """查找可以复用的离线worker ID
354
-
355
- 使用分布式锁来防止多个进程同时复用同一个worker ID
356
-
357
- Args:
358
- prefix: 主机名前缀
359
-
360
- Returns:
361
- 可复用的consumer_id,如果没有找到则返回None
362
- """
363
- # 使用Redis的分布式锁,可以自动等待锁释放
364
- reuse_lock_key = f"{self.redis_prefix}:{self.worker_prefix}:REUSE:LOCK"
365
- # 创建Redis锁对象,超时时间5秒,阻塞等待最多2秒
366
- from redis.lock import Lock
367
- lock = Lock(self.redis, reuse_lock_key, timeout=5, blocking=True, blocking_timeout=2)
368
-
369
- try:
370
- acquired = lock.acquire()
371
- if not acquired:
372
- logger.debug("Could not acquire worker reuse lock, creating new ID")
373
- return None
374
-
375
- # 扫描所有worker键
376
- pattern = f"{self.redis_prefix}:{self.worker_prefix}:*"
377
- worker_keys = []
378
- cursor = 0
379
-
380
- # 使用SCAN迭代获取所有worker键
381
- while True:
382
- cursor, keys = self.redis.scan(cursor, match=pattern, count=100)
383
- # 过滤掉HISTORY相关的键、锁键和REUSING标记键
384
- for key in keys:
385
- # key 是 bytes 类型,需要解码或使用 bytes 进行比较
386
- key_str = key.decode('utf-8') if isinstance(key, bytes) else key
387
- if ':HISTORY:' not in key_str and ':REUSE:LOCK' not in key_str and ':REUSING' not in key_str:
388
- worker_keys.append(key)
389
- if cursor == 0:
390
- break
391
-
392
- if not worker_keys:
393
- logger.debug("No worker keys found during scan")
394
- return None
395
- else:
396
- logger.debug(f"Found {len(worker_keys)} worker keys to check")
397
-
398
- # 查找符合条件的离线worker
399
- offline_workers = []
400
-
401
- for worker_key in worker_keys:
402
- try:
403
- # 获取worker数据
404
- worker_data = self.redis.hgetall(worker_key)
405
- # logger.debug(f'{worker_key=} {worker_data=}')
406
- if not worker_data:
407
- continue
408
- # 获取worker的状态信息
409
- is_alive_val = worker_data.get('is_alive', 'true')
410
- if isinstance(is_alive_val, bytes):
411
- is_alive_val = is_alive_val.decode('utf-8')
412
- is_alive = is_alive_val.lower() == 'true'
413
-
414
- last_heartbeat_val = worker_data.get('last_heartbeat', 0)
415
- if isinstance(last_heartbeat_val, bytes):
416
- last_heartbeat_val = last_heartbeat_val.decode('utf-8')
417
- last_heartbeat = float(last_heartbeat_val)
418
- current_time = time.time()
419
-
420
- # 获取离线时间
421
- offline_time_str = worker_data.get('offline_time', '0')
422
- if isinstance(offline_time_str, bytes):
423
- offline_time_str = offline_time_str.decode('utf-8')
424
- try:
425
- offline_time = float(offline_time_str) if offline_time_str else last_heartbeat
426
- except:
427
- offline_time = last_heartbeat
428
-
429
- # 判断worker是否真的离线
430
- # 1. is_alive标记为false,或者
431
- # 2. 最后心跳时间超过了heartbeat_timeout
432
- is_truly_offline = (not is_alive) or (current_time - last_heartbeat > self.heartbeat_timeout)
433
- # logger.debug(f'{is_truly_offline=} {worker_data=}')
434
- if not is_truly_offline:
435
- logger.debug(f"Worker {is_alive=} {current_time - last_heartbeat} {self.heartbeat_timeout} {worker_data.get('consumer_id')} is still active (last_heartbeat: {current_time - last_heartbeat:.1f}s ago)")
436
- continue
437
-
438
- # 需要离线超过heartbeat_timeout才能被复用(与离线检测保持一致)
439
- # min_offline_duration = self.heartbeat_timeout
440
- # if offline_time > 0 and (current_time - offline_time) < min_offline_duration:
441
- # logger.debug(f"Worker {worker_data.get('consumer_id')} offline for only {current_time - offline_time:.1f}s, need {min_offline_duration}s")
442
- # continue
443
-
444
- # 获取consumer_id
445
- consumer_id = worker_data.get('consumer_id', '')
446
- if isinstance(consumer_id, bytes):
447
- consumer_id = consumer_id.decode('utf-8')
448
- if not consumer_id:
449
- continue
450
-
451
- # 不再检查前缀,允许复用任何离线的worker
452
-
453
- # 使用离线时间或最后心跳时间
454
- if 'offline_time' in worker_data:
455
- offline_time = float(worker_data.get('offline_time'))
456
- else:
457
- # 如果没有offline_time,使用最后心跳时间
458
- offline_time = last_heartbeat
459
- logger.debug(f"Worker {consumer_id} has no offline_time, using last_heartbeat")
460
-
461
- offline_workers.append((consumer_id, offline_time, worker_key))
462
-
463
- except Exception as e:
464
- logger.debug(f"Error checking worker {worker_key}: {e}")
465
- continue
466
-
467
- if not offline_workers:
468
- logger.debug(f"No offline workers found matching prefix {prefix}")
469
- return None
470
- else:
471
- logger.debug(f"Found {len(offline_workers)} offline workers: {[w[0] for w in offline_workers]}")
472
-
473
- # 按离线时间排序,选择离线时间最长的(最早离线的)
474
- offline_workers.sort(key=lambda x: x[1])
475
- selected_consumer_id, selected_offline_time, selected_worker_key = offline_workers[0]
476
-
477
- # 重置该worker的状态 - 保留所有统计数据,但不保留queues
478
- pipeline = self.redis.pipeline()
479
-
480
- # 更新基本信息,保留原有的queues字段
481
- pipeline.hset(selected_worker_key, mapping={
482
- 'consumer_id': selected_consumer_id,
483
- 'is_alive': 'true',
484
- 'last_heartbeat': str(time.time()),
485
- 'pid': str(os.getpid()),
486
- 'created_at': str(time.time()),
487
- 'messages_transferred': 'false' # 重置消息转移标记,这是新的生命周期
488
- })
489
-
490
- # 注意:不删除queues字段,让心跳线程根据实际情况更新
491
- # 这避免了在复用时清空queues导致的显示问题
492
-
493
- # 保留所有统计数据,不清空
494
-
495
- pipeline.execute()
496
-
497
- logger.debug(f"Found reusable worker: {selected_consumer_id}, offline since {time.time() - selected_offline_time:.1f}s ago")
498
- return selected_consumer_id
499
-
500
- except Exception as e:
501
- logger.error(f"Error finding reusable worker ID: {e}")
502
- return None
503
- finally:
504
- try:
505
- lock.release()
506
- except:
507
- pass
508
-
509
- def get_prefixed_queue_name(self, queue: str) -> str:
510
- """为队列名称添加前缀"""
511
- return f"{self.redis_prefix}:QUEUE:{queue}"
512
-
513
- def update_stats(self, queue: str, success: bool = True, processing_time: float = 0.0,
514
- total_latency: float = None):
515
- """更新worker的统计信息 - 使用无锁队列
516
-
517
- Args:
518
- queue: 队列名称
519
- success: 是否执行成功
520
- processing_time: 处理时间(秒) - 实际执行时间
521
- total_latency: 总延迟时间(秒) - 从任务创建到完成的总时间
522
- """
523
- try:
524
- # 创建统计事件并添加到列表
525
- timestamp = time.time()
526
-
527
- # 成功/失败计数
528
- event_type = 'success' if success else 'failed'
529
- self.stats_events.append(
530
- self.StatsEvent(event_type, queue, 1, timestamp)
531
- )
532
-
533
- # 处理时间
534
- if processing_time > 0:
535
- self.stats_events.append(
536
- self.StatsEvent('processing_time', queue, processing_time, timestamp)
537
- )
538
-
539
- # 总延迟时间
540
- if total_latency is not None and total_latency > 0:
541
- self.stats_events.append(
542
- self.StatsEvent('total_latency', queue, total_latency, timestamp)
543
- )
544
-
545
- except Exception as e:
546
- logger.error(f"Error updating stats: {e}")
547
-
548
- def task_started(self, queue: str):
549
- """任务开始执行时调用 - 添加到事件列表"""
550
- self.stats_events.append(
551
- self.StatsEvent('task_started', queue, 1, time.time())
552
- )
553
-
554
- def task_finished(self, queue: str):
555
- """任务完成时调用 - 添加到事件列表"""
556
- self.stats_events.append(
557
- self.StatsEvent('task_finished', queue, -1, time.time())
558
- )
559
-
560
- async def flush_stats_buffer(self):
561
- """刷新统计缓冲到 Redis - 优化版本(异步版本)"""
562
- # 如果worker从未初始化,直接返回
563
- if self.consumer_id is None or self._worker_key is None:
564
- logger.debug("Worker not initialized, skipping stats flush")
565
- return
566
-
567
- # 直接获取所有待处理的事件
568
- events = self.stats_events.copy() # 复制当前事件列表
569
- self.stats_events.clear() # 清空原列表
570
- start_time = time.time()
571
-
572
- try:
573
- if not events:
574
- return
575
-
576
- # 清空累积器
577
- for buffer in self.stats_accumulator.values():
578
- buffer.clear()
579
-
580
- # 处理所有事件,累积到本地缓冲区
581
- for event in events:
582
- if event.type == 'success':
583
- self.stats_accumulator['success_count'][event.queue] += event.value
584
- self.stats_accumulator['total_count'][event.queue] += event.value
585
- elif event.type == 'failed':
586
- self.stats_accumulator['failed_count'][event.queue] += event.value
587
- self.stats_accumulator['total_count'][event.queue] += event.value
588
- elif event.type == 'processing_time':
589
- self.stats_accumulator['total_time'][event.queue] += event.value
590
- elif event.type == 'total_latency':
591
- self.stats_accumulator['total_latency'][event.queue] += event.value
592
- elif event.type == 'task_started':
593
- self.stats_accumulator['running_tasks'][event.queue] += event.value
594
- elif event.type == 'task_finished':
595
- self.stats_accumulator['running_tasks'][event.queue] += event.value # 注意:task_finished的value是-1
596
-
597
- # 批量更新到 Redis
598
- pipeline = self.async_redis.pipeline()
599
- processed_queues = set()
600
-
601
- # 收集所有需要更新的队列
602
- for buffer in self.stats_accumulator.values():
603
- processed_queues.update(buffer.keys())
604
-
605
- # 为每个队列构建批量更新
606
- for queue in processed_queues:
607
- # 运行中任务数(可能为负数,表示减少)
608
- if queue in self.stats_accumulator['running_tasks']:
609
- delta = self.stats_accumulator['running_tasks'][queue]
610
- if delta != 0:
611
- pipeline.hincrby(self._worker_key, f'{queue}:running_tasks', delta)
612
-
613
- # 成功计数
614
- if queue in self.stats_accumulator['success_count']:
615
- pipeline.hincrby(self._worker_key, f'{queue}:success_count',
616
- self.stats_accumulator['success_count'][queue])
617
-
618
- # 失败计数
619
- if queue in self.stats_accumulator['failed_count']:
620
- pipeline.hincrby(self._worker_key, f'{queue}:failed_count',
621
- self.stats_accumulator['failed_count'][queue])
622
-
623
- # 总计数
624
- if queue in self.stats_accumulator['total_count']:
625
- pipeline.hincrby(self._worker_key, f'{queue}:total_count',
626
- self.stats_accumulator['total_count'][queue])
627
-
628
- # 处理时间
629
- if queue in self.stats_accumulator['total_time']:
630
- pipeline.hincrbyfloat(self._worker_key, f'{queue}:total_processing_time',
631
- self.stats_accumulator['total_time'][queue])
632
-
633
- # 延迟时间
634
- if queue in self.stats_accumulator['total_latency']:
635
- pipeline.hincrbyfloat(self._worker_key, f'{queue}:total_latency_time',
636
- self.stats_accumulator['total_latency'][queue])
637
-
638
- # 执行所有更新
639
- redis_start = time.time()
640
- await pipeline.execute()
641
- redis_duration = time.time() - redis_start
642
-
643
- # 批量计算并更新平均值(使用单独的pipeline以提高效率)
644
- if processed_queues:
645
- # 批量获取所有需要的数据
646
- fields = []
647
- for queue in processed_queues:
648
- fields.extend([
649
- f'{queue}:total_count',
650
- f'{queue}:total_processing_time',
651
- f'{queue}:total_latency_time'
652
- ])
653
-
654
- if fields:
655
- values = await self.async_redis.hmget(self._worker_key, fields)
656
-
657
- # 计算平均值并批量更新
658
- pipeline = self.async_redis.pipeline()
659
- idx = 0
660
- for queue in processed_queues:
661
- total_count = values[idx] if values[idx] else '0'
662
- total_time = values[idx + 1] if values[idx + 1] else '0'
663
- total_latency = values[idx + 2] if values[idx + 2] else '0'
664
- idx += 3
665
-
666
- if int(total_count) > 0:
667
- # 计算平均处理时间
668
- if float(total_time) > 0:
669
- avg_time = float(total_time) / int(total_count)
670
- pipeline.hset(self._worker_key, f'{queue}:avg_processing_time', f'{avg_time:.3f}')
671
-
672
- # 计算平均延迟时间
673
- if float(total_latency) > 0:
674
- avg_latency = float(total_latency) / int(total_count)
675
- pipeline.hset(self._worker_key, f'{queue}:avg_latency_time', f'{avg_latency:.3f}')
676
-
677
- await pipeline.execute()
678
-
679
- # 性能统计日志
680
- total_duration = time.time() - start_time
681
- if total_duration > 0.05 or len(events) > 100: # 超过50ms或处理超过100个事件时记录
682
- logger.info(
683
- f"Stats flush performance: "
684
- f"events={len(events)}, "
685
- f"queues={len(processed_queues)}, "
686
- f"total_time={total_duration:.3f}s, "
687
- f"redis_time={redis_duration:.3f}s, "
688
- f"events_remaining={len(self.stats_events)}, "
689
- f"dropped=0"
690
- )
691
-
692
- except Exception as e:
693
- logger.error(f"Failed to flush stats buffer: {e}")
694
- # 将未处理的事件放回列表(尽力而为)
695
- # 只放回后半部分,避免无限循环
696
- self.stats_events.extend(events[len(events) - len(events) // 2:])
697
-
698
- def get_stats(self, queue: str) -> dict:
699
- """获取队列的统计信息 - 从Redis Hash读取"""
700
- try:
701
- # 如果worker未初始化,返回空统计
702
- if self.consumer_id is None or self._worker_key is None:
703
- return {
704
- 'success_count': 0,
705
- 'failed_count': 0,
706
- 'total_count': 0,
707
- 'running_tasks': 0,
708
- 'avg_processing_time': 0.0
709
- }
710
-
711
- # 批量获取该队列的所有统计字段
712
- fields = [
713
- f'{queue}:success_count',
714
- f'{queue}:failed_count',
715
- f'{queue}:total_count',
716
- f'{queue}:running_tasks',
717
- f'{queue}:avg_processing_time'
718
- ]
719
-
720
- values = self.redis.hmget(self._worker_key, fields)
721
-
722
- return {
723
- 'success_count': int(values[0] or 0),
724
- 'failed_count': int(values[1] or 0),
725
- 'total_count': int(values[2] or 0),
726
- 'running_tasks': int(values[3] or 0),
727
- 'avg_processing_time': float(values[4] or 0.0)
728
- }
729
- except Exception as e:
730
- logger.error(f"Failed to get stats for queue {queue}: {e}")
731
- return {
732
- 'success_count': 0,
733
- 'failed_count': 0,
734
- 'total_count': 0,
735
- 'running_tasks': 0,
736
- 'avg_processing_time': 0.0
737
- }
738
-
739
- def _ensure_consumer_id(self):
740
- """确保consumer_id已创建"""
741
- if self.consumer_id is None:
742
- # 延迟创建consumer_id
743
- self.consumer_id = self._find_reusable_worker_id(self.hostname_prefix)
744
- if not self.consumer_id:
745
- # 如果没有可复用的,生成新的consumer ID
746
- self.consumer_id = f"{self.hostname_prefix}-{uuid.uuid4().hex[:8]}-{os.getpid()}"
747
- logger.debug(f"Created new consumer ID: {self.consumer_id}")
748
- else:
749
- logger.debug(f"Reusing offline worker ID: {self.consumer_id}")
750
-
751
- # 更新worker_key
752
- self._worker_key = f'{self.redis_prefix}:{self.worker_prefix}:{self.consumer_id}'
753
-
754
- @property
755
- def worker_key(self):
756
- """获取worker_key,确保consumer_id已初始化"""
757
- self._ensure_consumer_id()
758
- return self._worker_key
759
-
760
- def get_consumer_name(self, queue: str) -> str:
761
- """获取消费者名称"""
762
- # 确保consumer_id已创建
763
- self._ensure_consumer_id()
764
-
765
- # 第一次调用时启动扫描器
766
- if not self._scanner_started:
767
- self._start_scanner()
768
- self._scanner_started = True
769
-
770
- # 第一次调用时启动统计刷新器
771
- if not self._stats_flusher_started:
772
- self._start_stats_flusher()
773
- self._stats_flusher_started = True
774
-
775
- if queue not in self.consumer_names:
776
- # 为每个队列生成唯一的consumer name
777
- self.consumer_names[queue] = f"{self.consumer_id}-{queue}"
778
- self.active_queues.add(queue)
779
-
780
- # 为这个队列启动心跳进程
781
- if queue not in self._heartbeat_processes:
782
- self._start_heartbeat_process_for_queue(queue)
783
-
784
- logger.debug(f"Created consumer name for queue {queue}: {self.consumer_names[queue]}")
785
- return self.consumer_names[queue]
786
-
787
- def record_group_info(self, queue: str, task_name: str, group_name: str, consumer_name: str):
788
- """记录task的group信息到worker hash表
789
-
790
- Args:
791
- queue: 队列名
792
- task_name: 任务名
793
- group_name: consumer group名称
794
- consumer_name: consumer名称
795
- """
796
- try:
797
- # 确保worker_key已初始化
798
- if not self._worker_key:
799
- self._ensure_consumer_id()
800
- if not self._worker_key:
801
- logger.warning("Cannot record group info: worker_key not initialized")
802
- return
803
-
804
- # 构建group信息
805
- import json
806
- group_info = {
807
- 'queue': queue,
808
- 'task_name': task_name,
809
- 'group_name': group_name,
810
- 'consumer_name': consumer_name,
811
- 'stream_key': f"{self.redis_prefix}:QUEUE:{queue}"
812
- }
813
-
814
- # 将group信息存储到worker的hash中
815
- # 使用 group_info:{group_name} 作为field
816
- field_name = f"group_info:{group_name}"
817
- self.redis.hset(
818
- self._worker_key,
819
- field_name,
820
- json.dumps(group_info)
821
- )
822
-
823
- logger.debug(f"Recorded group info for task {task_name}: {group_info}")
824
-
825
- except Exception as e:
826
- logger.error(f"Error recording task group info: {e}")
827
-
828
- async def record_group_info_async(self, queue: str, task_name: str, group_name: str, consumer_name: str):
829
- """异步记录task的group信息到worker hash表
830
-
831
- Args:
832
- queue: 队列名
833
- task_name: 任务名
834
- group_name: consumer group名称
835
- consumer_name: consumer名称
836
- """
837
- try:
838
- # 确保worker_key已初始化
839
- if not self._worker_key:
840
- self._ensure_consumer_id()
841
- if not self._worker_key:
842
- logger.warning("Cannot record group info: worker_key not initialized")
843
- return
844
-
845
- # 构建group信息
846
- import json
847
- group_info = {
848
- 'queue': queue,
849
- 'task_name': task_name,
850
- 'group_name': group_name,
851
- 'consumer_name': consumer_name,
852
- 'stream_key': f"{self.redis_prefix}:QUEUE:{queue}"
853
- }
854
-
855
- # 将group信息存储到worker的hash中
856
- # 使用 group_info:{group_name} 作为field
857
- field_name = f"group_info:{group_name}"
858
- await self.async_redis.hset(
859
- self._worker_key,
860
- field_name,
861
- json.dumps(group_info)
862
- )
863
-
864
- logger.debug(f"Recorded group info for task {task_name}: {group_info}")
865
-
866
- except Exception as e:
867
- logger.error(f"Error recording task group info: {e}")
868
-
869
- def _ensure_worker_initialized(self):
870
- """确保worker已初始化"""
871
- if self.consumer_id is None:
872
- self._ensure_consumer_id()
873
- if self._worker_key is None:
874
- self._worker_key = f"{self.redis_prefix}:{self.worker_prefix}:{self.consumer_id}"
875
-
876
- def _start_heartbeat_process_for_queue(self, queue: str):
877
- """为特定队列启动心跳进程"""
878
- # 只需要启动一次心跳进程,不需要为每个队列都启动
879
- if self._heartbeat_process_manager is not None:
880
- # 心跳进程已经在运行,只需要记录这个队列
881
- self._heartbeat_processes[queue] = True
882
- return
883
- logger.debug('启动心跳进程')
884
- # 第一次调用时创建心跳进程管理器
885
- if self._heartbeat_process_manager is None:
886
- # 获取Redis URL
887
- redis_url = None
888
- if hasattr(self.redis.connection_pool, 'connection_kwargs'):
889
- redis_url = self.redis.connection_pool.connection_kwargs.get('url')
890
-
891
- if not redis_url:
892
- # 构造Redis URL
893
- connection_kwargs = self.redis.connection_pool.connection_kwargs
894
- host = connection_kwargs.get('host', 'localhost')
895
- port = connection_kwargs.get('port', 6379)
896
- db = connection_kwargs.get('db', 0)
897
- password = connection_kwargs.get('password')
898
- if password:
899
- redis_url = f"redis://:{password}@{host}:{port}/{db}"
900
- else:
901
- redis_url = f"redis://{host}:{port}/{db}"
902
-
903
- self._heartbeat_process_manager = HeartbeatProcessManager(
904
- redis_url=redis_url,
905
- consumer_id=self.consumer_id,
906
- heartbeat_interval=self.heartbeat_interval,
907
- heartbeat_timeout=self.heartbeat_timeout
908
- )
909
-
910
- # 确保worker key存在并初始化
911
- self._ensure_worker_initialized()
912
-
913
- # 初始化worker信息(心跳进程只负责更新last_heartbeat)
914
- current_time = time.time()
915
- import socket
916
- try:
917
- hostname = socket.gethostname()
918
- if not hostname or hostname == 'localhost':
919
- hostname = socket.gethostbyname(socket.gethostname())
920
- except:
921
- hostname = os.environ.get('HOSTNAME', 'unknown')
922
-
923
- # 设置初始worker信息
924
- worker_info = {
925
- 'consumer_id': self.consumer_id,
926
- 'host': hostname,
927
- 'pid': str(os.getpid()),
928
- 'created_at': str(current_time),
929
- 'last_heartbeat': str(current_time),
930
- 'is_alive': 'true',
931
- 'heartbeat_timeout': str(self.heartbeat_timeout),
932
- 'queues': ','.join(sorted(self.configured_queues)) if self.configured_queues else queue,
933
- 'messages_transferred': 'false' # 新worker的消息未转移
934
- }
935
-
936
- # 使用hset直接设置,确保数据写入
937
- self.redis.hset(self._worker_key, mapping=worker_info)
938
- # 同时添加到 sorted set
939
- self.redis.zadd(f"{self.redis_prefix}:ACTIVE_WORKERS", {self.consumer_id: current_time})
940
- logger.debug(f"Initialized worker {self.consumer_id} with key {self._worker_key}")
941
-
942
- self._heartbeat_process_manager.add_queue(queue, self._worker_key)
943
- self._heartbeat_processes[queue] = True
944
- # logger.debug(f"Started heartbeat process for queue {queue}")
945
-
946
- def _start_scanner(self):
947
- """启动扫描器协程"""
948
- try:
949
- loop = asyncio.get_running_loop()
950
- self._scanner_task = loop.create_task(self._scanner_loop())
951
- # 立即执行一次扫描,清理可能存在的死亡worker
952
- loop.create_task(self._immediate_scan())
953
- # logger.debug("Started heartbeat scanner coroutine")
954
- except RuntimeError:
955
- # 没有运行中的事件循环,标记为需要启动
956
- logger.debug("No running event loop, scanner will be started when async context is available")
957
- self._scanner_needs_start = True
958
-
959
- async def _immediate_scan(self):
960
- """启动时立即执行一次扫描(协程版本)"""
961
- try:
962
- # logger.debug("Performing immediate scan for dead workers...")
963
- await self._perform_scan()
964
- # logger.debug("Immediate scan completed")
965
- except Exception as e:
966
- logger.error(f"Error in immediate scan: {e}")
967
-
968
-
969
- async def _perform_scan(self):
970
- """执行扫描操作 - 使用高效的 O(log N) 算法"""
971
- try:
972
- # 使用 Worker 扫描器
973
- timeout_workers = await self.scanner.scan_timeout_workers()
974
-
975
- if timeout_workers:
976
- for worker_info in timeout_workers:
977
- await self._mark_worker_offline(
978
- worker_info['worker_key'],
979
- worker_info['worker_data']
980
- )
981
- return
982
- except Exception as e:
983
- logger.error(f"Scanner error: {e}")
984
-
985
- # 原始扫描逻辑作为后备
986
- current_time = time.time()
987
- # 注意:不再使用全局的heartbeat_timeout,而是使用每个worker自己的值
988
-
989
- try:
990
- # 扫描所有worker hash键
991
- pattern = f"{self.redis_prefix}:{self.worker_prefix}:*"
992
- worker_keys = []
993
- cursor = 0
994
-
995
- # 使用SCAN迭代获取所有worker键,排除HISTORY相关的键
996
- while True:
997
- cursor, keys = await self.async_redis.scan(cursor, match=pattern, count=100)
998
- # 过滤掉HISTORY相关的键、锁键和REUSING标记键
999
- for key in keys:
1000
- # key 是 bytes 类型,需要解码或使用 bytes 进行比较
1001
- key_str = key.decode('utf-8') if isinstance(key, bytes) else key
1002
- if ':HISTORY:' not in key_str and ':REUSE:LOCK' not in key_str and ':REUSING' not in key_str:
1003
- worker_keys.append(key)
1004
- if cursor == 0:
1005
- break
1006
-
1007
- # 同时清理残留的recovery consumer
1008
- await self._cleanup_stale_recovery_consumers()
1009
-
1010
- if not worker_keys:
1011
- logger.debug("No worker keys found")
1012
- return
1013
-
1014
- timeout_workers = []
1015
-
1016
- # 检查每个worker的心跳时间
1017
- for worker_key in worker_keys:
1018
- try:
1019
- # 先检查key的类型(现在应该不需要了,但保留作为安全检查)
1020
- key_type = await self.async_redis.type(worker_key)
1021
- if key_type != 'hash':
1022
- logger.warning(f"Worker key {worker_key} is not a hash, type: {key_type}, skipping")
1023
- continue
1024
-
1025
- worker_data = await self.async_redis.hgetall(worker_key)
1026
- if not worker_data:
1027
- continue
1028
-
1029
- last_heartbeat = float(worker_data.get('last_heartbeat', 0))
1030
- consumer_id = worker_data.get('consumer_id')
1031
- is_alive = worker_data.get('is_alive', 'true').lower() == 'true'
1032
-
1033
- # 获取该worker自己的heartbeat_timeout
1034
- # 如果没有记录,使用默认值(兼容旧版本)
1035
- worker_heartbeat_timeout = float(worker_data.get('heartbeat_timeout', self.heartbeat_timeout))
1036
-
1037
- # 跳过自己(如果consumer_id已初始化)
1038
- if self.consumer_id and consumer_id == self.consumer_id:
1039
- continue
1040
-
1041
- # 使用该worker自己的超时时间来判断
1042
- worker_timeout_threshold = current_time - worker_heartbeat_timeout
1043
-
1044
- # 检查是否需要处理这个worker
1045
- # 只处理心跳超时的活跃worker
1046
- needs_processing = False
1047
-
1048
- if is_alive and last_heartbeat < worker_timeout_threshold:
1049
- # 心跳超时的活跃worker
1050
- logger.debug(f"Worker {consumer_id} timeout detected: "
1051
- f"last_heartbeat={last_heartbeat}, "
1052
- f"timeout={worker_heartbeat_timeout}s, "
1053
- f"threshold={worker_timeout_threshold}")
1054
- needs_processing = True
1055
-
1056
- if needs_processing:
1057
- timeout_workers.append((worker_key, worker_data))
1058
-
1059
- except (ValueError, TypeError) as e:
1060
- logger.error(f"Error parsing worker data from {worker_key}: {e}")
1061
- continue
1062
-
1063
- if timeout_workers:
1064
- logger.debug(f"Found {len(timeout_workers)} timeout workers")
1065
-
1066
- for worker_key, worker_data in timeout_workers:
1067
- consumer_id = worker_data.get('consumer_id')
1068
- # queues = worker_data.get('queues', '').split(',') if worker_data.get('queues') else []
1069
-
1070
- # 使用Redis原生分布式锁来避免多个scanner同时处理同一个worker
1071
- lock_key = f"{self.redis_prefix}:SCANNER:LOCK:{consumer_id}"
1072
- lock_ttl = max(1, int(self.scan_interval * 2)) # 确保是整数,最小1秒
1073
-
1074
- # 创建Redis锁
1075
- lock = AsyncLock(
1076
- self.async_redis,
1077
- lock_key,
1078
- timeout=lock_ttl,
1079
- blocking=False # 不阻塞,直接跳过
1080
- )
1081
-
1082
- # 尝试获取锁
1083
- if not await lock.acquire():
1084
- logger.debug(f"Another scanner is processing worker {consumer_id}, skipping")
1085
- continue
1086
-
1087
- try:
1088
- # 再次检查worker是否真的超时(避免竞态条件)
1089
- current_heartbeat = await self.async_redis.hget(worker_key, 'last_heartbeat')
1090
- if current_heartbeat and float(current_heartbeat) >= timeout_threshold:
1091
- logger.debug(f"Worker {consumer_id} is now alive, skipping")
1092
- continue
1093
-
1094
- logger.debug(f"Processing timeout worker: {consumer_id}")
1095
- # 只标记worker为离线
1096
- await self._mark_worker_offline(worker_key, worker_data)
1097
-
1098
- except Exception as e:
1099
- logger.error(f"Error processing timeout worker {consumer_id}: {e}")
1100
- finally:
1101
- # 释放锁
1102
- await lock.release()
1103
-
1104
- except Exception as e:
1105
- logger.error(f"Error in scanner: {e}")
1106
-
1107
- async def _mark_worker_offline(self, worker_key: str, worker_data: dict):
1108
- """只标记worker为离线状态"""
1109
- consumer_id = worker_data.get('consumer_id')
1110
-
1111
- try:
1112
- current_time = time.time()
1113
- is_alive = worker_data.get('is_alive', 'true').lower() == 'true'
1114
-
1115
- # 只有之前是在线的worker才需要初始化消息转移状态
1116
- if is_alive:
1117
- # 标记worker为离线状态,并设置消息转移状态为未转移
1118
- await self.async_redis.hset(worker_key, mapping={
1119
- 'is_alive': 'false',
1120
- 'offline_time': str(current_time),
1121
- 'shutdown_reason': 'heartbeat_timeout',
1122
- 'messages_transferred': 'false' # 初始状态:消息未转移
1123
- })
1124
- logger.debug(f"Marked worker {consumer_id} as offline with messages_transferred=false")
1125
- else:
1126
- # 已经是离线状态的worker,只更新离线时间
1127
- await self.async_redis.hset(worker_key, 'offline_time', str(current_time))
1128
- logger.debug(f"Worker {consumer_id} was already offline, updated offline_time")
1129
-
1130
- except Exception as e:
1131
- logger.error(f"Error marking worker {consumer_id} offline: {e}")
1132
-
1133
-
1134
- async def _scanner_loop(self):
1135
- """扫描超时worker的循环(协程版本)"""
1136
- while not self._scanner_stop.is_set():
1137
- try:
1138
- await self._perform_scan()
1139
- await asyncio.sleep(self.scan_interval)
1140
- except Exception as e:
1141
- logger.error(f"Error in scanner loop: {e}")
1142
- await asyncio.sleep(5) # 错误时等待5秒后重试
1143
-
1144
-
1145
- def _start_stats_flusher(self):
1146
- """启动统计刷新器协程"""
1147
- try:
1148
- loop = asyncio.get_running_loop()
1149
- self._stats_flusher_task = loop.create_task(self._stats_flusher_loop())
1150
- logger.debug("Started stats flusher coroutine")
1151
- except RuntimeError:
1152
- # 没有运行中的事件循环,标记为需要启动
1153
- logger.debug("No running event loop for stats flusher, will be started when async context is available")
1154
- self._stats_flusher_needs_start = True
1155
-
1156
- async def _stats_flusher_loop(self):
1157
- """统计刷新循环(协程版本)"""
1158
- while not self._stats_flusher_stop.is_set():
1159
- try:
1160
- # 周期性刷新统计缓冲区
1161
- if len(self.stats_events) > 0:
1162
- # 直接调用异步的 flush_stats_buffer
1163
- await self.flush_stats_buffer()
1164
-
1165
- # 等待下一个刷新周期
1166
- await asyncio.sleep(self.stats_flush_interval)
1167
- except Exception as e:
1168
- logger.error(f"Error in stats flusher loop: {e}")
1169
- await asyncio.sleep(1) # 错误时等待1秒后重试
1170
-
1171
-
1172
-
1173
- def _cleanup_stream_consumer(self, queue: str, consumer_name: str):
1174
- """从Redis Stream消费者组中删除consumer"""
1175
- try:
1176
- # 删除消费者(这会阻止它重新加入后继续消费消息)
1177
- prefixed_queue = self.get_prefixed_queue_name(queue)
1178
- result = self.redis.execute_command('XGROUP', 'DELCONSUMER', prefixed_queue, prefixed_queue, consumer_name)
1179
- if result > 0:
1180
- logger.debug(f"Deleted stream consumer {consumer_name} from group {queue}")
1181
- else:
1182
- logger.debug(f"Stream consumer {consumer_name} was not found in group {queue}")
1183
- except Exception as e:
1184
- logger.error(f"Error deleting stream consumer {consumer_name}: {e}")
1185
-
1186
- async def _handle_dead_worker(self, queue: str, worker_info: dict, worker_data: bytes):
1187
- """处理死亡的worker(异步版本)"""
1188
- consumer_name = worker_info.get('consumer_name', 'unknown')
1189
-
1190
- # 使用Redis原生分布式锁来避免多个scanner同时处理同一个consumer
1191
- consumer_lock_key = f"{self.redis_prefix}:CONSUMER:LOCK:{consumer_name}"
1192
- consumer_lock_ttl = 30 # 30秒锁超时
1193
-
1194
- # 创建Redis锁
1195
- lock = AsyncLock(
1196
- self.async_redis,
1197
- consumer_lock_key,
1198
- timeout=consumer_lock_ttl,
1199
- blocking=False # 不阻塞,直接返回
1200
- )
1201
-
1202
- # 尝试获取锁
1203
- if not await lock.acquire():
1204
- logger.debug(f"Another scanner is handling consumer {consumer_name}, skipping")
1205
- return
1206
-
1207
- try:
1208
- heartbeat_key = f"{self.heartbeat_key_prefix}{queue}"
1209
-
1210
- # 再次检查worker是否真的超时(避免竞态条件)
1211
- current_score = await self.async_redis.zscore(heartbeat_key, worker_data)
1212
- if current_score and time.time() - current_score < self.heartbeat_timeout:
1213
- logger.debug(f"Worker {consumer_name} is now alive, skipping")
1214
- return
1215
-
1216
- # 从有序集合中删除死亡的worker(使用原始的worker_data)
1217
- removed = await self.async_redis.zrem(heartbeat_key, worker_data)
1218
- if removed:
1219
- logger.debug(f"Removed dead worker {consumer_name} from heartbeat set for queue {queue}")
1220
-
1221
- # 重置该consumer的pending消息
1222
- await self._reset_consumer_pending_messages(queue, consumer_name)
1223
- else:
1224
- logger.debug(f"Worker {consumer_name} already removed by another scanner")
1225
-
1226
- except Exception as e:
1227
- logger.error(f"Error handling dead worker {consumer_name}: {e}")
1228
- finally:
1229
- # 释放锁
1230
- await lock.release()
1231
-
1232
- async def _reset_consumer_pending_messages(self, queue: str, consumer_name: str):
1233
- """重置指定consumer的pending消息 - 优化版本,确保任务不会丢失(异步版本)"""
1234
- recovery_lock_key = f"RECOVERY:{queue}:{consumer_name}"
1235
- max_retries = 3
1236
-
1237
- try:
1238
- # 使用Redis原生分布式锁防止并发恢复同一个consumer
1239
- recovery_lock = AsyncLock(
1240
- self.async_redis,
1241
- recovery_lock_key,
1242
- timeout=300, # 5分钟超时
1243
- blocking=False # 不阻塞
1244
- )
1245
-
1246
- # 尝试获取锁
1247
- if not await recovery_lock.acquire():
1248
- logger.warning(f"Another process is recovering messages for {consumer_name}, skipping")
1249
- return
1250
-
1251
- # 首先获取该consumer的所有pending消息
1252
- consumer_messages = []
1253
- try:
1254
- # 分批获取该consumer的所有pending消息
1255
- batch_size = 1000
1256
- last_id = '-'
1257
-
1258
- while True:
1259
- # 获取一批pending消息
1260
- prefixed_queue = self.get_prefixed_queue_name(queue)
1261
- pending_batch = await self.async_redis.xpending_range(
1262
- prefixed_queue, prefixed_queue,
1263
- min=last_id, max='+',
1264
- count=batch_size
1265
- )
1266
-
1267
- if not pending_batch:
1268
- break
1269
-
1270
- # 过滤出属于该consumer的消息
1271
- for msg in pending_batch:
1272
- msg_consumer = msg['consumer']
1273
- # 处理bytes类型
1274
- if isinstance(msg_consumer, bytes):
1275
- msg_consumer = msg_consumer.decode('utf-8')
1276
- if msg_consumer == consumer_name:
1277
- consumer_messages.append(msg)
1278
-
1279
- # 如果获取的消息数小于batch_size,说明已经获取完所有消息
1280
- if len(pending_batch) < batch_size:
1281
- break
1282
-
1283
- # 更新last_id为最后一条消息的ID,用于下一批查询
1284
- last_id = pending_batch[-1]['message_id']
1285
-
1286
- if not consumer_messages:
1287
- logger.debug(f"No pending messages for consumer {consumer_name}")
1288
- # 仍然尝试删除consumer
1289
- try:
1290
- prefixed_queue = self.get_prefixed_queue_name(queue)
1291
- await self.async_redis.execute_command('XGROUP', 'DELCONSUMER', prefixed_queue, prefixed_queue, consumer_name)
1292
- except:
1293
- pass
1294
- return
1295
-
1296
- logger.debug(f"Found {len(consumer_messages)} pending messages for dead consumer {consumer_name}")
1297
-
1298
- # 获取消息ID列表
1299
- message_ids = [msg['message_id'] for msg in consumer_messages]
1300
-
1301
- # 使用一个特殊的consumer来claim这些消息
1302
- temp_consumer = f"recovery-{consumer_name}-{uuid.uuid4().hex[:8]}"
1303
-
1304
- # 记录恢复开始
1305
- await self.async_redis.hset(f"RECOVERY:STATUS:{temp_consumer}", mapping={
1306
- 'start_time': str(time.time()),
1307
- 'total_messages': str(len(message_ids)),
1308
- 'queue': queue,
1309
- 'original_consumer': consumer_name,
1310
- 'status': 'in_progress'
1311
- })
1312
- await self.async_redis.expire(f"RECOVERY:STATUS:{temp_consumer}", 3600) # 1小时过期
1313
-
1314
- # 分批处理消息,增加重试机制
1315
- recovered_count = 0
1316
- failed_messages = []
1317
-
1318
- for i in range(0, len(message_ids), 100):
1319
- batch = message_ids[i:i+100]
1320
-
1321
- for retry in range(max_retries):
1322
- try:
1323
- # 使用pipeline确保原子性
1324
- pipeline = self.async_redis.pipeline()
1325
-
1326
- # 1. Claim消息到临时consumer
1327
- prefixed_queue = self.get_prefixed_queue_name(queue)
1328
- claimed = await self.async_redis.xclaim(
1329
- prefixed_queue, prefixed_queue,
1330
- temp_consumer,
1331
- min_idle_time=0,
1332
- message_ids=batch,
1333
- force=True
1334
- )
1335
-
1336
- if claimed:
1337
- # 2. 准备批量添加的数据
1338
- messages_to_add = []
1339
- claimed_ids = []
1340
-
1341
- for msg_id, msg_data in claimed:
1342
- messages_to_add.append((msg_data, msg_id))
1343
- claimed_ids.append(msg_id)
1344
-
1345
- # 3. 在pipeline中执行所有操作
1346
- for msg_data, original_id in messages_to_add:
1347
- # 添加恢复标记
1348
- msg_data['_recovered_from'] = consumer_name
1349
- msg_data['_recovery_time'] = str(time.time())
1350
- msg_data['_original_id'] = original_id
1351
- pipeline.xadd(prefixed_queue, msg_data)
1352
-
1353
- # 4. ACK原始消息
1354
- pipeline.xack(prefixed_queue, prefixed_queue, *claimed_ids)
1355
-
1356
- # 5. 执行pipeline
1357
- results = await pipeline.execute()
1358
-
1359
- # 验证所有操作都成功
1360
- new_ids = [r for r in results[:-1]] # 前面的都是xadd的结果
1361
- if all(new_ids):
1362
- recovered_count += len(claimed_ids)
1363
- logger.debug(f"Successfully recovered batch of {len(claimed_ids)} messages")
1364
- break
1365
- else:
1366
- logger.error(f"Pipeline execution failed for some messages, retry {retry + 1}/{max_retries}")
1367
- else:
1368
- # 没有成功claim到消息,可能已经被其他进程处理
1369
- logger.warning(f"No messages claimed from batch, they may have been processed")
1370
- break
1371
-
1372
- except Exception as e:
1373
- logger.error(f"Error recovering batch (retry {retry + 1}/{max_retries}): {e}")
1374
- if retry == max_retries - 1:
1375
- failed_messages.extend(batch)
1376
-
1377
- # 更新恢复进度
1378
- if (i + len(batch)) % 1000 == 0 or i + len(batch) >= len(message_ids):
1379
- await self.async_redis.hset(f"RECOVERY:STATUS:{temp_consumer}",
1380
- 'recovered_count', str(recovered_count))
1381
-
1382
- # 记录恢复结果
1383
- await self.async_redis.hset(f"RECOVERY:STATUS:{temp_consumer}", mapping={
1384
- 'end_time': str(time.time()),
1385
- 'recovered_count': str(recovered_count),
1386
- 'failed_count': str(len(failed_messages)),
1387
- 'status': 'completed' if not failed_messages else 'completed_with_errors'
1388
- })
1389
-
1390
- logger.debug(f"Recovery completed: {recovered_count}/{len(message_ids)} messages recovered from {consumer_name}")
1391
-
1392
- if failed_messages:
1393
- logger.error(f"Failed to recover {len(failed_messages)} messages: {failed_messages[:10]}...")
1394
- # 将失败的消息ID记录到Redis供后续分析
1395
- await self.async_redis.rpush(f"RECOVERY:FAILED:{queue}", *[str(mid) for mid in failed_messages[:100]])
1396
- await self.async_redis.expire(f"RECOVERY:FAILED:{queue}", 86400) # 保留24小时
1397
-
1398
- except Exception as e:
1399
- logger.error(f"Error getting pending messages: {e}")
1400
- await self.async_redis.hset(f"RECOVERY:STATUS:{temp_consumer}", mapping={
1401
- 'error': str(e),
1402
- 'status': 'failed'
1403
- })
1404
-
1405
- # 清理临时consumer(如果创建了的话)
1406
- if 'temp_consumer' in locals():
1407
- try:
1408
- prefixed_queue = self.get_prefixed_queue_name(queue)
1409
- # 确保临时consumer没有新的pending消息
1410
- temp_pending = await self.async_redis.xpending(prefixed_queue, prefixed_queue)
1411
-
1412
- # 处理不同的返回格式
1413
- if temp_pending and isinstance(temp_pending, dict) and temp_pending.get('consumers'):
1414
- for consumer_info in temp_pending['consumers']:
1415
- # 处理不同的consumer_info格式
1416
- if isinstance(consumer_info, dict):
1417
- # 新格式:{'name': 'consumer_name', 'pending': count}
1418
- consumer_name_check = consumer_info.get('name', '')
1419
- pending_count = consumer_info.get('pending', 0)
1420
- elif isinstance(consumer_info, (list, tuple)) and len(consumer_info) >= 2:
1421
- # 旧格式:['consumer_name', count]
1422
- consumer_name_check = consumer_info[0]
1423
- pending_count = consumer_info[1]
1424
- else:
1425
- continue
1426
-
1427
- # 处理bytes类型
1428
- if isinstance(consumer_name_check, bytes):
1429
- consumer_name_check = consumer_name_check.decode('utf-8')
1430
-
1431
- if consumer_name_check == temp_consumer and int(pending_count) > 0:
1432
- logger.warning(f"Temp consumer {temp_consumer} still has {pending_count} pending messages")
1433
- # 递归恢复临时consumer的消息
1434
- await self._reset_consumer_pending_messages(queue, temp_consumer)
1435
-
1436
- # 删除临时consumer
1437
- await self.async_redis.execute_command('XGROUP', 'DELCONSUMER', prefixed_queue, prefixed_queue, temp_consumer)
1438
- logger.debug(f"Cleaned up temp consumer {temp_consumer}")
1439
- except Exception as e:
1440
- logger.error(f"Error cleaning up temp consumer: {e}")
1441
-
1442
- # 最后删除死亡的consumer
1443
- try:
1444
- prefixed_queue = self.get_prefixed_queue_name(queue)
1445
- await self.async_redis.execute_command('XGROUP', 'DELCONSUMER', prefixed_queue, prefixed_queue, consumer_name)
1446
- logger.debug(f"Deleted consumer {consumer_name}")
1447
- except:
1448
- pass
1449
-
1450
- except Exception as e:
1451
- logger.error(f"Error resetting pending messages for {consumer_name}: {e}")
1452
- finally:
1453
- # 释放恢复锁
1454
- await self.async_redis.delete(recovery_lock_key)
1455
-
1456
- async def _cleanup_stale_recovery_consumers(self):
1457
- """清理残留的recovery consumer(异步版本)"""
1458
- try:
1459
- # 获取所有队列
1460
- queues_pattern = f"{self.redis_prefix}:*"
1461
- all_keys = []
1462
- cursor = 0
1463
-
1464
- while True:
1465
- cursor, keys = await self.async_redis.scan(cursor, match=queues_pattern, count=100)
1466
- all_keys.extend(keys)
1467
- if cursor == 0:
1468
- break
1469
-
1470
- # 筛选出stream类型的队列
1471
- stream_queues = []
1472
- for key in all_keys:
1473
- try:
1474
- if await self.async_redis.type(key) == 'stream':
1475
- stream_queues.append(key)
1476
- except:
1477
- continue
1478
-
1479
- cleaned_count = 0
1480
- for queue in stream_queues:
1481
- try:
1482
- # 跳过非队列的stream(比如可能的其他用途的stream)
1483
- if ':QUEUE:' not in queue:
1484
- continue
1485
-
1486
- # 获取该队列的所有consumer信息
1487
- # 在jettask中,consumer group名称和stream名称相同(都是带前缀的)
1488
- try:
1489
- pending_info = await self.async_redis.xpending(queue, queue)
1490
- except Exception as xpending_error:
1491
- # 如果xpending失败,可能是因为consumer group不存在
1492
- logger.debug(f"xpending failed for {queue}: {xpending_error}")
1493
- continue
1494
-
1495
- # 处理不同的返回格式
1496
- if not pending_info:
1497
- continue
1498
-
1499
- # 如果返回的是数字0,说明没有pending消息
1500
- if isinstance(pending_info, int) and pending_info == 0:
1501
- continue
1502
-
1503
- # 如果不是字典,跳过
1504
- if not isinstance(pending_info, dict):
1505
- logger.debug(f"Unexpected xpending response for {queue}: {type(pending_info)} - {pending_info}")
1506
- continue
1507
-
1508
- # 检查是否有consumers字段
1509
- consumers = pending_info.get('consumers')
1510
- if not consumers:
1511
- continue
1512
-
1513
- # 检查recovery consumer
1514
- for consumer_info in consumers:
1515
- # 处理不同的consumer_info格式
1516
- if isinstance(consumer_info, dict):
1517
- # 新格式:{'name': 'consumer_name', 'pending': count}
1518
- consumer_name = consumer_info.get('name', '')
1519
- pending_count = consumer_info.get('pending', 0)
1520
- elif isinstance(consumer_info, (list, tuple)) and len(consumer_info) >= 2:
1521
- # 旧格式:['consumer_name', count]
1522
- consumer_name = consumer_info[0]
1523
- pending_count = consumer_info[1]
1524
- else:
1525
- logger.warning(f"Unexpected consumer info format: {consumer_info}")
1526
- continue
1527
-
1528
- # 处理bytes类型
1529
- if isinstance(consumer_name, bytes):
1530
- consumer_name = consumer_name.decode('utf-8')
1531
-
1532
- # 确保pending_count是整数
1533
- try:
1534
- pending_count = int(pending_count)
1535
- except (ValueError, TypeError):
1536
- logger.warning(f"Invalid pending count for {consumer_name}: {pending_count}")
1537
- continue
1538
-
1539
- # 识别recovery consumer
1540
- if consumer_name.startswith('recovery-'):
1541
-
1542
- # 检查recovery状态
1543
- status_key = f"RECOVERY:STATUS:{consumer_name}"
1544
- status = await self.async_redis.hget(status_key, 'status')
1545
-
1546
- # 如果状态已完成或不存在状态信息(可能是旧的残留)
1547
- if not status or status in ['completed', 'completed_with_errors', 'failed']:
1548
- # 如果还有pending消息,先恢复它们
1549
- if pending_count > 0:
1550
- logger.warning(f"Found stale recovery consumer {consumer_name} with {pending_count} pending messages")
1551
- # 递归恢复这些消息
1552
- queue_name = queue.split(':', 1)[-1] if ':' in queue else queue
1553
- await self._reset_consumer_pending_messages(queue_name, consumer_name)
1554
- else:
1555
- # 没有pending消息,直接删除
1556
- try:
1557
- await self.async_redis.execute_command('XGROUP', 'DELCONSUMER', queue, queue, consumer_name)
1558
- logger.debug(f"Cleaned up stale recovery consumer {consumer_name}")
1559
- cleaned_count += 1
1560
- except Exception as e:
1561
- logger.error(f"Failed to delete recovery consumer {consumer_name}: {e}")
1562
-
1563
- except Exception as e:
1564
- import traceback
1565
- logger.error(f"Error cleaning recovery consumers in queue {queue}: {e}")
1566
- logger.error(f"Traceback:\n{traceback.format_exc()}")
1567
-
1568
- if cleaned_count > 0:
1569
- logger.debug(f"Cleaned up {cleaned_count} stale recovery consumers")
1570
-
1571
- except Exception as e:
1572
- logger.error(f"Error in cleanup_stale_recovery_consumers: {e}")
1573
-
1574
- def is_heartbeat_timeout(self) -> bool:
1575
- """检查心跳是否已超时"""
1576
- # 给新启动的worker一个宽限期(15秒),避免误判
1577
- if hasattr(self, '_startup_time'):
1578
- if time.time() - self._startup_time < 15:
1579
- return False
1580
-
1581
- if self._heartbeat_process_manager:
1582
- return self._heartbeat_process_manager.is_heartbeat_timeout()
1583
- return False
1584
-
1585
- def cleanup(self):
1586
- """清理资源"""
1587
- # 如果consumer_id从未被创建,说明这个实例从未真正运行
1588
- if self.consumer_id is None:
1589
- logger.debug("HeartbeatConsumerStrategy cleanup: never initialized, skipping")
1590
- return
1591
-
1592
- logger.debug(f"Cleaning up heartbeat consumer {self.consumer_id}")
1593
-
1594
- # 先停止所有心跳进程,避免新的数据产生
1595
- if self._heartbeat_process_manager and self._heartbeat_process_manager.heartbeat_process:
1596
- logger.debug("Stopping heartbeat processes...")
1597
- self._heartbeat_process_manager.stop_all()
1598
-
1599
- # 停止扫描器(如果已启动)
1600
- if self._scanner_started:
1601
- self._scanner_stop.set()
1602
- # 如果是协程,取消它
1603
- if self._scanner_task and not self._scanner_task.done():
1604
- self._scanner_task.cancel()
1605
-
1606
- # 停止统计刷新线程/协程(如果已启动)
1607
- if self._stats_flusher_started:
1608
- self._stats_flusher_stop.set()
1609
- # 如果是协程,取消它
1610
- if self._stats_flusher_task and not self._stats_flusher_task.done():
1611
- self._stats_flusher_task.cancel()
1612
-
1613
- # 在cleanup时简单记录未处理的统计事件数量
1614
- try:
1615
- events_count = len(self.stats_events)
1616
- if events_count > 0:
1617
- logger.warning(f"Dropped {events_count} stats events during cleanup (async flush not available)")
1618
- self.stats_events.clear() # 清空以避免内存泄露
1619
- except Exception as e:
1620
- logger.error(f"Failed to clear stats buffer during cleanup: {e}")
1621
-
1622
- # 立即将worker标记为离线状态
1623
- worker_data = None
1624
- try:
1625
- current_time = time.time()
1626
-
1627
- # 只有在consumer_id已初始化的情况下才进行清理
1628
- if self.consumer_id is None:
1629
- logger.debug("Consumer ID was never initialized, skipping worker cleanup")
1630
- return
1631
-
1632
- # 直接使用已有的worker_key,不要触发getter
1633
- worker_key = self._worker_key
1634
- if not worker_key:
1635
- logger.debug("Worker key was never initialized, skipping worker cleanup")
1636
- return
1637
-
1638
- # 获取当前worker的数据用于保存历史
1639
- worker_data = self.redis.hgetall(worker_key)
1640
-
1641
- # 如果worker从未运行过(没有数据),则不需要处理
1642
- if not worker_data:
1643
- logger.debug(f"Worker {self.consumer_id} never started, skipping cleanup")
1644
- return
1645
-
1646
- # 更新worker状态为离线(保留所有现有数据)
1647
- pipeline = self.redis.pipeline()
1648
- pipeline.hset(worker_key, mapping={
1649
- 'is_alive': 'false',
1650
- 'offline_time': str(current_time),
1651
- 'shutdown_reason': 'graceful_shutdown',
1652
- 'messages_transferred': 'false' # 标记消息需要转移
1653
- })
1654
-
1655
- # 获取worker的队列列表
1656
- queues = worker_data.get('queues', '').split(',') if worker_data.get('queues') else []
1657
-
1658
- # 将所有队列的运行中任务数归零
1659
- for queue in queues:
1660
- if queue.strip():
1661
- pipeline.hset(worker_key, f'{queue}:running_tasks', '0')
1662
-
1663
- # 执行批量更新
1664
- pipeline.execute()
1665
-
1666
- # 不再保存历史记录,WORKER键本身就包含了所有信息
1667
-
1668
- logger.debug(f"Marked worker {self.consumer_id} as offline immediately")
1669
-
1670
- except Exception as e:
1671
- logger.error(f"Failed to mark worker as offline during cleanup: {e}")
1672
-
1673
- # 如果从未成功运行过,直接返回
1674
- if not worker_data:
1675
- logger.debug(f"Heartbeat consumer {self.consumer_id} stopped gracefully (never started)")
1676
- return
1677
-
1678
- # 等待扫描线程结束(非阻塞)
1679
- if self._scanner_started and self._scanner_thread and self._scanner_thread.is_alive():
1680
- max_wait_time = 0.5 # 最多等待0.5秒
1681
- self._scanner_thread.join(timeout=max_wait_time)
1682
- if self._scanner_thread.is_alive():
1683
- logger.warning("Scanner thread did not stop in time")
1684
-
1685
- # 等待统计刷新线程结束(非阻塞)
1686
- if self._stats_flusher_started and self._stats_flusher_thread and self._stats_flusher_thread.is_alive():
1687
- max_wait_time = 0.5 # 最多等待0.5秒
1688
- self._stats_flusher_thread.join(timeout=max_wait_time)
1689
- if self._stats_flusher_thread.is_alive():
1690
- logger.warning("Stats flusher thread did not stop in time")
1691
-
1692
- # 重要:不删除心跳记录!
1693
- # 心跳记录必须保留,让scanner能够检测到worker离线并恢复pending消息
1694
- # 心跳会因为超时自动被scanner清理
1695
- logger.debug(f"Heartbeat consumer {self.consumer_id} stopped")