jettask 0.2.19__py3-none-any.whl → 0.2.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. jettask/__init__.py +12 -3
  2. jettask/cli.py +314 -228
  3. jettask/config/__init__.py +9 -1
  4. jettask/config/config.py +245 -0
  5. jettask/config/env_loader.py +381 -0
  6. jettask/config/lua_scripts.py +158 -0
  7. jettask/config/nacos_config.py +132 -5
  8. jettask/core/__init__.py +1 -1
  9. jettask/core/app.py +1573 -666
  10. jettask/core/app_importer.py +33 -16
  11. jettask/core/container.py +532 -0
  12. jettask/core/task.py +1 -4
  13. jettask/core/unified_manager_base.py +2 -2
  14. jettask/executor/__init__.py +38 -0
  15. jettask/executor/core.py +625 -0
  16. jettask/executor/executor.py +338 -0
  17. jettask/executor/orchestrator.py +290 -0
  18. jettask/executor/process_entry.py +638 -0
  19. jettask/executor/task_executor.py +317 -0
  20. jettask/messaging/__init__.py +68 -0
  21. jettask/messaging/event_pool.py +2188 -0
  22. jettask/messaging/reader.py +519 -0
  23. jettask/messaging/registry.py +266 -0
  24. jettask/messaging/scanner.py +369 -0
  25. jettask/messaging/sender.py +312 -0
  26. jettask/persistence/__init__.py +118 -0
  27. jettask/persistence/backlog_monitor.py +567 -0
  28. jettask/{backend/data_access.py → persistence/base.py} +58 -57
  29. jettask/persistence/consumer.py +315 -0
  30. jettask/{core → persistence}/db_manager.py +23 -22
  31. jettask/persistence/maintenance.py +81 -0
  32. jettask/persistence/message_consumer.py +259 -0
  33. jettask/{backend/namespace_data_access.py → persistence/namespace.py} +66 -98
  34. jettask/persistence/offline_recovery.py +196 -0
  35. jettask/persistence/queue_discovery.py +215 -0
  36. jettask/persistence/task_persistence.py +218 -0
  37. jettask/persistence/task_updater.py +583 -0
  38. jettask/scheduler/__init__.py +2 -2
  39. jettask/scheduler/loader.py +6 -5
  40. jettask/scheduler/run_scheduler.py +1 -1
  41. jettask/scheduler/scheduler.py +7 -7
  42. jettask/scheduler/{unified_scheduler_manager.py → scheduler_coordinator.py} +18 -13
  43. jettask/task/__init__.py +16 -0
  44. jettask/{router.py → task/router.py} +26 -8
  45. jettask/task/task_center/__init__.py +9 -0
  46. jettask/task/task_executor.py +318 -0
  47. jettask/task/task_registry.py +291 -0
  48. jettask/test_connection_monitor.py +73 -0
  49. jettask/utils/__init__.py +31 -1
  50. jettask/{monitor/run_backlog_collector.py → utils/backlog_collector.py} +1 -1
  51. jettask/utils/db_connector.py +1629 -0
  52. jettask/{db_init.py → utils/db_init.py} +1 -1
  53. jettask/utils/rate_limit/__init__.py +30 -0
  54. jettask/utils/rate_limit/concurrency_limiter.py +665 -0
  55. jettask/utils/rate_limit/config.py +145 -0
  56. jettask/utils/rate_limit/limiter.py +41 -0
  57. jettask/utils/rate_limit/manager.py +269 -0
  58. jettask/utils/rate_limit/qps_limiter.py +154 -0
  59. jettask/utils/rate_limit/task_limiter.py +384 -0
  60. jettask/utils/serializer.py +3 -0
  61. jettask/{monitor/stream_backlog_monitor.py → utils/stream_backlog.py} +14 -6
  62. jettask/utils/time_sync.py +173 -0
  63. jettask/webui/__init__.py +27 -0
  64. jettask/{api/v1 → webui/api}/alerts.py +1 -1
  65. jettask/{api/v1 → webui/api}/analytics.py +2 -2
  66. jettask/{api/v1 → webui/api}/namespaces.py +1 -1
  67. jettask/{api/v1 → webui/api}/overview.py +1 -1
  68. jettask/{api/v1 → webui/api}/queues.py +3 -3
  69. jettask/{api/v1 → webui/api}/scheduled.py +1 -1
  70. jettask/{api/v1 → webui/api}/settings.py +1 -1
  71. jettask/{api.py → webui/app.py} +253 -145
  72. jettask/webui/namespace_manager/__init__.py +10 -0
  73. jettask/{multi_namespace_consumer.py → webui/namespace_manager/multi.py} +69 -22
  74. jettask/{unified_consumer_manager.py → webui/namespace_manager/unified.py} +1 -1
  75. jettask/{run.py → webui/run.py} +2 -2
  76. jettask/{services → webui/services}/__init__.py +1 -3
  77. jettask/{services → webui/services}/overview_service.py +34 -16
  78. jettask/{services → webui/services}/queue_service.py +1 -1
  79. jettask/{backend → webui/services}/queue_stats_v2.py +1 -1
  80. jettask/{services → webui/services}/settings_service.py +1 -1
  81. jettask/worker/__init__.py +53 -0
  82. jettask/worker/lifecycle.py +1507 -0
  83. jettask/worker/manager.py +583 -0
  84. jettask/{core/offline_worker_recovery.py → worker/recovery.py} +268 -175
  85. {jettask-0.2.19.dist-info → jettask-0.2.23.dist-info}/METADATA +2 -71
  86. jettask-0.2.23.dist-info/RECORD +145 -0
  87. jettask/__main__.py +0 -140
  88. jettask/api/__init__.py +0 -103
  89. jettask/backend/__init__.py +0 -1
  90. jettask/backend/api/__init__.py +0 -3
  91. jettask/backend/api/v1/__init__.py +0 -17
  92. jettask/backend/api/v1/monitoring.py +0 -431
  93. jettask/backend/api/v1/namespaces.py +0 -504
  94. jettask/backend/api/v1/queues.py +0 -342
  95. jettask/backend/api/v1/tasks.py +0 -367
  96. jettask/backend/core/__init__.py +0 -3
  97. jettask/backend/core/cache.py +0 -221
  98. jettask/backend/core/database.py +0 -200
  99. jettask/backend/core/exceptions.py +0 -102
  100. jettask/backend/dependencies.py +0 -261
  101. jettask/backend/init_meta_db.py +0 -158
  102. jettask/backend/main.py +0 -1426
  103. jettask/backend/main_unified.py +0 -78
  104. jettask/backend/main_v2.py +0 -394
  105. jettask/backend/models/__init__.py +0 -3
  106. jettask/backend/models/requests.py +0 -236
  107. jettask/backend/models/responses.py +0 -230
  108. jettask/backend/namespace_api_old.py +0 -267
  109. jettask/backend/services/__init__.py +0 -3
  110. jettask/backend/start.py +0 -42
  111. jettask/backend/unified_api_router.py +0 -1541
  112. jettask/cleanup_deprecated_tables.sql +0 -16
  113. jettask/core/consumer_manager.py +0 -1695
  114. jettask/core/delay_scanner.py +0 -256
  115. jettask/core/event_pool.py +0 -1700
  116. jettask/core/heartbeat_process.py +0 -222
  117. jettask/core/task_batch.py +0 -153
  118. jettask/core/worker_scanner.py +0 -271
  119. jettask/executors/__init__.py +0 -5
  120. jettask/executors/asyncio.py +0 -876
  121. jettask/executors/base.py +0 -30
  122. jettask/executors/common.py +0 -148
  123. jettask/executors/multi_asyncio.py +0 -309
  124. jettask/gradio_app.py +0 -570
  125. jettask/integrated_gradio_app.py +0 -1088
  126. jettask/main.py +0 -0
  127. jettask/monitoring/__init__.py +0 -3
  128. jettask/pg_consumer.py +0 -1896
  129. jettask/run_monitor.py +0 -22
  130. jettask/run_webui.py +0 -148
  131. jettask/scheduler/multi_namespace_scheduler.py +0 -294
  132. jettask/scheduler/unified_manager.py +0 -450
  133. jettask/task_center_client.py +0 -150
  134. jettask/utils/serializer_optimized.py +0 -33
  135. jettask/webui_exceptions.py +0 -67
  136. jettask-0.2.19.dist-info/RECORD +0 -150
  137. /jettask/{constants.py → config/constants.py} +0 -0
  138. /jettask/{backend/config.py → config/task_center.py} +0 -0
  139. /jettask/{pg_consumer → messaging/pg_consumer}/pg_consumer_v2.py +0 -0
  140. /jettask/{pg_consumer → messaging/pg_consumer}/sql/add_execution_time_field.sql +0 -0
  141. /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_new_tables.sql +0 -0
  142. /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_tables_v3.sql +0 -0
  143. /jettask/{pg_consumer → messaging/pg_consumer}/sql/migrate_to_new_structure.sql +0 -0
  144. /jettask/{pg_consumer → messaging/pg_consumer}/sql/modify_time_fields.sql +0 -0
  145. /jettask/{pg_consumer → messaging/pg_consumer}/sql_utils.py +0 -0
  146. /jettask/{models.py → persistence/models.py} +0 -0
  147. /jettask/scheduler/{manager.py → task_crud.py} +0 -0
  148. /jettask/{schema.sql → schemas/schema.sql} +0 -0
  149. /jettask/{task_center.py → task/task_center/client.py} +0 -0
  150. /jettask/{monitoring → utils}/file_watcher.py +0 -0
  151. /jettask/{services/redis_monitor_service.py → utils/redis_monitor.py} +0 -0
  152. /jettask/{api/v1 → webui/api}/__init__.py +0 -0
  153. /jettask/{webui_config.py → webui/config.py} +0 -0
  154. /jettask/{webui_models → webui/models}/__init__.py +0 -0
  155. /jettask/{webui_models → webui/models}/namespace.py +0 -0
  156. /jettask/{services → webui/services}/alert_service.py +0 -0
  157. /jettask/{services → webui/services}/analytics_service.py +0 -0
  158. /jettask/{services → webui/services}/scheduled_task_service.py +0 -0
  159. /jettask/{services → webui/services}/task_service.py +0 -0
  160. /jettask/{webui_sql → webui/sql}/batch_upsert_functions.sql +0 -0
  161. /jettask/{webui_sql → webui/sql}/verify_database.sql +0 -0
  162. {jettask-0.2.19.dist-info → jettask-0.2.23.dist-info}/WHEEL +0 -0
  163. {jettask-0.2.19.dist-info → jettask-0.2.23.dist-info}/entry_points.txt +0 -0
  164. {jettask-0.2.19.dist-info → jettask-0.2.23.dist-info}/licenses/LICENSE +0 -0
  165. {jettask-0.2.19.dist-info → jettask-0.2.23.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,338 @@
1
+ """
2
+ 统一执行器
3
+
4
+ 整合了单进程和多进程执行模式的统一接口
5
+ """
6
+
7
+ import asyncio
8
+ import multiprocessing
9
+ import logging
10
+ import os
11
+ import time
12
+ from collections import deque
13
+ from typing import List, Optional
14
+
15
+ from .core import ExecutionMode, ExecutorCore
16
+ from .orchestrator import ProcessConfig, ProcessOrchestrator
17
+ from ..worker.lifecycle import WorkerStateManager
18
+ from ..utils.rate_limit.manager import RateLimiterManager
19
+
20
+ logger = logging.getLogger('app')
21
+
22
+ # Try to use uvloop for better performance
23
+ try:
24
+ import uvloop
25
+ uvloop.install()
26
+ logger.debug("Using uvloop for better performance")
27
+ except ImportError:
28
+ pass
29
+
30
+
31
+ class UnifiedExecutor:
32
+ """
33
+ 统一执行器
34
+
35
+ 整合AsyncioExecutor和MultiAsyncioExecutor的功能
36
+ 支持单进程和多进程两种执行模式
37
+
38
+ 职责:
39
+ 1. 提供统一的执行器接口
40
+ 2. 根据模式选择ExecutorCore或ProcessOrchestrator
41
+ 3. 管理事件队列和任务分发
42
+ """
43
+
44
+ def __init__(self, event_queue, app, concurrency=100,
45
+ mode: ExecutionMode = ExecutionMode.SINGLE_PROCESS,
46
+ task_name: str = None):
47
+ """
48
+ 初始化统一执行器
49
+
50
+ Args:
51
+ event_queue: 事件队列
52
+ app: Application实例
53
+ concurrency: 并发数
54
+ mode: 执行模式
55
+ task_name: 任务名称(单进程模式必需)
56
+ """
57
+ self.event_queue = event_queue
58
+ self.app = app
59
+ self.concurrency = concurrency
60
+ self.mode = mode
61
+ self.task_name = task_name
62
+
63
+ # 根据模式初始化核心组件
64
+ if mode == ExecutionMode.SINGLE_PROCESS:
65
+ if not task_name:
66
+ raise ValueError("task_name is required for SINGLE_PROCESS mode")
67
+
68
+ self.executor_core = ExecutorCore(
69
+ app=app,
70
+ task_name=task_name,
71
+ concurrency=concurrency
72
+ )
73
+ self.orchestrator = None
74
+ logger.debug(f"UnifiedExecutor initialized in SINGLE_PROCESS mode for task {task_name}")
75
+
76
+ elif mode == ExecutionMode.MULTI_PROCESS:
77
+ self.executor_core = None
78
+ self.orchestrator = ProcessOrchestrator(
79
+ app=app,
80
+ num_processes=concurrency
81
+ )
82
+ logger.debug(f"UnifiedExecutor initialized in MULTI_PROCESS mode with {concurrency} processes")
83
+
84
+ else:
85
+ raise ValueError(f"Unsupported execution mode: {mode}")
86
+
87
+ # 活动任务集合(单进程模式使用)
88
+ self._active_tasks = set()
89
+
90
+ def logic(self, *args, **kwargs):
91
+ """
92
+ BaseExecutor接口方法
93
+ 在单进程模式下不使用,多进程模式委托给ProcessOrchestrator
94
+ """
95
+ pass
96
+
97
+ async def loop(self):
98
+ """主循环 - 单进程模式"""
99
+ if self.mode != ExecutionMode.SINGLE_PROCESS:
100
+ raise RuntimeError("loop() is only for SINGLE_PROCESS mode")
101
+
102
+ # 初始化限流器
103
+ self.app.consumer_manager._heartbeat_strategy._ensure_consumer_id()
104
+ worker_id = self.app.consumer_manager._heartbeat_strategy.consumer_id
105
+ registry_manager = self.app.consumer_manager
106
+
107
+ if not self.app.worker_state_manager:
108
+ self.app.worker_state_manager = WorkerStateManager(
109
+ redis_client=self.app.ep.async_redis_client,
110
+ redis_prefix=self.executor_core.prefix,
111
+ event_pool=self.app.ep # 传入 EventPool 实例,启用事件驱动的消息恢复
112
+ )
113
+ await self.app.worker_state_manager.start_listener()
114
+ logger.debug(f"WorkerStateManager started for worker {worker_id}")
115
+
116
+ # # 初始化时间同步
117
+ from jettask.utils.time_sync import init_time_sync
118
+ time_sync = await init_time_sync(self.app.ep.async_redis_client)
119
+ logger.debug(f"TimeSync initialized, offset={time_sync.get_offset():.6f}s")
120
+
121
+ self.executor_core.rate_limiter_manager = RateLimiterManager(
122
+ redis_client=self.app.ep.async_redis_client,
123
+ worker_id=worker_id,
124
+ redis_prefix=self.executor_core.prefix,
125
+ registry_manager=registry_manager,
126
+ worker_state_manager=self.app.worker_state_manager
127
+ )
128
+ logger.debug(f"RateLimiterManager initialized for worker {worker_id}")
129
+
130
+ await self.executor_core.rate_limiter_manager.load_config_from_redis()
131
+
132
+ tasks_batch = []
133
+ max_buffer_size = 5000
134
+
135
+ try:
136
+ while True:
137
+ # 检查退出信号
138
+ if hasattr(self.app, '_should_exit') and self.app._should_exit:
139
+ logger.debug("UnifiedExecutor detected shutdown signal")
140
+ break
141
+
142
+ # 检查父进程
143
+ if hasattr(os, 'getppid') and os.getppid() == 1:
144
+ logger.debug("Parent process died, exiting...")
145
+ break
146
+
147
+ current_time = time.time()
148
+
149
+ # 获取事件
150
+ event = None
151
+ try:
152
+ event = await asyncio.wait_for(self.event_queue.get(), timeout=0.1)
153
+ except asyncio.TimeoutError:
154
+ event = None
155
+
156
+ if event:
157
+ event.pop("execute_time", None)
158
+ tasks_batch.append(event)
159
+ logger.debug(f"[EVENT] Got event: {event.get('event_id', 'unknown')}, task_name={event.get('event_data', {}).get('_task_name')}")
160
+
161
+ # 批量创建任务
162
+ if tasks_batch:
163
+ for event in tasks_batch:
164
+ event_data = event.get('event_data', {})
165
+ event_task_name = event_data.get("_task_name") or event_data.get("name")
166
+
167
+ if not event_task_name:
168
+ logger.error(f"No task_name in event {event.get('event_id')}")
169
+ continue
170
+
171
+ # 验证任务名称匹配
172
+ if event_task_name != self.task_name:
173
+ logger.error(f"Task name mismatch: {event_task_name} != {self.task_name}")
174
+ continue
175
+
176
+ # 限流控制
177
+ logger.debug(f"[TASK] Attempting to acquire rate limit for {self.task_name}, event_id={event.get('event_id')}")
178
+ rate_limit_token = await self.executor_core.rate_limiter_manager.acquire(
179
+ task_name=self.task_name,
180
+ timeout=None
181
+ )
182
+ print(f'{rate_limit_token=}')
183
+ if not rate_limit_token:
184
+ logger.error(f"Failed to acquire token for {self.task_name}")
185
+ continue
186
+ logger.debug(f"[TASK] Successfully acquired rate limit for {self.task_name}, token={rate_limit_token}, starting execution")
187
+
188
+ self.executor_core.batch_counter += 1
189
+
190
+ # 创建任务包装器,在任务完成时自动释放限流许可
191
+ async def execute_with_release(event_data, token):
192
+ try:
193
+ await self.executor_core.execute_task(**event_data)
194
+ finally:
195
+ # 无论任务成功还是失败,都释放并发许可
196
+ await self.executor_core.rate_limiter_manager.release(self.task_name, task_id=token)
197
+
198
+ task = asyncio.create_task(execute_with_release(event, rate_limit_token))
199
+ self._active_tasks.add(task)
200
+ task.add_done_callback(self._active_tasks.discard)
201
+
202
+ tasks_batch.clear()
203
+
204
+ # 智能缓冲区管理
205
+ buffer_full = (
206
+ len(self.executor_core.pending_acks) >= max_buffer_size or
207
+ len(self.executor_core.status_updates) >= max_buffer_size or
208
+ len(self.executor_core.data_updates) >= max_buffer_size or
209
+ len(self.executor_core.task_info_updates) >= max_buffer_size
210
+ )
211
+
212
+ should_flush_periodic = False
213
+ has_pending_data = (
214
+ self.executor_core.pending_acks or
215
+ self.executor_core.status_updates or
216
+ self.executor_core.data_updates or
217
+ self.executor_core.task_info_updates
218
+ )
219
+
220
+ if has_pending_data:
221
+ for data_type, config in self.executor_core.pipeline_config.items():
222
+ time_since_flush = current_time - self.executor_core.last_pipeline_flush[data_type]
223
+
224
+ if data_type == 'ack' and self.executor_core.pending_acks:
225
+ if time_since_flush >= config['max_delay']:
226
+ should_flush_periodic = True
227
+ break
228
+ elif data_type == 'task_info' and self.executor_core.task_info_updates:
229
+ if time_since_flush >= config['max_delay']:
230
+ should_flush_periodic = True
231
+ break
232
+ elif data_type == 'status' and self.executor_core.status_updates:
233
+ if time_since_flush >= config['max_delay']:
234
+ should_flush_periodic = True
235
+ break
236
+ elif data_type == 'data' and self.executor_core.data_updates:
237
+ if time_since_flush >= config['max_delay']:
238
+ should_flush_periodic = True
239
+ break
240
+
241
+ if buffer_full or should_flush_periodic:
242
+ asyncio.create_task(self.executor_core._flush_all_buffers())
243
+
244
+ # 智能休眠
245
+ has_events = False
246
+ if isinstance(self.event_queue, deque):
247
+ has_events = bool(self.event_queue)
248
+ elif isinstance(self.event_queue, asyncio.Queue):
249
+ has_events = not self.event_queue.empty()
250
+
251
+ if has_events:
252
+ await asyncio.sleep(0)
253
+ else:
254
+ if has_pending_data:
255
+ await self.executor_core._flush_all_buffers()
256
+ await asyncio.sleep(0.001)
257
+
258
+ except KeyboardInterrupt:
259
+ logger.debug("UnifiedExecutor received KeyboardInterrupt")
260
+ except Exception as e:
261
+ logger.error(f"UnifiedExecutor loop error: {e}")
262
+ finally:
263
+ await self._cleanup_single_process()
264
+
265
+ async def _cleanup_single_process(self):
266
+ """清理单进程模式资源"""
267
+ logger.debug("UnifiedExecutor cleaning up...")
268
+
269
+ # 设置停止标志
270
+ if hasattr(self.app.ep, '_stop_reading'):
271
+ self.app.ep._stop_reading = True
272
+
273
+ # 取消活动任务
274
+ if self._active_tasks:
275
+ logger.debug(f"Cancelling {len(self._active_tasks)} active tasks...")
276
+ for task in self._active_tasks:
277
+ if not task.done():
278
+ task.cancel()
279
+
280
+ if self._active_tasks:
281
+ try:
282
+ await asyncio.wait_for(
283
+ asyncio.gather(*self._active_tasks, return_exceptions=True),
284
+ timeout=0.2
285
+ )
286
+ except asyncio.TimeoutError:
287
+ logger.debug("Some tasks did not complete in time")
288
+
289
+ # 清理ExecutorCore
290
+ await self.executor_core.cleanup()
291
+
292
+ # 清理event_pool
293
+ if hasattr(self.app.ep, 'cleanup'):
294
+ try:
295
+ self.app.ep.cleanup()
296
+ except Exception as e:
297
+ logger.error(f"Error cleaning up EventPool: {e}")
298
+
299
+ # 标记worker离线
300
+ if self.app.consumer_manager:
301
+ try:
302
+ self.app.consumer_manager.cleanup()
303
+ logger.debug("Worker marked as offline")
304
+ except Exception as e:
305
+ logger.error(f"Error marking worker offline: {e}")
306
+
307
+ logger.debug("UnifiedExecutor stopped")
308
+
309
+ def start_multi_process(self, queues: List[str], prefetch_multiplier: int = 100, worker_id: str = None, worker_key: str = None):
310
+ """启动多进程模式
311
+
312
+ Args:
313
+ queues: 队列列表
314
+ prefetch_multiplier: 预取倍数
315
+ worker_id: Worker ID(主进程生成,子进程复用)
316
+ worker_key: Worker Key(主进程生成,子进程复用)
317
+ """
318
+ if self.mode != ExecutionMode.MULTI_PROCESS:
319
+ raise RuntimeError("start_multi_process() is only for MULTI_PROCESS mode")
320
+
321
+ self.orchestrator.start(queues, prefetch_multiplier, worker_id, worker_key)
322
+
323
+ def shutdown(self):
324
+ """
325
+ 关闭执行器
326
+
327
+ 根据执行模式调用相应的关闭方法
328
+ """
329
+ if self.mode == ExecutionMode.MULTI_PROCESS:
330
+ if self.orchestrator:
331
+ self.orchestrator.shutdown()
332
+ elif self.mode == ExecutionMode.SINGLE_PROCESS:
333
+ # 单进程模式的清理在 _cleanup_single_process 中处理
334
+ # 这里只是一个占位符,实际清理由事件循环完成
335
+ logger.debug("UnifiedExecutor shutdown called in SINGLE_PROCESS mode")
336
+
337
+
338
+ __all__ = ['UnifiedExecutor']
@@ -0,0 +1,290 @@
1
+ """
2
+ 多进程编排器
3
+
4
+ 职责:
5
+ 1. 启动和停止子进程
6
+ 2. 监控子进程健康
7
+ 3. 故障自动重启
8
+
9
+ 注意:使用系统默认的 multiprocessing 启动模式(Linux上是fork,Windows/macOS上是spawn)。
10
+ - fork模式:子进程会继承父进程的状态,需要通过 cleanup_inherited_state() 清理
11
+ - spawn模式:子进程是全新的Python解释器,不会继承父进程状态,但需要所有对象可序列化
12
+ """
13
+ import time
14
+ import os
15
+ import signal
16
+ import logging
17
+ import multiprocessing
18
+ from dataclasses import dataclass
19
+ from typing import Dict, List
20
+
21
+ logger = logging.getLogger('app')
22
+
23
+
24
+ @dataclass
25
+ class ProcessConfig:
26
+ """进程配置"""
27
+ process_id: int
28
+ redis_url: str
29
+ redis_prefix: str
30
+ queues: List[str]
31
+ tasks: Dict
32
+ concurrency: int
33
+ prefetch_multiplier: int
34
+ max_connections: int
35
+ consumer_strategy: str
36
+ consumer_config: Dict
37
+ worker_id: str
38
+ worker_key: str
39
+
40
+
41
+ class ProcessOrchestrator:
42
+ """
43
+ 进程编排器 - 只负责进程生命周期管理
44
+
45
+ 职责:
46
+ 1. 启动和停止子进程
47
+ 2. 监控子进程健康状态
48
+ 3. 自动重启失败的进程
49
+ """
50
+
51
+ def __init__(self, app, num_processes: int = 2):
52
+ """
53
+ 初始化进程编排器
54
+
55
+ Args:
56
+ app: Application 实例
57
+ num_processes: 进程数量
58
+ """
59
+ self.app = app
60
+ self.num_processes = num_processes
61
+ self.processes: Dict[int, multiprocessing.Process] = {}
62
+ self.process_configs: Dict[int, ProcessConfig] = {}
63
+ self.shutdown_event = multiprocessing.Event()
64
+
65
+ # 监控配置
66
+ self._monitor_interval = 1.0
67
+ self._restart_delay = 2.0
68
+ self._max_restart_attempts = 3
69
+ self._restart_counts: Dict[int, int] = {}
70
+ self._main_received_signal = False
71
+ self._shutdown_called = False
72
+
73
+ logger.debug(f"ProcessOrchestrator initialized with {num_processes} processes")
74
+
75
+ def _create_process_config(
76
+ self,
77
+ process_id: int,
78
+ queues: List[str],
79
+ prefetch_multiplier: int
80
+ ) -> ProcessConfig:
81
+ """创建进程配置"""
82
+ # 复制 consumer_config 并添加 disable_heartbeat_process 标志
83
+ # 因为在子进程中,心跳已经由主进程的 HeartbeatThreadManager 管理
84
+ consumer_config = dict(self.app.consumer_config or {})
85
+ consumer_config['disable_heartbeat_process'] = True
86
+
87
+ return ProcessConfig(
88
+ process_id=process_id,
89
+ redis_url=self.app.redis_url,
90
+ redis_prefix=self.app.redis_prefix,
91
+ queues=queues,
92
+ tasks=self.app._tasks,
93
+ concurrency=10000, # 子进程内部并发
94
+ prefetch_multiplier=prefetch_multiplier,
95
+ max_connections=self.app.max_connections,
96
+ consumer_strategy=self.app.consumer_strategy,
97
+ consumer_config=consumer_config,
98
+ worker_id=getattr(self.app, 'worker_id', None),
99
+ worker_key=getattr(self.app, 'worker_key', None)
100
+ )
101
+
102
+ def _start_process(self, process_id: int, config: ProcessConfig) -> multiprocessing.Process:
103
+ """启动单个子进程
104
+
105
+ 注意:在 fork 模式下,子进程会在 subprocess_main 的第一行重置 logging 锁。
106
+ """
107
+ from .process_entry import subprocess_main
108
+ process = multiprocessing.Process(
109
+ target=subprocess_main,
110
+ args=(
111
+ config.process_id,
112
+ config.redis_url,
113
+ config.redis_prefix,
114
+ config.queues,
115
+ config.tasks,
116
+ config.concurrency,
117
+ config.prefetch_multiplier,
118
+ config.max_connections,
119
+ config.consumer_strategy,
120
+ config.consumer_config,
121
+ config.worker_id,
122
+ config.worker_key,
123
+ self.shutdown_event
124
+ ),
125
+ name=f"JetTask-Worker-{process_id}"
126
+ )
127
+ process.start()
128
+
129
+
130
+ logger.info(f"Started process #{process_id} (PID: {process.pid})")
131
+ return process
132
+
133
+ def _restart_process(self, process_id: int):
134
+ """重启失败的进程"""
135
+ if self.shutdown_event.is_set():
136
+ return
137
+
138
+ restart_count = self._restart_counts.get(process_id, 0)
139
+ if restart_count >= self._max_restart_attempts:
140
+ logger.error(f"Process #{process_id} exceeded max restart attempts")
141
+ return
142
+
143
+ self._restart_counts[process_id] = restart_count + 1
144
+ delay = self._restart_delay * (2 ** restart_count)
145
+
146
+ logger.info(
147
+ f"Restarting process #{process_id} (attempt {restart_count + 1}) after {delay}s"
148
+ )
149
+ time.sleep(delay)
150
+
151
+ config = self.process_configs[process_id]
152
+ process = self._start_process(process_id, config)
153
+ self.processes[process_id] = process
154
+
155
+ def _monitor_processes(self) -> int:
156
+ """监控进程健康状态"""
157
+ alive_count = 0
158
+
159
+ for process_id, process in list(self.processes.items()):
160
+ if process.is_alive():
161
+ alive_count += 1
162
+ self._restart_counts[process_id] = 0 # 重置重启计数
163
+ else:
164
+ exit_code = process.exitcode
165
+
166
+ if self.shutdown_event.is_set():
167
+ logger.debug(f"Process #{process_id} stopped during shutdown")
168
+ elif exit_code in (-15, -2): # SIGTERM, SIGINT
169
+ logger.info(f"Process #{process_id} received termination signal")
170
+ self.shutdown_event.set()
171
+ elif exit_code == 0:
172
+ if not self._main_received_signal:
173
+ # 检查是否所有进程都停止了
174
+ all_stopped = all(not p.is_alive() for p in self.processes.values())
175
+ if all_stopped:
176
+ logger.info("All processes stopped simultaneously")
177
+ self.shutdown_event.set()
178
+ else:
179
+ logger.warning(f"Process #{process_id} stopped unexpectedly")
180
+ self._restart_process(process_id)
181
+ else:
182
+ logger.error(f"Process #{process_id} exited with code {exit_code}")
183
+ self._restart_process(process_id)
184
+
185
+ return alive_count
186
+
187
+ def start(
188
+ self,
189
+ queues: List[str],
190
+ prefetch_multiplier: int = 100,
191
+ worker_ids: list = None,
192
+ ):
193
+ """启动所有进程
194
+
195
+ Args:
196
+ queues: 队列列表
197
+ prefetch_multiplier: 预取倍数
198
+ """
199
+ logger.info(f"Starting {self.num_processes} worker processes")
200
+
201
+ # 设置主进程信号处理
202
+ def signal_handler(signum, frame):
203
+ logger.info(f"Main process received signal {signum}")
204
+ self._main_received_signal = True
205
+ self.shutdown_event.set()
206
+
207
+ signal.signal(signal.SIGINT, signal_handler)
208
+ signal.signal(signal.SIGTERM, signal_handler)
209
+
210
+ try:
211
+ # 启动所有进程
212
+ for i, (worker_id, worker_key) in enumerate(worker_ids):
213
+ config = self._create_process_config(i, queues, prefetch_multiplier)
214
+ if worker_id:
215
+ config.worker_id = worker_id
216
+ config.worker_key = worker_key
217
+
218
+ self.process_configs[i] = config
219
+ process = self._start_process(i, config)
220
+ self.processes[i] = process
221
+
222
+ time.sleep(0.1) # 错开启动时间
223
+
224
+ logger.info(f"All {self.num_processes} processes started")
225
+
226
+ # 监控循环
227
+ while not self.shutdown_event.is_set():
228
+ alive_count = self._monitor_processes()
229
+
230
+ if alive_count == 0:
231
+ if self._main_received_signal or self.shutdown_event.is_set():
232
+ logger.info("All processes stopped during shutdown")
233
+ else:
234
+ logger.error("All processes stopped unexpectedly")
235
+ break
236
+
237
+ time.sleep(self._monitor_interval)
238
+
239
+ except KeyboardInterrupt:
240
+ logger.info("Main process received KeyboardInterrupt")
241
+ self._main_received_signal = True
242
+ self.shutdown_event.set()
243
+ except Exception as e:
244
+ logger.error(f"ProcessOrchestrator error: {e}", exc_info=True)
245
+ finally:
246
+ self.shutdown()
247
+
248
+ def shutdown(self):
249
+ """关闭所有进程"""
250
+ if self._shutdown_called:
251
+ return
252
+ self._shutdown_called = True
253
+
254
+ logger.info("Shutting down ProcessOrchestrator")
255
+
256
+ self.shutdown_event.set()
257
+
258
+ # 强制杀死所有进程
259
+ for process_id, process in self.processes.items():
260
+ if process.is_alive():
261
+ logger.info(f"Force killing process #{process_id} (PID: {process.pid})")
262
+ try:
263
+ os.kill(process.pid, signal.SIGKILL)
264
+ except Exception as e:
265
+ logger.warning(f"Error killing process #{process_id}: {e}")
266
+ try:
267
+ process.kill()
268
+ except:
269
+ pass
270
+
271
+ # 等待进程退出(最多1秒)
272
+ start_time = time.time()
273
+ for process_id, process in self.processes.items():
274
+ remaining = max(0, 1.0 - (time.time() - start_time))
275
+ try:
276
+ process.join(timeout=remaining)
277
+ except:
278
+ pass
279
+
280
+ if process.is_alive():
281
+ logger.warning(f"Process #{process_id} still alive after SIGKILL")
282
+
283
+ self.processes.clear()
284
+ self.process_configs.clear()
285
+ self._restart_counts.clear()
286
+
287
+ logger.info("ProcessOrchestrator shutdown complete")
288
+
289
+
290
+ __all__ = ['ProcessOrchestrator', 'ProcessConfig']