jettask 0.2.19__py3-none-any.whl → 0.2.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (165) hide show
  1. jettask/__init__.py +12 -3
  2. jettask/cli.py +314 -228
  3. jettask/config/__init__.py +9 -1
  4. jettask/config/config.py +245 -0
  5. jettask/config/env_loader.py +381 -0
  6. jettask/config/lua_scripts.py +158 -0
  7. jettask/config/nacos_config.py +132 -5
  8. jettask/core/__init__.py +1 -1
  9. jettask/core/app.py +1573 -666
  10. jettask/core/app_importer.py +33 -16
  11. jettask/core/container.py +532 -0
  12. jettask/core/task.py +1 -4
  13. jettask/core/unified_manager_base.py +2 -2
  14. jettask/executor/__init__.py +38 -0
  15. jettask/executor/core.py +625 -0
  16. jettask/executor/executor.py +338 -0
  17. jettask/executor/orchestrator.py +290 -0
  18. jettask/executor/process_entry.py +638 -0
  19. jettask/executor/task_executor.py +317 -0
  20. jettask/messaging/__init__.py +68 -0
  21. jettask/messaging/event_pool.py +2188 -0
  22. jettask/messaging/reader.py +519 -0
  23. jettask/messaging/registry.py +266 -0
  24. jettask/messaging/scanner.py +369 -0
  25. jettask/messaging/sender.py +312 -0
  26. jettask/persistence/__init__.py +118 -0
  27. jettask/persistence/backlog_monitor.py +567 -0
  28. jettask/{backend/data_access.py → persistence/base.py} +58 -57
  29. jettask/persistence/consumer.py +315 -0
  30. jettask/{core → persistence}/db_manager.py +23 -22
  31. jettask/persistence/maintenance.py +81 -0
  32. jettask/persistence/message_consumer.py +259 -0
  33. jettask/{backend/namespace_data_access.py → persistence/namespace.py} +66 -98
  34. jettask/persistence/offline_recovery.py +196 -0
  35. jettask/persistence/queue_discovery.py +215 -0
  36. jettask/persistence/task_persistence.py +218 -0
  37. jettask/persistence/task_updater.py +583 -0
  38. jettask/scheduler/__init__.py +2 -2
  39. jettask/scheduler/loader.py +6 -5
  40. jettask/scheduler/run_scheduler.py +1 -1
  41. jettask/scheduler/scheduler.py +7 -7
  42. jettask/scheduler/{unified_scheduler_manager.py → scheduler_coordinator.py} +18 -13
  43. jettask/task/__init__.py +16 -0
  44. jettask/{router.py → task/router.py} +26 -8
  45. jettask/task/task_center/__init__.py +9 -0
  46. jettask/task/task_executor.py +318 -0
  47. jettask/task/task_registry.py +291 -0
  48. jettask/test_connection_monitor.py +73 -0
  49. jettask/utils/__init__.py +31 -1
  50. jettask/{monitor/run_backlog_collector.py → utils/backlog_collector.py} +1 -1
  51. jettask/utils/db_connector.py +1629 -0
  52. jettask/{db_init.py → utils/db_init.py} +1 -1
  53. jettask/utils/rate_limit/__init__.py +30 -0
  54. jettask/utils/rate_limit/concurrency_limiter.py +665 -0
  55. jettask/utils/rate_limit/config.py +145 -0
  56. jettask/utils/rate_limit/limiter.py +41 -0
  57. jettask/utils/rate_limit/manager.py +269 -0
  58. jettask/utils/rate_limit/qps_limiter.py +154 -0
  59. jettask/utils/rate_limit/task_limiter.py +384 -0
  60. jettask/utils/serializer.py +3 -0
  61. jettask/{monitor/stream_backlog_monitor.py → utils/stream_backlog.py} +14 -6
  62. jettask/utils/time_sync.py +173 -0
  63. jettask/webui/__init__.py +27 -0
  64. jettask/{api/v1 → webui/api}/alerts.py +1 -1
  65. jettask/{api/v1 → webui/api}/analytics.py +2 -2
  66. jettask/{api/v1 → webui/api}/namespaces.py +1 -1
  67. jettask/{api/v1 → webui/api}/overview.py +1 -1
  68. jettask/{api/v1 → webui/api}/queues.py +3 -3
  69. jettask/{api/v1 → webui/api}/scheduled.py +1 -1
  70. jettask/{api/v1 → webui/api}/settings.py +1 -1
  71. jettask/{api.py → webui/app.py} +253 -145
  72. jettask/webui/namespace_manager/__init__.py +10 -0
  73. jettask/{multi_namespace_consumer.py → webui/namespace_manager/multi.py} +69 -22
  74. jettask/{unified_consumer_manager.py → webui/namespace_manager/unified.py} +1 -1
  75. jettask/{run.py → webui/run.py} +2 -2
  76. jettask/{services → webui/services}/__init__.py +1 -3
  77. jettask/{services → webui/services}/overview_service.py +34 -16
  78. jettask/{services → webui/services}/queue_service.py +1 -1
  79. jettask/{backend → webui/services}/queue_stats_v2.py +1 -1
  80. jettask/{services → webui/services}/settings_service.py +1 -1
  81. jettask/worker/__init__.py +53 -0
  82. jettask/worker/lifecycle.py +1507 -0
  83. jettask/worker/manager.py +583 -0
  84. jettask/{core/offline_worker_recovery.py → worker/recovery.py} +268 -175
  85. {jettask-0.2.19.dist-info → jettask-0.2.23.dist-info}/METADATA +2 -71
  86. jettask-0.2.23.dist-info/RECORD +145 -0
  87. jettask/__main__.py +0 -140
  88. jettask/api/__init__.py +0 -103
  89. jettask/backend/__init__.py +0 -1
  90. jettask/backend/api/__init__.py +0 -3
  91. jettask/backend/api/v1/__init__.py +0 -17
  92. jettask/backend/api/v1/monitoring.py +0 -431
  93. jettask/backend/api/v1/namespaces.py +0 -504
  94. jettask/backend/api/v1/queues.py +0 -342
  95. jettask/backend/api/v1/tasks.py +0 -367
  96. jettask/backend/core/__init__.py +0 -3
  97. jettask/backend/core/cache.py +0 -221
  98. jettask/backend/core/database.py +0 -200
  99. jettask/backend/core/exceptions.py +0 -102
  100. jettask/backend/dependencies.py +0 -261
  101. jettask/backend/init_meta_db.py +0 -158
  102. jettask/backend/main.py +0 -1426
  103. jettask/backend/main_unified.py +0 -78
  104. jettask/backend/main_v2.py +0 -394
  105. jettask/backend/models/__init__.py +0 -3
  106. jettask/backend/models/requests.py +0 -236
  107. jettask/backend/models/responses.py +0 -230
  108. jettask/backend/namespace_api_old.py +0 -267
  109. jettask/backend/services/__init__.py +0 -3
  110. jettask/backend/start.py +0 -42
  111. jettask/backend/unified_api_router.py +0 -1541
  112. jettask/cleanup_deprecated_tables.sql +0 -16
  113. jettask/core/consumer_manager.py +0 -1695
  114. jettask/core/delay_scanner.py +0 -256
  115. jettask/core/event_pool.py +0 -1700
  116. jettask/core/heartbeat_process.py +0 -222
  117. jettask/core/task_batch.py +0 -153
  118. jettask/core/worker_scanner.py +0 -271
  119. jettask/executors/__init__.py +0 -5
  120. jettask/executors/asyncio.py +0 -876
  121. jettask/executors/base.py +0 -30
  122. jettask/executors/common.py +0 -148
  123. jettask/executors/multi_asyncio.py +0 -309
  124. jettask/gradio_app.py +0 -570
  125. jettask/integrated_gradio_app.py +0 -1088
  126. jettask/main.py +0 -0
  127. jettask/monitoring/__init__.py +0 -3
  128. jettask/pg_consumer.py +0 -1896
  129. jettask/run_monitor.py +0 -22
  130. jettask/run_webui.py +0 -148
  131. jettask/scheduler/multi_namespace_scheduler.py +0 -294
  132. jettask/scheduler/unified_manager.py +0 -450
  133. jettask/task_center_client.py +0 -150
  134. jettask/utils/serializer_optimized.py +0 -33
  135. jettask/webui_exceptions.py +0 -67
  136. jettask-0.2.19.dist-info/RECORD +0 -150
  137. /jettask/{constants.py → config/constants.py} +0 -0
  138. /jettask/{backend/config.py → config/task_center.py} +0 -0
  139. /jettask/{pg_consumer → messaging/pg_consumer}/pg_consumer_v2.py +0 -0
  140. /jettask/{pg_consumer → messaging/pg_consumer}/sql/add_execution_time_field.sql +0 -0
  141. /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_new_tables.sql +0 -0
  142. /jettask/{pg_consumer → messaging/pg_consumer}/sql/create_tables_v3.sql +0 -0
  143. /jettask/{pg_consumer → messaging/pg_consumer}/sql/migrate_to_new_structure.sql +0 -0
  144. /jettask/{pg_consumer → messaging/pg_consumer}/sql/modify_time_fields.sql +0 -0
  145. /jettask/{pg_consumer → messaging/pg_consumer}/sql_utils.py +0 -0
  146. /jettask/{models.py → persistence/models.py} +0 -0
  147. /jettask/scheduler/{manager.py → task_crud.py} +0 -0
  148. /jettask/{schema.sql → schemas/schema.sql} +0 -0
  149. /jettask/{task_center.py → task/task_center/client.py} +0 -0
  150. /jettask/{monitoring → utils}/file_watcher.py +0 -0
  151. /jettask/{services/redis_monitor_service.py → utils/redis_monitor.py} +0 -0
  152. /jettask/{api/v1 → webui/api}/__init__.py +0 -0
  153. /jettask/{webui_config.py → webui/config.py} +0 -0
  154. /jettask/{webui_models → webui/models}/__init__.py +0 -0
  155. /jettask/{webui_models → webui/models}/namespace.py +0 -0
  156. /jettask/{services → webui/services}/alert_service.py +0 -0
  157. /jettask/{services → webui/services}/analytics_service.py +0 -0
  158. /jettask/{services → webui/services}/scheduled_task_service.py +0 -0
  159. /jettask/{services → webui/services}/task_service.py +0 -0
  160. /jettask/{webui_sql → webui/sql}/batch_upsert_functions.sql +0 -0
  161. /jettask/{webui_sql → webui/sql}/verify_database.sql +0 -0
  162. {jettask-0.2.19.dist-info → jettask-0.2.23.dist-info}/WHEEL +0 -0
  163. {jettask-0.2.19.dist-info → jettask-0.2.23.dist-info}/entry_points.txt +0 -0
  164. {jettask-0.2.19.dist-info → jettask-0.2.23.dist-info}/licenses/LICENSE +0 -0
  165. {jettask-0.2.19.dist-info → jettask-0.2.23.dist-info}/top_level.txt +0 -0
jettask/executors/base.py DELETED
@@ -1,30 +0,0 @@
1
- from collections import deque
2
- from abc import ABC, abstractmethod
3
- import time
4
- from typing import TYPE_CHECKING
5
-
6
- from .common import CommonExecutorMixin
7
-
8
- if TYPE_CHECKING:
9
- from ..core.app import Jettask
10
-
11
-
12
- class BaseExecutor(CommonExecutorMixin, ABC):
13
- """Base class for all executors"""
14
-
15
- def __init__(self, event_queue: deque, app: "Jettask", concurrency: int = 1) -> None:
16
- self.event_queue = event_queue
17
- self.app = app
18
- self.concurrency = concurrency
19
- self.last_refresh_pending_time = 0
20
- self.pedding_count = 0
21
- self.batch_counter = 0
22
-
23
- def logic(self, *args, **kwargs):
24
- """Process a single task"""
25
- pass
26
-
27
- @abstractmethod
28
- def loop(self):
29
- """Main loop for the executor"""
30
- pass
@@ -1,148 +0,0 @@
1
- import time
2
- import traceback
3
- from ..utils.serializer import dumps_str, loads_str
4
- from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
5
- from ..utils.traceback_filter import filter_framework_traceback
6
-
7
- if TYPE_CHECKING:
8
- from ..core.app import Jettask
9
- from ..core.task import Task
10
-
11
-
12
- class CommonExecutorMixin:
13
- """Mixin class containing common functionality for all executors"""
14
-
15
- app: "Jettask"
16
- last_refresh_pending_time: float
17
- pedding_count: int
18
-
19
- def parse_task_data(self, data: bytes) -> Tuple[str, str, str, list, dict]:
20
- """Parse task data from bytes"""
21
- data_str = data.decode("utf-8")
22
- parts = data_str.split("_____", 4)
23
- event_id = parts[0]
24
- task_name = parts[1]
25
- trigger_time = parts[2]
26
- args = loads_str(parts[3]) if parts[3] else []
27
- kwargs = loads_str(parts[4]) if parts[4] else {}
28
- return event_id, task_name, trigger_time, args, kwargs
29
-
30
- def get_task(self, task_name: str) -> Optional["Task"]:
31
- """Get task by name"""
32
- return self.app.get_task_by_name(task_name)
33
-
34
- def handle_task_error(self, event_id: str, task_name: str, error: Exception) -> Dict[str, Any]:
35
- """Handle task execution error"""
36
- # 使用过滤后的堆栈信息
37
- error_msg = filter_framework_traceback()
38
- result = {
39
- "status": "error",
40
- "task_name": task_name,
41
- "message": str(error),
42
- "traceback": error_msg
43
- }
44
- self.app.set_task_status(event_id, "error")
45
- self.app.set_data(event_id, dumps_str(result))
46
- return result
47
-
48
- def handle_task_success(self, event_id: str, task_name: str, result: Any) -> Dict[str, Any]:
49
- """Handle successful task execution"""
50
- result_data = {
51
- "status": "success",
52
- "task_name": task_name,
53
- "result": result
54
- }
55
- self.app.set_task_status(event_id, "success")
56
- self.app.set_data(event_id, dumps_str(result_data))
57
- return result_data
58
-
59
- def ack_message(self, data: bytes) -> None:
60
- """Acknowledge message processing"""
61
- self.app.ep.ack(data)
62
-
63
- def get_routing_data(self, kwargs: dict) -> Tuple[Optional[str], Optional[str]]:
64
- """Extract routing data from kwargs"""
65
- routing_key = kwargs.pop("routing_key", None)
66
- aggregation_key = kwargs.pop("aggregation_key", None)
67
- return routing_key, aggregation_key
68
-
69
- def should_process_routing(self, routing_key: Optional[str], aggregation_key: Optional[str]) -> bool:
70
- """Check if task should be processed based on routing"""
71
- if routing_key:
72
- # Check solo running state
73
- if self.app.is_solo_running_by_aggregation_key(aggregation_key):
74
- return False
75
- # Set solo running state
76
- self.app.set_solo_running_by_aggregation_key(aggregation_key)
77
- return True
78
-
79
- def clear_routing_state(self, routing_key: Optional[str], aggregation_key: Optional[str]) -> None:
80
- """Clear routing state after task completion"""
81
- if routing_key:
82
- self.app.clear_solo_running_by_aggregation_key(aggregation_key)
83
-
84
- def handle_urgent_retry(self, kwargs: dict, event_id: str, task_name: str,
85
- trigger_time: str, args: list) -> bool:
86
- """Handle urgent retry logic"""
87
- urgent_retry = kwargs.pop("urgent_retry", None)
88
- if urgent_retry:
89
- delay = kwargs.pop("delay", 0)
90
- self.app.delay(
91
- task_name=task_name,
92
- delay=delay,
93
- args=args,
94
- kwargs=kwargs,
95
- task_id=event_id,
96
- trigger_time=trigger_time,
97
- producer_type="urgent_retry",
98
- )
99
- self.app.set_task_status(event_id, "retry")
100
- return True
101
- return False
102
-
103
- def get_pending_count(self) -> int:
104
- """Get pending count with caching"""
105
- current_time = time.time()
106
- if current_time - self.last_refresh_pending_time > 1:
107
- self.pedding_count = self.app.ep.get_pending_count()
108
- self.last_refresh_pending_time = current_time
109
- return self.pedding_count
110
-
111
- def execute_task_lifecycle(self, task: "Task", event_id: str, trigger_time: str,
112
- args: list, kwargs: dict, is_async: bool = False):
113
- """Execute task with lifecycle methods"""
114
- # This method will be implemented differently for sync/async executors
115
- raise NotImplementedError("Subclasses must implement execute_task_lifecycle")
116
-
117
- def format_batch_data(self, data: bytes) -> Tuple[str, str, list, dict]:
118
- """Parse batch task data"""
119
- data_str = data.decode("utf-8")
120
- parts = data_str.split("_____", 3)
121
- batch_id = parts[0]
122
- task_name = parts[1]
123
- args = loads_str(parts[2]) if parts[2] else []
124
- kwargs = loads_str(parts[3]) if parts[3] else {}
125
- return batch_id, task_name, args, kwargs
126
-
127
- def extract_batch_params(self, kwargs: dict) -> Tuple[list, list, str, Optional[str], Optional[str]]:
128
- """Extract batch-specific parameters from kwargs"""
129
- event_ids = kwargs.pop("event_ids", [])
130
- trigger_times = kwargs.pop("trigger_times", [])
131
- producer_type = kwargs.pop("producer_type", "normal")
132
- routing_key = kwargs.pop("routing_key", None)
133
- aggregation_key = kwargs.pop("aggregation_key", None)
134
- return event_ids, trigger_times, producer_type, routing_key, aggregation_key
135
-
136
- def ack_batch_events(self, event_ids: list) -> None:
137
- """Acknowledge multiple events in batch"""
138
- for event_id in event_ids:
139
- data = f"{event_id}_____batch_ack".encode()
140
- self.ack_message(data)
141
-
142
- def update_batch_status(self, event_ids: list, status: str, result: Any = None) -> None:
143
- """Update status for multiple events in batch"""
144
- result_str = dumps_str(result) if result is not None else None
145
- for event_id in event_ids:
146
- self.app.set_task_status(event_id, status)
147
- if result_str:
148
- self.app.set_data(event_id, result_str)
@@ -1,309 +0,0 @@
1
- import multiprocessing
2
- import logging
3
- import time
4
- import os
5
- import signal
6
- from typing import Dict, List
7
- from multiprocessing import Process, Event
8
- from dataclasses import dataclass
9
-
10
- from .base import BaseExecutor
11
-
12
- logger = logging.getLogger('app')
13
-
14
-
15
- @dataclass
16
- class ProcessConfig:
17
- """Configuration for AsyncIO executor process"""
18
- executor_id: int
19
- redis_url: str
20
- queues: List[str]
21
- app_tasks: Dict
22
- consumer_strategy: str
23
- consumer_config: Dict
24
- max_connections: int
25
- prefetch_multiplier: int
26
- concurrency_per_process: int = 10000
27
-
28
-
29
- class MultiAsyncioExecutor(BaseExecutor):
30
- """
31
- Multi-asyncio executor that manages multiple AsyncioExecutor instances
32
- Each instance runs in its own process with its own event loop
33
-
34
- Features:
35
- - Automatic process restart on failure
36
- - Graceful shutdown with timeout
37
- - Process health monitoring
38
- - Configurable concurrency per process
39
- """
40
-
41
- def __init__(self, event_queue, app, concurrency=3):
42
- super().__init__(event_queue, app, concurrency)
43
- self.processes: Dict[int, Process] = {}
44
- self.process_configs: Dict[int, ProcessConfig] = {}
45
- self.shutdown_event = Event()
46
- self._monitor_interval = 1 # seconds
47
- self._status_log_interval = 30 # seconds
48
- self._restart_delay = 2 # seconds
49
- self._max_restart_attempts = 3
50
- self._restart_counts: Dict[int, int] = {}
51
- self._main_received_signal = False # Track if main process received signal
52
-
53
- def logic(self):
54
- """
55
- Logic method for BaseExecutor interface.
56
- Not used in MultiAsyncioExecutor as it delegates to AsyncioExecutor instances.
57
- """
58
- pass
59
-
60
- @staticmethod
61
- def _run_asyncio_executor(config: ProcessConfig, shutdown_event):
62
- """Run an AsyncioExecutor in a separate process"""
63
- # Set process name for debugging
64
- multiprocessing.current_process().name = f"AsyncioExecutor-{config.executor_id}"
65
-
66
- # Configure logging for subprocess
67
- logging.basicConfig(
68
- level=logging.INFO,
69
- format=f"%(asctime)s - %(levelname)s - [Executor-{config.executor_id}] - %(message)s",
70
- datefmt="%Y-%m-%d %H:%M:%S",
71
- )
72
- logger = logging.getLogger('app')
73
-
74
- # Handle signals gracefully
75
- def signal_handler(signum, _frame):
76
- logger.info(f"AsyncioExecutor #{config.executor_id} received signal {signum}")
77
- shutdown_event.set()
78
-
79
- signal.signal(signal.SIGINT, signal_handler)
80
- signal.signal(signal.SIGTERM, signal_handler)
81
-
82
- try:
83
- # Import inside process to avoid pickle issues
84
- from ..core.app import Jettask
85
-
86
- # Create app instance for this process
87
- app = Jettask(
88
- redis_url=config.redis_url,
89
- max_connections=config.max_connections,
90
- consumer_strategy=config.consumer_strategy,
91
- consumer_config=config.consumer_config,
92
- tasks=config.app_tasks
93
- )
94
-
95
- logger.info(f"Started AsyncioExecutor #{config.executor_id} in process {os.getpid()}")
96
-
97
- # Check shutdown event before starting
98
- if shutdown_event.is_set():
99
- logger.info(f"AsyncioExecutor #{config.executor_id} shutdown before start")
100
- return
101
-
102
- # Start the executor
103
- app._start(
104
- execute_type="asyncio",
105
- queues=config.queues,
106
- concurrency=config.concurrency_per_process,
107
- prefetch_multiplier=config.prefetch_multiplier
108
- )
109
-
110
- except KeyboardInterrupt:
111
- logger.info(f"AsyncioExecutor #{config.executor_id} received interrupt")
112
- except Exception as e:
113
- logger.error(f"AsyncioExecutor #{config.executor_id} error: {e}")
114
- import traceback
115
- traceback.print_exc()
116
- raise # Re-raise to trigger restart mechanism
117
- finally:
118
- logger.info(f"AsyncioExecutor #{config.executor_id} stopped")
119
-
120
- def _create_process_config(self, executor_id: int, queues: List[str],
121
- prefetch_multiplier: int) -> ProcessConfig:
122
- """Create configuration for a process"""
123
- return ProcessConfig(
124
- executor_id=executor_id,
125
- redis_url=self.app.redis_url,
126
- queues=queues,
127
- app_tasks=self.app._tasks,
128
- consumer_strategy=self.app.consumer_strategy,
129
- consumer_config=self.app.consumer_config,
130
- max_connections=self.app.max_connections,
131
- prefetch_multiplier=prefetch_multiplier,
132
- concurrency_per_process=10000
133
- )
134
-
135
- def _start_process(self, executor_id: int, config: ProcessConfig) -> Process:
136
- """Start a single AsyncioExecutor process"""
137
- process = Process(
138
- target=self._run_asyncio_executor,
139
- args=(config, self.shutdown_event),
140
- name=f"AsyncioExecutor-{executor_id}"
141
- )
142
- process.start()
143
- logger.info(f"Started AsyncioExecutor process #{executor_id} (PID: {process.pid})")
144
- return process
145
-
146
- def _restart_process(self, executor_id: int):
147
- """Restart a failed process with exponential backoff"""
148
- if self.shutdown_event.is_set():
149
- return
150
-
151
- restart_count = self._restart_counts.get(executor_id, 0)
152
- if restart_count >= self._max_restart_attempts:
153
- logger.error(f"Process #{executor_id} exceeded max restart attempts ({self._max_restart_attempts})")
154
- return
155
-
156
- self._restart_counts[executor_id] = restart_count + 1
157
- delay = self._restart_delay * (2 ** restart_count) # Exponential backoff
158
-
159
- logger.info(f"Restarting process #{executor_id} (attempt {restart_count + 1}/{self._max_restart_attempts}) "
160
- f"after {delay}s delay")
161
- time.sleep(delay)
162
-
163
- config = self.process_configs[executor_id]
164
- process = self._start_process(executor_id, config)
165
- self.processes[executor_id] = process
166
-
167
- def _monitor_processes(self):
168
- """Monitor process health and restart failed processes"""
169
- alive_count = 0
170
- for executor_id, process in list(self.processes.items()):
171
- if process.is_alive():
172
- alive_count += 1
173
- # Reset restart count for healthy processes
174
- self._restart_counts[executor_id] = 0
175
- else:
176
- exit_code = process.exitcode
177
- # 如果正在关闭,不要重启进程
178
- if self.shutdown_event.is_set():
179
- logger.info(f"Process #{executor_id} stopped during shutdown")
180
- # 如果进程收到SIGTERM(exit_code == -15)或SIGINT(exit_code == -2),认为是正常关闭
181
- elif exit_code == -15 or exit_code == -2:
182
- logger.info(f"Process #{executor_id} received termination signal, marking as shutdown")
183
- # 设置shutdown事件,避免重启其他进程
184
- self.shutdown_event.set()
185
- # 如果所有进程都同时停止(正常退出),认为是收到了关闭信号
186
- elif exit_code == 0 or exit_code is None:
187
- # 如果主进程已经收到信号,不要重启
188
- if self._main_received_signal:
189
- logger.info(f"Process #{executor_id} stopped after main received signal")
190
- else:
191
- # 检查是否所有进程都停止了
192
- all_stopped = True
193
- for _, p in self.processes.items():
194
- if p.is_alive():
195
- all_stopped = False
196
- break
197
-
198
- if all_stopped:
199
- logger.info(f"All processes stopped simultaneously, marking as shutdown")
200
- self.shutdown_event.set()
201
- else:
202
- logger.warning(f"Process #{executor_id} stopped unexpectedly with exit code {exit_code}")
203
- self._restart_process(executor_id)
204
- else:
205
- logger.warning(f"Process #{executor_id} exited with code {exit_code}")
206
- self._restart_process(executor_id)
207
-
208
- return alive_count
209
-
210
- def loop(self):
211
- """Main loop that starts and monitors AsyncioExecutor processes"""
212
- logger.info(f"Starting MultiAsyncioExecutor with {self.concurrency} asyncio processes")
213
-
214
- # Set up signal handler to track when main process receives signal
215
- def signal_handler(signum, frame):
216
- logger.info(f"MultiAsyncioExecutor received signal {signum}")
217
- self._main_received_signal = True
218
- self.shutdown_event.set()
219
-
220
- # Register signal handlers
221
- signal.signal(signal.SIGINT, signal_handler)
222
- signal.signal(signal.SIGTERM, signal_handler)
223
-
224
- try:
225
- # Get configuration
226
- queues = getattr(self.app.ep, 'queues', ['robust_bench'])
227
- prefetch_multiplier = getattr(self, 'prefetch_multiplier', 100)
228
-
229
- # Start AsyncioExecutor processes
230
- for i in range(self.concurrency):
231
- config = self._create_process_config(i, queues, prefetch_multiplier)
232
- self.process_configs[i] = config
233
-
234
- process = self._start_process(i, config)
235
- self.processes[i] = process
236
-
237
- # Small delay to avoid thundering herd
238
- time.sleep(0.1)
239
-
240
- logger.info(f"All {self.concurrency} AsyncioExecutor processes started")
241
-
242
- # Monitor executor processes
243
- last_status_log = time.time()
244
-
245
- while not self.shutdown_event.is_set():
246
- # Monitor process health
247
- alive_count = self._monitor_processes()
248
-
249
- if alive_count == 0:
250
- if self._main_received_signal or self.shutdown_event.is_set():
251
- logger.info("All AsyncioExecutor processes have stopped during shutdown")
252
- else:
253
- logger.error("All AsyncioExecutor processes have stopped unexpectedly")
254
- break
255
-
256
- # Log status periodically
257
- current_time = time.time()
258
- if current_time - last_status_log > self._status_log_interval:
259
- dead_count = self.concurrency - alive_count
260
- logger.info(f"MultiAsyncioExecutor status - Active: {alive_count}/{self.concurrency}, "
261
- f"Dead: {dead_count}, Restart attempts: {sum(self._restart_counts.values())}")
262
- last_status_log = current_time
263
-
264
- time.sleep(self._monitor_interval)
265
-
266
- except KeyboardInterrupt:
267
- logger.info("MultiAsyncioExecutor received KeyboardInterrupt")
268
- self._main_received_signal = True
269
- self.shutdown_event.set()
270
- except Exception as e:
271
- logger.error(f"MultiAsyncioExecutor error: {e}")
272
- import traceback
273
- traceback.print_exc()
274
- finally:
275
- self.shutdown()
276
-
277
- def shutdown(self):
278
- """Gracefully shutdown all executor processes"""
279
- logger.info("Shutting down MultiAsyncioExecutor...")
280
-
281
- # Signal shutdown - this prevents any restart attempts
282
- self.shutdown_event.set()
283
-
284
- # 快速关闭模式 - 减少等待时间
285
- shutdown_timeout = 1.0 # 减少到1秒
286
- terminate_timeout = 0.5 # 减少到0.5秒
287
-
288
- # 先发送终止信号给所有进程
289
- for executor_id, process in self.processes.items():
290
- if process.is_alive():
291
- logger.info(f"Sending TERM signal to process #{executor_id} (PID: {process.pid})")
292
- process.terminate()
293
-
294
- # 短暂等待所有进程自行退出
295
- time.sleep(shutdown_timeout)
296
-
297
- # 强制杀死仍在运行的进程
298
- for executor_id, process in self.processes.items():
299
- if process.is_alive():
300
- logger.warning(f"Process #{executor_id} did not terminate, killing...")
301
- process.kill()
302
- # 不再等待join,让操作系统清理
303
-
304
- # Clear process tracking
305
- self.processes.clear()
306
- self.process_configs.clear()
307
- self._restart_counts.clear()
308
-
309
- logger.info("MultiAsyncioExecutor shutdown complete")