digitalkin 0.3.1.dev1__py3-none-any.whl → 0.3.2a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- base_server/server_async_insecure.py +6 -5
- base_server/server_async_secure.py +6 -5
- base_server/server_sync_insecure.py +5 -4
- base_server/server_sync_secure.py +5 -4
- digitalkin/__version__.py +1 -1
- digitalkin/core/job_manager/base_job_manager.py +1 -1
- digitalkin/core/job_manager/single_job_manager.py +78 -36
- digitalkin/core/job_manager/taskiq_broker.py +8 -7
- digitalkin/core/job_manager/taskiq_job_manager.py +9 -5
- digitalkin/core/task_manager/base_task_manager.py +3 -1
- digitalkin/core/task_manager/surrealdb_repository.py +13 -7
- digitalkin/core/task_manager/task_executor.py +27 -10
- digitalkin/core/task_manager/task_session.py +133 -101
- digitalkin/grpc_servers/module_server.py +95 -171
- digitalkin/grpc_servers/module_servicer.py +133 -27
- digitalkin/grpc_servers/utils/grpc_client_wrapper.py +36 -10
- digitalkin/grpc_servers/utils/utility_schema_extender.py +106 -0
- digitalkin/models/__init__.py +1 -1
- digitalkin/models/core/job_manager_models.py +0 -8
- digitalkin/models/core/task_monitor.py +23 -1
- digitalkin/models/grpc_servers/models.py +95 -8
- digitalkin/models/module/__init__.py +26 -13
- digitalkin/models/module/base_types.py +61 -0
- digitalkin/models/module/module_context.py +279 -13
- digitalkin/models/module/module_types.py +29 -109
- digitalkin/models/module/setup_types.py +547 -0
- digitalkin/models/module/tool_cache.py +230 -0
- digitalkin/models/module/tool_reference.py +160 -0
- digitalkin/models/module/utility.py +167 -0
- digitalkin/models/services/cost.py +22 -1
- digitalkin/models/services/registry.py +77 -0
- digitalkin/modules/__init__.py +5 -1
- digitalkin/modules/_base_module.py +253 -90
- digitalkin/modules/archetype_module.py +6 -1
- digitalkin/modules/tool_module.py +6 -1
- digitalkin/modules/triggers/__init__.py +8 -0
- digitalkin/modules/triggers/healthcheck_ping_trigger.py +45 -0
- digitalkin/modules/triggers/healthcheck_services_trigger.py +63 -0
- digitalkin/modules/triggers/healthcheck_status_trigger.py +52 -0
- digitalkin/services/__init__.py +4 -0
- digitalkin/services/communication/__init__.py +7 -0
- digitalkin/services/communication/communication_strategy.py +87 -0
- digitalkin/services/communication/default_communication.py +104 -0
- digitalkin/services/communication/grpc_communication.py +264 -0
- digitalkin/services/cost/cost_strategy.py +36 -14
- digitalkin/services/cost/default_cost.py +61 -1
- digitalkin/services/cost/grpc_cost.py +98 -2
- digitalkin/services/filesystem/grpc_filesystem.py +9 -2
- digitalkin/services/registry/__init__.py +22 -1
- digitalkin/services/registry/default_registry.py +156 -4
- digitalkin/services/registry/exceptions.py +47 -0
- digitalkin/services/registry/grpc_registry.py +382 -0
- digitalkin/services/registry/registry_models.py +15 -0
- digitalkin/services/registry/registry_strategy.py +106 -4
- digitalkin/services/services_config.py +25 -3
- digitalkin/services/services_models.py +5 -1
- digitalkin/services/setup/default_setup.py +1 -1
- digitalkin/services/setup/grpc_setup.py +1 -1
- digitalkin/services/storage/grpc_storage.py +1 -1
- digitalkin/services/user_profile/__init__.py +11 -0
- digitalkin/services/user_profile/grpc_user_profile.py +2 -2
- digitalkin/services/user_profile/user_profile_strategy.py +0 -15
- digitalkin/utils/__init__.py +40 -0
- digitalkin/utils/conditional_schema.py +260 -0
- digitalkin/utils/dynamic_schema.py +487 -0
- digitalkin/utils/schema_splitter.py +290 -0
- {digitalkin-0.3.1.dev1.dist-info → digitalkin-0.3.2a2.dist-info}/METADATA +13 -13
- digitalkin-0.3.2a2.dist-info/RECORD +144 -0
- {digitalkin-0.3.1.dev1.dist-info → digitalkin-0.3.2a2.dist-info}/WHEEL +1 -1
- {digitalkin-0.3.1.dev1.dist-info → digitalkin-0.3.2a2.dist-info}/top_level.txt +1 -0
- modules/archetype_with_tools_module.py +232 -0
- modules/cpu_intensive_module.py +1 -1
- modules/dynamic_setup_module.py +338 -0
- modules/minimal_llm_module.py +1 -1
- modules/text_transform_module.py +1 -1
- monitoring/digitalkin_observability/__init__.py +46 -0
- monitoring/digitalkin_observability/http_server.py +150 -0
- monitoring/digitalkin_observability/interceptors.py +176 -0
- monitoring/digitalkin_observability/metrics.py +201 -0
- monitoring/digitalkin_observability/prometheus.py +137 -0
- monitoring/tests/test_metrics.py +172 -0
- services/filesystem_module.py +7 -5
- services/storage_module.py +4 -2
- digitalkin/grpc_servers/registry_server.py +0 -65
- digitalkin/grpc_servers/registry_servicer.py +0 -456
- digitalkin-0.3.1.dev1.dist-info/RECORD +0 -117
- {digitalkin-0.3.1.dev1.dist-info → digitalkin-0.3.2a2.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Task executor for running tasks with full lifecycle management."""
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import contextlib
|
|
4
5
|
import datetime
|
|
5
6
|
from collections.abc import Coroutine
|
|
6
7
|
from typing import Any
|
|
@@ -59,6 +60,8 @@ class TaskExecutor:
|
|
|
59
60
|
SignalMessage(
|
|
60
61
|
task_id=task_id,
|
|
61
62
|
mission_id=mission_id,
|
|
63
|
+
setup_id=session.setup_id,
|
|
64
|
+
setup_version_id=session.setup_version_id,
|
|
62
65
|
status=session.status,
|
|
63
66
|
action=SignalType.START,
|
|
64
67
|
).model_dump(),
|
|
@@ -67,15 +70,21 @@ class TaskExecutor:
|
|
|
67
70
|
except asyncio.CancelledError:
|
|
68
71
|
logger.debug("Signal listener cancelled", extra={"mission_id": mission_id, "task_id": task_id})
|
|
69
72
|
finally:
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
73
|
+
with contextlib.suppress(Exception): # Connection may already be closed
|
|
74
|
+
await channel.create(
|
|
75
|
+
"tasks",
|
|
76
|
+
SignalMessage(
|
|
77
|
+
task_id=task_id,
|
|
78
|
+
mission_id=mission_id,
|
|
79
|
+
setup_id=session.setup_id,
|
|
80
|
+
setup_version_id=session.setup_version_id,
|
|
81
|
+
status=session.status,
|
|
82
|
+
action=SignalType.STOP,
|
|
83
|
+
cancellation_reason=session.cancellation_reason,
|
|
84
|
+
error_message=session._last_exception, # noqa: SLF001
|
|
85
|
+
exception_traceback=session._last_traceback, # noqa: SLF001
|
|
86
|
+
).model_dump(),
|
|
87
|
+
)
|
|
79
88
|
logger.info("Signal listener ended", extra={"mission_id": mission_id, "task_id": task_id})
|
|
80
89
|
|
|
81
90
|
async def heartbeat_wrapper() -> None:
|
|
@@ -125,8 +134,14 @@ class TaskExecutor:
|
|
|
125
134
|
# Heartbeat stopped - failure cleanup
|
|
126
135
|
cleanup_reason = CancellationReason.FAILURE_CLEANUP
|
|
127
136
|
|
|
137
|
+
# Signal stream to close FIRST before any cleanup
|
|
138
|
+
session.close_stream()
|
|
139
|
+
|
|
128
140
|
# Cancel pending tasks with proper reason logging
|
|
129
141
|
if pending:
|
|
142
|
+
# Give stream time to see the signal and exit gracefully
|
|
143
|
+
await asyncio.sleep(0.01) # Allow one event loop cycle
|
|
144
|
+
|
|
130
145
|
pending_names = [t.get_name() for t in pending]
|
|
131
146
|
logger.debug(
|
|
132
147
|
"Cancelling pending tasks: %s, reason: %s",
|
|
@@ -148,6 +163,7 @@ class TaskExecutor:
|
|
|
148
163
|
# Determine final status based on which task completed
|
|
149
164
|
if completed is main_task:
|
|
150
165
|
session.status = TaskStatus.COMPLETED
|
|
166
|
+
session.cancellation_reason = CancellationReason.COMPLETED
|
|
151
167
|
logger.info(
|
|
152
168
|
"Main task completed successfully",
|
|
153
169
|
extra={"mission_id": mission_id, "task_id": task_id},
|
|
@@ -193,9 +209,10 @@ class TaskExecutor:
|
|
|
193
209
|
)
|
|
194
210
|
cleanup_reason = CancellationReason.FAILURE_CLEANUP
|
|
195
211
|
raise
|
|
196
|
-
except Exception:
|
|
212
|
+
except Exception as e:
|
|
197
213
|
session.status = TaskStatus.FAILED
|
|
198
214
|
cleanup_reason = CancellationReason.FAILURE_CLEANUP
|
|
215
|
+
session.record_exception(e)
|
|
199
216
|
logger.exception(
|
|
200
217
|
"Task failed with exception: '%s'",
|
|
201
218
|
task_id,
|
|
@@ -1,7 +1,9 @@
|
|
|
1
1
|
"""Task session easing task lifecycle management."""
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import contextlib
|
|
4
5
|
import datetime
|
|
6
|
+
import traceback
|
|
5
7
|
from collections.abc import AsyncGenerator
|
|
6
8
|
|
|
7
9
|
from digitalkin.core.task_manager.surrealdb_repository import SurrealDBConnection
|
|
@@ -39,9 +41,17 @@ class TaskSession:
|
|
|
39
41
|
is_cancelled: asyncio.Event
|
|
40
42
|
cancellation_reason: CancellationReason
|
|
41
43
|
_paused: asyncio.Event
|
|
44
|
+
_stream_closed: asyncio.Event
|
|
42
45
|
_heartbeat_interval: datetime.timedelta
|
|
43
46
|
_last_heartbeat: datetime.datetime
|
|
44
47
|
|
|
48
|
+
# Exception tracking for enhanced DB logging
|
|
49
|
+
_last_exception: str | None
|
|
50
|
+
_last_traceback: str | None
|
|
51
|
+
|
|
52
|
+
# Cleanup guard for idempotent cleanup
|
|
53
|
+
_cleanup_done: bool
|
|
54
|
+
|
|
45
55
|
def __init__(
|
|
46
56
|
self,
|
|
47
57
|
task_id: str,
|
|
@@ -49,14 +59,24 @@ class TaskSession:
|
|
|
49
59
|
db: SurrealDBConnection,
|
|
50
60
|
module: BaseModule,
|
|
51
61
|
heartbeat_interval: datetime.timedelta = datetime.timedelta(seconds=2),
|
|
62
|
+
queue_maxsize: int = 1000,
|
|
52
63
|
) -> None:
|
|
53
|
-
"""Initialize Task Session.
|
|
64
|
+
"""Initialize Task Session.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
task_id: Unique task identifier
|
|
68
|
+
mission_id: Mission identifier
|
|
69
|
+
db: SurrealDB connection
|
|
70
|
+
module: Module instance
|
|
71
|
+
heartbeat_interval: Interval between heartbeats
|
|
72
|
+
queue_maxsize: Maximum size for the queue (0 = unlimited)
|
|
73
|
+
"""
|
|
54
74
|
self.db = db
|
|
55
75
|
self.module = module
|
|
56
76
|
|
|
57
77
|
self.status = TaskStatus.PENDING
|
|
58
78
|
# Bounded queue to prevent unbounded memory growth (max 1000 items)
|
|
59
|
-
self.queue: asyncio.Queue = asyncio.Queue(maxsize=
|
|
79
|
+
self.queue: asyncio.Queue = asyncio.Queue(maxsize=queue_maxsize)
|
|
60
80
|
|
|
61
81
|
self.task_id = task_id
|
|
62
82
|
self.mission_id = mission_id
|
|
@@ -71,12 +91,23 @@ class TaskSession:
|
|
|
71
91
|
self.is_cancelled = asyncio.Event()
|
|
72
92
|
self.cancellation_reason = CancellationReason.UNKNOWN
|
|
73
93
|
self._paused = asyncio.Event()
|
|
94
|
+
self._stream_closed = asyncio.Event()
|
|
74
95
|
self._heartbeat_interval = heartbeat_interval
|
|
75
96
|
|
|
97
|
+
# Exception tracking
|
|
98
|
+
self._last_exception = None
|
|
99
|
+
self._last_traceback = None
|
|
100
|
+
|
|
101
|
+
# Cleanup guard
|
|
102
|
+
self._cleanup_done = False
|
|
103
|
+
|
|
76
104
|
logger.info(
|
|
77
|
-
"
|
|
78
|
-
|
|
79
|
-
|
|
105
|
+
"TaskSession initialized",
|
|
106
|
+
extra={
|
|
107
|
+
"task_id": task_id,
|
|
108
|
+
"mission_id": mission_id,
|
|
109
|
+
"heartbeat_interval": str(heartbeat_interval),
|
|
110
|
+
},
|
|
80
111
|
)
|
|
81
112
|
|
|
82
113
|
@property
|
|
@@ -89,6 +120,39 @@ class TaskSession:
|
|
|
89
120
|
"""Task paused status."""
|
|
90
121
|
return self._paused.is_set()
|
|
91
122
|
|
|
123
|
+
@property
|
|
124
|
+
def stream_closed(self) -> bool:
|
|
125
|
+
"""Check if stream termination was signaled."""
|
|
126
|
+
return self._stream_closed.is_set()
|
|
127
|
+
|
|
128
|
+
def close_stream(self) -> None:
|
|
129
|
+
"""Signal that the stream should terminate."""
|
|
130
|
+
self._stream_closed.set()
|
|
131
|
+
|
|
132
|
+
@property
|
|
133
|
+
def setup_id(self) -> str:
|
|
134
|
+
"""Get setup_id from module context."""
|
|
135
|
+
return self.module.context.session.setup_id
|
|
136
|
+
|
|
137
|
+
@property
|
|
138
|
+
def setup_version_id(self) -> str:
|
|
139
|
+
"""Get setup_version_id from module context."""
|
|
140
|
+
return self.module.context.session.setup_version_id
|
|
141
|
+
|
|
142
|
+
@property
|
|
143
|
+
def session_ids(self) -> dict[str, str]:
|
|
144
|
+
"""Get all session IDs from module context for structured logging."""
|
|
145
|
+
return self.module.context.session.current_ids()
|
|
146
|
+
|
|
147
|
+
def record_exception(self, exc: Exception) -> None:
|
|
148
|
+
"""Record exception details for DB logging.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
exc: The exception that caused the task to fail.
|
|
152
|
+
"""
|
|
153
|
+
self._last_exception = str(exc)
|
|
154
|
+
self._last_traceback = traceback.format_exc()
|
|
155
|
+
|
|
92
156
|
async def send_heartbeat(self) -> bool:
|
|
93
157
|
"""Rate-limited heartbeat with connection resilience.
|
|
94
158
|
|
|
@@ -98,6 +162,8 @@ class TaskSession:
|
|
|
98
162
|
heartbeat = HeartbeatMessage(
|
|
99
163
|
task_id=self.task_id,
|
|
100
164
|
mission_id=self.mission_id,
|
|
165
|
+
setup_id=self.setup_id,
|
|
166
|
+
setup_version_id=self.setup_version_id,
|
|
101
167
|
timestamp=datetime.datetime.now(datetime.timezone.utc),
|
|
102
168
|
)
|
|
103
169
|
|
|
@@ -110,23 +176,17 @@ class TaskSession:
|
|
|
110
176
|
return True
|
|
111
177
|
except Exception as e:
|
|
112
178
|
logger.error(
|
|
113
|
-
"Heartbeat exception
|
|
114
|
-
self.
|
|
115
|
-
extra={"task_id": self.task_id, "error": str(e)},
|
|
179
|
+
"Heartbeat exception",
|
|
180
|
+
extra={**self.session_ids, "error": str(e)},
|
|
116
181
|
exc_info=True,
|
|
117
182
|
)
|
|
118
|
-
logger.error(
|
|
119
|
-
"Initial heartbeat failed for task: '%s'",
|
|
120
|
-
self.task_id,
|
|
121
|
-
extra={"task_id": self.task_id},
|
|
122
|
-
)
|
|
183
|
+
logger.error("Initial heartbeat failed", extra=self.session_ids)
|
|
123
184
|
return False
|
|
124
185
|
|
|
125
186
|
if (heartbeat.timestamp - self._last_heartbeat) < self._heartbeat_interval:
|
|
126
187
|
logger.debug(
|
|
127
|
-
"Heartbeat skipped due to rate limiting
|
|
128
|
-
self.
|
|
129
|
-
heartbeat.timestamp - self._last_heartbeat,
|
|
188
|
+
"Heartbeat skipped due to rate limiting",
|
|
189
|
+
extra={**self.session_ids, "delta": str(heartbeat.timestamp - self._last_heartbeat)},
|
|
130
190
|
)
|
|
131
191
|
return True
|
|
132
192
|
|
|
@@ -137,39 +197,24 @@ class TaskSession:
|
|
|
137
197
|
return True
|
|
138
198
|
except Exception as e:
|
|
139
199
|
logger.error(
|
|
140
|
-
"Heartbeat exception
|
|
141
|
-
self.
|
|
142
|
-
extra={"task_id": self.task_id, "error": str(e)},
|
|
200
|
+
"Heartbeat exception",
|
|
201
|
+
extra={**self.session_ids, "error": str(e)},
|
|
143
202
|
exc_info=True,
|
|
144
203
|
)
|
|
145
|
-
logger.warning(
|
|
146
|
-
"Heartbeat failed for task: '%s'",
|
|
147
|
-
self.task_id,
|
|
148
|
-
extra={"task_id": self.task_id},
|
|
149
|
-
)
|
|
204
|
+
logger.warning("Heartbeat failed", extra=self.session_ids)
|
|
150
205
|
return False
|
|
151
206
|
|
|
152
207
|
async def generate_heartbeats(self) -> None:
|
|
153
208
|
"""Periodic heartbeat generator with cancellation support."""
|
|
154
|
-
logger.debug(
|
|
155
|
-
"Heartbeat generator started for task: '%s'",
|
|
156
|
-
self.task_id,
|
|
157
|
-
extra={"task_id": self.task_id, "mission_id": self.mission_id},
|
|
158
|
-
)
|
|
209
|
+
logger.debug("Heartbeat generator started", extra=self.session_ids)
|
|
159
210
|
while not self.cancelled:
|
|
160
211
|
logger.debug(
|
|
161
|
-
"Heartbeat tick
|
|
162
|
-
self.
|
|
163
|
-
self.cancelled,
|
|
164
|
-
extra={"task_id": self.task_id, "mission_id": self.mission_id},
|
|
212
|
+
"Heartbeat tick",
|
|
213
|
+
extra={**self.session_ids, "cancelled": self.cancelled},
|
|
165
214
|
)
|
|
166
215
|
success = await self.send_heartbeat()
|
|
167
216
|
if not success:
|
|
168
|
-
logger.error(
|
|
169
|
-
"Heartbeat failed, cancelling task: '%s'",
|
|
170
|
-
self.task_id,
|
|
171
|
-
extra={"task_id": self.task_id, "mission_id": self.mission_id},
|
|
172
|
-
)
|
|
217
|
+
logger.error("Heartbeat failed, cancelling task", extra=self.session_ids)
|
|
173
218
|
await self._handle_cancel(CancellationReason.HEARTBEAT_FAILURE)
|
|
174
219
|
break
|
|
175
220
|
await asyncio.sleep(self._heartbeat_interval.total_seconds())
|
|
@@ -177,11 +222,7 @@ class TaskSession:
|
|
|
177
222
|
async def wait_if_paused(self) -> None:
|
|
178
223
|
"""Block execution if task is paused."""
|
|
179
224
|
if self._paused.is_set():
|
|
180
|
-
logger.info(
|
|
181
|
-
"Task paused, waiting for resume: '%s'",
|
|
182
|
-
self.task_id,
|
|
183
|
-
extra={"task_id": self.task_id},
|
|
184
|
-
)
|
|
225
|
+
logger.info("Task paused, waiting for resume", extra=self.session_ids)
|
|
185
226
|
await self._paused.wait()
|
|
186
227
|
|
|
187
228
|
async def listen_signals(self) -> None: # noqa: C901
|
|
@@ -190,18 +231,14 @@ class TaskSession:
|
|
|
190
231
|
Raises:
|
|
191
232
|
CancelledError: Asyncio when task cancelling
|
|
192
233
|
"""
|
|
193
|
-
logger.info(
|
|
194
|
-
"Signal listener started for task: '%s'",
|
|
195
|
-
self.task_id,
|
|
196
|
-
extra={"task_id": self.task_id},
|
|
197
|
-
)
|
|
234
|
+
logger.info("Signal listener started", extra=self.session_ids)
|
|
198
235
|
if self.signal_record_id is None:
|
|
199
236
|
self.signal_record_id = (await self.db.select_by_task_id("tasks", self.task_id)).get("id")
|
|
200
237
|
|
|
201
238
|
live_id, live_signals = await self.db.start_live("tasks")
|
|
202
239
|
try:
|
|
203
240
|
async for signal in live_signals:
|
|
204
|
-
logger.debug("Signal received
|
|
241
|
+
logger.debug("Signal received", extra={**self.session_ids, "signal": signal})
|
|
205
242
|
if self.cancelled:
|
|
206
243
|
break
|
|
207
244
|
|
|
@@ -218,26 +255,18 @@ class TaskSession:
|
|
|
218
255
|
await self._handle_status_request()
|
|
219
256
|
|
|
220
257
|
except asyncio.CancelledError:
|
|
221
|
-
logger.debug(
|
|
222
|
-
"Signal listener cancelled for task: '%s'",
|
|
223
|
-
self.task_id,
|
|
224
|
-
extra={"task_id": self.task_id},
|
|
225
|
-
)
|
|
258
|
+
logger.debug("Signal listener cancelled", extra=self.session_ids)
|
|
226
259
|
raise
|
|
227
260
|
except Exception as e:
|
|
228
261
|
logger.error(
|
|
229
|
-
"Signal listener fatal error
|
|
230
|
-
self.
|
|
231
|
-
extra={"task_id": self.task_id, "error": str(e)},
|
|
262
|
+
"Signal listener fatal error",
|
|
263
|
+
extra={**self.session_ids, "error": str(e)},
|
|
232
264
|
exc_info=True,
|
|
233
265
|
)
|
|
234
266
|
finally:
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
self.task_id,
|
|
239
|
-
extra={"task_id": self.task_id},
|
|
240
|
-
)
|
|
267
|
+
with contextlib.suppress(Exception): # Connection may already be closed
|
|
268
|
+
await self.db.stop_live(live_id)
|
|
269
|
+
logger.info("Signal listener stopped", extra=self.session_ids)
|
|
241
270
|
|
|
242
271
|
async def _handle_cancel(self, reason: CancellationReason = CancellationReason.UNKNOWN) -> None:
|
|
243
272
|
"""Idempotent cancellation with acknowledgment and reason tracking.
|
|
@@ -247,13 +276,9 @@ class TaskSession:
|
|
|
247
276
|
"""
|
|
248
277
|
if self.is_cancelled.is_set():
|
|
249
278
|
logger.debug(
|
|
250
|
-
"Cancel ignored -
|
|
251
|
-
self.task_id,
|
|
252
|
-
self.cancellation_reason.value,
|
|
253
|
-
reason.value,
|
|
279
|
+
"Cancel ignored - already cancelled",
|
|
254
280
|
extra={
|
|
255
|
-
|
|
256
|
-
"mission_id": self.mission_id,
|
|
281
|
+
**self.session_ids,
|
|
257
282
|
"existing_reason": self.cancellation_reason.value,
|
|
258
283
|
"new_reason": reason.value,
|
|
259
284
|
},
|
|
@@ -267,25 +292,13 @@ class TaskSession:
|
|
|
267
292
|
# Log with appropriate level based on reason
|
|
268
293
|
if reason in {CancellationReason.SUCCESS_CLEANUP, CancellationReason.FAILURE_CLEANUP}:
|
|
269
294
|
logger.debug(
|
|
270
|
-
"Task cancelled (cleanup)
|
|
271
|
-
self.
|
|
272
|
-
reason.value,
|
|
273
|
-
extra={
|
|
274
|
-
"task_id": self.task_id,
|
|
275
|
-
"mission_id": self.mission_id,
|
|
276
|
-
"cancellation_reason": reason.value,
|
|
277
|
-
},
|
|
295
|
+
"Task cancelled (cleanup)",
|
|
296
|
+
extra={**self.session_ids, "cancellation_reason": reason.value},
|
|
278
297
|
)
|
|
279
298
|
else:
|
|
280
299
|
logger.info(
|
|
281
|
-
"Task cancelled
|
|
282
|
-
self.
|
|
283
|
-
reason.value,
|
|
284
|
-
extra={
|
|
285
|
-
"task_id": self.task_id,
|
|
286
|
-
"mission_id": self.mission_id,
|
|
287
|
-
"cancellation_reason": reason.value,
|
|
288
|
-
},
|
|
300
|
+
"Task cancelled",
|
|
301
|
+
extra={**self.session_ids, "cancellation_reason": reason.value},
|
|
289
302
|
)
|
|
290
303
|
|
|
291
304
|
# Resume if paused so cancellation can proceed
|
|
@@ -298,19 +311,18 @@ class TaskSession:
|
|
|
298
311
|
SignalMessage(
|
|
299
312
|
task_id=self.task_id,
|
|
300
313
|
mission_id=self.mission_id,
|
|
314
|
+
setup_id=self.setup_id,
|
|
315
|
+
setup_version_id=self.setup_version_id,
|
|
301
316
|
action=SignalType.ACK_CANCEL,
|
|
302
317
|
status=self.status,
|
|
318
|
+
cancellation_reason=reason,
|
|
303
319
|
).model_dump(),
|
|
304
320
|
)
|
|
305
321
|
|
|
306
322
|
async def _handle_pause(self) -> None:
|
|
307
323
|
"""Pause task execution."""
|
|
308
324
|
if not self._paused.is_set():
|
|
309
|
-
logger.info(
|
|
310
|
-
"Pausing task: '%s'",
|
|
311
|
-
self.task_id,
|
|
312
|
-
extra={"task_id": self.task_id},
|
|
313
|
-
)
|
|
325
|
+
logger.info("Task paused", extra=self.session_ids)
|
|
314
326
|
self._paused.set()
|
|
315
327
|
|
|
316
328
|
await self.db.update(
|
|
@@ -319,6 +331,8 @@ class TaskSession:
|
|
|
319
331
|
SignalMessage(
|
|
320
332
|
task_id=self.task_id,
|
|
321
333
|
mission_id=self.mission_id,
|
|
334
|
+
setup_id=self.setup_id,
|
|
335
|
+
setup_version_id=self.setup_version_id,
|
|
322
336
|
action=SignalType.ACK_PAUSE,
|
|
323
337
|
status=self.status,
|
|
324
338
|
).model_dump(),
|
|
@@ -327,11 +341,7 @@ class TaskSession:
|
|
|
327
341
|
async def _handle_resume(self) -> None:
|
|
328
342
|
"""Resume paused task."""
|
|
329
343
|
if self._paused.is_set():
|
|
330
|
-
logger.info(
|
|
331
|
-
"Resuming task: '%s'",
|
|
332
|
-
self.task_id,
|
|
333
|
-
extra={"task_id": self.task_id},
|
|
334
|
-
)
|
|
344
|
+
logger.info("Task resumed", extra=self.session_ids)
|
|
335
345
|
self._paused.clear()
|
|
336
346
|
|
|
337
347
|
await self.db.update(
|
|
@@ -340,6 +350,8 @@ class TaskSession:
|
|
|
340
350
|
SignalMessage(
|
|
341
351
|
task_id=self.task_id,
|
|
342
352
|
mission_id=self.mission_id,
|
|
353
|
+
setup_id=self.setup_id,
|
|
354
|
+
setup_version_id=self.setup_version_id,
|
|
343
355
|
action=SignalType.ACK_RESUME,
|
|
344
356
|
status=self.status,
|
|
345
357
|
).model_dump(),
|
|
@@ -351,28 +363,38 @@ class TaskSession:
|
|
|
351
363
|
"tasks",
|
|
352
364
|
self.signal_record_id, # type: ignore
|
|
353
365
|
SignalMessage(
|
|
354
|
-
mission_id=self.mission_id,
|
|
355
366
|
task_id=self.task_id,
|
|
367
|
+
mission_id=self.mission_id,
|
|
368
|
+
setup_id=self.setup_id,
|
|
369
|
+
setup_version_id=self.setup_version_id,
|
|
356
370
|
status=self.status,
|
|
357
371
|
action=SignalType.ACK_STATUS,
|
|
358
372
|
).model_dump(),
|
|
359
373
|
)
|
|
360
374
|
|
|
361
|
-
logger.debug(
|
|
362
|
-
"Status report sent for task: '%s'",
|
|
363
|
-
self.task_id,
|
|
364
|
-
extra={"task_id": self.task_id},
|
|
365
|
-
)
|
|
375
|
+
logger.debug("Status report sent", extra=self.session_ids)
|
|
366
376
|
|
|
367
377
|
async def cleanup(self) -> None:
|
|
368
378
|
"""Clean up task session resources.
|
|
369
379
|
|
|
380
|
+
This method is idempotent - safe to call multiple times.
|
|
381
|
+
Second and subsequent calls are no-ops.
|
|
382
|
+
|
|
370
383
|
This includes:
|
|
371
384
|
- Clearing queue to free memory
|
|
385
|
+
- Cleaning up module context services
|
|
372
386
|
- Stopping module
|
|
373
387
|
- Closing database connection
|
|
374
388
|
- Clearing module reference
|
|
375
389
|
"""
|
|
390
|
+
if self._cleanup_done:
|
|
391
|
+
logger.debug(
|
|
392
|
+
"Cleanup already done, skipping",
|
|
393
|
+
extra={"task_id": self.task_id, "mission_id": self.mission_id},
|
|
394
|
+
)
|
|
395
|
+
return
|
|
396
|
+
self._cleanup_done = True
|
|
397
|
+
|
|
376
398
|
# Clear queue to free memory
|
|
377
399
|
try:
|
|
378
400
|
while not self.queue.empty():
|
|
@@ -380,6 +402,16 @@ class TaskSession:
|
|
|
380
402
|
except asyncio.QueueEmpty:
|
|
381
403
|
pass
|
|
382
404
|
|
|
405
|
+
# Clean up module context services (e.g., gRPC channel pool)
|
|
406
|
+
if self.module is not None and self.module.context is not None:
|
|
407
|
+
try:
|
|
408
|
+
await self.module.context.cleanup()
|
|
409
|
+
except Exception:
|
|
410
|
+
logger.exception(
|
|
411
|
+
"Error cleaning up module context",
|
|
412
|
+
extra={"mission_id": self.mission_id, "task_id": self.task_id},
|
|
413
|
+
)
|
|
414
|
+
|
|
383
415
|
# Stop module
|
|
384
416
|
try:
|
|
385
417
|
await self.module.stop()
|