digitalkin 0.3.1__py3-none-any.whl → 0.3.1.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- digitalkin/__version__.py +1 -1
- digitalkin/core/job_manager/taskiq_broker.py +1 -1
- digitalkin/core/job_manager/taskiq_job_manager.py +2 -1
- digitalkin/core/task_manager/base_task_manager.py +12 -87
- digitalkin/core/task_manager/task_executor.py +27 -103
- digitalkin/core/task_manager/task_session.py +19 -75
- digitalkin/grpc_servers/module_servicer.py +9 -17
- digitalkin/models/core/task_monitor.py +0 -17
- digitalkin/models/grpc_servers/models.py +4 -4
- digitalkin/models/module/module_context.py +0 -5
- digitalkin/models/module/module_types.py +15 -299
- digitalkin/modules/_base_module.py +28 -66
- digitalkin/services/services_config.py +0 -11
- digitalkin/services/services_models.py +1 -3
- digitalkin/services/user_profile/__init__.py +0 -11
- digitalkin/services/user_profile/grpc_user_profile.py +2 -2
- digitalkin/utils/__init__.py +0 -28
- {digitalkin-0.3.1.dist-info → digitalkin-0.3.1.dev0.dist-info}/METADATA +5 -5
- {digitalkin-0.3.1.dist-info → digitalkin-0.3.1.dev0.dist-info}/RECORD +22 -24
- digitalkin/utils/dynamic_schema.py +0 -483
- modules/dynamic_setup_module.py +0 -362
- {digitalkin-0.3.1.dist-info → digitalkin-0.3.1.dev0.dist-info}/WHEEL +0 -0
- {digitalkin-0.3.1.dist-info → digitalkin-0.3.1.dev0.dist-info}/licenses/LICENSE +0 -0
- {digitalkin-0.3.1.dist-info → digitalkin-0.3.1.dev0.dist-info}/top_level.txt +0 -0
digitalkin/__version__.py
CHANGED
|
@@ -208,7 +208,7 @@ async def run_start_module(
|
|
|
208
208
|
# Reconstruct Pydantic models from dicts for type safety
|
|
209
209
|
try:
|
|
210
210
|
input_model = module_class.create_input_model(input_data)
|
|
211
|
-
setup_model =
|
|
211
|
+
setup_model = module_class.create_setup_model(setup_data)
|
|
212
212
|
except Exception as e:
|
|
213
213
|
logger.error("Failed to reconstruct models for job %s: %s", job_id, e, exc_info=True)
|
|
214
214
|
raise
|
|
@@ -140,7 +140,7 @@ class TaskiqJobManager(BaseJobManager[InputModelT, OutputModelT, SetupModelT]):
|
|
|
140
140
|
services_mode: ServicesMode,
|
|
141
141
|
default_timeout: float = 10.0,
|
|
142
142
|
max_concurrent_tasks: int = 100,
|
|
143
|
-
stream_timeout: float =
|
|
143
|
+
stream_timeout: float = 15.0,
|
|
144
144
|
) -> None:
|
|
145
145
|
"""Initialize the Taskiq job manager.
|
|
146
146
|
|
|
@@ -298,6 +298,7 @@ class TaskiqJobManager(BaseJobManager[InputModelT, OutputModelT, SetupModelT]):
|
|
|
298
298
|
while True:
|
|
299
299
|
try:
|
|
300
300
|
# Block for first item with timeout to allow termination checks
|
|
301
|
+
# Configurable timeout (default 15s) to account for distributed system latencies
|
|
301
302
|
item = await asyncio.wait_for(queue.get(), timeout=self.stream_timeout)
|
|
302
303
|
queue.task_done()
|
|
303
304
|
yield item
|
|
@@ -11,7 +11,6 @@ from typing import Any
|
|
|
11
11
|
from digitalkin.core.task_manager.surrealdb_repository import SurrealDBConnection
|
|
12
12
|
from digitalkin.core.task_manager.task_session import TaskSession
|
|
13
13
|
from digitalkin.logger import logger
|
|
14
|
-
from digitalkin.models.core.task_monitor import CancellationReason
|
|
15
14
|
from digitalkin.modules._base_module import BaseModule
|
|
16
15
|
|
|
17
16
|
|
|
@@ -85,32 +84,13 @@ class BaseTaskManager(ABC):
|
|
|
85
84
|
task_id: The ID of the task to clean up
|
|
86
85
|
mission_id: The ID of the mission associated with the task
|
|
87
86
|
"""
|
|
88
|
-
session = self.tasks_sessions.get(task_id)
|
|
89
|
-
cancellation_reason = session.cancellation_reason.value if session else "no_session"
|
|
90
|
-
final_status = session.status.value if session else "unknown"
|
|
91
|
-
|
|
92
87
|
logger.debug(
|
|
93
|
-
"Cleaning up resources",
|
|
94
|
-
extra={
|
|
95
|
-
"mission_id": mission_id,
|
|
96
|
-
"task_id": task_id,
|
|
97
|
-
"final_status": final_status,
|
|
98
|
-
"cancellation_reason": cancellation_reason,
|
|
99
|
-
},
|
|
88
|
+
"Cleaning up resources for task: '%s'", task_id, extra={"mission_id": mission_id, "task_id": task_id}
|
|
100
89
|
)
|
|
101
|
-
|
|
102
|
-
|
|
90
|
+
if task_id in self.tasks_sessions:
|
|
91
|
+
session = self.tasks_sessions[task_id]
|
|
103
92
|
await session.cleanup()
|
|
104
93
|
self.tasks_sessions.pop(task_id, None)
|
|
105
|
-
logger.debug(
|
|
106
|
-
"Task session cleanup completed",
|
|
107
|
-
extra={
|
|
108
|
-
"mission_id": mission_id,
|
|
109
|
-
"task_id": task_id,
|
|
110
|
-
"final_status": final_status,
|
|
111
|
-
"cancellation_reason": cancellation_reason,
|
|
112
|
-
},
|
|
113
|
-
)
|
|
114
94
|
|
|
115
95
|
self.tasks.pop(task_id, None)
|
|
116
96
|
|
|
@@ -281,21 +261,10 @@ class BaseTaskManager(ABC):
|
|
|
281
261
|
)
|
|
282
262
|
|
|
283
263
|
except asyncio.TimeoutError:
|
|
284
|
-
# Set timeout as cancellation reason
|
|
285
|
-
if task_id in self.tasks_sessions:
|
|
286
|
-
session = self.tasks_sessions[task_id]
|
|
287
|
-
if session.cancellation_reason == CancellationReason.UNKNOWN:
|
|
288
|
-
session.cancellation_reason = CancellationReason.TIMEOUT
|
|
289
|
-
|
|
290
264
|
logger.warning(
|
|
291
265
|
"Graceful cancellation timed out for task: '%s', forcing cancellation",
|
|
292
266
|
task_id,
|
|
293
|
-
extra={
|
|
294
|
-
"mission_id": mission_id,
|
|
295
|
-
"task_id": task_id,
|
|
296
|
-
"timeout": timeout,
|
|
297
|
-
"cancellation_reason": CancellationReason.TIMEOUT.value,
|
|
298
|
-
},
|
|
267
|
+
extra={"mission_id": mission_id, "task_id": task_id, "timeout": timeout},
|
|
299
268
|
)
|
|
300
269
|
|
|
301
270
|
# Phase 2: Force cancellation
|
|
@@ -303,16 +272,8 @@ class BaseTaskManager(ABC):
|
|
|
303
272
|
with contextlib.suppress(asyncio.CancelledError):
|
|
304
273
|
await task
|
|
305
274
|
|
|
306
|
-
logger.warning(
|
|
307
|
-
|
|
308
|
-
task_id,
|
|
309
|
-
CancellationReason.TIMEOUT.value,
|
|
310
|
-
extra={
|
|
311
|
-
"mission_id": mission_id,
|
|
312
|
-
"task_id": task_id,
|
|
313
|
-
"cancellation_reason": CancellationReason.TIMEOUT.value,
|
|
314
|
-
},
|
|
315
|
-
)
|
|
275
|
+
logger.warning("Task force-cancelled: '%s'", task_id, extra={"mission_id": mission_id, "task_id": task_id})
|
|
276
|
+
await self._cleanup_task(task_id, mission_id)
|
|
316
277
|
return True
|
|
317
278
|
|
|
318
279
|
except Exception as e:
|
|
@@ -322,9 +283,10 @@ class BaseTaskManager(ABC):
|
|
|
322
283
|
extra={"mission_id": mission_id, "task_id": task_id, "error": str(e)},
|
|
323
284
|
exc_info=True,
|
|
324
285
|
)
|
|
325
|
-
return False
|
|
326
|
-
finally:
|
|
327
286
|
await self._cleanup_task(task_id, mission_id)
|
|
287
|
+
return False
|
|
288
|
+
|
|
289
|
+
await self._cleanup_task(task_id, mission_id)
|
|
328
290
|
return True
|
|
329
291
|
|
|
330
292
|
async def clean_session(self, task_id: str, mission_id: str) -> bool:
|
|
@@ -420,16 +382,7 @@ class BaseTaskManager(ABC):
|
|
|
420
382
|
results: dict[str, bool | BaseException] = {}
|
|
421
383
|
for task_id, result in zip(task_ids, results_list):
|
|
422
384
|
if isinstance(result, Exception):
|
|
423
|
-
logger.error(
|
|
424
|
-
"Exception cancelling task: '%s', error: %s",
|
|
425
|
-
task_id,
|
|
426
|
-
result,
|
|
427
|
-
extra={
|
|
428
|
-
"mission_id": mission_id,
|
|
429
|
-
"task_id": task_id,
|
|
430
|
-
"error": str(result),
|
|
431
|
-
},
|
|
432
|
-
)
|
|
385
|
+
logger.error("Exception cancelling task %s: %s", task_id, result)
|
|
433
386
|
results[task_id] = False
|
|
434
387
|
else:
|
|
435
388
|
results[task_id] = result
|
|
@@ -450,21 +403,6 @@ class BaseTaskManager(ABC):
|
|
|
450
403
|
)
|
|
451
404
|
|
|
452
405
|
self._shutdown_event.set()
|
|
453
|
-
|
|
454
|
-
# Mark all sessions with shutdown reason before cancellation
|
|
455
|
-
for task_id, session in self.tasks_sessions.items():
|
|
456
|
-
if session.cancellation_reason == CancellationReason.UNKNOWN:
|
|
457
|
-
session.cancellation_reason = CancellationReason.SHUTDOWN
|
|
458
|
-
logger.debug(
|
|
459
|
-
"Marking task for shutdown: '%s'",
|
|
460
|
-
task_id,
|
|
461
|
-
extra={
|
|
462
|
-
"mission_id": mission_id,
|
|
463
|
-
"task_id": task_id,
|
|
464
|
-
"cancellation_reason": CancellationReason.SHUTDOWN.value,
|
|
465
|
-
},
|
|
466
|
-
)
|
|
467
|
-
|
|
468
406
|
results = await self.cancel_all_tasks(mission_id, timeout)
|
|
469
407
|
|
|
470
408
|
failed_tasks = [task_id for task_id, success in results.items() if not success]
|
|
@@ -473,26 +411,13 @@ class BaseTaskManager(ABC):
|
|
|
473
411
|
"Failed to cancel %d tasks during shutdown: %s",
|
|
474
412
|
len(failed_tasks),
|
|
475
413
|
failed_tasks,
|
|
476
|
-
extra={
|
|
477
|
-
"mission_id": mission_id,
|
|
478
|
-
"failed_tasks": failed_tasks,
|
|
479
|
-
"failed_count": len(failed_tasks),
|
|
480
|
-
"cancellation_reason": CancellationReason.SHUTDOWN.value,
|
|
481
|
-
},
|
|
414
|
+
extra={"mission_id": mission_id, "failed_tasks": failed_tasks, "failed_count": len(failed_tasks)},
|
|
482
415
|
)
|
|
483
416
|
|
|
484
417
|
# Clean up any remaining sessions (in case cancellation didn't clean them)
|
|
485
418
|
remaining_sessions = list(self.tasks_sessions.keys())
|
|
486
419
|
if remaining_sessions:
|
|
487
|
-
logger.info(
|
|
488
|
-
"Cleaning up %d remaining task sessions after shutdown",
|
|
489
|
-
len(remaining_sessions),
|
|
490
|
-
extra={
|
|
491
|
-
"mission_id": mission_id,
|
|
492
|
-
"remaining_sessions": remaining_sessions,
|
|
493
|
-
"remaining_count": len(remaining_sessions),
|
|
494
|
-
},
|
|
495
|
-
)
|
|
420
|
+
logger.info("Cleaning up %d remaining task sessions", len(remaining_sessions))
|
|
496
421
|
cleanup_coros = [self._cleanup_task(task_id, mission_id) for task_id in remaining_sessions]
|
|
497
422
|
await asyncio.gather(*cleanup_coros, return_exceptions=True)
|
|
498
423
|
|
|
@@ -8,12 +8,7 @@ from typing import Any
|
|
|
8
8
|
from digitalkin.core.task_manager.surrealdb_repository import SurrealDBConnection
|
|
9
9
|
from digitalkin.core.task_manager.task_session import TaskSession
|
|
10
10
|
from digitalkin.logger import logger
|
|
11
|
-
from digitalkin.models.core.task_monitor import
|
|
12
|
-
CancellationReason,
|
|
13
|
-
SignalMessage,
|
|
14
|
-
SignalType,
|
|
15
|
-
TaskStatus,
|
|
16
|
-
)
|
|
11
|
+
from digitalkin.models.core.task_monitor import SignalMessage, SignalType, TaskStatus
|
|
17
12
|
|
|
18
13
|
|
|
19
14
|
class TaskExecutor:
|
|
@@ -87,7 +82,7 @@ class TaskExecutor:
|
|
|
87
82
|
finally:
|
|
88
83
|
logger.info("Heartbeat task ended", extra={"mission_id": mission_id, "task_id": task_id})
|
|
89
84
|
|
|
90
|
-
async def supervisor() -> None:
|
|
85
|
+
async def supervisor() -> None:
|
|
91
86
|
"""Supervise the three concurrent tasks and handle outcomes.
|
|
92
87
|
|
|
93
88
|
Raises:
|
|
@@ -101,7 +96,6 @@ class TaskExecutor:
|
|
|
101
96
|
main_task = None
|
|
102
97
|
hb_task = None
|
|
103
98
|
sig_task = None
|
|
104
|
-
cleanup_reason = CancellationReason.UNKNOWN
|
|
105
99
|
|
|
106
100
|
try:
|
|
107
101
|
main_task = asyncio.create_task(coro, name=f"{task_id}_main")
|
|
@@ -112,37 +106,12 @@ class TaskExecutor:
|
|
|
112
106
|
return_when=asyncio.FIRST_COMPLETED,
|
|
113
107
|
)
|
|
114
108
|
|
|
115
|
-
#
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
if completed is main_task:
|
|
119
|
-
# Main task finished - cleanup is due to success
|
|
120
|
-
cleanup_reason = CancellationReason.SUCCESS_CLEANUP
|
|
121
|
-
elif completed is sig_task or (completed is hb_task and sig_task.done()):
|
|
122
|
-
# Signal task finished - external cancellation
|
|
123
|
-
cleanup_reason = CancellationReason.SIGNAL
|
|
124
|
-
elif completed is hb_task:
|
|
125
|
-
# Heartbeat stopped - failure cleanup
|
|
126
|
-
cleanup_reason = CancellationReason.FAILURE_CLEANUP
|
|
127
|
-
|
|
128
|
-
# Cancel pending tasks with proper reason logging
|
|
129
|
-
if pending:
|
|
130
|
-
pending_names = [t.get_name() for t in pending]
|
|
131
|
-
logger.debug(
|
|
132
|
-
"Cancelling pending tasks: %s, reason: %s",
|
|
133
|
-
pending_names,
|
|
134
|
-
cleanup_reason.value,
|
|
135
|
-
extra={
|
|
136
|
-
"mission_id": mission_id,
|
|
137
|
-
"task_id": task_id,
|
|
138
|
-
"pending_tasks": pending_names,
|
|
139
|
-
"cancellation_reason": cleanup_reason.value,
|
|
140
|
-
},
|
|
141
|
-
)
|
|
142
|
-
for t in pending:
|
|
143
|
-
t.cancel()
|
|
109
|
+
# One task completed -> cancel the others
|
|
110
|
+
for t in pending:
|
|
111
|
+
t.cancel()
|
|
144
112
|
|
|
145
113
|
# Propagate exception/result from the finished task
|
|
114
|
+
completed = next(iter(done))
|
|
146
115
|
await completed
|
|
147
116
|
|
|
148
117
|
# Determine final status based on which task completed
|
|
@@ -153,95 +122,50 @@ class TaskExecutor:
|
|
|
153
122
|
extra={"mission_id": mission_id, "task_id": task_id},
|
|
154
123
|
)
|
|
155
124
|
elif completed is sig_task or (completed is hb_task and sig_task.done()):
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
"Task cancelled via external signal",
|
|
160
|
-
extra={
|
|
161
|
-
"mission_id": mission_id,
|
|
162
|
-
"task_id": task_id,
|
|
163
|
-
"cancellation_reason": CancellationReason.SIGNAL.value,
|
|
164
|
-
},
|
|
125
|
+
logger.debug(
|
|
126
|
+
f"Task cancelled due to signal {sig_task=}",
|
|
127
|
+
extra={"mission_id": mission_id, "task_id": task_id},
|
|
165
128
|
)
|
|
129
|
+
session.status = TaskStatus.CANCELLED
|
|
166
130
|
elif completed is hb_task:
|
|
167
131
|
session.status = TaskStatus.FAILED
|
|
168
|
-
session.cancellation_reason = CancellationReason.HEARTBEAT_FAILURE
|
|
169
132
|
logger.error(
|
|
170
|
-
"Heartbeat stopped
|
|
171
|
-
task_id,
|
|
172
|
-
extra={
|
|
173
|
-
"mission_id": mission_id,
|
|
174
|
-
"task_id": task_id,
|
|
175
|
-
"cancellation_reason": CancellationReason.HEARTBEAT_FAILURE.value,
|
|
176
|
-
},
|
|
133
|
+
f"Heartbeat stopped for {task_id}",
|
|
134
|
+
extra={"mission_id": mission_id, "task_id": task_id},
|
|
177
135
|
)
|
|
178
136
|
msg = f"Heartbeat stopped for {task_id}"
|
|
179
137
|
raise RuntimeError(msg) # noqa: TRY301
|
|
180
138
|
|
|
181
139
|
except asyncio.CancelledError:
|
|
182
140
|
session.status = TaskStatus.CANCELLED
|
|
183
|
-
|
|
184
|
-
logger.info(
|
|
185
|
-
"Task cancelled externally: '%s', reason: %s",
|
|
186
|
-
task_id,
|
|
187
|
-
session.cancellation_reason.value,
|
|
188
|
-
extra={
|
|
189
|
-
"mission_id": mission_id,
|
|
190
|
-
"task_id": task_id,
|
|
191
|
-
"cancellation_reason": session.cancellation_reason.value,
|
|
192
|
-
},
|
|
193
|
-
)
|
|
194
|
-
cleanup_reason = CancellationReason.FAILURE_CLEANUP
|
|
141
|
+
logger.info("Task cancelled", extra={"mission_id": mission_id, "task_id": task_id})
|
|
195
142
|
raise
|
|
196
143
|
except Exception:
|
|
197
144
|
session.status = TaskStatus.FAILED
|
|
198
|
-
|
|
199
|
-
logger.exception(
|
|
200
|
-
"Task failed with exception: '%s'",
|
|
201
|
-
task_id,
|
|
202
|
-
extra={"mission_id": mission_id, "task_id": task_id},
|
|
203
|
-
)
|
|
145
|
+
logger.exception("Task failed", extra={"mission_id": mission_id, "task_id": task_id})
|
|
204
146
|
raise
|
|
205
147
|
finally:
|
|
206
148
|
session.completed_at = datetime.datetime.now(datetime.timezone.utc)
|
|
207
|
-
# Ensure all tasks are cleaned up
|
|
208
|
-
tasks_to_cleanup = [t for t in [main_task, hb_task, sig_task] if t is not None
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
logger.debug(
|
|
212
|
-
"Final cleanup of %d remaining tasks: %s, reason: %s",
|
|
213
|
-
len(tasks_to_cleanup),
|
|
214
|
-
cleanup_names,
|
|
215
|
-
cleanup_reason.value,
|
|
216
|
-
extra={
|
|
217
|
-
"mission_id": mission_id,
|
|
218
|
-
"task_id": task_id,
|
|
219
|
-
"cleanup_count": len(tasks_to_cleanup),
|
|
220
|
-
"cleanup_tasks": cleanup_names,
|
|
221
|
-
"cancellation_reason": cleanup_reason.value,
|
|
222
|
-
},
|
|
223
|
-
)
|
|
224
|
-
for t in tasks_to_cleanup:
|
|
149
|
+
# Ensure all tasks are cleaned up
|
|
150
|
+
tasks_to_cleanup = [t for t in [main_task, hb_task, sig_task] if t is not None]
|
|
151
|
+
for t in tasks_to_cleanup:
|
|
152
|
+
if not t.done():
|
|
225
153
|
t.cancel()
|
|
154
|
+
if tasks_to_cleanup:
|
|
226
155
|
await asyncio.gather(*tasks_to_cleanup, return_exceptions=True)
|
|
227
156
|
|
|
228
|
-
duration = (
|
|
229
|
-
(session.completed_at - session.started_at).total_seconds()
|
|
230
|
-
if session.started_at and session.completed_at
|
|
231
|
-
else None
|
|
232
|
-
)
|
|
233
157
|
logger.info(
|
|
234
|
-
"Task execution completed
|
|
235
|
-
|
|
236
|
-
session.status.value,
|
|
237
|
-
session.cancellation_reason.value if session.status == TaskStatus.CANCELLED else "n/a",
|
|
238
|
-
duration or 0,
|
|
158
|
+
"Task execution completed with status: %s",
|
|
159
|
+
session.status,
|
|
239
160
|
extra={
|
|
240
161
|
"mission_id": mission_id,
|
|
241
162
|
"task_id": task_id,
|
|
242
|
-
"status": session.status
|
|
243
|
-
"
|
|
244
|
-
|
|
163
|
+
"status": session.status,
|
|
164
|
+
"duration": (
|
|
165
|
+
(session.completed_at - session.started_at).total_seconds()
|
|
166
|
+
if session.started_at and session.completed_at
|
|
167
|
+
else None
|
|
168
|
+
),
|
|
245
169
|
},
|
|
246
170
|
)
|
|
247
171
|
|
|
@@ -6,13 +6,7 @@ from collections.abc import AsyncGenerator
|
|
|
6
6
|
|
|
7
7
|
from digitalkin.core.task_manager.surrealdb_repository import SurrealDBConnection
|
|
8
8
|
from digitalkin.logger import logger
|
|
9
|
-
from digitalkin.models.core.task_monitor import
|
|
10
|
-
CancellationReason,
|
|
11
|
-
HeartbeatMessage,
|
|
12
|
-
SignalMessage,
|
|
13
|
-
SignalType,
|
|
14
|
-
TaskStatus,
|
|
15
|
-
)
|
|
9
|
+
from digitalkin.models.core.task_monitor import HeartbeatMessage, SignalMessage, SignalType, TaskStatus
|
|
16
10
|
from digitalkin.modules._base_module import BaseModule
|
|
17
11
|
|
|
18
12
|
|
|
@@ -37,7 +31,6 @@ class TaskSession:
|
|
|
37
31
|
completed_at: datetime.datetime | None
|
|
38
32
|
|
|
39
33
|
is_cancelled: asyncio.Event
|
|
40
|
-
cancellation_reason: CancellationReason
|
|
41
34
|
_paused: asyncio.Event
|
|
42
35
|
_heartbeat_interval: datetime.timedelta
|
|
43
36
|
_last_heartbeat: datetime.datetime
|
|
@@ -49,24 +42,14 @@ class TaskSession:
|
|
|
49
42
|
db: SurrealDBConnection,
|
|
50
43
|
module: BaseModule,
|
|
51
44
|
heartbeat_interval: datetime.timedelta = datetime.timedelta(seconds=2),
|
|
52
|
-
queue_maxsize: int = 1000,
|
|
53
45
|
) -> None:
|
|
54
|
-
"""Initialize Task Session.
|
|
55
|
-
|
|
56
|
-
Args:
|
|
57
|
-
task_id: Unique task identifier
|
|
58
|
-
mission_id: Mission identifier
|
|
59
|
-
db: SurrealDB connection
|
|
60
|
-
module: Module instance
|
|
61
|
-
heartbeat_interval: Interval between heartbeats
|
|
62
|
-
queue_maxsize: Maximum size for the queue (0 = unlimited)
|
|
63
|
-
"""
|
|
46
|
+
"""Initialize Task Session."""
|
|
64
47
|
self.db = db
|
|
65
48
|
self.module = module
|
|
66
49
|
|
|
67
50
|
self.status = TaskStatus.PENDING
|
|
68
51
|
# Bounded queue to prevent unbounded memory growth (max 1000 items)
|
|
69
|
-
self.queue: asyncio.Queue = asyncio.Queue(maxsize=
|
|
52
|
+
self.queue: asyncio.Queue = asyncio.Queue(maxsize=1000)
|
|
70
53
|
|
|
71
54
|
self.task_id = task_id
|
|
72
55
|
self.mission_id = mission_id
|
|
@@ -79,7 +62,6 @@ class TaskSession:
|
|
|
79
62
|
self.heartbeat_record_id = None
|
|
80
63
|
|
|
81
64
|
self.is_cancelled = asyncio.Event()
|
|
82
|
-
self.cancellation_reason = CancellationReason.UNKNOWN
|
|
83
65
|
self._paused = asyncio.Event()
|
|
84
66
|
self._heartbeat_interval = heartbeat_interval
|
|
85
67
|
|
|
@@ -161,26 +143,17 @@ class TaskSession:
|
|
|
161
143
|
|
|
162
144
|
async def generate_heartbeats(self) -> None:
|
|
163
145
|
"""Periodic heartbeat generator with cancellation support."""
|
|
164
|
-
logger.debug(
|
|
165
|
-
"Heartbeat generator started for task: '%s'",
|
|
166
|
-
self.task_id,
|
|
167
|
-
extra={"task_id": self.task_id, "mission_id": self.mission_id},
|
|
168
|
-
)
|
|
146
|
+
logger.debug("Heartbeat started")
|
|
169
147
|
while not self.cancelled:
|
|
170
|
-
logger.debug(
|
|
171
|
-
"Heartbeat tick for task: '%s', cancelled=%s",
|
|
172
|
-
self.task_id,
|
|
173
|
-
self.cancelled,
|
|
174
|
-
extra={"task_id": self.task_id, "mission_id": self.mission_id},
|
|
175
|
-
)
|
|
148
|
+
logger.debug(f"Heartbeat tick for task: '{self.task_id}' | {self.cancelled=}")
|
|
176
149
|
success = await self.send_heartbeat()
|
|
177
150
|
if not success:
|
|
178
151
|
logger.error(
|
|
179
152
|
"Heartbeat failed, cancelling task: '%s'",
|
|
180
153
|
self.task_id,
|
|
181
|
-
extra={"task_id": self.task_id
|
|
154
|
+
extra={"task_id": self.task_id},
|
|
182
155
|
)
|
|
183
|
-
await self._handle_cancel(
|
|
156
|
+
await self._handle_cancel()
|
|
184
157
|
break
|
|
185
158
|
await asyncio.sleep(self._heartbeat_interval.total_seconds())
|
|
186
159
|
|
|
@@ -219,7 +192,7 @@ class TaskSession:
|
|
|
219
192
|
continue
|
|
220
193
|
|
|
221
194
|
if signal["action"] == "cancel":
|
|
222
|
-
await self._handle_cancel(
|
|
195
|
+
await self._handle_cancel()
|
|
223
196
|
elif signal["action"] == "pause":
|
|
224
197
|
await self._handle_pause()
|
|
225
198
|
elif signal["action"] == "resume":
|
|
@@ -249,55 +222,26 @@ class TaskSession:
|
|
|
249
222
|
extra={"task_id": self.task_id},
|
|
250
223
|
)
|
|
251
224
|
|
|
252
|
-
async def _handle_cancel(self
|
|
253
|
-
"""Idempotent cancellation with acknowledgment
|
|
254
|
-
|
|
255
|
-
Args:
|
|
256
|
-
reason: The reason for cancellation (signal, heartbeat failure, cleanup, etc.)
|
|
257
|
-
"""
|
|
225
|
+
async def _handle_cancel(self) -> None:
|
|
226
|
+
"""Idempotent cancellation with acknowledgment."""
|
|
227
|
+
logger.debug("Handle cancel called")
|
|
258
228
|
if self.is_cancelled.is_set():
|
|
259
229
|
logger.debug(
|
|
260
|
-
"Cancel ignored - task already cancelled: '%s'
|
|
230
|
+
"Cancel signal ignored - task already cancelled: '%s'",
|
|
261
231
|
self.task_id,
|
|
262
|
-
self.
|
|
263
|
-
reason.value,
|
|
264
|
-
extra={
|
|
265
|
-
"task_id": self.task_id,
|
|
266
|
-
"mission_id": self.mission_id,
|
|
267
|
-
"existing_reason": self.cancellation_reason.value,
|
|
268
|
-
"new_reason": reason.value,
|
|
269
|
-
},
|
|
232
|
+
extra={"task_id": self.task_id},
|
|
270
233
|
)
|
|
271
234
|
return
|
|
272
235
|
|
|
273
|
-
|
|
236
|
+
logger.info(
|
|
237
|
+
"Cancelling task: '%s'",
|
|
238
|
+
self.task_id,
|
|
239
|
+
extra={"task_id": self.task_id},
|
|
240
|
+
)
|
|
241
|
+
|
|
274
242
|
self.status = TaskStatus.CANCELLED
|
|
275
243
|
self.is_cancelled.set()
|
|
276
244
|
|
|
277
|
-
# Log with appropriate level based on reason
|
|
278
|
-
if reason in {CancellationReason.SUCCESS_CLEANUP, CancellationReason.FAILURE_CLEANUP}:
|
|
279
|
-
logger.debug(
|
|
280
|
-
"Task cancelled (cleanup): '%s', reason: %s",
|
|
281
|
-
self.task_id,
|
|
282
|
-
reason.value,
|
|
283
|
-
extra={
|
|
284
|
-
"task_id": self.task_id,
|
|
285
|
-
"mission_id": self.mission_id,
|
|
286
|
-
"cancellation_reason": reason.value,
|
|
287
|
-
},
|
|
288
|
-
)
|
|
289
|
-
else:
|
|
290
|
-
logger.info(
|
|
291
|
-
"Task cancelled: '%s', reason: %s",
|
|
292
|
-
self.task_id,
|
|
293
|
-
reason.value,
|
|
294
|
-
extra={
|
|
295
|
-
"task_id": self.task_id,
|
|
296
|
-
"mission_id": self.mission_id,
|
|
297
|
-
"cancellation_reason": reason.value,
|
|
298
|
-
},
|
|
299
|
-
)
|
|
300
|
-
|
|
301
245
|
# Resume if paused so cancellation can proceed
|
|
302
246
|
if self._paused.is_set():
|
|
303
247
|
self._paused.set()
|
|
@@ -112,7 +112,7 @@ class ModuleServicer(module_service_pb2_grpc.ModuleServiceServicer, ArgParser):
|
|
|
112
112
|
# TODO: Secret should be used here as well
|
|
113
113
|
setup_version = request.setup_version
|
|
114
114
|
config_setup_data = self.module_class.create_config_setup_model(json_format.MessageToDict(request.content))
|
|
115
|
-
setup_version_data =
|
|
115
|
+
setup_version_data = self.module_class.create_setup_model(
|
|
116
116
|
json_format.MessageToDict(request.setup_version.content),
|
|
117
117
|
config_fields=True,
|
|
118
118
|
)
|
|
@@ -185,7 +185,7 @@ class ModuleServicer(module_service_pb2_grpc.ModuleServiceServicer, ArgParser):
|
|
|
185
185
|
msg = "No setup data returned."
|
|
186
186
|
raise ServicerError(msg)
|
|
187
187
|
|
|
188
|
-
setup_data =
|
|
188
|
+
setup_data = self.module_class.create_setup_model(setup_data_class.current_setup_version.content)
|
|
189
189
|
|
|
190
190
|
# create a task to run the module in background
|
|
191
191
|
job_id = await self.job_manager.create_module_instance_job(
|
|
@@ -220,13 +220,9 @@ class ModuleServicer(module_service_pb2_grpc.ModuleServiceServicer, ArgParser):
|
|
|
220
220
|
break
|
|
221
221
|
|
|
222
222
|
if message.get("code", None) is not None and message.get("code") == "__END_OF_STREAM__":
|
|
223
|
-
|
|
224
|
-
"End of stream via __END_OF_STREAM__",
|
|
225
|
-
extra={"job_id": job_id, "mission_id": request.mission_id},
|
|
226
|
-
)
|
|
223
|
+
yield lifecycle_pb2.StartModuleResponse(success=True, job_id=job_id)
|
|
227
224
|
break
|
|
228
225
|
|
|
229
|
-
logger.info("Yielding message from job %s: %s", job_id, message)
|
|
230
226
|
proto = json_format.ParseDict(message, struct_pb2.Struct(), ignore_unknown_fields=True)
|
|
231
227
|
yield lifecycle_pb2.StartModuleResponse(success=True, output=proto, job_id=job_id)
|
|
232
228
|
finally:
|
|
@@ -350,9 +346,7 @@ class ModuleServicer(module_service_pb2_grpc.ModuleServiceServicer, ArgParser):
|
|
|
350
346
|
# Get input schema if available
|
|
351
347
|
try:
|
|
352
348
|
# Convert schema to proto format
|
|
353
|
-
input_schema_proto =
|
|
354
|
-
llm_format=request.llm_format,
|
|
355
|
-
)
|
|
349
|
+
input_schema_proto = self.module_class.get_input_format(llm_format=request.llm_format)
|
|
356
350
|
input_format_struct = json_format.Parse(
|
|
357
351
|
text=input_schema_proto,
|
|
358
352
|
message=struct_pb2.Struct(), # pylint: disable=no-member
|
|
@@ -388,9 +382,7 @@ class ModuleServicer(module_service_pb2_grpc.ModuleServiceServicer, ArgParser):
|
|
|
388
382
|
# Get output schema if available
|
|
389
383
|
try:
|
|
390
384
|
# Convert schema to proto format
|
|
391
|
-
output_schema_proto =
|
|
392
|
-
llm_format=request.llm_format,
|
|
393
|
-
)
|
|
385
|
+
output_schema_proto = self.module_class.get_output_format(llm_format=request.llm_format)
|
|
394
386
|
output_format_struct = json_format.Parse(
|
|
395
387
|
text=output_schema_proto,
|
|
396
388
|
message=struct_pb2.Struct(), # pylint: disable=no-member
|
|
@@ -426,7 +418,7 @@ class ModuleServicer(module_service_pb2_grpc.ModuleServiceServicer, ArgParser):
|
|
|
426
418
|
# Get setup schema if available
|
|
427
419
|
try:
|
|
428
420
|
# Convert schema to proto format
|
|
429
|
-
setup_schema_proto =
|
|
421
|
+
setup_schema_proto = self.module_class.get_setup_format(llm_format=request.llm_format)
|
|
430
422
|
setup_format_struct = json_format.Parse(
|
|
431
423
|
text=setup_schema_proto,
|
|
432
424
|
message=struct_pb2.Struct(), # pylint: disable=no-member
|
|
@@ -443,7 +435,7 @@ class ModuleServicer(module_service_pb2_grpc.ModuleServiceServicer, ArgParser):
|
|
|
443
435
|
setup_schema=setup_format_struct,
|
|
444
436
|
)
|
|
445
437
|
|
|
446
|
-
|
|
438
|
+
def GetModuleSecret( # noqa: N802
|
|
447
439
|
self,
|
|
448
440
|
request: information_pb2.GetModuleSecretRequest,
|
|
449
441
|
context: grpc.ServicerContext,
|
|
@@ -462,7 +454,7 @@ class ModuleServicer(module_service_pb2_grpc.ModuleServiceServicer, ArgParser):
|
|
|
462
454
|
# Get secret schema if available
|
|
463
455
|
try:
|
|
464
456
|
# Convert schema to proto format
|
|
465
|
-
secret_schema_proto =
|
|
457
|
+
secret_schema_proto = self.module_class.get_secret_format(llm_format=request.llm_format)
|
|
466
458
|
secret_format_struct = json_format.Parse(
|
|
467
459
|
text=secret_schema_proto,
|
|
468
460
|
message=struct_pb2.Struct(), # pylint: disable=no-member
|
|
@@ -498,7 +490,7 @@ class ModuleServicer(module_service_pb2_grpc.ModuleServiceServicer, ArgParser):
|
|
|
498
490
|
# Get setup schema if available
|
|
499
491
|
try:
|
|
500
492
|
# Convert schema to proto format
|
|
501
|
-
config_setup_schema_proto =
|
|
493
|
+
config_setup_schema_proto = self.module_class.get_config_setup_format(llm_format=request.llm_format)
|
|
502
494
|
config_setup_format_struct = json_format.Parse(
|
|
503
495
|
text=config_setup_schema_proto,
|
|
504
496
|
message=struct_pb2.Struct(), # pylint: disable=no-member
|
|
@@ -17,23 +17,6 @@ class TaskStatus(Enum):
|
|
|
17
17
|
FAILED = "failed"
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
class CancellationReason(Enum):
|
|
21
|
-
"""Reason for task cancellation - helps distinguish cleanup vs real cancellation."""
|
|
22
|
-
|
|
23
|
-
# Cleanup cancellations (not errors)
|
|
24
|
-
SUCCESS_CLEANUP = "success_cleanup" # Main task completed, cleaning up helper tasks
|
|
25
|
-
FAILURE_CLEANUP = "failure_cleanup" # Main task failed, cleaning up helper tasks
|
|
26
|
-
|
|
27
|
-
# Real cancellations
|
|
28
|
-
SIGNAL = "signal" # External signal requested cancellation
|
|
29
|
-
HEARTBEAT_FAILURE = "heartbeat_failure" # Heartbeat stopped working
|
|
30
|
-
TIMEOUT = "timeout" # Task timed out
|
|
31
|
-
SHUTDOWN = "shutdown" # Manager is shutting down
|
|
32
|
-
|
|
33
|
-
# Unknown/unset
|
|
34
|
-
UNKNOWN = "unknown" # Reason not determined
|
|
35
|
-
|
|
36
|
-
|
|
37
20
|
class SignalType(Enum):
|
|
38
21
|
"""Signal type enumeration."""
|
|
39
22
|
|