digitalkin 0.3.0rc1__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. digitalkin/__version__.py +1 -1
  2. digitalkin/core/common/__init__.py +9 -0
  3. digitalkin/core/common/factories.py +156 -0
  4. digitalkin/core/job_manager/base_job_manager.py +128 -28
  5. digitalkin/core/job_manager/single_job_manager.py +80 -25
  6. digitalkin/core/job_manager/taskiq_broker.py +114 -19
  7. digitalkin/core/job_manager/taskiq_job_manager.py +291 -39
  8. digitalkin/core/task_manager/base_task_manager.py +539 -0
  9. digitalkin/core/task_manager/local_task_manager.py +108 -0
  10. digitalkin/core/task_manager/remote_task_manager.py +87 -0
  11. digitalkin/core/task_manager/surrealdb_repository.py +43 -4
  12. digitalkin/core/task_manager/task_executor.py +249 -0
  13. digitalkin/core/task_manager/task_session.py +107 -19
  14. digitalkin/grpc_servers/module_server.py +2 -2
  15. digitalkin/grpc_servers/module_servicer.py +21 -12
  16. digitalkin/grpc_servers/registry_server.py +1 -1
  17. digitalkin/grpc_servers/registry_servicer.py +4 -4
  18. digitalkin/grpc_servers/utils/grpc_error_handler.py +53 -0
  19. digitalkin/models/core/task_monitor.py +17 -0
  20. digitalkin/models/grpc_servers/models.py +4 -4
  21. digitalkin/models/module/module_context.py +5 -0
  22. digitalkin/models/module/module_types.py +304 -16
  23. digitalkin/modules/_base_module.py +66 -28
  24. digitalkin/services/cost/grpc_cost.py +8 -41
  25. digitalkin/services/filesystem/grpc_filesystem.py +9 -38
  26. digitalkin/services/services_config.py +11 -0
  27. digitalkin/services/services_models.py +3 -1
  28. digitalkin/services/setup/default_setup.py +5 -6
  29. digitalkin/services/setup/grpc_setup.py +51 -14
  30. digitalkin/services/storage/grpc_storage.py +2 -2
  31. digitalkin/services/user_profile/__init__.py +12 -0
  32. digitalkin/services/user_profile/default_user_profile.py +55 -0
  33. digitalkin/services/user_profile/grpc_user_profile.py +69 -0
  34. digitalkin/services/user_profile/user_profile_strategy.py +40 -0
  35. digitalkin/utils/__init__.py +28 -0
  36. digitalkin/utils/dynamic_schema.py +483 -0
  37. {digitalkin-0.3.0rc1.dist-info → digitalkin-0.3.1.dist-info}/METADATA +9 -29
  38. {digitalkin-0.3.0rc1.dist-info → digitalkin-0.3.1.dist-info}/RECORD +42 -30
  39. modules/dynamic_setup_module.py +362 -0
  40. digitalkin/core/task_manager/task_manager.py +0 -439
  41. {digitalkin-0.3.0rc1.dist-info → digitalkin-0.3.1.dist-info}/WHEEL +0 -0
  42. {digitalkin-0.3.0rc1.dist-info → digitalkin-0.3.1.dist-info}/licenses/LICENSE +0 -0
  43. {digitalkin-0.3.0rc1.dist-info → digitalkin-0.3.1.dist-info}/top_level.txt +0 -0
@@ -1,439 +0,0 @@
1
- """Task manager with comprehensive lifecycle management."""
2
-
3
- import asyncio
4
- import contextlib
5
- import datetime
6
- from collections.abc import Coroutine
7
- from typing import Any
8
-
9
- from digitalkin.core.task_manager.surrealdb_repository import SurrealDBConnection
10
- from digitalkin.core.task_manager.task_session import TaskSession
11
- from digitalkin.logger import logger
12
- from digitalkin.models.core.task_monitor import SignalMessage, SignalType, TaskStatus
13
- from digitalkin.modules._base_module import BaseModule
14
-
15
-
16
- class TaskManager:
17
- """Task manager with comprehensive lifecycle management.
18
-
19
- Handle the tasks creation, execution, monitoring, signaling, and cancellation.
20
- """
21
-
22
- tasks: dict[str, asyncio.Task]
23
- tasks_sessions: dict[str, TaskSession]
24
- channel: SurrealDBConnection
25
- default_timeout: float
26
- max_concurrent_tasks: int
27
- _shutdown_event: asyncio.Event
28
-
29
- def __init__(self, default_timeout: float = 10.0, max_concurrent_tasks: int = 100) -> None:
30
- """Defining task manager properties."""
31
- self.tasks = {}
32
- self.tasks_sessions = {}
33
- self.default_timeout = default_timeout
34
- self.max_concurrent_tasks = max_concurrent_tasks
35
- self._shutdown_event = asyncio.Event()
36
-
37
- logger.info(
38
- "TaskManager initialized with max_concurrent_tasks: %d, default_timeout: %.1f",
39
- max_concurrent_tasks,
40
- default_timeout,
41
- extra={
42
- "max_concurrent_tasks": max_concurrent_tasks,
43
- "default_timeout": default_timeout,
44
- },
45
- )
46
-
47
- @property
48
- def task_count(self) -> int:
49
- """Number of managed tasks."""
50
- return len(self.tasks_sessions)
51
-
52
- @property
53
- def running_tasks(self) -> set[str]:
54
- """Get IDs of currently running tasks."""
55
- return {task_id for task_id, task in self.tasks.items() if not task.done()}
56
-
57
- async def _cleanup_task(self, task_id: str, mission_id: str) -> None:
58
- """Clean up task resources.
59
-
60
- Args:
61
- task_id (str): The ID of the task to clean up.
62
- mission_id (str): The ID of the mission associated with the task.
63
- """
64
- logger.debug(
65
- "Cleaning up resources for task: '%s'", task_id, extra={"mission_id": mission_id, "task_id": task_id}
66
- )
67
- if task_id in self.tasks_sessions:
68
- await self.tasks_sessions[task_id].db.close()
69
- # Remove from collections
70
-
71
- async def _task_wrapper( # noqa: C901, PLR0915
72
- self,
73
- task_id: str,
74
- mission_id: str,
75
- coro: Coroutine[Any, Any, None],
76
- session: TaskSession,
77
- ) -> asyncio.Task[None]:
78
- """Task wrapper that runs main, heartbeat, and listener concurrently.
79
-
80
- The first to finish determines the outcome. Returns a Task that the
81
- caller can await externally.
82
-
83
- Returns:
84
- asyncio.Task[None]: The supervisor task managing the lifecycle.
85
- """
86
-
87
- async def signal_wrapper() -> None:
88
- try:
89
- await self.channel.create(
90
- "tasks",
91
- SignalMessage(
92
- task_id=task_id,
93
- mission_id=mission_id,
94
- status=session.status,
95
- action=SignalType.START,
96
- ).model_dump(),
97
- )
98
- await session.listen_signals()
99
- except asyncio.CancelledError:
100
- logger.debug("Signal listener cancelled", extra={"mission_id": mission_id, "task_id": task_id})
101
- finally:
102
- await self.channel.create(
103
- "tasks",
104
- SignalMessage(
105
- task_id=task_id,
106
- mission_id=mission_id,
107
- status=session.status,
108
- action=SignalType.STOP,
109
- ).model_dump(),
110
- )
111
- logger.info("Signal listener ended", extra={"mission_id": mission_id, "task_id": task_id})
112
-
113
- async def heartbeat_wrapper() -> None:
114
- try:
115
- await session.generate_heartbeats()
116
- except asyncio.CancelledError:
117
- logger.debug("Signal listener cancelled", extra={"mission_id": mission_id, "task_id": task_id})
118
- finally:
119
- logger.info("Heartbeat task ended", extra={"mission_id": mission_id, "task_id": task_id})
120
-
121
- async def supervisor() -> None:
122
- session.started_at = datetime.datetime.now(datetime.timezone.utc)
123
- session.status = TaskStatus.RUNNING
124
-
125
- main_task = asyncio.create_task(coro, name=f"{task_id}_main")
126
- hb_task = asyncio.create_task(heartbeat_wrapper(), name=f"{task_id}_heartbeat")
127
- sig_task = asyncio.create_task(signal_wrapper(), name=f"{task_id}_listener")
128
-
129
- try:
130
- done, pending = await asyncio.wait(
131
- [main_task, sig_task, hb_task],
132
- return_when=asyncio.FIRST_COMPLETED,
133
- )
134
-
135
- # One task completed -> cancel the others
136
- for t in pending:
137
- t.cancel()
138
-
139
- # Propagate exception/result from the finished task
140
- completed = next(iter(done))
141
- await completed
142
-
143
- if completed is main_task:
144
- session.status = TaskStatus.COMPLETED
145
- elif completed is sig_task or (completed is hb_task and sig_task.done()):
146
- logger.debug(f"Task cancelled due to signal {sig_task=}")
147
- session.status = TaskStatus.CANCELLED
148
- elif completed is hb_task:
149
- session.status = TaskStatus.FAILED
150
- msg = f"Heartbeat stopped for {task_id}"
151
- raise RuntimeError(msg) # noqa: TRY301
152
-
153
- except asyncio.CancelledError:
154
- session.status = TaskStatus.CANCELLED
155
- raise
156
- except Exception:
157
- session.status = TaskStatus.FAILED
158
- raise
159
- finally:
160
- session.completed_at = datetime.datetime.now(datetime.timezone.utc)
161
- # Ensure all tasks are cleaned up
162
- for t in [main_task, hb_task, sig_task]:
163
- if not t.done():
164
- t.cancel()
165
- await asyncio.gather(main_task, hb_task, sig_task, return_exceptions=True)
166
-
167
- # Return the supervisor task to be awaited outside
168
- return asyncio.create_task(supervisor(), name=f"{task_id}_supervisor")
169
-
170
- async def create_task(
171
- self,
172
- task_id: str,
173
- mission_id: str,
174
- module: BaseModule,
175
- coro: Coroutine[Any, Any, None],
176
- heartbeat_interval: datetime.timedelta = datetime.timedelta(seconds=2),
177
- connection_timeout: datetime.timedelta = datetime.timedelta(seconds=5),
178
- ) -> None:
179
- """Create and start a new managed task.
180
-
181
- Raises:
182
- ValueError: task_id duplicated
183
- RuntimeError: task overload
184
- """
185
- if task_id in self.tasks:
186
- # close Coroutine during runtime
187
- coro.close()
188
- logger.warning(
189
- "Task creation failed - task already exists: '%s'",
190
- task_id,
191
- extra={"mission_id": mission_id, "task_id": task_id},
192
- )
193
- msg = f"Task {task_id} already exists"
194
- raise ValueError(msg)
195
-
196
- if len(self.tasks) >= self.max_concurrent_tasks:
197
- coro.close()
198
- logger.error(
199
- "Task creation failed - max concurrent tasks reached: %d",
200
- self.max_concurrent_tasks,
201
- extra={
202
- "mission_id": mission_id,
203
- "task_id": task_id,
204
- "current_count": len(self.tasks),
205
- "max_concurrent": self.max_concurrent_tasks,
206
- },
207
- )
208
- msg = f"Maximum concurrent tasks ({self.max_concurrent_tasks}) reached"
209
- raise RuntimeError(msg)
210
-
211
- logger.info(
212
- "Creating new task: '%s'",
213
- task_id,
214
- extra={
215
- "mission_id": mission_id,
216
- "task_id": task_id,
217
- "heartbeat_interval": heartbeat_interval,
218
- "connection_timeout": connection_timeout,
219
- },
220
- )
221
-
222
- try:
223
- # Initialize components
224
- channel: SurrealDBConnection = SurrealDBConnection("task_manager", connection_timeout)
225
- await channel.init_surreal_instance()
226
- session = TaskSession(task_id, mission_id, channel, module, heartbeat_interval)
227
-
228
- self.tasks_sessions[task_id] = session
229
-
230
- # Create wrapper task
231
- self.tasks[task_id] = asyncio.create_task(
232
- self._task_wrapper(
233
- task_id,
234
- mission_id,
235
- coro,
236
- session,
237
- ),
238
- name=task_id,
239
- )
240
-
241
- logger.info(
242
- "Task created successfully: '%s'",
243
- task_id,
244
- extra={
245
- "mission_id": mission_id,
246
- "task_id": task_id,
247
- "total_tasks": len(self.tasks),
248
- },
249
- )
250
-
251
- except Exception as e:
252
- logger.error(
253
- "Failed to create task: '%s'",
254
- task_id,
255
- extra={"mission_id": mission_id, "task_id": task_id, "error": str(e)},
256
- exc_info=True,
257
- )
258
- # Cleanup on failure
259
- await self._cleanup_task(task_id, mission_id=mission_id)
260
- raise
261
-
262
- async def send_signal(self, task_id: str, mission_id: str, signal_type: str, payload: dict) -> bool:
263
- """Send signal to a specific task.
264
-
265
- Returns:
266
- bool: True if the task sent successfully the given signal, False otherwise.
267
- """
268
- if task_id not in self.tasks_sessions:
269
- logger.warning(
270
- "Cannot send signal - task not found: '%s'",
271
- task_id,
272
- extra={"mission_id": mission_id, "task_id": task_id, "signal_type": signal_type},
273
- )
274
- return False
275
-
276
- logger.info(
277
- "Sending signal '%s' to task: '%s'",
278
- signal_type,
279
- task_id,
280
- extra={"mission_id": mission_id, "task_id": task_id, "signal_type": signal_type, "payload": payload},
281
- )
282
-
283
- await self.channel.update("tasks", signal_type, payload)
284
- return True
285
-
286
- async def cancel_task(self, task_id: str, mission_id: str, timeout: float | None = None) -> bool:
287
- """Cancel a task with graceful shutdown and fallback.
288
-
289
- Returns:
290
- bool: True if the task was cancelled successfully, False otherwise.
291
- """
292
- if task_id not in self.tasks:
293
- logger.warning(
294
- "Cannot cancel - task not found: '%s'", task_id, extra={"mission_id": mission_id, "task_id": task_id}
295
- )
296
- return True
297
-
298
- timeout = timeout or self.default_timeout
299
- task = self.tasks[task_id]
300
-
301
- logger.info(
302
- "Initiating task cancellation: '%s', timeout: %.1fs",
303
- task_id,
304
- timeout,
305
- extra={"mission_id": mission_id, "task_id": task_id, "timeout": timeout},
306
- )
307
-
308
- try:
309
- # Phase 1: Cooperative cancellation
310
- # await self.send_signal(task_id, mission_id, "cancel") # noqa: ERA001
311
-
312
- # Wait for graceful shutdown
313
- await asyncio.wait_for(task, timeout=timeout)
314
-
315
- logger.info(
316
- "Task cancelled gracefully: '%s'", task_id, extra={"mission_id": mission_id, "task_id": task_id}
317
- )
318
-
319
- except asyncio.TimeoutError:
320
- logger.warning(
321
- "Graceful cancellation timed out for task: '%s', forcing cancellation",
322
- task_id,
323
- extra={"mission_id": mission_id, "task_id": task_id, "timeout": timeout},
324
- )
325
-
326
- # Phase 2: Force cancellation
327
- task.cancel()
328
- with contextlib.suppress(asyncio.CancelledError):
329
- await task
330
-
331
- logger.warning("Task force-cancelled: '%s'", task_id, extra={"mission_id": mission_id, "task_id": task_id})
332
- return True
333
-
334
- except Exception as e:
335
- logger.error(
336
- "Error during task cancellation: '%s'",
337
- task_id,
338
- extra={"mission_id": mission_id, "task_id": task_id, "error": str(e)},
339
- exc_info=True,
340
- )
341
- return False
342
- return True
343
-
344
- async def clean_session(self, task_id: str, mission_id: str) -> bool:
345
- """Clean up task session without cancelling the task.
346
-
347
- Returns:
348
- bool: True if the task was cleaned successfully, False otherwise.
349
- """
350
- if task_id not in self.tasks_sessions:
351
- logger.warning(
352
- "Cannot clean session - task not found: '%s'",
353
- task_id,
354
- extra={"mission_id": mission_id, "task_id": task_id},
355
- )
356
- return False
357
-
358
- await self.tasks_sessions[task_id].module.stop()
359
- await self.cancel_task(mission_id, task_id)
360
-
361
- logger.info("Cleaning up session for task: '%s'", task_id, extra={"mission_id": mission_id, "task_id": task_id})
362
- self.tasks_sessions.pop(task_id, None)
363
- return True
364
-
365
- async def pause_task(self, task_id: str, mission_id: str) -> bool:
366
- """Pause a running task.
367
-
368
- Returns:
369
- bool: True if the task was paused successfully, False otherwise.
370
- """
371
- return await self.send_signal(task_id, mission_id, "pause", {})
372
-
373
- async def resume_task(self, task_id: str, mission_id: str) -> bool:
374
- """Resume a paused task.
375
-
376
- Returns:
377
- bool: True if the task was paused successfully, False otherwise.
378
- """
379
- return await self.send_signal(task_id, mission_id, "resume", {})
380
-
381
- async def get_task_status(self, task_id: str, mission_id: str) -> bool:
382
- """Request status from a task.
383
-
384
- Returns:
385
- bool: True if the task was paused successfully, False otherwise.
386
- """
387
- return await self.send_signal(task_id, mission_id, "status", {})
388
-
389
- async def cancel_all_tasks(self, mission_id: str, timeout: float | None = None) -> dict[str, bool]:
390
- """Cancel all running tasks.
391
-
392
- Returns:
393
- dict[str: bool]: True if the tasks were paused successfully, False otherwise.
394
- """
395
- timeout = timeout or self.default_timeout
396
- task_ids = list(self.running_tasks)
397
-
398
- logger.info(
399
- "Cancelling all tasks: %d tasks",
400
- len(task_ids),
401
- extra={"mission_id": mission_id, "task_count": len(task_ids), "timeout": timeout},
402
- )
403
-
404
- results = {}
405
- for task_id in task_ids:
406
- results[task_id] = await self.cancel_task(task_id, mission_id, timeout)
407
-
408
- return results
409
-
410
- async def shutdown(self, mission_id: str, timeout: float = 30.0) -> None:
411
- """Graceful shutdown of all tasks."""
412
- logger.info(
413
- "TaskManager shutdown initiated, timeout: %.1fs",
414
- timeout,
415
- extra={"mission_id": mission_id, "timeout": timeout, "active_tasks": len(self.running_tasks)},
416
- )
417
-
418
- self._shutdown_event.set()
419
- results = await self.cancel_all_tasks(mission_id, timeout)
420
-
421
- failed_tasks = [task_id for task_id, success in results.items() if not success]
422
- if failed_tasks:
423
- logger.error(
424
- "Failed to cancel %d tasks during shutdown: %s",
425
- len(failed_tasks),
426
- failed_tasks,
427
- extra={"mission_id": mission_id, "failed_tasks": failed_tasks, "failed_count": len(failed_tasks)},
428
- )
429
-
430
- logger.info(
431
- "TaskManager shutdown completed, cancelled: %d, failed: %d",
432
- len(results) - len(failed_tasks),
433
- len(failed_tasks),
434
- extra={
435
- "mission_id": mission_id,
436
- "cancelled_count": len(results) - len(failed_tasks),
437
- "failed_count": len(failed_tasks),
438
- },
439
- )