digitalkin 0.3.0rc2__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. digitalkin/__version__.py +1 -1
  2. digitalkin/core/common/__init__.py +9 -0
  3. digitalkin/core/common/factories.py +156 -0
  4. digitalkin/core/job_manager/base_job_manager.py +128 -28
  5. digitalkin/core/job_manager/single_job_manager.py +80 -25
  6. digitalkin/core/job_manager/taskiq_broker.py +114 -19
  7. digitalkin/core/job_manager/taskiq_job_manager.py +291 -39
  8. digitalkin/core/task_manager/base_task_manager.py +539 -0
  9. digitalkin/core/task_manager/local_task_manager.py +108 -0
  10. digitalkin/core/task_manager/remote_task_manager.py +87 -0
  11. digitalkin/core/task_manager/surrealdb_repository.py +43 -4
  12. digitalkin/core/task_manager/task_executor.py +249 -0
  13. digitalkin/core/task_manager/task_session.py +95 -17
  14. digitalkin/grpc_servers/module_server.py +2 -2
  15. digitalkin/grpc_servers/module_servicer.py +21 -12
  16. digitalkin/grpc_servers/registry_server.py +1 -1
  17. digitalkin/grpc_servers/registry_servicer.py +4 -4
  18. digitalkin/grpc_servers/utils/grpc_error_handler.py +53 -0
  19. digitalkin/models/core/task_monitor.py +17 -0
  20. digitalkin/models/module/module_context.py +5 -0
  21. digitalkin/models/module/module_types.py +299 -15
  22. digitalkin/modules/_base_module.py +66 -28
  23. digitalkin/services/cost/grpc_cost.py +8 -41
  24. digitalkin/services/filesystem/grpc_filesystem.py +9 -38
  25. digitalkin/services/services_config.py +11 -0
  26. digitalkin/services/services_models.py +3 -1
  27. digitalkin/services/setup/default_setup.py +5 -6
  28. digitalkin/services/setup/grpc_setup.py +51 -14
  29. digitalkin/services/storage/grpc_storage.py +2 -2
  30. digitalkin/services/user_profile/__init__.py +12 -0
  31. digitalkin/services/user_profile/default_user_profile.py +55 -0
  32. digitalkin/services/user_profile/grpc_user_profile.py +69 -0
  33. digitalkin/services/user_profile/user_profile_strategy.py +40 -0
  34. digitalkin/utils/__init__.py +28 -0
  35. digitalkin/utils/dynamic_schema.py +483 -0
  36. {digitalkin-0.3.0rc2.dist-info → digitalkin-0.3.1.dist-info}/METADATA +8 -8
  37. {digitalkin-0.3.0rc2.dist-info → digitalkin-0.3.1.dist-info}/RECORD +41 -29
  38. modules/dynamic_setup_module.py +362 -0
  39. digitalkin/core/task_manager/task_manager.py +0 -442
  40. {digitalkin-0.3.0rc2.dist-info → digitalkin-0.3.1.dist-info}/WHEEL +0 -0
  41. {digitalkin-0.3.0rc2.dist-info → digitalkin-0.3.1.dist-info}/licenses/LICENSE +0 -0
  42. {digitalkin-0.3.0rc2.dist-info → digitalkin-0.3.1.dist-info}/top_level.txt +0 -0
@@ -1,442 +0,0 @@
1
- """Task manager with comprehensive lifecycle management."""
2
-
3
- import asyncio
4
- import contextlib
5
- import datetime
6
- from collections.abc import Coroutine
7
- from typing import Any
8
-
9
- from digitalkin.core.task_manager.surrealdb_repository import SurrealDBConnection
10
- from digitalkin.core.task_manager.task_session import TaskSession
11
- from digitalkin.logger import logger
12
- from digitalkin.models.core.task_monitor import SignalMessage, SignalType, TaskStatus
13
- from digitalkin.modules._base_module import BaseModule
14
-
15
-
16
- class TaskManager:
17
- """Task manager with comprehensive lifecycle management.
18
-
19
- Handle the tasks creation, execution, monitoring, signaling, and cancellation.
20
- """
21
-
22
- tasks: dict[str, asyncio.Task]
23
- tasks_sessions: dict[str, TaskSession]
24
- channel: SurrealDBConnection
25
- default_timeout: float
26
- max_concurrent_tasks: int
27
- _shutdown_event: asyncio.Event
28
-
29
- def __init__(self, default_timeout: float = 10.0, max_concurrent_tasks: int = 1000) -> None:
30
- """Defining task manager properties."""
31
- self.tasks = {}
32
- self.tasks_sessions = {}
33
- self.default_timeout = default_timeout
34
- self.max_concurrent_tasks = max_concurrent_tasks
35
- self._shutdown_event = asyncio.Event()
36
-
37
- logger.info(
38
- "TaskManager initialized with max_concurrent_tasks: %d, default_timeout: %.1f",
39
- max_concurrent_tasks,
40
- default_timeout,
41
- extra={
42
- "max_concurrent_tasks": max_concurrent_tasks,
43
- "default_timeout": default_timeout,
44
- },
45
- )
46
-
47
- @property
48
- def task_count(self) -> int:
49
- """Number of managed tasks."""
50
- return len(self.tasks_sessions)
51
-
52
- @property
53
- def running_tasks(self) -> set[str]:
54
- """Get IDs of currently running tasks."""
55
- return {task_id for task_id, task in self.tasks.items() if not task.done()}
56
-
57
- async def _cleanup_task(self, task_id: str, mission_id: str) -> None:
58
- """Clean up task resources.
59
-
60
- Args:
61
- task_id (str): The ID of the task to clean up.
62
- mission_id (str): The ID of the mission associated with the task.
63
- """
64
- logger.debug(
65
- "Cleaning up resources for task: '%s'", task_id, extra={"mission_id": mission_id, "task_id": task_id}
66
- )
67
- if task_id in self.tasks_sessions:
68
- await self.tasks_sessions[task_id].db.close()
69
- # Remove from collections
70
- self.tasks.pop(task_id, None)
71
- self.tasks_sessions.pop(task_id, None)
72
-
73
- async def _task_wrapper( # noqa: C901, PLR0915
74
- self,
75
- task_id: str,
76
- mission_id: str,
77
- coro: Coroutine[Any, Any, None],
78
- session: TaskSession,
79
- ) -> asyncio.Task[None]:
80
- """Task wrapper that runs main, heartbeat, and listener concurrently.
81
-
82
- The first to finish determines the outcome. Returns a Task that the
83
- caller can await externally.
84
-
85
- Returns:
86
- asyncio.Task[None]: The supervisor task managing the lifecycle.
87
- """
88
-
89
- async def signal_wrapper() -> None:
90
- try:
91
- await self.channel.create(
92
- "tasks",
93
- SignalMessage(
94
- task_id=task_id,
95
- mission_id=mission_id,
96
- status=session.status,
97
- action=SignalType.START,
98
- ).model_dump(),
99
- )
100
- await session.listen_signals()
101
- except asyncio.CancelledError:
102
- logger.debug("Signal listener cancelled", extra={"mission_id": mission_id, "task_id": task_id})
103
- finally:
104
- await self.channel.create(
105
- "tasks",
106
- SignalMessage(
107
- task_id=task_id,
108
- mission_id=mission_id,
109
- status=session.status,
110
- action=SignalType.STOP,
111
- ).model_dump(),
112
- )
113
- logger.info("Signal listener ended", extra={"mission_id": mission_id, "task_id": task_id})
114
-
115
- async def heartbeat_wrapper() -> None:
116
- try:
117
- await session.generate_heartbeats()
118
- except asyncio.CancelledError:
119
- logger.debug("Signal listener cancelled", extra={"mission_id": mission_id, "task_id": task_id})
120
- finally:
121
- logger.info("Heartbeat task ended", extra={"mission_id": mission_id, "task_id": task_id})
122
-
123
- async def supervisor() -> None:
124
- session.started_at = datetime.datetime.now(datetime.timezone.utc)
125
- session.status = TaskStatus.RUNNING
126
-
127
- main_task = asyncio.create_task(coro, name=f"{task_id}_main")
128
- hb_task = asyncio.create_task(heartbeat_wrapper(), name=f"{task_id}_heartbeat")
129
- sig_task = asyncio.create_task(signal_wrapper(), name=f"{task_id}_listener")
130
-
131
- try:
132
- done, pending = await asyncio.wait(
133
- [main_task, sig_task, hb_task],
134
- return_when=asyncio.FIRST_COMPLETED,
135
- )
136
-
137
- # One task completed -> cancel the others
138
- for t in pending:
139
- t.cancel()
140
-
141
- # Propagate exception/result from the finished task
142
- completed = next(iter(done))
143
- await completed
144
-
145
- if completed is main_task:
146
- session.status = TaskStatus.COMPLETED
147
- elif completed is sig_task or (completed is hb_task and sig_task.done()):
148
- logger.debug(f"Task cancelled due to signal {sig_task=}")
149
- session.status = TaskStatus.CANCELLED
150
- elif completed is hb_task:
151
- session.status = TaskStatus.FAILED
152
- msg = f"Heartbeat stopped for {task_id}"
153
- raise RuntimeError(msg) # noqa: TRY301
154
-
155
- except asyncio.CancelledError:
156
- session.status = TaskStatus.CANCELLED
157
- raise
158
- except Exception:
159
- session.status = TaskStatus.FAILED
160
- raise
161
- finally:
162
- session.completed_at = datetime.datetime.now(datetime.timezone.utc)
163
- # Ensure all tasks are cleaned up
164
- for t in [main_task, hb_task, sig_task]:
165
- if not t.done():
166
- t.cancel()
167
- await asyncio.gather(main_task, hb_task, sig_task, return_exceptions=True)
168
-
169
- # Return the supervisor task to be awaited outside
170
- return asyncio.create_task(supervisor(), name=f"{task_id}_supervisor")
171
-
172
- async def create_task(
173
- self,
174
- task_id: str,
175
- mission_id: str,
176
- module: BaseModule,
177
- coro: Coroutine[Any, Any, None],
178
- heartbeat_interval: datetime.timedelta = datetime.timedelta(seconds=2),
179
- connection_timeout: datetime.timedelta = datetime.timedelta(seconds=5),
180
- ) -> None:
181
- """Create and start a new managed task.
182
-
183
- Raises:
184
- ValueError: task_id duplicated
185
- RuntimeError: task overload
186
- """
187
- if task_id in self.tasks:
188
- # close Coroutine during runtime
189
- coro.close()
190
- logger.warning(
191
- "Task creation failed - task already exists: '%s'",
192
- task_id,
193
- extra={"mission_id": mission_id, "task_id": task_id},
194
- )
195
- msg = f"Task {task_id} already exists"
196
- raise ValueError(msg)
197
-
198
- if len(self.tasks) >= self.max_concurrent_tasks:
199
- coro.close()
200
- logger.error(
201
- "Task creation failed - max concurrent tasks reached: %d",
202
- self.max_concurrent_tasks,
203
- extra={
204
- "mission_id": mission_id,
205
- "task_id": task_id,
206
- "current_count": len(self.tasks),
207
- "max_concurrent": self.max_concurrent_tasks,
208
- },
209
- )
210
- msg = f"Maximum concurrent tasks ({self.max_concurrent_tasks}) reached"
211
- raise RuntimeError(msg)
212
-
213
- logger.info(
214
- "Creating new task: '%s'",
215
- task_id,
216
- extra={
217
- "mission_id": mission_id,
218
- "task_id": task_id,
219
- "heartbeat_interval": heartbeat_interval,
220
- "connection_timeout": connection_timeout,
221
- },
222
- )
223
-
224
- try:
225
- # Initialize components
226
- channel: SurrealDBConnection = SurrealDBConnection("task_manager", connection_timeout)
227
- await channel.init_surreal_instance()
228
- session = TaskSession(task_id, mission_id, channel, module, heartbeat_interval)
229
-
230
- self.tasks_sessions[task_id] = session
231
-
232
- # Create wrapper task
233
- self.tasks[task_id] = asyncio.create_task(
234
- self._task_wrapper(
235
- task_id,
236
- mission_id,
237
- coro,
238
- session,
239
- ),
240
- name=task_id,
241
- )
242
-
243
- logger.info(
244
- "Task created successfully: '%s'",
245
- task_id,
246
- extra={
247
- "mission_id": mission_id,
248
- "task_id": task_id,
249
- "total_tasks": len(self.tasks),
250
- },
251
- )
252
-
253
- except Exception as e:
254
- logger.error(
255
- "Failed to create task: '%s'",
256
- task_id,
257
- extra={"mission_id": mission_id, "task_id": task_id, "error": str(e)},
258
- exc_info=True,
259
- )
260
- # Cleanup on failure
261
- await self._cleanup_task(task_id, mission_id=mission_id)
262
- raise
263
-
264
- async def send_signal(self, task_id: str, mission_id: str, signal_type: str, payload: dict) -> bool:
265
- """Send signal to a specific task.
266
-
267
- Returns:
268
- bool: True if the task sent successfully the given signal, False otherwise.
269
- """
270
- if task_id not in self.tasks_sessions:
271
- logger.warning(
272
- "Cannot send signal - task not found: '%s'",
273
- task_id,
274
- extra={"mission_id": mission_id, "task_id": task_id, "signal_type": signal_type},
275
- )
276
- return False
277
-
278
- logger.info(
279
- "Sending signal '%s' to task: '%s'",
280
- signal_type,
281
- task_id,
282
- extra={"mission_id": mission_id, "task_id": task_id, "signal_type": signal_type, "payload": payload},
283
- )
284
-
285
- await self.channel.update("tasks", signal_type, payload)
286
- return True
287
-
288
- async def cancel_task(self, task_id: str, mission_id: str, timeout: float | None = None) -> bool:
289
- """Cancel a task with graceful shutdown and fallback.
290
-
291
- Returns:
292
- bool: True if the task was cancelled successfully, False otherwise.
293
- """
294
- if task_id not in self.tasks:
295
- logger.warning(
296
- "Cannot cancel - task not found: '%s'", task_id, extra={"mission_id": mission_id, "task_id": task_id}
297
- )
298
- return True
299
-
300
- timeout = timeout or self.default_timeout
301
- task = self.tasks[task_id]
302
-
303
- logger.info(
304
- "Initiating task cancellation: '%s', timeout: %.1fs",
305
- task_id,
306
- timeout,
307
- extra={"mission_id": mission_id, "task_id": task_id, "timeout": timeout},
308
- )
309
-
310
- try:
311
- # Phase 1: Cooperative cancellation
312
- # await self.send_signal(task_id, mission_id, "cancel") # noqa: ERA001
313
-
314
- # Wait for graceful shutdown
315
- await asyncio.wait_for(task, timeout=timeout)
316
-
317
- logger.info(
318
- "Task cancelled gracefully: '%s'", task_id, extra={"mission_id": mission_id, "task_id": task_id}
319
- )
320
-
321
- except asyncio.TimeoutError:
322
- logger.warning(
323
- "Graceful cancellation timed out for task: '%s', forcing cancellation",
324
- task_id,
325
- extra={"mission_id": mission_id, "task_id": task_id, "timeout": timeout},
326
- )
327
-
328
- # Phase 2: Force cancellation
329
- task.cancel()
330
- with contextlib.suppress(asyncio.CancelledError):
331
- await task
332
-
333
- logger.warning("Task force-cancelled: '%s'", task_id, extra={"mission_id": mission_id, "task_id": task_id})
334
- return True
335
-
336
- except Exception as e:
337
- logger.error(
338
- "Error during task cancellation: '%s'",
339
- task_id,
340
- extra={"mission_id": mission_id, "task_id": task_id, "error": str(e)},
341
- exc_info=True,
342
- )
343
- return False
344
- return True
345
-
346
- async def clean_session(self, task_id: str, mission_id: str) -> bool:
347
- """Clean up task session without cancelling the task.
348
-
349
- Returns:
350
- bool: True if the task was cleaned successfully, False otherwise.
351
- """
352
- if task_id not in self.tasks_sessions:
353
- logger.warning(
354
- "Cannot clean session - task not found: '%s'",
355
- task_id,
356
- extra={"mission_id": mission_id, "task_id": task_id},
357
- )
358
- return False
359
-
360
- await self.tasks_sessions[task_id].module.stop()
361
- await self.cancel_task(task_id, mission_id)
362
-
363
- logger.info("Cleaning up session for task: '%s'", task_id, extra={"mission_id": mission_id, "task_id": task_id})
364
- self.tasks_sessions.pop(task_id, None)
365
- self.tasks.pop(task_id, None)
366
- return True
367
-
368
- async def pause_task(self, task_id: str, mission_id: str) -> bool:
369
- """Pause a running task.
370
-
371
- Returns:
372
- bool: True if the task was paused successfully, False otherwise.
373
- """
374
- return await self.send_signal(task_id, mission_id, "pause", {})
375
-
376
- async def resume_task(self, task_id: str, mission_id: str) -> bool:
377
- """Resume a paused task.
378
-
379
- Returns:
380
- bool: True if the task was paused successfully, False otherwise.
381
- """
382
- return await self.send_signal(task_id, mission_id, "resume", {})
383
-
384
- async def get_task_status(self, task_id: str, mission_id: str) -> bool:
385
- """Request status from a task.
386
-
387
- Returns:
388
- bool: True if the task was paused successfully, False otherwise.
389
- """
390
- return await self.send_signal(task_id, mission_id, "status", {})
391
-
392
- async def cancel_all_tasks(self, mission_id: str, timeout: float | None = None) -> dict[str, bool]:
393
- """Cancel all running tasks.
394
-
395
- Returns:
396
- dict[str: bool]: True if the tasks were paused successfully, False otherwise.
397
- """
398
- timeout = timeout or self.default_timeout
399
- task_ids = list(self.running_tasks)
400
-
401
- logger.info(
402
- "Cancelling all tasks: %d tasks",
403
- len(task_ids),
404
- extra={"mission_id": mission_id, "task_count": len(task_ids), "timeout": timeout},
405
- )
406
-
407
- results = {}
408
- for task_id in task_ids:
409
- results[task_id] = await self.cancel_task(task_id, mission_id, timeout)
410
-
411
- return results
412
-
413
- async def shutdown(self, mission_id: str, timeout: float = 30.0) -> None:
414
- """Graceful shutdown of all tasks."""
415
- logger.info(
416
- "TaskManager shutdown initiated, timeout: %.1fs",
417
- timeout,
418
- extra={"mission_id": mission_id, "timeout": timeout, "active_tasks": len(self.running_tasks)},
419
- )
420
-
421
- self._shutdown_event.set()
422
- results = await self.cancel_all_tasks(mission_id, timeout)
423
-
424
- failed_tasks = [task_id for task_id, success in results.items() if not success]
425
- if failed_tasks:
426
- logger.error(
427
- "Failed to cancel %d tasks during shutdown: %s",
428
- len(failed_tasks),
429
- failed_tasks,
430
- extra={"mission_id": mission_id, "failed_tasks": failed_tasks, "failed_count": len(failed_tasks)},
431
- )
432
-
433
- logger.info(
434
- "TaskManager shutdown completed, cancelled: %d, failed: %d",
435
- len(results) - len(failed_tasks),
436
- len(failed_tasks),
437
- extra={
438
- "mission_id": mission_id,
439
- "cancelled_count": len(results) - len(failed_tasks),
440
- "failed_count": len(failed_tasks),
441
- },
442
- )