digitalkin 0.2.25rc0__py3-none-any.whl → 0.3.2.dev14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- base_server/server_async_insecure.py +6 -5
- base_server/server_async_secure.py +6 -5
- base_server/server_sync_insecure.py +5 -4
- base_server/server_sync_secure.py +5 -4
- digitalkin/__version__.py +1 -1
- digitalkin/core/__init__.py +1 -0
- digitalkin/core/common/__init__.py +9 -0
- digitalkin/core/common/factories.py +156 -0
- digitalkin/core/job_manager/__init__.py +1 -0
- digitalkin/{modules → core}/job_manager/base_job_manager.py +138 -32
- digitalkin/core/job_manager/single_job_manager.py +373 -0
- digitalkin/{modules → core}/job_manager/taskiq_broker.py +121 -26
- digitalkin/core/job_manager/taskiq_job_manager.py +541 -0
- digitalkin/core/task_manager/__init__.py +1 -0
- digitalkin/core/task_manager/base_task_manager.py +539 -0
- digitalkin/core/task_manager/local_task_manager.py +108 -0
- digitalkin/core/task_manager/remote_task_manager.py +87 -0
- digitalkin/core/task_manager/surrealdb_repository.py +266 -0
- digitalkin/core/task_manager/task_executor.py +249 -0
- digitalkin/core/task_manager/task_session.py +368 -0
- digitalkin/grpc_servers/__init__.py +1 -19
- digitalkin/grpc_servers/_base_server.py +3 -3
- digitalkin/grpc_servers/module_server.py +120 -195
- digitalkin/grpc_servers/module_servicer.py +81 -44
- digitalkin/grpc_servers/utils/__init__.py +1 -0
- digitalkin/grpc_servers/utils/exceptions.py +0 -8
- digitalkin/grpc_servers/utils/grpc_client_wrapper.py +25 -9
- digitalkin/grpc_servers/utils/grpc_error_handler.py +53 -0
- digitalkin/grpc_servers/utils/utility_schema_extender.py +100 -0
- digitalkin/logger.py +64 -27
- digitalkin/mixins/__init__.py +19 -0
- digitalkin/mixins/base_mixin.py +10 -0
- digitalkin/mixins/callback_mixin.py +24 -0
- digitalkin/mixins/chat_history_mixin.py +110 -0
- digitalkin/mixins/cost_mixin.py +76 -0
- digitalkin/mixins/file_history_mixin.py +93 -0
- digitalkin/mixins/filesystem_mixin.py +46 -0
- digitalkin/mixins/logger_mixin.py +51 -0
- digitalkin/mixins/storage_mixin.py +79 -0
- digitalkin/models/__init__.py +1 -1
- digitalkin/models/core/__init__.py +1 -0
- digitalkin/{modules/job_manager → models/core}/job_manager_models.py +3 -11
- digitalkin/models/core/task_monitor.py +74 -0
- digitalkin/models/grpc_servers/__init__.py +1 -0
- digitalkin/{grpc_servers/utils → models/grpc_servers}/models.py +92 -7
- digitalkin/models/module/__init__.py +18 -11
- digitalkin/models/module/base_types.py +61 -0
- digitalkin/models/module/module.py +9 -1
- digitalkin/models/module/module_context.py +282 -6
- digitalkin/models/module/module_types.py +29 -105
- digitalkin/models/module/setup_types.py +490 -0
- digitalkin/models/module/tool_cache.py +68 -0
- digitalkin/models/module/tool_reference.py +117 -0
- digitalkin/models/module/utility.py +167 -0
- digitalkin/models/services/__init__.py +9 -0
- digitalkin/models/services/cost.py +1 -0
- digitalkin/models/services/registry.py +35 -0
- digitalkin/models/services/storage.py +39 -5
- digitalkin/modules/__init__.py +5 -1
- digitalkin/modules/_base_module.py +265 -167
- digitalkin/modules/archetype_module.py +6 -1
- digitalkin/modules/tool_module.py +16 -3
- digitalkin/modules/trigger_handler.py +7 -6
- digitalkin/modules/triggers/__init__.py +8 -0
- digitalkin/modules/triggers/healthcheck_ping_trigger.py +45 -0
- digitalkin/modules/triggers/healthcheck_services_trigger.py +63 -0
- digitalkin/modules/triggers/healthcheck_status_trigger.py +52 -0
- digitalkin/services/__init__.py +4 -0
- digitalkin/services/communication/__init__.py +7 -0
- digitalkin/services/communication/communication_strategy.py +76 -0
- digitalkin/services/communication/default_communication.py +101 -0
- digitalkin/services/communication/grpc_communication.py +234 -0
- digitalkin/services/cost/__init__.py +9 -2
- digitalkin/services/cost/grpc_cost.py +9 -42
- digitalkin/services/filesystem/default_filesystem.py +0 -2
- digitalkin/services/filesystem/grpc_filesystem.py +10 -39
- digitalkin/services/registry/__init__.py +22 -1
- digitalkin/services/registry/default_registry.py +135 -4
- digitalkin/services/registry/exceptions.py +47 -0
- digitalkin/services/registry/grpc_registry.py +306 -0
- digitalkin/services/registry/registry_models.py +15 -0
- digitalkin/services/registry/registry_strategy.py +88 -4
- digitalkin/services/services_config.py +25 -3
- digitalkin/services/services_models.py +5 -1
- digitalkin/services/setup/default_setup.py +6 -7
- digitalkin/services/setup/grpc_setup.py +52 -15
- digitalkin/services/storage/grpc_storage.py +4 -4
- digitalkin/services/user_profile/__init__.py +12 -0
- digitalkin/services/user_profile/default_user_profile.py +55 -0
- digitalkin/services/user_profile/grpc_user_profile.py +69 -0
- digitalkin/services/user_profile/user_profile_strategy.py +25 -0
- digitalkin/utils/__init__.py +28 -0
- digitalkin/utils/arg_parser.py +1 -1
- digitalkin/utils/development_mode_action.py +2 -2
- digitalkin/utils/dynamic_schema.py +483 -0
- digitalkin/utils/package_discover.py +1 -2
- digitalkin/utils/schema_splitter.py +207 -0
- {digitalkin-0.2.25rc0.dist-info → digitalkin-0.3.2.dev14.dist-info}/METADATA +11 -30
- digitalkin-0.3.2.dev14.dist-info/RECORD +143 -0
- {digitalkin-0.2.25rc0.dist-info → digitalkin-0.3.2.dev14.dist-info}/top_level.txt +1 -0
- modules/archetype_with_tools_module.py +244 -0
- modules/cpu_intensive_module.py +1 -1
- modules/dynamic_setup_module.py +338 -0
- modules/minimal_llm_module.py +1 -1
- modules/text_transform_module.py +1 -1
- monitoring/digitalkin_observability/__init__.py +46 -0
- monitoring/digitalkin_observability/http_server.py +150 -0
- monitoring/digitalkin_observability/interceptors.py +176 -0
- monitoring/digitalkin_observability/metrics.py +201 -0
- monitoring/digitalkin_observability/prometheus.py +137 -0
- monitoring/tests/test_metrics.py +172 -0
- services/filesystem_module.py +7 -5
- services/storage_module.py +4 -2
- digitalkin/grpc_servers/registry_server.py +0 -65
- digitalkin/grpc_servers/registry_servicer.py +0 -456
- digitalkin/grpc_servers/utils/factory.py +0 -180
- digitalkin/modules/job_manager/single_job_manager.py +0 -294
- digitalkin/modules/job_manager/taskiq_job_manager.py +0 -290
- digitalkin-0.2.25rc0.dist-info/RECORD +0 -89
- /digitalkin/{grpc_servers/utils → models/grpc_servers}/types.py +0 -0
- {digitalkin-0.2.25rc0.dist-info → digitalkin-0.3.2.dev14.dist-info}/WHEEL +0 -0
- {digitalkin-0.2.25rc0.dist-info → digitalkin-0.3.2.dev14.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
"""Background module manager with single instance."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import datetime
|
|
5
|
+
import uuid
|
|
6
|
+
from collections.abc import AsyncGenerator, AsyncIterator
|
|
7
|
+
from contextlib import asynccontextmanager
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import grpc
|
|
11
|
+
|
|
12
|
+
from digitalkin.core.common import ConnectionFactory, ModuleFactory
|
|
13
|
+
from digitalkin.core.job_manager.base_job_manager import BaseJobManager
|
|
14
|
+
from digitalkin.core.task_manager.local_task_manager import LocalTaskManager
|
|
15
|
+
from digitalkin.core.task_manager.task_session import TaskSession
|
|
16
|
+
from digitalkin.logger import logger
|
|
17
|
+
from digitalkin.models.core.task_monitor import TaskStatus
|
|
18
|
+
from digitalkin.models.module.base_types import InputModelT, OutputModelT, SetupModelT
|
|
19
|
+
from digitalkin.models.module.module import ModuleCodeModel
|
|
20
|
+
from digitalkin.modules._base_module import BaseModule
|
|
21
|
+
from digitalkin.services.services_models import ServicesMode
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class SingleJobManager(BaseJobManager[InputModelT, OutputModelT, SetupModelT]):
|
|
25
|
+
"""Manages a single instance of a module job.
|
|
26
|
+
|
|
27
|
+
This class ensures that only one instance of a module job is active at a time.
|
|
28
|
+
It provides functionality to create, stop, and monitor module jobs, as well as
|
|
29
|
+
to handle their output data.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
async def start(self) -> None:
|
|
33
|
+
"""Start manager."""
|
|
34
|
+
self.channel = await ConnectionFactory.create_surreal_connection("task_manager", datetime.timedelta(seconds=5))
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
module_class: type[BaseModule],
|
|
39
|
+
services_mode: ServicesMode,
|
|
40
|
+
default_timeout: float = 10.0,
|
|
41
|
+
max_concurrent_tasks: int = 100,
|
|
42
|
+
) -> None:
|
|
43
|
+
"""Initialize the job manager.
|
|
44
|
+
|
|
45
|
+
Args:
|
|
46
|
+
module_class: The class of the module to be managed.
|
|
47
|
+
services_mode: The mode of operation for the services (e.g., ASYNC or SYNC).
|
|
48
|
+
default_timeout: Default timeout for task operations
|
|
49
|
+
max_concurrent_tasks: Maximum number of concurrent tasks
|
|
50
|
+
"""
|
|
51
|
+
# Create local task manager for same-process execution
|
|
52
|
+
task_manager = LocalTaskManager(default_timeout, max_concurrent_tasks)
|
|
53
|
+
|
|
54
|
+
# Initialize base job manager with task manager
|
|
55
|
+
super().__init__(module_class, services_mode, task_manager)
|
|
56
|
+
|
|
57
|
+
self._lock = asyncio.Lock()
|
|
58
|
+
|
|
59
|
+
async def generate_config_setup_module_response(self, job_id: str) -> SetupModelT | ModuleCodeModel:
|
|
60
|
+
"""Generate a stream consumer for a module's output data.
|
|
61
|
+
|
|
62
|
+
This method creates an asynchronous generator that streams output data
|
|
63
|
+
from a specific module job. If the module does not exist, it generates
|
|
64
|
+
an error message.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
job_id: The unique identifier of the job.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
SetupModelT | ModuleCodeModel: the SetupModelT object fully processed.
|
|
71
|
+
"""
|
|
72
|
+
if (session := self.tasks_sessions.get(job_id, None)) is None:
|
|
73
|
+
return ModuleCodeModel(
|
|
74
|
+
code=str(grpc.StatusCode.NOT_FOUND),
|
|
75
|
+
message=f"Module {job_id} not found",
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
logger.debug("Module %s found: %s", job_id, session.module)
|
|
79
|
+
try:
|
|
80
|
+
# Add timeout to prevent indefinite blocking
|
|
81
|
+
return await asyncio.wait_for(session.queue.get(), timeout=30.0)
|
|
82
|
+
except asyncio.TimeoutError:
|
|
83
|
+
logger.error("Timeout waiting for config setup response from module %s", job_id)
|
|
84
|
+
return ModuleCodeModel(
|
|
85
|
+
code=str(grpc.StatusCode.DEADLINE_EXCEEDED),
|
|
86
|
+
message=f"Module {job_id} did not respond within 30 seconds",
|
|
87
|
+
)
|
|
88
|
+
finally:
|
|
89
|
+
logger.debug(
|
|
90
|
+
"Config setup response retrieved",
|
|
91
|
+
extra={"job_id": job_id, "queue_empty": session.queue.empty()},
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
async def create_config_setup_instance_job(
|
|
95
|
+
self,
|
|
96
|
+
config_setup_data: SetupModelT,
|
|
97
|
+
mission_id: str,
|
|
98
|
+
setup_id: str,
|
|
99
|
+
setup_version_id: str,
|
|
100
|
+
) -> str:
|
|
101
|
+
"""Create and start a new module setup configuration job.
|
|
102
|
+
|
|
103
|
+
This method initializes a new module job, assigns it a unique job ID,
|
|
104
|
+
and starts the config setup it in the background.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
config_setup_data: The input data required to start the job.
|
|
108
|
+
mission_id: The mission ID associated with the job.
|
|
109
|
+
setup_id: The setup ID associated with the module.
|
|
110
|
+
setup_version_id: The setup ID.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
str: The unique identifier (job ID) of the created job.
|
|
114
|
+
|
|
115
|
+
Raises:
|
|
116
|
+
Exception: If the module fails to start.
|
|
117
|
+
"""
|
|
118
|
+
job_id = str(uuid.uuid4())
|
|
119
|
+
# TODO: Ensure the job_id is unique.
|
|
120
|
+
module = ModuleFactory.create_module_instance(self.module_class, job_id, mission_id, setup_id, setup_version_id)
|
|
121
|
+
self.tasks_sessions[job_id] = TaskSession(job_id, mission_id, self.channel, module)
|
|
122
|
+
|
|
123
|
+
try:
|
|
124
|
+
await module.start_config_setup(
|
|
125
|
+
config_setup_data,
|
|
126
|
+
await self.job_specific_callback(self.add_to_queue, job_id),
|
|
127
|
+
)
|
|
128
|
+
logger.debug("Module %s (%s) started successfully", job_id, module.name)
|
|
129
|
+
except Exception:
|
|
130
|
+
# Remove the module from the manager in case of an error.
|
|
131
|
+
del self.tasks_sessions[job_id]
|
|
132
|
+
logger.exception("Failed to start module", extra={"job_id": job_id})
|
|
133
|
+
raise
|
|
134
|
+
else:
|
|
135
|
+
return job_id
|
|
136
|
+
|
|
137
|
+
async def add_to_queue(self, job_id: str, output_data: OutputModelT | ModuleCodeModel) -> None:
|
|
138
|
+
"""Add output data to the queue for a specific job.
|
|
139
|
+
|
|
140
|
+
This method is used as a callback to handle output data generated by a module job.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
job_id: The unique identifier of the job.
|
|
144
|
+
output_data: The output data produced by the job.
|
|
145
|
+
"""
|
|
146
|
+
session = self.tasks_sessions[job_id]
|
|
147
|
+
await session.queue.put(output_data.model_dump())
|
|
148
|
+
|
|
149
|
+
@asynccontextmanager # type: ignore
|
|
150
|
+
async def generate_stream_consumer(self, job_id: str) -> AsyncIterator[AsyncGenerator[dict[str, Any], None]]: # type: ignore
|
|
151
|
+
"""Generate a stream consumer for a module's output data.
|
|
152
|
+
|
|
153
|
+
This method creates an asynchronous generator that streams output data
|
|
154
|
+
from a specific module job. If the module does not exist, it generates
|
|
155
|
+
an error message.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
job_id: The unique identifier of the job.
|
|
159
|
+
|
|
160
|
+
Yields:
|
|
161
|
+
AsyncGenerator: A stream of output data or error messages.
|
|
162
|
+
"""
|
|
163
|
+
if (session := self.tasks_sessions.get(job_id, None)) is None:
|
|
164
|
+
|
|
165
|
+
async def _error_gen() -> AsyncGenerator[dict[str, Any], None]: # noqa: RUF029
|
|
166
|
+
"""Generate an error message for a non-existent module.
|
|
167
|
+
|
|
168
|
+
Yields:
|
|
169
|
+
AsyncGenerator: A generator yielding an error message.
|
|
170
|
+
"""
|
|
171
|
+
yield {
|
|
172
|
+
"error": {
|
|
173
|
+
"error_message": f"Module {job_id} not found",
|
|
174
|
+
"code": grpc.StatusCode.NOT_FOUND,
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
yield _error_gen()
|
|
179
|
+
return
|
|
180
|
+
|
|
181
|
+
logger.debug("Session: %s with Module %s", job_id, session.module)
|
|
182
|
+
|
|
183
|
+
async def _stream() -> AsyncGenerator[dict[str, Any], Any]:
|
|
184
|
+
"""Stream output data from the module with simple blocking pattern.
|
|
185
|
+
|
|
186
|
+
This implementation uses a simple one-item-at-a-time pattern optimized
|
|
187
|
+
for local execution where we have direct access to session status:
|
|
188
|
+
1. Block waiting for each item
|
|
189
|
+
2. Check termination conditions after each item
|
|
190
|
+
3. Clean shutdown when task completes
|
|
191
|
+
|
|
192
|
+
This pattern provides:
|
|
193
|
+
- Immediate termination when task completes
|
|
194
|
+
- Direct session status monitoring
|
|
195
|
+
- Simple, predictable behavior for local tasks
|
|
196
|
+
|
|
197
|
+
Yields:
|
|
198
|
+
dict: Output data generated by the module.
|
|
199
|
+
"""
|
|
200
|
+
while True:
|
|
201
|
+
# Block for next item - if queue is empty but producer not finished yet
|
|
202
|
+
msg = await session.queue.get()
|
|
203
|
+
try:
|
|
204
|
+
yield msg
|
|
205
|
+
finally:
|
|
206
|
+
# Always mark task as done, even if consumer raises exception
|
|
207
|
+
session.queue.task_done()
|
|
208
|
+
|
|
209
|
+
# Check termination conditions after each message
|
|
210
|
+
# This allows immediate shutdown when the task completes
|
|
211
|
+
if (
|
|
212
|
+
session.is_cancelled.is_set()
|
|
213
|
+
or (session.status is TaskStatus.COMPLETED and session.queue.empty())
|
|
214
|
+
or session.status is TaskStatus.FAILED
|
|
215
|
+
):
|
|
216
|
+
logger.debug(
|
|
217
|
+
"Stream ending for job %s: cancelled=%s, status=%s, queue_empty=%s",
|
|
218
|
+
job_id,
|
|
219
|
+
session.is_cancelled.is_set(),
|
|
220
|
+
session.status,
|
|
221
|
+
session.queue.empty(),
|
|
222
|
+
)
|
|
223
|
+
break
|
|
224
|
+
|
|
225
|
+
yield _stream()
|
|
226
|
+
|
|
227
|
+
async def create_module_instance_job(
|
|
228
|
+
self,
|
|
229
|
+
input_data: InputModelT,
|
|
230
|
+
setup_data: SetupModelT,
|
|
231
|
+
mission_id: str,
|
|
232
|
+
setup_id: str,
|
|
233
|
+
setup_version_id: str,
|
|
234
|
+
) -> str:
|
|
235
|
+
"""Create and start a new module job.
|
|
236
|
+
|
|
237
|
+
This method initializes a new module job, assigns it a unique job ID,
|
|
238
|
+
and starts it in the background.
|
|
239
|
+
|
|
240
|
+
Args:
|
|
241
|
+
input_data: The input data required to start the job.
|
|
242
|
+
setup_data: The setup configuration for the module.
|
|
243
|
+
mission_id: The mission ID associated with the job.
|
|
244
|
+
setup_id: The setup ID associated with the module.
|
|
245
|
+
setup_version_id: The setup Version ID associated with the module.
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
str: The unique identifier (job ID) of the created job.
|
|
249
|
+
|
|
250
|
+
Raises:
|
|
251
|
+
Exception: If the module fails to start.
|
|
252
|
+
"""
|
|
253
|
+
job_id = str(uuid.uuid4())
|
|
254
|
+
module = ModuleFactory.create_module_instance(self.module_class, job_id, mission_id, setup_id, setup_version_id)
|
|
255
|
+
callback = await self.job_specific_callback(self.add_to_queue, job_id)
|
|
256
|
+
|
|
257
|
+
await self.create_task(
|
|
258
|
+
job_id,
|
|
259
|
+
mission_id,
|
|
260
|
+
module,
|
|
261
|
+
module.start(input_data, setup_data, callback, done_callback=None),
|
|
262
|
+
)
|
|
263
|
+
logger.info("Managed task started: '%s'", job_id, extra={"task_id": job_id})
|
|
264
|
+
return job_id
|
|
265
|
+
|
|
266
|
+
async def clean_session(self, task_id: str, mission_id: str) -> bool:
|
|
267
|
+
"""Clean a task's session.
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
task_id: Unique identifier for the task.
|
|
271
|
+
mission_id: Mission identifier.
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
bool: True if the task was successfully cleaned, False otherwise.
|
|
275
|
+
"""
|
|
276
|
+
return await self._task_manager.clean_session(task_id, mission_id)
|
|
277
|
+
|
|
278
|
+
async def stop_module(self, job_id: str) -> bool:
|
|
279
|
+
"""Stop a running module job.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
job_id: The unique identifier of the job to stop.
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
bool: True if the module was successfully stopped, False if it does not exist.
|
|
286
|
+
|
|
287
|
+
Raises:
|
|
288
|
+
Exception: If an error occurs while stopping the module.
|
|
289
|
+
"""
|
|
290
|
+
logger.info("Stop module requested", extra={"job_id": job_id})
|
|
291
|
+
|
|
292
|
+
async with self._lock:
|
|
293
|
+
session = self.tasks_sessions.get(job_id)
|
|
294
|
+
|
|
295
|
+
if not session:
|
|
296
|
+
logger.warning("Session not found", extra={"job_id": job_id})
|
|
297
|
+
return False
|
|
298
|
+
try:
|
|
299
|
+
await session.module.stop()
|
|
300
|
+
await self.cancel_task(job_id, session.mission_id)
|
|
301
|
+
logger.debug(
|
|
302
|
+
"Module stopped successfully",
|
|
303
|
+
extra={"job_id": job_id, "mission_id": session.mission_id},
|
|
304
|
+
)
|
|
305
|
+
except Exception:
|
|
306
|
+
logger.exception("Error stopping module", extra={"job_id": job_id})
|
|
307
|
+
raise
|
|
308
|
+
else:
|
|
309
|
+
return True
|
|
310
|
+
|
|
311
|
+
async def get_module_status(self, job_id: str) -> TaskStatus:
|
|
312
|
+
"""Retrieve the status of a module job.
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
job_id: The unique identifier of the job.
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
ModuleStatus: The status of the module.
|
|
319
|
+
"""
|
|
320
|
+
session = self.tasks_sessions.get(job_id, None)
|
|
321
|
+
return session.status if session is not None else TaskStatus.FAILED
|
|
322
|
+
|
|
323
|
+
async def wait_for_completion(self, job_id: str) -> None:
|
|
324
|
+
"""Wait for a task to complete by awaiting its asyncio.Task.
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
job_id: The unique identifier of the job to wait for.
|
|
328
|
+
|
|
329
|
+
Raises:
|
|
330
|
+
KeyError: If the job_id is not found in tasks.
|
|
331
|
+
"""
|
|
332
|
+
if job_id not in self._task_manager.tasks:
|
|
333
|
+
msg = f"Job {job_id} not found"
|
|
334
|
+
raise KeyError(msg)
|
|
335
|
+
await self._task_manager.tasks[job_id]
|
|
336
|
+
|
|
337
|
+
async def stop_all_modules(self) -> None:
|
|
338
|
+
"""Stop all currently running module jobs.
|
|
339
|
+
|
|
340
|
+
This method ensures that all active jobs are gracefully terminated
|
|
341
|
+
and closes the SurrealDB connection.
|
|
342
|
+
"""
|
|
343
|
+
# Snapshot job IDs while holding lock
|
|
344
|
+
async with self._lock:
|
|
345
|
+
job_ids = list(self.tasks_sessions.keys())
|
|
346
|
+
|
|
347
|
+
# Release lock before calling stop_module (which has its own lock)
|
|
348
|
+
if job_ids:
|
|
349
|
+
stop_tasks = [self.stop_module(job_id) for job_id in job_ids]
|
|
350
|
+
await asyncio.gather(*stop_tasks, return_exceptions=True)
|
|
351
|
+
|
|
352
|
+
# Close SurrealDB connection after stopping all modules
|
|
353
|
+
if hasattr(self, "channel"):
|
|
354
|
+
try:
|
|
355
|
+
await self.channel.close()
|
|
356
|
+
logger.info("SingleJobManager: SurrealDB connection closed")
|
|
357
|
+
except Exception as e:
|
|
358
|
+
logger.warning("Failed to close SurrealDB connection: %s", e)
|
|
359
|
+
|
|
360
|
+
async def list_modules(self) -> dict[str, dict[str, Any]]:
|
|
361
|
+
"""List all modules along with their statuses.
|
|
362
|
+
|
|
363
|
+
Returns:
|
|
364
|
+
dict[str, dict[str, Any]]: A dictionary containing information about all modules and their statuses.
|
|
365
|
+
"""
|
|
366
|
+
return {
|
|
367
|
+
job_id: {
|
|
368
|
+
"name": session.module.name,
|
|
369
|
+
"status": session.module.status,
|
|
370
|
+
"class": session.module.__class__.__name__,
|
|
371
|
+
}
|
|
372
|
+
for job_id, session in self.tasks_sessions.items()
|
|
373
|
+
}
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
"""Taskiq broker & RSTREAM producer for the job manager."""
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import datetime
|
|
4
5
|
import json
|
|
5
6
|
import logging
|
|
6
7
|
import os
|
|
7
8
|
import pickle # noqa: S403
|
|
9
|
+
from typing import Any
|
|
8
10
|
|
|
9
11
|
from rstream import Producer
|
|
10
12
|
from rstream.exceptions import PreconditionFailed
|
|
@@ -14,11 +16,15 @@ from taskiq.compat import model_validate
|
|
|
14
16
|
from taskiq.message import BrokerMessage
|
|
15
17
|
from taskiq_aio_pika import AioPikaBroker
|
|
16
18
|
|
|
19
|
+
from digitalkin.core.common import ConnectionFactory, ModuleFactory
|
|
20
|
+
from digitalkin.core.job_manager.base_job_manager import BaseJobManager
|
|
21
|
+
from digitalkin.core.task_manager.task_executor import TaskExecutor
|
|
22
|
+
from digitalkin.core.task_manager.task_session import TaskSession
|
|
17
23
|
from digitalkin.logger import logger
|
|
18
|
-
from digitalkin.models.module.
|
|
24
|
+
from digitalkin.models.module.module import ModuleCodeModel
|
|
25
|
+
from digitalkin.models.module.module_types import DataModel, OutputModelT
|
|
26
|
+
from digitalkin.models.module.utility import EndOfStreamOutput
|
|
19
27
|
from digitalkin.modules._base_module import BaseModule
|
|
20
|
-
from digitalkin.modules.job_manager.base_job_manager import BaseJobManager
|
|
21
|
-
from digitalkin.modules.job_manager.job_manager_models import StreamCodeModel
|
|
22
28
|
from digitalkin.services.services_config import ServicesConfig
|
|
23
29
|
from digitalkin.services.services_models import ServicesMode
|
|
24
30
|
|
|
@@ -118,7 +124,25 @@ RSTREAM_PRODUCER = define_producer()
|
|
|
118
124
|
TASKIQ_BROKER = define_broker()
|
|
119
125
|
|
|
120
126
|
|
|
121
|
-
async def
|
|
127
|
+
async def cleanup_global_resources() -> None:
|
|
128
|
+
"""Clean up global resources (producer and broker connections).
|
|
129
|
+
|
|
130
|
+
This should be called during shutdown to prevent connection leaks.
|
|
131
|
+
"""
|
|
132
|
+
try:
|
|
133
|
+
await RSTREAM_PRODUCER.close()
|
|
134
|
+
logger.info("RStream producer closed successfully")
|
|
135
|
+
except Exception as e:
|
|
136
|
+
logger.warning("Failed to close RStream producer: %s", e)
|
|
137
|
+
|
|
138
|
+
try:
|
|
139
|
+
await TASKIQ_BROKER.shutdown()
|
|
140
|
+
logger.info("Taskiq broker shut down successfully")
|
|
141
|
+
except Exception as e:
|
|
142
|
+
logger.warning("Failed to shutdown Taskiq broker: %s", e)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
async def send_message_to_stream(job_id: str, output_data: OutputModelT | ModuleCodeModel) -> None: # type: ignore[type-var]
|
|
122
146
|
"""Callback define to add a message frame to the Rstream.
|
|
123
147
|
|
|
124
148
|
Args:
|
|
@@ -152,27 +176,70 @@ async def run_start_module(
|
|
|
152
176
|
setup_data: dict,
|
|
153
177
|
context: Allow TaskIQ context access
|
|
154
178
|
"""
|
|
155
|
-
logger.
|
|
179
|
+
logger.info("Starting module with services_mode: %s", services_mode)
|
|
156
180
|
services_config = ServicesConfig(
|
|
157
181
|
services_config_strategies=module_class.services_config_strategies,
|
|
158
182
|
services_config_params=module_class.services_config_params,
|
|
159
183
|
mode=services_mode,
|
|
160
184
|
)
|
|
161
185
|
setattr(module_class, "services_config", services_config)
|
|
162
|
-
logger.
|
|
186
|
+
logger.debug("Services config: %s | Module config: %s", services_config, module_class.services_config)
|
|
187
|
+
module_class.discover()
|
|
163
188
|
|
|
164
189
|
job_id = context.message.task_id
|
|
165
|
-
callback = await BaseJobManager.job_specific_callback(send_message_to_stream, job_id)
|
|
166
|
-
module = module_class
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
#
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
190
|
+
callback = await BaseJobManager.job_specific_callback(send_message_to_stream, job_id) # type: ignore[type-var]
|
|
191
|
+
module = ModuleFactory.create_module_instance(module_class, job_id, mission_id, setup_id, setup_version_id)
|
|
192
|
+
|
|
193
|
+
channel = None
|
|
194
|
+
try:
|
|
195
|
+
# Create TaskExecutor and supporting components for worker execution
|
|
196
|
+
executor = TaskExecutor()
|
|
197
|
+
# SurrealDB env vars are expected to be set in env.
|
|
198
|
+
channel = await ConnectionFactory.create_surreal_connection("taskiq_worker", datetime.timedelta(seconds=5))
|
|
199
|
+
session = TaskSession(job_id, mission_id, channel, module, datetime.timedelta(seconds=2))
|
|
200
|
+
|
|
201
|
+
# Execute the task using TaskExecutor
|
|
202
|
+
# Create a proper done callback that handles errors
|
|
203
|
+
async def send_end_of_stream(_: Any) -> None: # noqa: ANN401
|
|
204
|
+
try:
|
|
205
|
+
await callback(DataModel(root=EndOfStreamOutput()))
|
|
206
|
+
except Exception as e:
|
|
207
|
+
logger.error("Error sending end of stream: %s", e, exc_info=True)
|
|
208
|
+
|
|
209
|
+
# Reconstruct Pydantic models from dicts for type safety
|
|
210
|
+
try:
|
|
211
|
+
input_model = module_class.create_input_model(input_data)
|
|
212
|
+
setup_model = await module_class.create_setup_model(setup_data)
|
|
213
|
+
except Exception as e:
|
|
214
|
+
logger.error("Failed to reconstruct models for job %s: %s", job_id, e, exc_info=True)
|
|
215
|
+
raise
|
|
216
|
+
|
|
217
|
+
supervisor_task = await executor.execute_task(
|
|
218
|
+
task_id=job_id,
|
|
219
|
+
mission_id=mission_id,
|
|
220
|
+
coro=module.start(
|
|
221
|
+
input_model,
|
|
222
|
+
setup_model,
|
|
223
|
+
callback,
|
|
224
|
+
done_callback=lambda result: asyncio.ensure_future(send_end_of_stream(result)),
|
|
225
|
+
),
|
|
226
|
+
session=session,
|
|
227
|
+
channel=channel,
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
# Wait for the supervisor task to complete
|
|
231
|
+
await supervisor_task
|
|
232
|
+
logger.info("Module task %s completed", job_id)
|
|
233
|
+
except Exception:
|
|
234
|
+
logger.exception("Error running module %s", job_id)
|
|
235
|
+
raise
|
|
236
|
+
finally:
|
|
237
|
+
# Cleanup channel
|
|
238
|
+
if channel is not None:
|
|
239
|
+
try:
|
|
240
|
+
await channel.close()
|
|
241
|
+
except Exception:
|
|
242
|
+
logger.exception("Error closing channel for job %s", job_id)
|
|
176
243
|
|
|
177
244
|
|
|
178
245
|
@TASKIQ_BROKER.task
|
|
@@ -194,23 +261,51 @@ async def run_config_module(
|
|
|
194
261
|
module_class: type[BaseModule],
|
|
195
262
|
services_mode: ServicesMode,
|
|
196
263
|
config_setup_data: dict,
|
|
197
|
-
setup_data: dict,
|
|
198
264
|
context: Allow TaskIQ context access
|
|
199
265
|
"""
|
|
200
|
-
logger.
|
|
266
|
+
logger.info("Starting config module with services_mode: %s", services_mode)
|
|
201
267
|
services_config = ServicesConfig(
|
|
202
268
|
services_config_strategies=module_class.services_config_strategies,
|
|
203
269
|
services_config_params=module_class.services_config_params,
|
|
204
270
|
mode=services_mode,
|
|
205
271
|
)
|
|
206
272
|
setattr(module_class, "services_config", services_config)
|
|
207
|
-
logger.
|
|
273
|
+
logger.debug("Services config: %s | Module config: %s", services_config, module_class.services_config)
|
|
208
274
|
|
|
209
275
|
job_id = context.message.task_id
|
|
210
|
-
callback = await BaseJobManager.job_specific_callback(send_message_to_stream, job_id)
|
|
211
|
-
module = module_class
|
|
276
|
+
callback = await BaseJobManager.job_specific_callback(send_message_to_stream, job_id) # type: ignore[type-var]
|
|
277
|
+
module = ModuleFactory.create_module_instance(module_class, job_id, mission_id, setup_id, setup_version_id)
|
|
212
278
|
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
279
|
+
# Override environment variables temporarily to use manager's SurrealDB
|
|
280
|
+
channel = None
|
|
281
|
+
try:
|
|
282
|
+
# Create TaskExecutor and supporting components for worker execution
|
|
283
|
+
executor = TaskExecutor()
|
|
284
|
+
# SurrealDB env vars are expected to be set in env.
|
|
285
|
+
channel = await ConnectionFactory.create_surreal_connection("taskiq_worker", datetime.timedelta(seconds=5))
|
|
286
|
+
session = TaskSession(job_id, mission_id, channel, module, datetime.timedelta(seconds=2))
|
|
287
|
+
|
|
288
|
+
# Create and run the config setup task with TaskExecutor
|
|
289
|
+
setup_model = module_class.create_config_setup_model(config_setup_data)
|
|
290
|
+
|
|
291
|
+
supervisor_task = await executor.execute_task(
|
|
292
|
+
task_id=job_id,
|
|
293
|
+
mission_id=mission_id,
|
|
294
|
+
coro=module.start_config_setup(setup_model, callback),
|
|
295
|
+
session=session,
|
|
296
|
+
channel=channel,
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
# Wait for the supervisor task to complete
|
|
300
|
+
await supervisor_task
|
|
301
|
+
logger.info("Config module task %s completed", job_id)
|
|
302
|
+
except Exception:
|
|
303
|
+
logger.exception("Error running config module %s", job_id)
|
|
304
|
+
raise
|
|
305
|
+
finally:
|
|
306
|
+
# Cleanup channel
|
|
307
|
+
if channel is not None:
|
|
308
|
+
try:
|
|
309
|
+
await channel.close()
|
|
310
|
+
except Exception:
|
|
311
|
+
logger.exception("Error closing channel for job %s", job_id)
|