diracx-db 0.0.1a19__py3-none-any.whl → 0.0.1a21__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,302 +0,0 @@
1
- import asyncio
2
- from datetime import datetime, timezone
3
- from unittest.mock import MagicMock
4
-
5
- from fastapi import BackgroundTasks
6
-
7
- from diracx.core.config.schema import Config
8
- from diracx.core.exceptions import JobNotFound
9
- from diracx.core.models import (
10
- JobStatus,
11
- JobStatusUpdate,
12
- ScalarSearchOperator,
13
- SetJobStatusReturn,
14
- )
15
-
16
- from .. import JobDB, JobLoggingDB, SandboxMetadataDB, TaskQueueDB
17
-
18
-
19
- async def set_job_status(
20
- job_id: int,
21
- status: dict[datetime, JobStatusUpdate],
22
- job_db: JobDB,
23
- job_logging_db: JobLoggingDB,
24
- force: bool = False,
25
- ) -> SetJobStatusReturn:
26
- """Set various status fields for job specified by its jobId.
27
- Set only the last status in the JobDB, updating all the status
28
- logging information in the JobLoggingDB. The status dict has datetime
29
- as a key and status information dictionary as values.
30
-
31
- :raises: JobNotFound if the job is not found in one of the DBs
32
- """
33
- from DIRAC.Core.Utilities import TimeUtilities
34
- from DIRAC.Core.Utilities.ReturnValues import returnValueOrRaise
35
- from DIRAC.WorkloadManagementSystem.Utilities.JobStatusUtility import (
36
- getNewStatus,
37
- getStartAndEndTime,
38
- )
39
-
40
- # transform JobStateUpdate objects into dicts
41
- statusDict = {}
42
- for key, value in status.items():
43
- statusDict[key] = {k: v for k, v in value.model_dump().items() if v is not None}
44
-
45
- _, res = await job_db.search(
46
- parameters=["Status", "StartExecTime", "EndExecTime"],
47
- search=[
48
- {
49
- "parameter": "JobID",
50
- "operator": ScalarSearchOperator.EQUAL,
51
- "value": str(job_id),
52
- }
53
- ],
54
- sorts=[],
55
- )
56
- if not res:
57
- raise JobNotFound(job_id) from None
58
-
59
- currentStatus = res[0]["Status"]
60
- startTime = res[0]["StartExecTime"]
61
- endTime = res[0]["EndExecTime"]
62
-
63
- # If the current status is Stalled and we get an update, it should probably be "Running"
64
- if currentStatus == JobStatus.STALLED:
65
- currentStatus = JobStatus.RUNNING
66
-
67
- # Get the latest time stamps of major status updates
68
- result = await job_logging_db.get_wms_time_stamps(job_id)
69
-
70
- #####################################################################################################
71
-
72
- # This is more precise than "LastTime". timeStamps is a sorted list of tuples...
73
- timeStamps = sorted((float(t), s) for s, t in result.items())
74
- lastTime = TimeUtilities.fromEpoch(timeStamps[-1][0]).replace(tzinfo=timezone.utc)
75
-
76
- # Get chronological order of new updates
77
- updateTimes = sorted(statusDict)
78
-
79
- newStartTime, newEndTime = getStartAndEndTime(
80
- startTime, endTime, updateTimes, timeStamps, statusDict
81
- )
82
-
83
- job_data = {}
84
- if updateTimes[-1] >= lastTime:
85
- new_status, new_minor, new_application = returnValueOrRaise(
86
- getNewStatus(
87
- job_id,
88
- updateTimes,
89
- lastTime,
90
- statusDict,
91
- currentStatus,
92
- force,
93
- MagicMock(),
94
- )
95
- )
96
-
97
- if new_status:
98
- job_data["Status"] = new_status
99
- job_data["LastUpdateTime"] = datetime.now(timezone.utc)
100
- if new_minor:
101
- job_data["MinorStatus"] = new_minor
102
- if new_application:
103
- job_data["ApplicationStatus"] = new_application
104
-
105
- # TODO: implement elasticJobParametersDB ?
106
- # if cls.elasticJobParametersDB:
107
- # result = cls.elasticJobParametersDB.setJobParameter(int(jobID), "Status", status)
108
- # if not result["OK"]:
109
- # return result
110
-
111
- for updTime in updateTimes:
112
- if statusDict[updTime]["Source"].startswith("Job"):
113
- job_data["HeartBeatTime"] = updTime
114
-
115
- if not startTime and newStartTime:
116
- job_data["StartExecTime"] = newStartTime
117
-
118
- if not endTime and newEndTime:
119
- job_data["EndExecTime"] = newEndTime
120
-
121
- if job_data:
122
- await job_db.setJobAttributes(job_id, job_data)
123
-
124
- for updTime in updateTimes:
125
- sDict = statusDict[updTime]
126
- if not sDict.get("Status"):
127
- sDict["Status"] = "idem"
128
- if not sDict.get("MinorStatus"):
129
- sDict["MinorStatus"] = "idem"
130
- if not sDict.get("ApplicationStatus"):
131
- sDict["ApplicationStatus"] = "idem"
132
- if not sDict.get("Source"):
133
- sDict["Source"] = "Unknown"
134
-
135
- await job_logging_db.insert_record(
136
- job_id,
137
- sDict["Status"],
138
- sDict["MinorStatus"],
139
- sDict["ApplicationStatus"],
140
- updTime,
141
- sDict["Source"],
142
- )
143
-
144
- return SetJobStatusReturn(**job_data)
145
-
146
-
147
- class ForgivingTaskGroup(asyncio.TaskGroup):
148
- # Hacky way, check https://stackoverflow.com/questions/75250788/how-to-prevent-python3-11-taskgroup-from-canceling-all-the-tasks
149
- # Basically e're using this because we want to wait for all tasks to finish, even if one of them raises an exception
150
- def _abort(self):
151
- return None
152
-
153
-
154
- async def delete_jobs(
155
- job_ids: list[int],
156
- config: Config,
157
- job_db: JobDB,
158
- job_logging_db: JobLoggingDB,
159
- task_queue_db: TaskQueueDB,
160
- background_task: BackgroundTasks,
161
- ):
162
- """Removing jobs from task queues, send a kill command and set status to DELETED.
163
-
164
- :raises: BaseExceptionGroup[JobNotFound] for every job that was not found.
165
- """
166
- await _remove_jobs_from_task_queue(job_ids, config, task_queue_db, background_task)
167
- # TODO: implement StorageManagerClient
168
- # returnValueOrRaise(StorageManagerClient().killTasksBySourceTaskID(job_ids))
169
-
170
- async with ForgivingTaskGroup() as task_group:
171
- for job_id in job_ids:
172
- task_group.create_task(job_db.set_job_command(job_id, "Kill"))
173
-
174
- task_group.create_task(
175
- set_job_status(
176
- job_id,
177
- {
178
- datetime.now(timezone.utc): JobStatusUpdate(
179
- Status=JobStatus.DELETED,
180
- MinorStatus="Checking accounting",
181
- Source="job_manager",
182
- )
183
- },
184
- job_db,
185
- job_logging_db,
186
- force=True,
187
- )
188
- )
189
-
190
-
191
- async def kill_jobs(
192
- job_ids: list[int],
193
- config: Config,
194
- job_db: JobDB,
195
- job_logging_db: JobLoggingDB,
196
- task_queue_db: TaskQueueDB,
197
- background_task: BackgroundTasks,
198
- ):
199
- """Kill jobs by removing them from the task queues, set kill as a job command and setting the job status to KILLED.
200
- :raises: BaseExceptionGroup[JobNotFound] for every job that was not found.
201
- """
202
- await _remove_jobs_from_task_queue(job_ids, config, task_queue_db, background_task)
203
- # TODO: implement StorageManagerClient
204
- # returnValueOrRaise(StorageManagerClient().killTasksBySourceTaskID(job_ids))
205
-
206
- async with ForgivingTaskGroup() as task_group:
207
- for job_id in job_ids:
208
- task_group.create_task(job_db.set_job_command(job_id, "Kill"))
209
- task_group.create_task(
210
- set_job_status(
211
- job_id,
212
- {
213
- datetime.now(timezone.utc): JobStatusUpdate(
214
- Status=JobStatus.KILLED,
215
- MinorStatus="Marked for termination",
216
- Source="job_manager",
217
- )
218
- },
219
- job_db,
220
- job_logging_db,
221
- force=True,
222
- )
223
- )
224
-
225
- # TODO: Consider using the code below instead, probably more stable but less performant
226
- # errors = []
227
- # for job_id in job_ids:
228
- # try:
229
- # await job_db.set_job_command(job_id, "Kill")
230
- # await set_job_status(
231
- # job_id,
232
- # {
233
- # datetime.now(timezone.utc): JobStatusUpdate(
234
- # Status=JobStatus.KILLED,
235
- # MinorStatus="Marked for termination",
236
- # Source="job_manager",
237
- # )
238
- # },
239
- # job_db,
240
- # job_logging_db,
241
- # force=True,
242
- # )
243
- # except JobNotFound as e:
244
- # errors.append(e)
245
-
246
- # if errors:
247
- # raise BaseExceptionGroup("Some job ids were not found", errors)
248
-
249
-
250
- async def remove_jobs(
251
- job_ids: list[int],
252
- config: Config,
253
- job_db: JobDB,
254
- job_logging_db: JobLoggingDB,
255
- sandbox_metadata_db: SandboxMetadataDB,
256
- task_queue_db: TaskQueueDB,
257
- background_task: BackgroundTasks,
258
- ):
259
- """Fully remove a job from the WMS databases.
260
- :raises: nothing.
261
- """
262
- # Remove the staging task from the StorageManager
263
- # TODO: this was not done in the JobManagerHandler, but it was done in the kill method
264
- # I think it should be done here too
265
- # TODO: implement StorageManagerClient
266
- # returnValueOrRaise(StorageManagerClient().killTasksBySourceTaskID([job_id]))
267
-
268
- # TODO: this was also not done in the JobManagerHandler, but it was done in the JobCleaningAgent
269
- # I think it should be done here as well
270
- await sandbox_metadata_db.unassign_sandboxes_to_jobs(job_ids)
271
-
272
- # Remove the job from TaskQueueDB
273
- await _remove_jobs_from_task_queue(job_ids, config, task_queue_db, background_task)
274
-
275
- # Remove the job from JobLoggingDB
276
- await job_logging_db.delete_records(job_ids)
277
-
278
- # Remove the job from JobDB
279
- await job_db.delete_jobs(job_ids)
280
-
281
-
282
- async def _remove_jobs_from_task_queue(
283
- job_ids: list[int],
284
- config: Config,
285
- task_queue_db: TaskQueueDB,
286
- background_task: BackgroundTasks,
287
- ):
288
- """Remove the job from TaskQueueDB."""
289
- tq_infos = await task_queue_db.get_tq_infos_for_jobs(job_ids)
290
- await task_queue_db.remove_jobs(job_ids)
291
- for tq_id, owner, owner_group, vo in tq_infos:
292
- # TODO: move to Celery
293
- background_task.add_task(
294
- task_queue_db.delete_task_queue_if_empty,
295
- tq_id,
296
- owner,
297
- owner_group,
298
- config.Registry[vo].Groups[owner_group].JobShare,
299
- config.Registry[vo].Groups[owner_group].Properties,
300
- config.Operations[vo].Services.JobScheduling.EnableSharesCorrection,
301
- config.Registry[vo].Groups[owner_group].AllowBackgroundTQs,
302
- )