diracx-db 0.0.1a19__py3-none-any.whl → 0.0.1a21__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- diracx/db/sql/job/db.py +107 -261
- diracx/db/sql/job_logging/db.py +74 -0
- diracx/db/sql/utils/__init__.py +11 -3
- diracx/db/sql/utils/job.py +574 -0
- {diracx_db-0.0.1a19.dist-info → diracx_db-0.0.1a21.dist-info}/METADATA +2 -3
- {diracx_db-0.0.1a19.dist-info → diracx_db-0.0.1a21.dist-info}/RECORD +9 -9
- {diracx_db-0.0.1a19.dist-info → diracx_db-0.0.1a21.dist-info}/WHEEL +1 -1
- diracx/db/sql/utils/job_status.py +0 -302
- {diracx_db-0.0.1a19.dist-info → diracx_db-0.0.1a21.dist-info}/entry_points.txt +0 -0
- {diracx_db-0.0.1a19.dist-info → diracx_db-0.0.1a21.dist-info}/top_level.txt +0 -0
@@ -1,302 +0,0 @@
|
|
1
|
-
import asyncio
|
2
|
-
from datetime import datetime, timezone
|
3
|
-
from unittest.mock import MagicMock
|
4
|
-
|
5
|
-
from fastapi import BackgroundTasks
|
6
|
-
|
7
|
-
from diracx.core.config.schema import Config
|
8
|
-
from diracx.core.exceptions import JobNotFound
|
9
|
-
from diracx.core.models import (
|
10
|
-
JobStatus,
|
11
|
-
JobStatusUpdate,
|
12
|
-
ScalarSearchOperator,
|
13
|
-
SetJobStatusReturn,
|
14
|
-
)
|
15
|
-
|
16
|
-
from .. import JobDB, JobLoggingDB, SandboxMetadataDB, TaskQueueDB
|
17
|
-
|
18
|
-
|
19
|
-
async def set_job_status(
|
20
|
-
job_id: int,
|
21
|
-
status: dict[datetime, JobStatusUpdate],
|
22
|
-
job_db: JobDB,
|
23
|
-
job_logging_db: JobLoggingDB,
|
24
|
-
force: bool = False,
|
25
|
-
) -> SetJobStatusReturn:
|
26
|
-
"""Set various status fields for job specified by its jobId.
|
27
|
-
Set only the last status in the JobDB, updating all the status
|
28
|
-
logging information in the JobLoggingDB. The status dict has datetime
|
29
|
-
as a key and status information dictionary as values.
|
30
|
-
|
31
|
-
:raises: JobNotFound if the job is not found in one of the DBs
|
32
|
-
"""
|
33
|
-
from DIRAC.Core.Utilities import TimeUtilities
|
34
|
-
from DIRAC.Core.Utilities.ReturnValues import returnValueOrRaise
|
35
|
-
from DIRAC.WorkloadManagementSystem.Utilities.JobStatusUtility import (
|
36
|
-
getNewStatus,
|
37
|
-
getStartAndEndTime,
|
38
|
-
)
|
39
|
-
|
40
|
-
# transform JobStateUpdate objects into dicts
|
41
|
-
statusDict = {}
|
42
|
-
for key, value in status.items():
|
43
|
-
statusDict[key] = {k: v for k, v in value.model_dump().items() if v is not None}
|
44
|
-
|
45
|
-
_, res = await job_db.search(
|
46
|
-
parameters=["Status", "StartExecTime", "EndExecTime"],
|
47
|
-
search=[
|
48
|
-
{
|
49
|
-
"parameter": "JobID",
|
50
|
-
"operator": ScalarSearchOperator.EQUAL,
|
51
|
-
"value": str(job_id),
|
52
|
-
}
|
53
|
-
],
|
54
|
-
sorts=[],
|
55
|
-
)
|
56
|
-
if not res:
|
57
|
-
raise JobNotFound(job_id) from None
|
58
|
-
|
59
|
-
currentStatus = res[0]["Status"]
|
60
|
-
startTime = res[0]["StartExecTime"]
|
61
|
-
endTime = res[0]["EndExecTime"]
|
62
|
-
|
63
|
-
# If the current status is Stalled and we get an update, it should probably be "Running"
|
64
|
-
if currentStatus == JobStatus.STALLED:
|
65
|
-
currentStatus = JobStatus.RUNNING
|
66
|
-
|
67
|
-
# Get the latest time stamps of major status updates
|
68
|
-
result = await job_logging_db.get_wms_time_stamps(job_id)
|
69
|
-
|
70
|
-
#####################################################################################################
|
71
|
-
|
72
|
-
# This is more precise than "LastTime". timeStamps is a sorted list of tuples...
|
73
|
-
timeStamps = sorted((float(t), s) for s, t in result.items())
|
74
|
-
lastTime = TimeUtilities.fromEpoch(timeStamps[-1][0]).replace(tzinfo=timezone.utc)
|
75
|
-
|
76
|
-
# Get chronological order of new updates
|
77
|
-
updateTimes = sorted(statusDict)
|
78
|
-
|
79
|
-
newStartTime, newEndTime = getStartAndEndTime(
|
80
|
-
startTime, endTime, updateTimes, timeStamps, statusDict
|
81
|
-
)
|
82
|
-
|
83
|
-
job_data = {}
|
84
|
-
if updateTimes[-1] >= lastTime:
|
85
|
-
new_status, new_minor, new_application = returnValueOrRaise(
|
86
|
-
getNewStatus(
|
87
|
-
job_id,
|
88
|
-
updateTimes,
|
89
|
-
lastTime,
|
90
|
-
statusDict,
|
91
|
-
currentStatus,
|
92
|
-
force,
|
93
|
-
MagicMock(),
|
94
|
-
)
|
95
|
-
)
|
96
|
-
|
97
|
-
if new_status:
|
98
|
-
job_data["Status"] = new_status
|
99
|
-
job_data["LastUpdateTime"] = datetime.now(timezone.utc)
|
100
|
-
if new_minor:
|
101
|
-
job_data["MinorStatus"] = new_minor
|
102
|
-
if new_application:
|
103
|
-
job_data["ApplicationStatus"] = new_application
|
104
|
-
|
105
|
-
# TODO: implement elasticJobParametersDB ?
|
106
|
-
# if cls.elasticJobParametersDB:
|
107
|
-
# result = cls.elasticJobParametersDB.setJobParameter(int(jobID), "Status", status)
|
108
|
-
# if not result["OK"]:
|
109
|
-
# return result
|
110
|
-
|
111
|
-
for updTime in updateTimes:
|
112
|
-
if statusDict[updTime]["Source"].startswith("Job"):
|
113
|
-
job_data["HeartBeatTime"] = updTime
|
114
|
-
|
115
|
-
if not startTime and newStartTime:
|
116
|
-
job_data["StartExecTime"] = newStartTime
|
117
|
-
|
118
|
-
if not endTime and newEndTime:
|
119
|
-
job_data["EndExecTime"] = newEndTime
|
120
|
-
|
121
|
-
if job_data:
|
122
|
-
await job_db.setJobAttributes(job_id, job_data)
|
123
|
-
|
124
|
-
for updTime in updateTimes:
|
125
|
-
sDict = statusDict[updTime]
|
126
|
-
if not sDict.get("Status"):
|
127
|
-
sDict["Status"] = "idem"
|
128
|
-
if not sDict.get("MinorStatus"):
|
129
|
-
sDict["MinorStatus"] = "idem"
|
130
|
-
if not sDict.get("ApplicationStatus"):
|
131
|
-
sDict["ApplicationStatus"] = "idem"
|
132
|
-
if not sDict.get("Source"):
|
133
|
-
sDict["Source"] = "Unknown"
|
134
|
-
|
135
|
-
await job_logging_db.insert_record(
|
136
|
-
job_id,
|
137
|
-
sDict["Status"],
|
138
|
-
sDict["MinorStatus"],
|
139
|
-
sDict["ApplicationStatus"],
|
140
|
-
updTime,
|
141
|
-
sDict["Source"],
|
142
|
-
)
|
143
|
-
|
144
|
-
return SetJobStatusReturn(**job_data)
|
145
|
-
|
146
|
-
|
147
|
-
class ForgivingTaskGroup(asyncio.TaskGroup):
|
148
|
-
# Hacky way, check https://stackoverflow.com/questions/75250788/how-to-prevent-python3-11-taskgroup-from-canceling-all-the-tasks
|
149
|
-
# Basically e're using this because we want to wait for all tasks to finish, even if one of them raises an exception
|
150
|
-
def _abort(self):
|
151
|
-
return None
|
152
|
-
|
153
|
-
|
154
|
-
async def delete_jobs(
|
155
|
-
job_ids: list[int],
|
156
|
-
config: Config,
|
157
|
-
job_db: JobDB,
|
158
|
-
job_logging_db: JobLoggingDB,
|
159
|
-
task_queue_db: TaskQueueDB,
|
160
|
-
background_task: BackgroundTasks,
|
161
|
-
):
|
162
|
-
"""Removing jobs from task queues, send a kill command and set status to DELETED.
|
163
|
-
|
164
|
-
:raises: BaseExceptionGroup[JobNotFound] for every job that was not found.
|
165
|
-
"""
|
166
|
-
await _remove_jobs_from_task_queue(job_ids, config, task_queue_db, background_task)
|
167
|
-
# TODO: implement StorageManagerClient
|
168
|
-
# returnValueOrRaise(StorageManagerClient().killTasksBySourceTaskID(job_ids))
|
169
|
-
|
170
|
-
async with ForgivingTaskGroup() as task_group:
|
171
|
-
for job_id in job_ids:
|
172
|
-
task_group.create_task(job_db.set_job_command(job_id, "Kill"))
|
173
|
-
|
174
|
-
task_group.create_task(
|
175
|
-
set_job_status(
|
176
|
-
job_id,
|
177
|
-
{
|
178
|
-
datetime.now(timezone.utc): JobStatusUpdate(
|
179
|
-
Status=JobStatus.DELETED,
|
180
|
-
MinorStatus="Checking accounting",
|
181
|
-
Source="job_manager",
|
182
|
-
)
|
183
|
-
},
|
184
|
-
job_db,
|
185
|
-
job_logging_db,
|
186
|
-
force=True,
|
187
|
-
)
|
188
|
-
)
|
189
|
-
|
190
|
-
|
191
|
-
async def kill_jobs(
|
192
|
-
job_ids: list[int],
|
193
|
-
config: Config,
|
194
|
-
job_db: JobDB,
|
195
|
-
job_logging_db: JobLoggingDB,
|
196
|
-
task_queue_db: TaskQueueDB,
|
197
|
-
background_task: BackgroundTasks,
|
198
|
-
):
|
199
|
-
"""Kill jobs by removing them from the task queues, set kill as a job command and setting the job status to KILLED.
|
200
|
-
:raises: BaseExceptionGroup[JobNotFound] for every job that was not found.
|
201
|
-
"""
|
202
|
-
await _remove_jobs_from_task_queue(job_ids, config, task_queue_db, background_task)
|
203
|
-
# TODO: implement StorageManagerClient
|
204
|
-
# returnValueOrRaise(StorageManagerClient().killTasksBySourceTaskID(job_ids))
|
205
|
-
|
206
|
-
async with ForgivingTaskGroup() as task_group:
|
207
|
-
for job_id in job_ids:
|
208
|
-
task_group.create_task(job_db.set_job_command(job_id, "Kill"))
|
209
|
-
task_group.create_task(
|
210
|
-
set_job_status(
|
211
|
-
job_id,
|
212
|
-
{
|
213
|
-
datetime.now(timezone.utc): JobStatusUpdate(
|
214
|
-
Status=JobStatus.KILLED,
|
215
|
-
MinorStatus="Marked for termination",
|
216
|
-
Source="job_manager",
|
217
|
-
)
|
218
|
-
},
|
219
|
-
job_db,
|
220
|
-
job_logging_db,
|
221
|
-
force=True,
|
222
|
-
)
|
223
|
-
)
|
224
|
-
|
225
|
-
# TODO: Consider using the code below instead, probably more stable but less performant
|
226
|
-
# errors = []
|
227
|
-
# for job_id in job_ids:
|
228
|
-
# try:
|
229
|
-
# await job_db.set_job_command(job_id, "Kill")
|
230
|
-
# await set_job_status(
|
231
|
-
# job_id,
|
232
|
-
# {
|
233
|
-
# datetime.now(timezone.utc): JobStatusUpdate(
|
234
|
-
# Status=JobStatus.KILLED,
|
235
|
-
# MinorStatus="Marked for termination",
|
236
|
-
# Source="job_manager",
|
237
|
-
# )
|
238
|
-
# },
|
239
|
-
# job_db,
|
240
|
-
# job_logging_db,
|
241
|
-
# force=True,
|
242
|
-
# )
|
243
|
-
# except JobNotFound as e:
|
244
|
-
# errors.append(e)
|
245
|
-
|
246
|
-
# if errors:
|
247
|
-
# raise BaseExceptionGroup("Some job ids were not found", errors)
|
248
|
-
|
249
|
-
|
250
|
-
async def remove_jobs(
|
251
|
-
job_ids: list[int],
|
252
|
-
config: Config,
|
253
|
-
job_db: JobDB,
|
254
|
-
job_logging_db: JobLoggingDB,
|
255
|
-
sandbox_metadata_db: SandboxMetadataDB,
|
256
|
-
task_queue_db: TaskQueueDB,
|
257
|
-
background_task: BackgroundTasks,
|
258
|
-
):
|
259
|
-
"""Fully remove a job from the WMS databases.
|
260
|
-
:raises: nothing.
|
261
|
-
"""
|
262
|
-
# Remove the staging task from the StorageManager
|
263
|
-
# TODO: this was not done in the JobManagerHandler, but it was done in the kill method
|
264
|
-
# I think it should be done here too
|
265
|
-
# TODO: implement StorageManagerClient
|
266
|
-
# returnValueOrRaise(StorageManagerClient().killTasksBySourceTaskID([job_id]))
|
267
|
-
|
268
|
-
# TODO: this was also not done in the JobManagerHandler, but it was done in the JobCleaningAgent
|
269
|
-
# I think it should be done here as well
|
270
|
-
await sandbox_metadata_db.unassign_sandboxes_to_jobs(job_ids)
|
271
|
-
|
272
|
-
# Remove the job from TaskQueueDB
|
273
|
-
await _remove_jobs_from_task_queue(job_ids, config, task_queue_db, background_task)
|
274
|
-
|
275
|
-
# Remove the job from JobLoggingDB
|
276
|
-
await job_logging_db.delete_records(job_ids)
|
277
|
-
|
278
|
-
# Remove the job from JobDB
|
279
|
-
await job_db.delete_jobs(job_ids)
|
280
|
-
|
281
|
-
|
282
|
-
async def _remove_jobs_from_task_queue(
|
283
|
-
job_ids: list[int],
|
284
|
-
config: Config,
|
285
|
-
task_queue_db: TaskQueueDB,
|
286
|
-
background_task: BackgroundTasks,
|
287
|
-
):
|
288
|
-
"""Remove the job from TaskQueueDB."""
|
289
|
-
tq_infos = await task_queue_db.get_tq_infos_for_jobs(job_ids)
|
290
|
-
await task_queue_db.remove_jobs(job_ids)
|
291
|
-
for tq_id, owner, owner_group, vo in tq_infos:
|
292
|
-
# TODO: move to Celery
|
293
|
-
background_task.add_task(
|
294
|
-
task_queue_db.delete_task_queue_if_empty,
|
295
|
-
tq_id,
|
296
|
-
owner,
|
297
|
-
owner_group,
|
298
|
-
config.Registry[vo].Groups[owner_group].JobShare,
|
299
|
-
config.Registry[vo].Groups[owner_group].Properties,
|
300
|
-
config.Operations[vo].Services.JobScheduling.EnableSharesCorrection,
|
301
|
-
config.Registry[vo].Groups[owner_group].AllowBackgroundTQs,
|
302
|
-
)
|
File without changes
|
File without changes
|