diracx-db 0.0.1a20__py3-none-any.whl → 0.0.1a22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- diracx/db/sql/job/db.py +107 -261
- diracx/db/sql/job_logging/db.py +74 -0
- diracx/db/sql/utils/__init__.py +11 -3
- diracx/db/sql/utils/job.py +574 -0
- {diracx_db-0.0.1a20.dist-info → diracx_db-0.0.1a22.dist-info}/METADATA +2 -2
- {diracx_db-0.0.1a20.dist-info → diracx_db-0.0.1a22.dist-info}/RECORD +9 -9
- diracx/db/sql/utils/job_status.py +0 -302
- {diracx_db-0.0.1a20.dist-info → diracx_db-0.0.1a22.dist-info}/WHEEL +0 -0
- {diracx_db-0.0.1a20.dist-info → diracx_db-0.0.1a22.dist-info}/entry_points.txt +0 -0
- {diracx_db-0.0.1a20.dist-info → diracx_db-0.0.1a22.dist-info}/top_level.txt +0 -0
@@ -1,302 +0,0 @@
|
|
1
|
-
import asyncio
|
2
|
-
from datetime import datetime, timezone
|
3
|
-
from unittest.mock import MagicMock
|
4
|
-
|
5
|
-
from fastapi import BackgroundTasks
|
6
|
-
|
7
|
-
from diracx.core.config.schema import Config
|
8
|
-
from diracx.core.exceptions import JobNotFound
|
9
|
-
from diracx.core.models import (
|
10
|
-
JobStatus,
|
11
|
-
JobStatusUpdate,
|
12
|
-
ScalarSearchOperator,
|
13
|
-
SetJobStatusReturn,
|
14
|
-
)
|
15
|
-
|
16
|
-
from .. import JobDB, JobLoggingDB, SandboxMetadataDB, TaskQueueDB
|
17
|
-
|
18
|
-
|
19
|
-
async def set_job_status(
|
20
|
-
job_id: int,
|
21
|
-
status: dict[datetime, JobStatusUpdate],
|
22
|
-
job_db: JobDB,
|
23
|
-
job_logging_db: JobLoggingDB,
|
24
|
-
force: bool = False,
|
25
|
-
) -> SetJobStatusReturn:
|
26
|
-
"""Set various status fields for job specified by its jobId.
|
27
|
-
Set only the last status in the JobDB, updating all the status
|
28
|
-
logging information in the JobLoggingDB. The status dict has datetime
|
29
|
-
as a key and status information dictionary as values.
|
30
|
-
|
31
|
-
:raises: JobNotFound if the job is not found in one of the DBs
|
32
|
-
"""
|
33
|
-
from DIRAC.Core.Utilities import TimeUtilities
|
34
|
-
from DIRAC.Core.Utilities.ReturnValues import returnValueOrRaise
|
35
|
-
from DIRAC.WorkloadManagementSystem.Utilities.JobStatusUtility import (
|
36
|
-
getNewStatus,
|
37
|
-
getStartAndEndTime,
|
38
|
-
)
|
39
|
-
|
40
|
-
# transform JobStateUpdate objects into dicts
|
41
|
-
statusDict = {}
|
42
|
-
for key, value in status.items():
|
43
|
-
statusDict[key] = {k: v for k, v in value.model_dump().items() if v is not None}
|
44
|
-
|
45
|
-
_, res = await job_db.search(
|
46
|
-
parameters=["Status", "StartExecTime", "EndExecTime"],
|
47
|
-
search=[
|
48
|
-
{
|
49
|
-
"parameter": "JobID",
|
50
|
-
"operator": ScalarSearchOperator.EQUAL,
|
51
|
-
"value": str(job_id),
|
52
|
-
}
|
53
|
-
],
|
54
|
-
sorts=[],
|
55
|
-
)
|
56
|
-
if not res:
|
57
|
-
raise JobNotFound(job_id) from None
|
58
|
-
|
59
|
-
currentStatus = res[0]["Status"]
|
60
|
-
startTime = res[0]["StartExecTime"]
|
61
|
-
endTime = res[0]["EndExecTime"]
|
62
|
-
|
63
|
-
# If the current status is Stalled and we get an update, it should probably be "Running"
|
64
|
-
if currentStatus == JobStatus.STALLED:
|
65
|
-
currentStatus = JobStatus.RUNNING
|
66
|
-
|
67
|
-
# Get the latest time stamps of major status updates
|
68
|
-
result = await job_logging_db.get_wms_time_stamps(job_id)
|
69
|
-
|
70
|
-
#####################################################################################################
|
71
|
-
|
72
|
-
# This is more precise than "LastTime". timeStamps is a sorted list of tuples...
|
73
|
-
timeStamps = sorted((float(t), s) for s, t in result.items())
|
74
|
-
lastTime = TimeUtilities.fromEpoch(timeStamps[-1][0]).replace(tzinfo=timezone.utc)
|
75
|
-
|
76
|
-
# Get chronological order of new updates
|
77
|
-
updateTimes = sorted(statusDict)
|
78
|
-
|
79
|
-
newStartTime, newEndTime = getStartAndEndTime(
|
80
|
-
startTime, endTime, updateTimes, timeStamps, statusDict
|
81
|
-
)
|
82
|
-
|
83
|
-
job_data = {}
|
84
|
-
if updateTimes[-1] >= lastTime:
|
85
|
-
new_status, new_minor, new_application = returnValueOrRaise(
|
86
|
-
getNewStatus(
|
87
|
-
job_id,
|
88
|
-
updateTimes,
|
89
|
-
lastTime,
|
90
|
-
statusDict,
|
91
|
-
currentStatus,
|
92
|
-
force,
|
93
|
-
MagicMock(),
|
94
|
-
)
|
95
|
-
)
|
96
|
-
|
97
|
-
if new_status:
|
98
|
-
job_data["Status"] = new_status
|
99
|
-
job_data["LastUpdateTime"] = datetime.now(timezone.utc)
|
100
|
-
if new_minor:
|
101
|
-
job_data["MinorStatus"] = new_minor
|
102
|
-
if new_application:
|
103
|
-
job_data["ApplicationStatus"] = new_application
|
104
|
-
|
105
|
-
# TODO: implement elasticJobParametersDB ?
|
106
|
-
# if cls.elasticJobParametersDB:
|
107
|
-
# result = cls.elasticJobParametersDB.setJobParameter(int(jobID), "Status", status)
|
108
|
-
# if not result["OK"]:
|
109
|
-
# return result
|
110
|
-
|
111
|
-
for updTime in updateTimes:
|
112
|
-
if statusDict[updTime]["Source"].startswith("Job"):
|
113
|
-
job_data["HeartBeatTime"] = updTime
|
114
|
-
|
115
|
-
if not startTime and newStartTime:
|
116
|
-
job_data["StartExecTime"] = newStartTime
|
117
|
-
|
118
|
-
if not endTime and newEndTime:
|
119
|
-
job_data["EndExecTime"] = newEndTime
|
120
|
-
|
121
|
-
if job_data:
|
122
|
-
await job_db.setJobAttributes(job_id, job_data)
|
123
|
-
|
124
|
-
for updTime in updateTimes:
|
125
|
-
sDict = statusDict[updTime]
|
126
|
-
if not sDict.get("Status"):
|
127
|
-
sDict["Status"] = "idem"
|
128
|
-
if not sDict.get("MinorStatus"):
|
129
|
-
sDict["MinorStatus"] = "idem"
|
130
|
-
if not sDict.get("ApplicationStatus"):
|
131
|
-
sDict["ApplicationStatus"] = "idem"
|
132
|
-
if not sDict.get("Source"):
|
133
|
-
sDict["Source"] = "Unknown"
|
134
|
-
|
135
|
-
await job_logging_db.insert_record(
|
136
|
-
job_id,
|
137
|
-
sDict["Status"],
|
138
|
-
sDict["MinorStatus"],
|
139
|
-
sDict["ApplicationStatus"],
|
140
|
-
updTime,
|
141
|
-
sDict["Source"],
|
142
|
-
)
|
143
|
-
|
144
|
-
return SetJobStatusReturn(**job_data)
|
145
|
-
|
146
|
-
|
147
|
-
class ForgivingTaskGroup(asyncio.TaskGroup):
|
148
|
-
# Hacky way, check https://stackoverflow.com/questions/75250788/how-to-prevent-python3-11-taskgroup-from-canceling-all-the-tasks
|
149
|
-
# Basically e're using this because we want to wait for all tasks to finish, even if one of them raises an exception
|
150
|
-
def _abort(self):
|
151
|
-
return None
|
152
|
-
|
153
|
-
|
154
|
-
async def delete_jobs(
|
155
|
-
job_ids: list[int],
|
156
|
-
config: Config,
|
157
|
-
job_db: JobDB,
|
158
|
-
job_logging_db: JobLoggingDB,
|
159
|
-
task_queue_db: TaskQueueDB,
|
160
|
-
background_task: BackgroundTasks,
|
161
|
-
):
|
162
|
-
"""Removing jobs from task queues, send a kill command and set status to DELETED.
|
163
|
-
|
164
|
-
:raises: BaseExceptionGroup[JobNotFound] for every job that was not found.
|
165
|
-
"""
|
166
|
-
await _remove_jobs_from_task_queue(job_ids, config, task_queue_db, background_task)
|
167
|
-
# TODO: implement StorageManagerClient
|
168
|
-
# returnValueOrRaise(StorageManagerClient().killTasksBySourceTaskID(job_ids))
|
169
|
-
|
170
|
-
async with ForgivingTaskGroup() as task_group:
|
171
|
-
for job_id in job_ids:
|
172
|
-
task_group.create_task(job_db.set_job_command(job_id, "Kill"))
|
173
|
-
|
174
|
-
task_group.create_task(
|
175
|
-
set_job_status(
|
176
|
-
job_id,
|
177
|
-
{
|
178
|
-
datetime.now(timezone.utc): JobStatusUpdate(
|
179
|
-
Status=JobStatus.DELETED,
|
180
|
-
MinorStatus="Checking accounting",
|
181
|
-
Source="job_manager",
|
182
|
-
)
|
183
|
-
},
|
184
|
-
job_db,
|
185
|
-
job_logging_db,
|
186
|
-
force=True,
|
187
|
-
)
|
188
|
-
)
|
189
|
-
|
190
|
-
|
191
|
-
async def kill_jobs(
|
192
|
-
job_ids: list[int],
|
193
|
-
config: Config,
|
194
|
-
job_db: JobDB,
|
195
|
-
job_logging_db: JobLoggingDB,
|
196
|
-
task_queue_db: TaskQueueDB,
|
197
|
-
background_task: BackgroundTasks,
|
198
|
-
):
|
199
|
-
"""Kill jobs by removing them from the task queues, set kill as a job command and setting the job status to KILLED.
|
200
|
-
:raises: BaseExceptionGroup[JobNotFound] for every job that was not found.
|
201
|
-
"""
|
202
|
-
await _remove_jobs_from_task_queue(job_ids, config, task_queue_db, background_task)
|
203
|
-
# TODO: implement StorageManagerClient
|
204
|
-
# returnValueOrRaise(StorageManagerClient().killTasksBySourceTaskID(job_ids))
|
205
|
-
|
206
|
-
async with ForgivingTaskGroup() as task_group:
|
207
|
-
for job_id in job_ids:
|
208
|
-
task_group.create_task(job_db.set_job_command(job_id, "Kill"))
|
209
|
-
task_group.create_task(
|
210
|
-
set_job_status(
|
211
|
-
job_id,
|
212
|
-
{
|
213
|
-
datetime.now(timezone.utc): JobStatusUpdate(
|
214
|
-
Status=JobStatus.KILLED,
|
215
|
-
MinorStatus="Marked for termination",
|
216
|
-
Source="job_manager",
|
217
|
-
)
|
218
|
-
},
|
219
|
-
job_db,
|
220
|
-
job_logging_db,
|
221
|
-
force=True,
|
222
|
-
)
|
223
|
-
)
|
224
|
-
|
225
|
-
# TODO: Consider using the code below instead, probably more stable but less performant
|
226
|
-
# errors = []
|
227
|
-
# for job_id in job_ids:
|
228
|
-
# try:
|
229
|
-
# await job_db.set_job_command(job_id, "Kill")
|
230
|
-
# await set_job_status(
|
231
|
-
# job_id,
|
232
|
-
# {
|
233
|
-
# datetime.now(timezone.utc): JobStatusUpdate(
|
234
|
-
# Status=JobStatus.KILLED,
|
235
|
-
# MinorStatus="Marked for termination",
|
236
|
-
# Source="job_manager",
|
237
|
-
# )
|
238
|
-
# },
|
239
|
-
# job_db,
|
240
|
-
# job_logging_db,
|
241
|
-
# force=True,
|
242
|
-
# )
|
243
|
-
# except JobNotFound as e:
|
244
|
-
# errors.append(e)
|
245
|
-
|
246
|
-
# if errors:
|
247
|
-
# raise BaseExceptionGroup("Some job ids were not found", errors)
|
248
|
-
|
249
|
-
|
250
|
-
async def remove_jobs(
|
251
|
-
job_ids: list[int],
|
252
|
-
config: Config,
|
253
|
-
job_db: JobDB,
|
254
|
-
job_logging_db: JobLoggingDB,
|
255
|
-
sandbox_metadata_db: SandboxMetadataDB,
|
256
|
-
task_queue_db: TaskQueueDB,
|
257
|
-
background_task: BackgroundTasks,
|
258
|
-
):
|
259
|
-
"""Fully remove a job from the WMS databases.
|
260
|
-
:raises: nothing.
|
261
|
-
"""
|
262
|
-
# Remove the staging task from the StorageManager
|
263
|
-
# TODO: this was not done in the JobManagerHandler, but it was done in the kill method
|
264
|
-
# I think it should be done here too
|
265
|
-
# TODO: implement StorageManagerClient
|
266
|
-
# returnValueOrRaise(StorageManagerClient().killTasksBySourceTaskID([job_id]))
|
267
|
-
|
268
|
-
# TODO: this was also not done in the JobManagerHandler, but it was done in the JobCleaningAgent
|
269
|
-
# I think it should be done here as well
|
270
|
-
await sandbox_metadata_db.unassign_sandboxes_to_jobs(job_ids)
|
271
|
-
|
272
|
-
# Remove the job from TaskQueueDB
|
273
|
-
await _remove_jobs_from_task_queue(job_ids, config, task_queue_db, background_task)
|
274
|
-
|
275
|
-
# Remove the job from JobLoggingDB
|
276
|
-
await job_logging_db.delete_records(job_ids)
|
277
|
-
|
278
|
-
# Remove the job from JobDB
|
279
|
-
await job_db.delete_jobs(job_ids)
|
280
|
-
|
281
|
-
|
282
|
-
async def _remove_jobs_from_task_queue(
|
283
|
-
job_ids: list[int],
|
284
|
-
config: Config,
|
285
|
-
task_queue_db: TaskQueueDB,
|
286
|
-
background_task: BackgroundTasks,
|
287
|
-
):
|
288
|
-
"""Remove the job from TaskQueueDB."""
|
289
|
-
tq_infos = await task_queue_db.get_tq_infos_for_jobs(job_ids)
|
290
|
-
await task_queue_db.remove_jobs(job_ids)
|
291
|
-
for tq_id, owner, owner_group, vo in tq_infos:
|
292
|
-
# TODO: move to Celery
|
293
|
-
background_task.add_task(
|
294
|
-
task_queue_db.delete_task_queue_if_empty,
|
295
|
-
tq_id,
|
296
|
-
owner,
|
297
|
-
owner_group,
|
298
|
-
config.Registry[vo].Groups[owner_group].JobShare,
|
299
|
-
config.Registry[vo].Groups[owner_group].Properties,
|
300
|
-
config.Operations[vo].Services.JobScheduling.EnableSharesCorrection,
|
301
|
-
config.Registry[vo].Groups[owner_group].AllowBackgroundTQs,
|
302
|
-
)
|
File without changes
|
File without changes
|
File without changes
|