diracx-db 0.0.1a27__py3-none-any.whl → 0.0.1a28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,578 +0,0 @@
1
- from __future__ import annotations
2
-
3
- import asyncio
4
- from collections import defaultdict
5
- from copy import deepcopy
6
- from datetime import datetime, timezone
7
- from typing import Any
8
- from unittest.mock import MagicMock
9
-
10
- from fastapi import BackgroundTasks
11
- from pydantic import BaseModel
12
-
13
- from diracx.core.config.schema import Config
14
- from diracx.core.models import (
15
- JobMinorStatus,
16
- JobStatus,
17
- JobStatusUpdate,
18
- SetJobStatusReturn,
19
- VectorSearchOperator,
20
- VectorSearchSpec,
21
- )
22
- from diracx.db.sql.job_logging.db import JobLoggingRecord
23
-
24
- from .. import JobDB, JobLoggingDB, SandboxMetadataDB, TaskQueueDB
25
-
26
-
27
- class JobSubmissionSpec(BaseModel):
28
- jdl: str
29
- owner: str
30
- owner_group: str
31
- initial_status: str
32
- initial_minor_status: str
33
- vo: str
34
-
35
-
36
- async def submit_jobs_jdl(jobs: list[JobSubmissionSpec], job_db: JobDB):
37
- from DIRAC.Core.Utilities.ClassAd.ClassAdLight import ClassAd
38
- from DIRAC.Core.Utilities.ReturnValues import returnValueOrRaise
39
- from DIRAC.WorkloadManagementSystem.DB.JobDBUtils import (
40
- checkAndAddOwner,
41
- createJDLWithInitialStatus,
42
- )
43
-
44
- jobs_to_insert = {}
45
- jdls_to_update = {}
46
- inputdata_to_insert = {}
47
- original_jdls = []
48
-
49
- # generate the jobIDs first
50
- # TODO: should ForgivingTaskGroup be used?
51
- async with asyncio.TaskGroup() as tg:
52
- for job in jobs:
53
- original_jdl = deepcopy(job.jdl)
54
- job_manifest = returnValueOrRaise(
55
- checkAndAddOwner(original_jdl, job.owner, job.owner_group)
56
- )
57
-
58
- # Fix possible lack of brackets
59
- if original_jdl.strip()[0] != "[":
60
- original_jdl = f"[{original_jdl}]"
61
-
62
- original_jdls.append(
63
- (
64
- original_jdl,
65
- job_manifest,
66
- tg.create_task(job_db.create_job(original_jdl)),
67
- )
68
- )
69
-
70
- async with asyncio.TaskGroup() as tg:
71
- for job, (original_jdl, job_manifest_, job_id_task) in zip(jobs, original_jdls):
72
- job_id = job_id_task.result()
73
- job_attrs = {
74
- "JobID": job_id,
75
- "LastUpdateTime": datetime.now(tz=timezone.utc),
76
- "SubmissionTime": datetime.now(tz=timezone.utc),
77
- "Owner": job.owner,
78
- "OwnerGroup": job.owner_group,
79
- "VO": job.vo,
80
- }
81
-
82
- job_manifest_.setOption("JobID", job_id)
83
-
84
- # 2.- Check JDL and Prepare DIRAC JDL
85
- job_jdl = job_manifest_.dumpAsJDL()
86
-
87
- # Replace the JobID placeholder if any
88
- if job_jdl.find("%j") != -1:
89
- job_jdl = job_jdl.replace("%j", str(job_id))
90
-
91
- class_ad_job = ClassAd(job_jdl)
92
-
93
- class_ad_req = ClassAd("[]")
94
- if not class_ad_job.isOK():
95
- # Rollback the entire transaction
96
- raise ValueError(f"Error in JDL syntax for job JDL: {original_jdl}")
97
- # TODO: check if that is actually true
98
- if class_ad_job.lookupAttribute("Parameters"):
99
- raise NotImplementedError("Parameters in the JDL are not supported")
100
-
101
- # TODO is this even needed?
102
- class_ad_job.insertAttributeInt("JobID", job_id)
103
-
104
- await job_db.check_and_prepare_job(
105
- job_id,
106
- class_ad_job,
107
- class_ad_req,
108
- job.owner,
109
- job.owner_group,
110
- job_attrs,
111
- job.vo,
112
- )
113
- job_jdl = createJDLWithInitialStatus(
114
- class_ad_job,
115
- class_ad_req,
116
- job_db.jdl_2_db_parameters,
117
- job_attrs,
118
- job.initial_status,
119
- job.initial_minor_status,
120
- modern=True,
121
- )
122
-
123
- jobs_to_insert[job_id] = job_attrs
124
- jdls_to_update[job_id] = job_jdl
125
-
126
- if class_ad_job.lookupAttribute("InputData"):
127
- input_data = class_ad_job.getListFromExpression("InputData")
128
- inputdata_to_insert[job_id] = [lfn for lfn in input_data if lfn]
129
-
130
- tg.create_task(job_db.update_job_jdls(jdls_to_update))
131
- tg.create_task(job_db.insert_job_attributes(jobs_to_insert))
132
-
133
- if inputdata_to_insert:
134
- tg.create_task(job_db.insert_input_data(inputdata_to_insert))
135
-
136
- return list(jobs_to_insert.keys())
137
-
138
-
139
- async def reschedule_jobs_bulk(
140
- job_ids: list[int],
141
- config: Config,
142
- job_db: JobDB,
143
- job_logging_db: JobLoggingDB,
144
- task_queue_db: TaskQueueDB,
145
- background_task: BackgroundTasks,
146
- *,
147
- reset_counter=False,
148
- ) -> dict[str, Any]:
149
- """Reschedule given job."""
150
- from DIRAC.Core.Utilities.ClassAd.ClassAdLight import ClassAd
151
- from DIRAC.Core.Utilities.ReturnValues import SErrorException
152
-
153
- failed = {}
154
- reschedule_max = config.Operations[
155
- "Defaults"
156
- ].Services.JobScheduling.MaxRescheduling # type: ignore
157
-
158
- status_changes = {}
159
- attribute_changes: dict[int, dict[str, str]] = defaultdict(dict)
160
- jdl_changes = {}
161
-
162
- _, results = await job_db.search(
163
- parameters=[
164
- "Status",
165
- "MinorStatus",
166
- "VerifiedFlag",
167
- "RescheduleCounter",
168
- "Owner",
169
- "OwnerGroup",
170
- "JobID",
171
- ],
172
- search=[
173
- VectorSearchSpec(
174
- parameter="JobID", operator=VectorSearchOperator.IN, values=job_ids
175
- )
176
- ],
177
- sorts=[],
178
- )
179
- if not results:
180
- for job_id in job_ids:
181
- failed[job_id] = {"detail": "Not found"}
182
-
183
- jobs_to_resched = {}
184
-
185
- for job_attrs in results or []:
186
- job_id = int(job_attrs["JobID"])
187
-
188
- if "VerifiedFlag" not in job_attrs:
189
- failed[job_id] = {"detail": "Not found: No verified flag"}
190
- # Noop
191
- continue
192
-
193
- if not job_attrs["VerifiedFlag"]:
194
- failed[job_id] = {
195
- "detail": (
196
- f"VerifiedFlag is False: Status {job_attrs['Status']}, "
197
- f"Minor Status: {job_attrs['MinorStatus']}"
198
- )
199
- }
200
- # Noop
201
- continue
202
-
203
- if reset_counter:
204
- job_attrs["RescheduleCounter"] = 0
205
- else:
206
- job_attrs["RescheduleCounter"] = int(job_attrs["RescheduleCounter"]) + 1
207
-
208
- if job_attrs["RescheduleCounter"] > reschedule_max:
209
- status_changes[job_id] = {
210
- datetime.now(tz=timezone.utc): JobStatusUpdate(
211
- Status=JobStatus.FAILED,
212
- MinorStatus=JobMinorStatus.MAX_RESCHEDULING,
213
- ApplicationStatus="Unknown",
214
- )
215
- }
216
- failed[job_id] = {
217
- "detail": f"Maximum number of reschedules exceeded ({reschedule_max})"
218
- }
219
- # DATABASE OPERATION (status change)
220
- continue
221
- jobs_to_resched[job_id] = job_attrs
222
-
223
- surviving_job_ids = set(jobs_to_resched.keys())
224
-
225
- # TODO: get the job parameters from JobMonitoringClient
226
- # result = JobMonitoringClient().getJobParameters(jobID)
227
- # if result["OK"]:
228
- # parDict = result["Value"]
229
- # for key, value in parDict.get(jobID, {}).items():
230
- # result = self.setAtticJobParameter(jobID, key, value, rescheduleCounter - 1)
231
- # if not result["OK"]:
232
- # break
233
-
234
- # TODO: IF we keep JobParameters and OptimizerParameters: Delete job in those tables.
235
- # await self.delete_job_parameters(job_id)
236
- # await self.delete_job_optimizer_parameters(job_id)
237
-
238
- def parse_jdl(job_id, job_jdl):
239
- if not job_jdl.strip().startswith("["):
240
- job_jdl = f"[{job_jdl}]"
241
- class_ad_job = ClassAd(job_jdl)
242
- class_ad_job.insertAttributeInt("JobID", job_id)
243
- return class_ad_job
244
-
245
- job_jdls = {
246
- jobid: parse_jdl(jobid, jdl)
247
- for jobid, jdl in (
248
- (await job_db.get_job_jdls(surviving_job_ids, original=True)).items()
249
- )
250
- }
251
-
252
- for job_id in surviving_job_ids:
253
- class_ad_job = job_jdls[job_id]
254
- class_ad_req = ClassAd("[]")
255
- try:
256
- await job_db.check_and_prepare_job(
257
- job_id,
258
- class_ad_job,
259
- class_ad_req,
260
- jobs_to_resched[job_id]["Owner"],
261
- jobs_to_resched[job_id]["OwnerGroup"],
262
- {"RescheduleCounter": jobs_to_resched[job_id]["RescheduleCounter"]},
263
- class_ad_job.getAttributeString("VirtualOrganization"),
264
- )
265
- except SErrorException as e:
266
- failed[job_id] = {"detail": str(e)}
267
- # surviving_job_ids.remove(job_id)
268
- continue
269
-
270
- priority = class_ad_job.getAttributeInt("Priority")
271
- if priority is None:
272
- priority = 0
273
-
274
- site_list = class_ad_job.getListFromExpression("Site")
275
- if not site_list:
276
- site = "ANY"
277
- elif len(site_list) > 1:
278
- site = "Multiple"
279
- else:
280
- site = site_list[0]
281
-
282
- req_jdl = class_ad_req.asJDL()
283
- class_ad_job.insertAttributeInt("JobRequirements", req_jdl)
284
- job_jdl = class_ad_job.asJDL()
285
- # Replace the JobID placeholder if any
286
- job_jdl = job_jdl.replace("%j", str(job_id))
287
-
288
- additional_attrs = {
289
- "Site": site,
290
- "UserPriority": priority,
291
- "RescheduleTime": datetime.now(tz=timezone.utc),
292
- "RescheduleCounter": jobs_to_resched[job_id]["RescheduleCounter"],
293
- }
294
-
295
- # set new JDL
296
- jdl_changes[job_id] = job_jdl
297
-
298
- # set new status
299
- status_changes[job_id] = {
300
- datetime.now(tz=timezone.utc): JobStatusUpdate(
301
- Status=JobStatus.RECEIVED,
302
- MinorStatus=JobMinorStatus.RESCHEDULED,
303
- ApplicationStatus="Unknown",
304
- )
305
- }
306
- # set new attributes
307
- attribute_changes[job_id].update(additional_attrs)
308
-
309
- if surviving_job_ids:
310
- # BULK STATUS UPDATE
311
- # DATABASE OPERATION
312
- set_job_status_result = await set_job_status_bulk(
313
- status_changes,
314
- config,
315
- job_db,
316
- job_logging_db,
317
- task_queue_db,
318
- background_task,
319
- additional_attributes=attribute_changes,
320
- )
321
-
322
- # BULK JDL UPDATE
323
- # DATABASE OPERATION
324
- await job_db.set_job_jdl_bulk(jdl_changes)
325
-
326
- return {
327
- "failed": failed,
328
- "success": {
329
- job_id: {
330
- "InputData": job_jdls.get(job_id, None),
331
- **attribute_changes[job_id],
332
- **set_status_result.model_dump(),
333
- }
334
- for job_id, set_status_result in set_job_status_result.success.items()
335
- if job_id not in failed
336
- },
337
- }
338
-
339
- return {
340
- "success": [],
341
- "failed": failed,
342
- }
343
-
344
-
345
- async def set_job_status_bulk(
346
- status_changes: dict[int, dict[datetime, JobStatusUpdate]],
347
- config: Config,
348
- job_db: JobDB,
349
- job_logging_db: JobLoggingDB,
350
- task_queue_db: TaskQueueDB,
351
- background_task: BackgroundTasks,
352
- *,
353
- force: bool = False,
354
- additional_attributes: dict[int, dict[str, str]] = {},
355
- ) -> SetJobStatusReturn:
356
- """Set various status fields for job specified by its jobId.
357
- Set only the last status in the JobDB, updating all the status
358
- logging information in the JobLoggingDB. The status dict has datetime
359
- as a key and status information dictionary as values.
360
-
361
- :raises: JobNotFound if the job is not found in one of the DBs
362
- """
363
- from DIRAC.Core.Utilities import TimeUtilities
364
- from DIRAC.Core.Utilities.ReturnValues import returnValueOrRaise
365
- from DIRAC.WorkloadManagementSystem.Utilities.JobStatusUtility import (
366
- getNewStatus,
367
- getStartAndEndTime,
368
- )
369
-
370
- failed: dict[int, Any] = {}
371
- deletable_killable_jobs = set()
372
- job_attribute_updates: dict[int, dict[str, str]] = {}
373
- job_logging_updates: list[JobLoggingRecord] = []
374
- status_dicts: dict[int, dict[datetime, dict[str, str]]] = defaultdict(dict)
375
-
376
- # transform JobStateUpdate objects into dicts
377
- status_dicts = {
378
- job_id: {
379
- key: {k: v for k, v in value.model_dump().items() if v is not None}
380
- for key, value in status.items()
381
- }
382
- for job_id, status in status_changes.items()
383
- }
384
-
385
- # search all jobs at once
386
- _, results = await job_db.search(
387
- parameters=["Status", "StartExecTime", "EndExecTime", "JobID"],
388
- search=[
389
- {
390
- "parameter": "JobID",
391
- "operator": VectorSearchOperator.IN,
392
- "values": list(set(status_changes.keys())),
393
- }
394
- ],
395
- sorts=[],
396
- )
397
- if not results:
398
- return SetJobStatusReturn(
399
- success={},
400
- failed={
401
- int(job_id): {"detail": "Not found"} for job_id in status_changes.keys()
402
- },
403
- )
404
-
405
- found_jobs = set(int(res["JobID"]) for res in results)
406
- failed.update(
407
- {
408
- int(nf_job_id): {"detail": "Not found"}
409
- for nf_job_id in set(status_changes.keys()) - found_jobs
410
- }
411
- )
412
- # Get the latest time stamps of major status updates
413
- wms_time_stamps = await job_logging_db.get_wms_time_stamps_bulk(found_jobs)
414
-
415
- for res in results:
416
- job_id = int(res["JobID"])
417
- current_status = res["Status"]
418
- start_time = res["StartExecTime"]
419
- end_time = res["EndExecTime"]
420
-
421
- # If the current status is Stalled and we get an update, it should probably be "Running"
422
- if current_status == JobStatus.STALLED:
423
- current_status = JobStatus.RUNNING
424
-
425
- #####################################################################################################
426
- status_dict = status_dicts[job_id]
427
- # This is more precise than "LastTime". time_stamps is a sorted list of tuples...
428
- time_stamps = sorted((float(t), s) for s, t in wms_time_stamps[job_id].items())
429
- last_time = TimeUtilities.fromEpoch(time_stamps[-1][0]).replace(
430
- tzinfo=timezone.utc
431
- )
432
-
433
- # Get chronological order of new updates
434
- update_times = sorted(status_dict)
435
-
436
- new_start_time, new_end_time = getStartAndEndTime(
437
- start_time, end_time, update_times, time_stamps, status_dict
438
- )
439
-
440
- job_data: dict[str, str] = {}
441
- new_status: str | None = None
442
- if update_times[-1] >= last_time:
443
- new_status, new_minor, new_application = (
444
- returnValueOrRaise( # TODO: Catch this
445
- getNewStatus(
446
- job_id,
447
- update_times,
448
- last_time,
449
- status_dict,
450
- current_status,
451
- force,
452
- MagicMock(), # FIXME
453
- )
454
- )
455
- )
456
-
457
- if new_status:
458
- job_data.update(additional_attributes.get(job_id, {}))
459
- job_data["Status"] = new_status
460
- job_data["LastUpdateTime"] = str(datetime.now(timezone.utc))
461
- if new_minor:
462
- job_data["MinorStatus"] = new_minor
463
- if new_application:
464
- job_data["ApplicationStatus"] = new_application
465
-
466
- # TODO: implement elasticJobParametersDB ?
467
- # if cls.elasticJobParametersDB:
468
- # result = cls.elasticJobParametersDB.setJobParameter(int(jobID), "Status", status)
469
- # if not result["OK"]:
470
- # return result
471
-
472
- for upd_time in update_times:
473
- if status_dict[upd_time]["Source"].startswith("Job"):
474
- job_data["HeartBeatTime"] = str(upd_time)
475
-
476
- if not start_time and new_start_time:
477
- job_data["StartExecTime"] = new_start_time
478
-
479
- if not end_time and new_end_time:
480
- job_data["EndExecTime"] = new_end_time
481
-
482
- #####################################################################################################
483
- # delete or kill job, if we transition to DELETED or KILLED state
484
- if new_status in [JobStatus.DELETED, JobStatus.KILLED]:
485
- deletable_killable_jobs.add(job_id)
486
-
487
- # Update database tables
488
- if job_data:
489
- job_attribute_updates[job_id] = job_data
490
-
491
- for upd_time in update_times:
492
- s_dict = status_dict[upd_time]
493
- job_logging_updates.append(
494
- JobLoggingRecord(
495
- job_id=job_id,
496
- status=s_dict.get("Status", "idem"),
497
- minor_status=s_dict.get("MinorStatus", "idem"),
498
- application_status=s_dict.get("ApplicationStatus", "idem"),
499
- date=upd_time,
500
- source=s_dict.get("Source", "Unknown"),
501
- )
502
- )
503
-
504
- await job_db.set_job_attributes_bulk(job_attribute_updates)
505
-
506
- await remove_jobs_from_task_queue(
507
- list(deletable_killable_jobs), config, task_queue_db, background_task
508
- )
509
-
510
- # TODO: implement StorageManagerClient
511
- # returnValueOrRaise(StorageManagerClient().killTasksBySourceTaskID(job_ids))
512
-
513
- if deletable_killable_jobs:
514
- await job_db.set_job_command_bulk(
515
- [(job_id, "Kill", "") for job_id in deletable_killable_jobs]
516
- )
517
-
518
- await job_logging_db.bulk_insert_record(job_logging_updates)
519
-
520
- return SetJobStatusReturn(
521
- success=job_attribute_updates,
522
- failed=failed,
523
- )
524
-
525
-
526
- async def remove_jobs(
527
- job_ids: list[int],
528
- config: Config,
529
- job_db: JobDB,
530
- job_logging_db: JobLoggingDB,
531
- sandbox_metadata_db: SandboxMetadataDB,
532
- task_queue_db: TaskQueueDB,
533
- background_task: BackgroundTasks,
534
- ):
535
- """Fully remove a job from the WMS databases.
536
- :raises: nothing.
537
- """
538
- # Remove the staging task from the StorageManager
539
- # TODO: this was not done in the JobManagerHandler, but it was done in the kill method
540
- # I think it should be done here too
541
- # TODO: implement StorageManagerClient
542
- # returnValueOrRaise(StorageManagerClient().killTasksBySourceTaskID([job_id]))
543
-
544
- # TODO: this was also not done in the JobManagerHandler, but it was done in the JobCleaningAgent
545
- # I think it should be done here as well
546
- await sandbox_metadata_db.unassign_sandboxes_to_jobs(job_ids)
547
-
548
- # Remove the job from TaskQueueDB
549
- await remove_jobs_from_task_queue(job_ids, config, task_queue_db, background_task)
550
-
551
- # Remove the job from JobLoggingDB
552
- await job_logging_db.delete_records(job_ids)
553
-
554
- # Remove the job from JobDB
555
- await job_db.delete_jobs(job_ids)
556
-
557
-
558
- async def remove_jobs_from_task_queue(
559
- job_ids: list[int],
560
- config: Config,
561
- task_queue_db: TaskQueueDB,
562
- background_task: BackgroundTasks,
563
- ):
564
- """Remove the job from TaskQueueDB."""
565
- tq_infos = await task_queue_db.get_tq_infos_for_jobs(job_ids)
566
- await task_queue_db.remove_jobs(job_ids)
567
- for tq_id, owner, owner_group, vo in tq_infos:
568
- # TODO: move to Celery
569
- background_task.add_task(
570
- task_queue_db.delete_task_queue_if_empty,
571
- tq_id,
572
- owner,
573
- owner_group,
574
- config.Registry[vo].Groups[owner_group].JobShare,
575
- config.Registry[vo].Groups[owner_group].Properties,
576
- config.Operations[vo].Services.JobScheduling.EnableSharesCorrection,
577
- config.Registry[vo].Groups[owner_group].AllowBackgroundTQs,
578
- )