DIRAC 9.0.0a64__py3-none-any.whl → 9.0.0a67__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- DIRAC/ConfigurationSystem/Client/LocalConfiguration.py +11 -8
- DIRAC/ConfigurationSystem/Client/VOMS2CSSynchronizer.py +1 -1
- DIRAC/Core/Security/IAMService.py +4 -3
- DIRAC/Core/Utilities/ClassAd/ClassAdLight.py +4 -290
- DIRAC/Core/Utilities/DErrno.py +5 -309
- DIRAC/Core/Utilities/JDL.py +1 -195
- DIRAC/Core/Utilities/List.py +1 -127
- DIRAC/Core/Utilities/ReturnValues.py +7 -252
- DIRAC/Core/Utilities/StateMachine.py +12 -178
- DIRAC/Core/Utilities/TimeUtilities.py +10 -253
- DIRAC/Core/Utilities/test/Test_JDL.py +0 -3
- DIRAC/Core/scripts/dirac_agent.py +1 -1
- DIRAC/DataManagementSystem/DB/FTS3DB.py +3 -0
- DIRAC/RequestManagementSystem/DB/test/RMSTestScenari.py +2 -0
- DIRAC/Resources/Catalog/RucioFileCatalogClient.py +1 -1
- DIRAC/Resources/Computing/test/Test_PoolComputingElement.py +2 -1
- DIRAC/TransformationSystem/Agent/TransformationCleaningAgent.py +1 -1
- DIRAC/Workflow/Modules/test/Test_Modules.py +5 -0
- DIRAC/WorkloadManagementSystem/Agent/JobCleaningAgent.py +1 -1
- DIRAC/WorkloadManagementSystem/Agent/StalledJobAgent.py +1 -1
- DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_JobAgent.py +2 -0
- DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_PushJobAgent.py +1 -0
- DIRAC/WorkloadManagementSystem/Client/JobState/JobManifest.py +32 -261
- DIRAC/WorkloadManagementSystem/Client/JobStatus.py +8 -93
- DIRAC/WorkloadManagementSystem/DB/JobDBUtils.py +18 -147
- DIRAC/WorkloadManagementSystem/DB/StatusUtils.py +125 -0
- DIRAC/WorkloadManagementSystem/DB/tests/Test_StatusUtils.py +28 -0
- DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapper.py +4 -2
- DIRAC/WorkloadManagementSystem/JobWrapper/test/Test_JobWrapper.py +21 -5
- DIRAC/WorkloadManagementSystem/JobWrapper/test/Test_JobWrapperTemplate.py +4 -0
- DIRAC/WorkloadManagementSystem/Service/JobManagerHandler.py +1 -1
- DIRAC/WorkloadManagementSystem/Utilities/JobModel.py +28 -199
- DIRAC/WorkloadManagementSystem/Utilities/JobStatusUtility.py +1 -63
- DIRAC/WorkloadManagementSystem/Utilities/ParametricJob.py +7 -171
- DIRAC/WorkloadManagementSystem/Utilities/jobAdministration.py +0 -123
- DIRAC/WorkloadManagementSystem/Utilities/test/Test_JobModel.py +1 -5
- DIRAC/WorkloadManagementSystem/Utilities/test/Test_ParametricJob.py +45 -128
- DIRAC/__init__.py +55 -54
- {dirac-9.0.0a64.dist-info → dirac-9.0.0a67.dist-info}/METADATA +2 -1
- {dirac-9.0.0a64.dist-info → dirac-9.0.0a67.dist-info}/RECORD +44 -45
- DIRAC/Core/Utilities/test/Test_List.py +0 -150
- DIRAC/Core/Utilities/test/Test_Time.py +0 -88
- DIRAC/WorkloadManagementSystem/Utilities/test/Test_JobAdministration.py +0 -28
- {dirac-9.0.0a64.dist-info → dirac-9.0.0a67.dist-info}/WHEEL +0 -0
- {dirac-9.0.0a64.dist-info → dirac-9.0.0a67.dist-info}/entry_points.txt +0 -0
- {dirac-9.0.0a64.dist-info → dirac-9.0.0a67.dist-info}/licenses/LICENSE +0 -0
- {dirac-9.0.0a64.dist-info → dirac-9.0.0a67.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
from DIRAC import S_ERROR, S_OK, gLogger
|
|
2
|
+
from DIRAC.StorageManagementSystem.DB.StorageManagementDB import StorageManagementDB
|
|
3
|
+
from DIRAC.WorkloadManagementSystem.Client import JobStatus
|
|
4
|
+
from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB
|
|
5
|
+
from DIRAC.WorkloadManagementSystem.DB.PilotAgentsDB import PilotAgentsDB
|
|
6
|
+
from DIRAC.WorkloadManagementSystem.DB.TaskQueueDB import TaskQueueDB
|
|
7
|
+
from DIRAC.WorkloadManagementSystem.Service.JobPolicy import RIGHT_DELETE, RIGHT_KILL
|
|
8
|
+
from DIRAC.WorkloadManagementSystem.Utilities.jobAdministration import _filterJobStateTransition
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def _deleteJob(jobID, force=False):
|
|
12
|
+
"""Set the job status to "Deleted"
|
|
13
|
+
and remove the pilot that ran and its logging info if the pilot is finished.
|
|
14
|
+
|
|
15
|
+
:param int jobID: job ID
|
|
16
|
+
:return: S_OK()/S_ERROR()
|
|
17
|
+
"""
|
|
18
|
+
if not (result := JobDB().setJobStatus(jobID, JobStatus.DELETED, "Checking accounting", force=force))["OK"]:
|
|
19
|
+
gLogger.warn("Failed to set job Deleted status", result["Message"])
|
|
20
|
+
return result
|
|
21
|
+
|
|
22
|
+
if not (result := TaskQueueDB().deleteJob(jobID))["OK"]:
|
|
23
|
+
gLogger.warn("Failed to delete job from the TaskQueue")
|
|
24
|
+
|
|
25
|
+
# if it was the last job for the pilot
|
|
26
|
+
result = PilotAgentsDB().getPilotsForJobID(jobID)
|
|
27
|
+
if not result["OK"]:
|
|
28
|
+
gLogger.error("Failed to get Pilots for JobID", result["Message"])
|
|
29
|
+
return result
|
|
30
|
+
for pilot in result["Value"]:
|
|
31
|
+
res = PilotAgentsDB().getJobsForPilot(pilot)
|
|
32
|
+
if not res["OK"]:
|
|
33
|
+
gLogger.error("Failed to get jobs for pilot", res["Message"])
|
|
34
|
+
return res
|
|
35
|
+
if not res["Value"]: # if list of jobs for pilot is empty, delete pilot
|
|
36
|
+
result = PilotAgentsDB().getPilotInfo(pilotID=pilot)
|
|
37
|
+
if not result["OK"]:
|
|
38
|
+
gLogger.error("Failed to get pilot info", result["Message"])
|
|
39
|
+
return result
|
|
40
|
+
ret = PilotAgentsDB().deletePilot(result["Value"]["PilotJobReference"])
|
|
41
|
+
if not ret["OK"]:
|
|
42
|
+
gLogger.error("Failed to delete pilot from PilotAgentsDB", ret["Message"])
|
|
43
|
+
return ret
|
|
44
|
+
|
|
45
|
+
return S_OK()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _killJob(jobID, sendKillCommand=True, force=False):
|
|
49
|
+
"""Kill one job
|
|
50
|
+
|
|
51
|
+
:param int jobID: job ID
|
|
52
|
+
:param bool sendKillCommand: send kill command
|
|
53
|
+
|
|
54
|
+
:return: S_OK()/S_ERROR()
|
|
55
|
+
"""
|
|
56
|
+
if sendKillCommand:
|
|
57
|
+
if not (result := JobDB().setJobCommand(jobID, "Kill"))["OK"]:
|
|
58
|
+
gLogger.warn("Failed to set job Kill command", result["Message"])
|
|
59
|
+
return result
|
|
60
|
+
|
|
61
|
+
gLogger.info("Job marked for termination", jobID)
|
|
62
|
+
if not (result := JobDB().setJobStatus(jobID, JobStatus.KILLED, "Marked for termination", force=force))["OK"]:
|
|
63
|
+
gLogger.warn("Failed to set job Killed status", result["Message"])
|
|
64
|
+
if not (result := TaskQueueDB().deleteJob(jobID))["OK"]:
|
|
65
|
+
gLogger.warn("Failed to delete job from the TaskQueue", result["Message"])
|
|
66
|
+
|
|
67
|
+
return S_OK()
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def kill_delete_jobs(right, validJobList, nonauthJobList=[], force=False):
|
|
71
|
+
"""Kill (== set the status to "KILLED") or delete (== set the status to "DELETED") jobs as necessary
|
|
72
|
+
|
|
73
|
+
:param str right: RIGHT_KILL or RIGHT_DELETE
|
|
74
|
+
|
|
75
|
+
:return: S_OK()/S_ERROR()
|
|
76
|
+
"""
|
|
77
|
+
badIDs = []
|
|
78
|
+
|
|
79
|
+
killJobList = []
|
|
80
|
+
deleteJobList = []
|
|
81
|
+
if validJobList:
|
|
82
|
+
result = JobDB().getJobsAttributes(killJobList, ["Status"])
|
|
83
|
+
if not result["OK"]:
|
|
84
|
+
return result
|
|
85
|
+
jobStates = result["Value"]
|
|
86
|
+
|
|
87
|
+
# Get the jobs allowed to transition to the Killed state
|
|
88
|
+
killJobList.extend(_filterJobStateTransition(jobStates, JobStatus.KILLED))
|
|
89
|
+
|
|
90
|
+
if right == RIGHT_DELETE:
|
|
91
|
+
# Get the jobs allowed to transition to the Deleted state
|
|
92
|
+
deleteJobList.extend(_filterJobStateTransition(jobStates, JobStatus.DELETED))
|
|
93
|
+
|
|
94
|
+
for jobID in killJobList:
|
|
95
|
+
result = _killJob(jobID, force=force)
|
|
96
|
+
if not result["OK"]:
|
|
97
|
+
badIDs.append(jobID)
|
|
98
|
+
|
|
99
|
+
for jobID in deleteJobList:
|
|
100
|
+
result = _deleteJob(jobID, force=force)
|
|
101
|
+
if not result["OK"]:
|
|
102
|
+
badIDs.append(jobID)
|
|
103
|
+
|
|
104
|
+
# Look for jobs that are in the Staging state to send kill signal to the stager
|
|
105
|
+
stagingJobList = [jobID for jobID, sDict in jobStates.items() if sDict["Status"] == JobStatus.STAGING]
|
|
106
|
+
|
|
107
|
+
if stagingJobList:
|
|
108
|
+
stagerDB = StorageManagementDB()
|
|
109
|
+
gLogger.info("Going to send killing signal to stager as well!")
|
|
110
|
+
result = stagerDB.killTasksBySourceTaskID(stagingJobList)
|
|
111
|
+
if not result["OK"]:
|
|
112
|
+
gLogger.warn("Failed to kill some Stager tasks", result["Message"])
|
|
113
|
+
|
|
114
|
+
if nonauthJobList or badIDs:
|
|
115
|
+
result = S_ERROR("Some jobs failed deletion")
|
|
116
|
+
if nonauthJobList:
|
|
117
|
+
gLogger.warn("Non-authorized JobIDs won't be deleted", str(nonauthJobList))
|
|
118
|
+
result["NonauthorizedJobIDs"] = nonauthJobList
|
|
119
|
+
if badIDs:
|
|
120
|
+
gLogger.warn("JobIDs failed to be deleted", str(badIDs))
|
|
121
|
+
result["FailedJobIDs"] = badIDs
|
|
122
|
+
return result
|
|
123
|
+
|
|
124
|
+
jobsList = killJobList if right == RIGHT_KILL else deleteJobList
|
|
125
|
+
return S_OK(jobsList)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
""" unit test (pytest) of JobAdministration module
|
|
2
|
+
"""
|
|
3
|
+
|
|
4
|
+
from unittest.mock import MagicMock
|
|
5
|
+
|
|
6
|
+
import pytest
|
|
7
|
+
|
|
8
|
+
# sut
|
|
9
|
+
from DIRAC.WorkloadManagementSystem.DB.StatusUtils import kill_delete_jobs
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@pytest.mark.parametrize(
|
|
13
|
+
"jobIDs_list, right",
|
|
14
|
+
[
|
|
15
|
+
([], "Kill"),
|
|
16
|
+
([], "Delete"),
|
|
17
|
+
(1, "Kill"),
|
|
18
|
+
([1, 2], "Kill"),
|
|
19
|
+
],
|
|
20
|
+
)
|
|
21
|
+
def test___kill_delete_jobs(mocker, jobIDs_list, right):
|
|
22
|
+
mocker.patch("DIRAC.WorkloadManagementSystem.DB.StatusUtils.JobDB", MagicMock())
|
|
23
|
+
mocker.patch("DIRAC.WorkloadManagementSystem.DB.StatusUtils.TaskQueueDB", MagicMock())
|
|
24
|
+
mocker.patch("DIRAC.WorkloadManagementSystem.DB.StatusUtils.PilotAgentsDB", MagicMock())
|
|
25
|
+
mocker.patch("DIRAC.WorkloadManagementSystem.DB.StatusUtils.StorageManagementDB", MagicMock())
|
|
26
|
+
|
|
27
|
+
res = kill_delete_jobs(right, jobIDs_list)
|
|
28
|
+
assert res["OK"]
|
|
@@ -55,6 +55,8 @@ from DIRAC.WorkloadManagementSystem.Client.JobStateUpdateClient import JobStateU
|
|
|
55
55
|
from DIRAC.WorkloadManagementSystem.Client.SandboxStoreClient import SandboxStoreClient
|
|
56
56
|
from DIRAC.WorkloadManagementSystem.JobWrapper.Watchdog import Watchdog
|
|
57
57
|
|
|
58
|
+
CHILD_PID_POLL_INTERVALS = list(range(5, 40, 5))
|
|
59
|
+
|
|
58
60
|
|
|
59
61
|
class JobWrapper:
|
|
60
62
|
"""The only user of the JobWrapper is the JobWrapperTemplate"""
|
|
@@ -430,14 +432,14 @@ class JobWrapper:
|
|
|
430
432
|
)
|
|
431
433
|
exeThread.start()
|
|
432
434
|
payloadPID = None
|
|
433
|
-
for seconds in
|
|
435
|
+
for seconds in CHILD_PID_POLL_INTERVALS:
|
|
434
436
|
time.sleep(seconds)
|
|
435
437
|
payloadPID = spObject.getChildPID()
|
|
436
438
|
if payloadPID:
|
|
437
439
|
self.__setJobParam("PayloadPID", payloadPID)
|
|
438
440
|
break
|
|
439
441
|
if not payloadPID:
|
|
440
|
-
return S_ERROR("Payload process could not start after
|
|
442
|
+
return S_ERROR(f"Payload process could not start after {sum(CHILD_PID_POLL_INTERVALS)} seconds")
|
|
441
443
|
|
|
442
444
|
watchdog = Watchdog(
|
|
443
445
|
pid=self.currentPID,
|
|
@@ -344,24 +344,40 @@ def test_processQuickExecutionNoWatchdog(mocker):
|
|
|
344
344
|
|
|
345
345
|
|
|
346
346
|
@pytest.mark.slow
|
|
347
|
-
|
|
348
|
-
|
|
347
|
+
@pytest.mark.parametrize("expect_failure", [True, False])
|
|
348
|
+
def test_processSubprocessFailureNoPid(mocker, monkeypatch, expect_failure):
|
|
349
|
+
"""Test the process method of the JobWrapper class: the subprocess fails and no PID is returned.
|
|
350
|
+
|
|
351
|
+
expect_failure is used to ensure that the JobWrapper is functioning correctly even with the other patching
|
|
352
|
+
that is applied in the test (e.g. CHILD_PID_POLL_INTERVALS).
|
|
353
|
+
"""
|
|
349
354
|
# Test failure in starting the payload process
|
|
350
355
|
jw = JobWrapper()
|
|
351
356
|
jw.jobArgs = {}
|
|
352
357
|
|
|
353
358
|
mocker.patch.object(jw, "_JobWrapper__report")
|
|
354
359
|
mocker.patch.object(jw, "_JobWrapper__setJobParam")
|
|
360
|
+
monkeypatch.setattr(
|
|
361
|
+
"DIRAC.WorkloadManagementSystem.JobWrapper.JobWrapper.CHILD_PID_POLL_INTERVALS", [0.1, 0.2, 0.3, 0.4, 0.5]
|
|
362
|
+
)
|
|
363
|
+
|
|
355
364
|
mock_exeThread = mocker.Mock()
|
|
356
365
|
mock_exeThread.start.side_effect = lambda: time.sleep(0.1)
|
|
357
|
-
|
|
366
|
+
if expect_failure:
|
|
367
|
+
mocker.patch(
|
|
368
|
+
"DIRAC.WorkloadManagementSystem.JobWrapper.JobWrapper.ExecutionThread", return_value=mock_exeThread
|
|
369
|
+
)
|
|
358
370
|
|
|
359
371
|
with tempfile.NamedTemporaryFile(delete=True) as std_out, tempfile.NamedTemporaryFile(delete=True) as std_err:
|
|
360
372
|
jw.outputFile = std_out.name
|
|
361
373
|
jw.errorFile = std_err.name
|
|
362
374
|
result = jw.process(command="mock_command", env={})
|
|
363
|
-
|
|
364
|
-
|
|
375
|
+
|
|
376
|
+
if expect_failure:
|
|
377
|
+
assert not result["OK"]
|
|
378
|
+
assert "Payload process could not start after 1.5 seconds" in result["Message"]
|
|
379
|
+
else:
|
|
380
|
+
assert result["OK"]
|
|
365
381
|
|
|
366
382
|
|
|
367
383
|
# -------------------------------------------------------------------------------------------------
|
|
@@ -72,6 +72,7 @@ def extraOptions():
|
|
|
72
72
|
os.remove(extraOptions)
|
|
73
73
|
|
|
74
74
|
|
|
75
|
+
@pytest.mark.slow
|
|
75
76
|
def test_createAndExecuteJobWrapperTemplate_success(extraOptions):
|
|
76
77
|
"""Test the creation of a classical job wrapper and its execution:
|
|
77
78
|
There is an extra option cfg file to be passed to the job wrapper.
|
|
@@ -144,6 +145,7 @@ def test_createAndExecuteJobWrapperTemplate_success(extraOptions):
|
|
|
144
145
|
shutil.rmtree(os.path.join(os.getcwd(), "job"))
|
|
145
146
|
|
|
146
147
|
|
|
148
|
+
@pytest.mark.slow
|
|
147
149
|
def test_createAndExecuteJobWrapperTemplate_missingExtraOptions():
|
|
148
150
|
"""Test the creation of a classical job wrapper and its execution:
|
|
149
151
|
There is no extra options to be passed to the job wrapper.
|
|
@@ -205,6 +207,7 @@ def test_createAndExecuteJobWrapperTemplate_missingExtraOptions():
|
|
|
205
207
|
shutil.rmtree(os.path.join(os.getcwd(), "job"))
|
|
206
208
|
|
|
207
209
|
|
|
210
|
+
@pytest.mark.slow
|
|
208
211
|
def test_createAndExecuteRelocatedJobWrapperTemplate_success(extraOptions):
|
|
209
212
|
"""Test the creation of a relocated job wrapper and its execution:
|
|
210
213
|
This is generally used when containers are involved (SingularityCE).
|
|
@@ -325,6 +328,7 @@ def test_createAndExecuteRelocatedJobWrapperTemplate_success(extraOptions):
|
|
|
325
328
|
shutil.rmtree(wrapperPath)
|
|
326
329
|
|
|
327
330
|
|
|
331
|
+
@pytest.mark.slow
|
|
328
332
|
def test_createAndExecuteJobWrapperOfflineTemplate_success(extraOptions):
|
|
329
333
|
"""Test the creation of an offline job wrapper and its execution:
|
|
330
334
|
This is generally used when pre/post processing operations are executed locally,
|
|
@@ -30,7 +30,7 @@ from DIRAC.WorkloadManagementSystem.Service.JobPolicy import (
|
|
|
30
30
|
RIGHT_SUBMIT,
|
|
31
31
|
JobPolicy,
|
|
32
32
|
)
|
|
33
|
-
from DIRAC.WorkloadManagementSystem.
|
|
33
|
+
from DIRAC.WorkloadManagementSystem.DB.StatusUtils import kill_delete_jobs
|
|
34
34
|
from DIRAC.WorkloadManagementSystem.Utilities.JobModel import JobDescriptionModel
|
|
35
35
|
from DIRAC.WorkloadManagementSystem.Utilities.ParametricJob import generateParametricJobs, getParameterVectorLength
|
|
36
36
|
from DIRAC.WorkloadManagementSystem.Utilities.Utils import rescheduleJobs
|
|
@@ -1,209 +1,38 @@
|
|
|
1
|
-
|
|
1
|
+
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
from
|
|
6
|
-
from typing import Any, Annotated, TypeAlias, Self
|
|
7
|
-
|
|
8
|
-
from pydantic import BaseModel, BeforeValidator, model_validator, field_validator, ConfigDict
|
|
3
|
+
from typing import ClassVar
|
|
4
|
+
from pydantic import PrivateAttr
|
|
5
|
+
from DIRACCommon.WorkloadManagementSystem.Utilities.JobModel import * # noqa: F401, F403
|
|
9
6
|
|
|
10
7
|
from DIRAC import gLogger
|
|
11
|
-
from DIRAC.ConfigurationSystem.Client.Helpers.
|
|
12
|
-
from DIRAC.ConfigurationSystem.Client.Helpers.Resources import getDIRACPlatforms, getSites
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
# HACK: Convert appropriate iterables into sets
|
|
16
|
-
def default_set_validator(value):
|
|
17
|
-
if value is None:
|
|
18
|
-
return set()
|
|
19
|
-
elif not isinstance(value, Iterable):
|
|
20
|
-
return value
|
|
21
|
-
elif isinstance(value, (str, bytes, bytearray)):
|
|
22
|
-
return value
|
|
23
|
-
else:
|
|
24
|
-
return set(value)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
CoercibleSetStr: TypeAlias = Annotated[set[str], BeforeValidator(default_set_validator)]
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
class BaseJobDescriptionModel(BaseModel):
|
|
31
|
-
"""Base model for the job description (not parametric)"""
|
|
32
|
-
|
|
33
|
-
model_config = ConfigDict(validate_assignment=True)
|
|
34
|
-
|
|
35
|
-
arguments: str = ""
|
|
36
|
-
bannedSites: CoercibleSetStr = set()
|
|
37
|
-
# TODO: This should use a field factory
|
|
38
|
-
cpuTime: int = Operations().getValue("JobDescription/DefaultCPUTime", 86400)
|
|
39
|
-
executable: str
|
|
40
|
-
executionEnvironment: dict = None
|
|
41
|
-
gridCE: str = ""
|
|
42
|
-
inputSandbox: CoercibleSetStr = set()
|
|
43
|
-
inputData: CoercibleSetStr = set()
|
|
44
|
-
inputDataPolicy: str = ""
|
|
45
|
-
jobConfigArgs: str = ""
|
|
46
|
-
jobGroup: str = ""
|
|
47
|
-
jobType: str = "User"
|
|
48
|
-
jobName: str = "Name"
|
|
49
|
-
# TODO: This should be an StrEnum
|
|
50
|
-
logLevel: str = "INFO"
|
|
51
|
-
# TODO: This can't be None with this type hint
|
|
52
|
-
maxNumberOfProcessors: int = None
|
|
53
|
-
minNumberOfProcessors: int = 1
|
|
54
|
-
outputData: CoercibleSetStr = set()
|
|
55
|
-
outputPath: str = ""
|
|
56
|
-
outputSandbox: CoercibleSetStr = set()
|
|
57
|
-
outputSE: str = ""
|
|
58
|
-
platform: str = ""
|
|
59
|
-
# TODO: This should use a field factory
|
|
60
|
-
priority: int = Operations().getValue("JobDescription/DefaultPriority", 1)
|
|
61
|
-
sites: CoercibleSetStr = set()
|
|
62
|
-
stderr: str = "std.err"
|
|
63
|
-
stdout: str = "std.out"
|
|
64
|
-
tags: CoercibleSetStr = set()
|
|
65
|
-
extraFields: dict[str, Any] = {}
|
|
66
|
-
|
|
67
|
-
@field_validator("cpuTime")
|
|
68
|
-
def checkCPUTimeBounds(cls, v):
|
|
69
|
-
minCPUTime = Operations().getValue("JobDescription/MinCPUTime", 100)
|
|
70
|
-
maxCPUTime = Operations().getValue("JobDescription/MaxCPUTime", 500000)
|
|
71
|
-
if not minCPUTime <= v <= maxCPUTime:
|
|
72
|
-
raise ValueError(f"cpuTime out of bounds (must be between {minCPUTime} and {maxCPUTime})")
|
|
73
|
-
return v
|
|
74
|
-
|
|
75
|
-
@field_validator("executable")
|
|
76
|
-
def checkExecutableIsNotAnEmptyString(cls, v: str):
|
|
77
|
-
if not v:
|
|
78
|
-
raise ValueError("executable must not be an empty string")
|
|
79
|
-
return v
|
|
80
|
-
|
|
81
|
-
@field_validator("jobType")
|
|
82
|
-
def checkJobTypeIsAllowed(cls, v: str):
|
|
83
|
-
jobTypes = Operations().getValue("JobDescription/AllowedJobTypes", ["User", "Test", "Hospital"])
|
|
84
|
-
transformationTypes = Operations().getValue("Transformations/DataProcessing", [])
|
|
85
|
-
allowedTypes = jobTypes + transformationTypes
|
|
86
|
-
if v not in allowedTypes:
|
|
87
|
-
raise ValueError(f"jobType '{v}' is not allowed for this kind of user (must be in {allowedTypes})")
|
|
88
|
-
return v
|
|
89
|
-
|
|
90
|
-
@field_validator("inputData")
|
|
91
|
-
def checkInputDataDoesntContainDoubleSlashes(cls, v):
|
|
92
|
-
if v:
|
|
93
|
-
for lfn in v:
|
|
94
|
-
if lfn.find("//") > -1:
|
|
95
|
-
raise ValueError("Input data contains //")
|
|
96
|
-
return v
|
|
97
|
-
|
|
98
|
-
@field_validator("inputData")
|
|
99
|
-
def addLFNPrefixIfStringStartsWithASlash(cls, v: set[str]):
|
|
100
|
-
if v:
|
|
101
|
-
v = {lfn.strip() for lfn in v if lfn.strip()}
|
|
102
|
-
v = {f"LFN:{lfn}" if lfn.startswith("/") else lfn for lfn in v}
|
|
103
|
-
|
|
104
|
-
for lfn in v:
|
|
105
|
-
if not lfn.startswith("LFN:/"):
|
|
106
|
-
raise ValueError("Input data files must start with LFN:/")
|
|
107
|
-
return v
|
|
108
|
-
|
|
109
|
-
@model_validator(mode="after")
|
|
110
|
-
def checkNumberOfInputDataFiles(self) -> Self:
|
|
111
|
-
if self.inputData:
|
|
112
|
-
maxInputDataFiles = Operations().getValue("JobDescription/MaxInputData", 500)
|
|
113
|
-
if self.jobType == "User" and len(self.inputData) >= maxInputDataFiles:
|
|
114
|
-
raise ValueError(f"inputData contains too many files (must contain at most {maxInputDataFiles})")
|
|
115
|
-
return self
|
|
116
|
-
|
|
117
|
-
@field_validator("inputSandbox")
|
|
118
|
-
def checkLFNSandboxesAreWellFormated(cls, v: set[str]):
|
|
119
|
-
for inputSandbox in v:
|
|
120
|
-
if inputSandbox.startswith("LFN:") and not inputSandbox.startswith("LFN:/"):
|
|
121
|
-
raise ValueError("LFN files must start by LFN:/")
|
|
122
|
-
return v
|
|
123
|
-
|
|
124
|
-
@field_validator("logLevel")
|
|
125
|
-
def checkLogLevelIsValid(cls, v: str):
|
|
126
|
-
v = v.upper()
|
|
127
|
-
possibleLogLevels = gLogger.getAllPossibleLevels()
|
|
128
|
-
if v not in possibleLogLevels:
|
|
129
|
-
raise ValueError(f"Log level {v} not in {possibleLogLevels}")
|
|
130
|
-
return v
|
|
131
|
-
|
|
132
|
-
@field_validator("minNumberOfProcessors")
|
|
133
|
-
def checkMinNumberOfProcessorsBounds(cls, v):
|
|
134
|
-
minNumberOfProcessors = Operations().getValue("JobDescription/MinNumberOfProcessors", 1)
|
|
135
|
-
maxNumberOfProcessors = Operations().getValue("JobDescription/MaxNumberOfProcessors", 1024)
|
|
136
|
-
if not minNumberOfProcessors <= v <= maxNumberOfProcessors:
|
|
137
|
-
raise ValueError(
|
|
138
|
-
f"minNumberOfProcessors out of bounds (must be between {minNumberOfProcessors} and {maxNumberOfProcessors})"
|
|
139
|
-
)
|
|
140
|
-
return v
|
|
141
|
-
|
|
142
|
-
@field_validator("maxNumberOfProcessors")
|
|
143
|
-
def checkMaxNumberOfProcessorsBounds(cls, v):
|
|
144
|
-
minNumberOfProcessors = Operations().getValue("JobDescription/MinNumberOfProcessors", 1)
|
|
145
|
-
maxNumberOfProcessors = Operations().getValue("JobDescription/MaxNumberOfProcessors", 1024)
|
|
146
|
-
if not minNumberOfProcessors <= v <= maxNumberOfProcessors:
|
|
147
|
-
raise ValueError(
|
|
148
|
-
f"minNumberOfProcessors out of bounds (must be between {minNumberOfProcessors} and {maxNumberOfProcessors})"
|
|
149
|
-
)
|
|
150
|
-
return v
|
|
151
|
-
|
|
152
|
-
@model_validator(mode="after")
|
|
153
|
-
def checkThatMaxNumberOfProcessorsIsGreaterThanMinNumberOfProcessors(self) -> Self:
|
|
154
|
-
if self.maxNumberOfProcessors:
|
|
155
|
-
if self.maxNumberOfProcessors < self.minNumberOfProcessors:
|
|
156
|
-
raise ValueError("maxNumberOfProcessors must be greater than minNumberOfProcessors")
|
|
157
|
-
return self
|
|
158
|
-
|
|
159
|
-
@model_validator(mode="after")
|
|
160
|
-
def addTagsDependingOnNumberOfProcessors(self) -> Self:
|
|
161
|
-
if self.minNumberOfProcessors == self.maxNumberOfProcessors:
|
|
162
|
-
self.tags.add(f"{self.minNumberOfProcessors}Processors")
|
|
163
|
-
if self.minNumberOfProcessors > 1:
|
|
164
|
-
self.tags.add("MultiProcessor")
|
|
165
|
-
return self
|
|
8
|
+
from DIRAC.ConfigurationSystem.Client.Helpers.Resources import getSites
|
|
166
9
|
|
|
167
|
-
@field_validator("sites")
|
|
168
|
-
def checkSites(cls, v: set[str]):
|
|
169
|
-
if v:
|
|
170
|
-
res = getSites()
|
|
171
|
-
if not res["OK"]:
|
|
172
|
-
raise ValueError(res["Message"])
|
|
173
|
-
invalidSites = v - set(res["Value"]).union({"ANY"})
|
|
174
|
-
if invalidSites:
|
|
175
|
-
raise ValueError(f"Invalid sites: {' '.join(invalidSites)}")
|
|
176
|
-
return v
|
|
177
10
|
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
if self.sites and self.bannedSites:
|
|
181
|
-
while self.bannedSites:
|
|
182
|
-
self.sites.discard(self.bannedSites.pop())
|
|
183
|
-
if not self.sites:
|
|
184
|
-
raise ValueError("sites and bannedSites are mutually exclusive")
|
|
185
|
-
return self
|
|
11
|
+
def _make_model_config(cls=None) -> BaseJobDescriptionModelConfg:
|
|
12
|
+
from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
|
|
186
13
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
14
|
+
ops = Operations()
|
|
15
|
+
allowedJobTypes = ops.getValue("JobDescription/AllowedJobTypes", ["User", "Test", "Hospital"])
|
|
16
|
+
allowedJobTypes += ops.getValue("Transformations/DataProcessing", [])
|
|
17
|
+
return {
|
|
18
|
+
"cpuTime": ops.getValue("JobDescription/DefaultCPUTime", 86400),
|
|
19
|
+
"priority": ops.getValue("JobDescription/DefaultPriority", 1),
|
|
20
|
+
"minCPUTime": ops.getValue("JobDescription/MinCPUTime", 100),
|
|
21
|
+
"maxCPUTime": ops.getValue("JobDescription/MaxCPUTime", 500000),
|
|
22
|
+
"allowedJobTypes": allowedJobTypes,
|
|
23
|
+
"maxInputDataFiles": ops.getValue("JobDescription/MaxInputData", 500),
|
|
24
|
+
"minNumberOfProcessors": ops.getValue("JobDescription/MinNumberOfProcessors", 1),
|
|
25
|
+
"maxNumberOfProcessors": ops.getValue("JobDescription/MaxNumberOfProcessors", 1024),
|
|
26
|
+
"minPriority": ops.getValue("JobDescription/MinPriority", 0),
|
|
27
|
+
"maxPriority": ops.getValue("JobDescription/MaxPriority", 10),
|
|
28
|
+
"possibleLogLevels": gLogger.getAllPossibleLevels(),
|
|
29
|
+
"sites": getSites(),
|
|
30
|
+
}
|
|
194
31
|
|
|
195
32
|
|
|
196
|
-
class
|
|
197
|
-
|
|
33
|
+
class BaseJobDescriptionModel(BaseJobDescriptionModel): # noqa: F405 pylint: disable=function-redefined
|
|
34
|
+
_config_builder: ClassVar = _make_model_config
|
|
198
35
|
|
|
199
|
-
owner: str
|
|
200
|
-
ownerGroup: str
|
|
201
|
-
vo: str
|
|
202
36
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
if self.inputData:
|
|
206
|
-
for lfn in self.inputData:
|
|
207
|
-
if not lfn.startswith(f"LFN:/{self.vo}/"):
|
|
208
|
-
raise ValueError(f"Input data not correctly specified (must start with LFN:/{self.vo}/)")
|
|
209
|
-
return self
|
|
37
|
+
class JobDescriptionModel(JobDescriptionModel): # noqa: F405 pylint: disable=function-redefined
|
|
38
|
+
_config_builder: ClassVar = _make_model_config
|
|
@@ -9,6 +9,7 @@ from DIRAC import S_ERROR, S_OK, gLogger
|
|
|
9
9
|
from DIRAC.Core.Utilities import TimeUtilities
|
|
10
10
|
from DIRAC.Core.Utilities.ObjectLoader import ObjectLoader
|
|
11
11
|
from DIRAC.WorkloadManagementSystem.Client import JobStatus
|
|
12
|
+
from DIRACCommon.WorkloadManagementSystem.Utilities.JobStatusUtility import getStartAndEndTime, getNewStatus
|
|
12
13
|
|
|
13
14
|
if TYPE_CHECKING:
|
|
14
15
|
from DIRAC.WorkloadManagementSystem.DB.JobLoggingDB import JobLoggingDB
|
|
@@ -180,66 +181,3 @@ class JobStatusUtility:
|
|
|
180
181
|
return result
|
|
181
182
|
|
|
182
183
|
return S_OK((attrNames, attrValues))
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
def getStartAndEndTime(startTime, endTime, updateTimes, timeStamps, statusDict):
|
|
186
|
-
newStat = ""
|
|
187
|
-
firstUpdate = TimeUtilities.toEpoch(TimeUtilities.fromString(updateTimes[0]))
|
|
188
|
-
for ts, st in timeStamps:
|
|
189
|
-
if firstUpdate >= ts:
|
|
190
|
-
newStat = st
|
|
191
|
-
# Pick up start and end times from all updates
|
|
192
|
-
for updTime in updateTimes:
|
|
193
|
-
sDict = statusDict[updTime]
|
|
194
|
-
newStat = sDict.get("Status", newStat)
|
|
195
|
-
|
|
196
|
-
if not startTime and newStat == JobStatus.RUNNING:
|
|
197
|
-
# Pick up the start date when the job starts running if not existing
|
|
198
|
-
startTime = updTime
|
|
199
|
-
elif not endTime and newStat in JobStatus.JOB_FINAL_STATES:
|
|
200
|
-
# Pick up the end time when the job is in a final status
|
|
201
|
-
endTime = updTime
|
|
202
|
-
|
|
203
|
-
return startTime, endTime
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
def getNewStatus(
|
|
207
|
-
jobID: int,
|
|
208
|
-
updateTimes: list[datetime],
|
|
209
|
-
lastTime: datetime,
|
|
210
|
-
statusDict: dict[datetime, Any],
|
|
211
|
-
currentStatus,
|
|
212
|
-
force: bool,
|
|
213
|
-
log,
|
|
214
|
-
):
|
|
215
|
-
status = ""
|
|
216
|
-
minor = ""
|
|
217
|
-
application = ""
|
|
218
|
-
# Get the last status values looping on the most recent upupdateTimes in chronological order
|
|
219
|
-
for updTime in [dt for dt in updateTimes if dt >= lastTime]:
|
|
220
|
-
sDict = statusDict[updTime]
|
|
221
|
-
log.debug(f"\tTime {updTime} - Statuses {str(sDict)}")
|
|
222
|
-
status = sDict.get("Status", currentStatus)
|
|
223
|
-
# evaluate the state machine if the status is changing
|
|
224
|
-
if not force and status != currentStatus:
|
|
225
|
-
res = JobStatus.JobsStateMachine(currentStatus).getNextState(status)
|
|
226
|
-
if not res["OK"]:
|
|
227
|
-
return res
|
|
228
|
-
newStat = res["Value"]
|
|
229
|
-
# If the JobsStateMachine does not accept the candidate, don't update
|
|
230
|
-
if newStat != status:
|
|
231
|
-
# keeping the same status
|
|
232
|
-
log.error(
|
|
233
|
-
f"Job Status Error: {jobID} can't move from {currentStatus} to {status}: using {newStat}",
|
|
234
|
-
)
|
|
235
|
-
status = newStat
|
|
236
|
-
sDict["Status"] = newStat
|
|
237
|
-
# Change the source to indicate this is not what was requested
|
|
238
|
-
source = sDict.get("Source", "")
|
|
239
|
-
sDict["Source"] = source + "(SM)"
|
|
240
|
-
# at this stage status == newStat. Set currentStatus to this new status
|
|
241
|
-
currentStatus = newStat
|
|
242
|
-
|
|
243
|
-
minor = sDict.get("MinorStatus", minor)
|
|
244
|
-
application = sDict.get("ApplicationStatus", application)
|
|
245
|
-
return S_OK((status, minor, application))
|