DIRAC 9.0.0a68__py3-none-any.whl → 9.0.0a70__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- DIRAC/AccountingSystem/Client/Types/Network.py +8 -8
- DIRAC/AccountingSystem/Client/Types/PilotSubmission.py +3 -3
- DIRAC/ConfigurationSystem/Client/CSAPI.py +11 -1
- DIRAC/ConfigurationSystem/Client/Helpers/CSGlobals.py +0 -9
- DIRAC/ConfigurationSystem/Client/Helpers/Registry.py +3 -29
- DIRAC/ConfigurationSystem/Client/SyncPlugins/CERNLDAPSyncPlugin.py +4 -1
- DIRAC/ConfigurationSystem/ConfigTemplate.cfg +3 -0
- DIRAC/ConfigurationSystem/private/Modificator.py +11 -3
- DIRAC/ConfigurationSystem/private/RefresherBase.py +4 -2
- DIRAC/Core/DISET/ServiceReactor.py +11 -3
- DIRAC/Core/DISET/private/Transports/M2SSLTransport.py +9 -7
- DIRAC/Core/Security/DiracX.py +11 -6
- DIRAC/Core/Security/test/test_diracx_token_from_pem.py +161 -0
- DIRAC/Core/Tornado/Server/TornadoService.py +1 -1
- DIRAC/Core/Utilities/ElasticSearchDB.py +1 -2
- DIRAC/Core/Utilities/Subprocess.py +66 -57
- DIRAC/Core/Utilities/test/Test_Profiler.py +20 -20
- DIRAC/Core/Utilities/test/Test_Subprocess.py +58 -8
- DIRAC/Core/scripts/dirac_apptainer_exec.py +8 -8
- DIRAC/DataManagementSystem/Agent/FTS3Agent.py +8 -7
- DIRAC/DataManagementSystem/Client/DataManager.py +6 -7
- DIRAC/DataManagementSystem/Client/FTS3Job.py +125 -34
- DIRAC/DataManagementSystem/Client/test/Test_FTS3Objects.py +1 -0
- DIRAC/DataManagementSystem/Client/test/Test_scitag.py +69 -0
- DIRAC/DataManagementSystem/DB/FileCatalogComponents/DatasetManager/DatasetManager.py +1 -1
- DIRAC/DataManagementSystem/scripts/dirac_dms_create_moving_request.py +2 -0
- DIRAC/FrameworkSystem/DB/InstalledComponentsDB.py +3 -2
- DIRAC/FrameworkSystem/DB/ProxyDB.py +9 -5
- DIRAC/FrameworkSystem/Utilities/MonitoringUtilities.py +1 -0
- DIRAC/FrameworkSystem/Utilities/TokenManagementUtilities.py +3 -2
- DIRAC/FrameworkSystem/Utilities/diracx.py +41 -10
- DIRAC/FrameworkSystem/scripts/dirac_login.py +2 -2
- DIRAC/FrameworkSystem/scripts/dirac_proxy_init.py +1 -1
- DIRAC/FrameworkSystem/scripts/dirac_uninstall_component.py +1 -0
- DIRAC/Interfaces/API/Dirac.py +3 -6
- DIRAC/Interfaces/Utilities/DConfigCache.py +2 -0
- DIRAC/Interfaces/scripts/dirac_wms_job_parameters.py +0 -1
- DIRAC/MonitoringSystem/DB/MonitoringDB.py +6 -5
- DIRAC/MonitoringSystem/Service/WebAppHandler.py +25 -6
- DIRAC/MonitoringSystem/private/MainReporter.py +0 -3
- DIRAC/RequestManagementSystem/Agent/RequestExecutingAgent.py +8 -6
- DIRAC/RequestManagementSystem/ConfigTemplate.cfg +6 -6
- DIRAC/ResourceStatusSystem/Command/FreeDiskSpaceCommand.py +3 -1
- DIRAC/Resources/Computing/AREXComputingElement.py +18 -2
- DIRAC/Resources/Computing/BatchSystems/Condor.py +0 -3
- DIRAC/Resources/Computing/BatchSystems/executeBatch.py +15 -7
- DIRAC/Resources/Computing/LocalComputingElement.py +0 -2
- DIRAC/Resources/Computing/SSHComputingElement.py +61 -38
- DIRAC/Resources/IdProvider/CheckInIdProvider.py +13 -0
- DIRAC/Resources/IdProvider/IdProviderFactory.py +13 -3
- DIRAC/Resources/IdProvider/tests/Test_IdProviderFactory.py +7 -0
- DIRAC/Resources/Storage/FileStorage.py +121 -2
- DIRAC/TransformationSystem/Agent/InputDataAgent.py +4 -1
- DIRAC/TransformationSystem/Agent/MCExtensionAgent.py +5 -2
- DIRAC/TransformationSystem/Agent/TaskManagerAgentBase.py +3 -4
- DIRAC/TransformationSystem/Agent/TransformationCleaningAgent.py +44 -9
- DIRAC/TransformationSystem/Agent/ValidateOutputDataAgent.py +4 -2
- DIRAC/TransformationSystem/Client/TransformationClient.py +9 -1
- DIRAC/TransformationSystem/Client/Utilities.py +6 -3
- DIRAC/TransformationSystem/DB/TransformationDB.py +105 -43
- DIRAC/TransformationSystem/Utilities/ReplicationCLIParameters.py +3 -3
- DIRAC/TransformationSystem/scripts/dirac_production_runjoblocal.py +2 -4
- DIRAC/TransformationSystem/test/Test_replicationTransformation.py +5 -6
- DIRAC/WorkloadManagementSystem/Agent/JobAgent.py +1 -5
- DIRAC/WorkloadManagementSystem/Agent/PilotSyncAgent.py +4 -3
- DIRAC/WorkloadManagementSystem/Agent/PushJobAgent.py +0 -4
- DIRAC/WorkloadManagementSystem/Agent/SiteDirector.py +8 -11
- DIRAC/WorkloadManagementSystem/Agent/StalledJobAgent.py +39 -7
- DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_SiteDirector.py +8 -2
- DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_StalledJobAgent.py +24 -4
- DIRAC/WorkloadManagementSystem/Client/DownloadInputData.py +4 -3
- DIRAC/WorkloadManagementSystem/ConfigTemplate.cfg +3 -3
- DIRAC/WorkloadManagementSystem/DB/JobParametersDB.py +8 -8
- DIRAC/WorkloadManagementSystem/DB/SandboxMetadataDB.py +1 -1
- DIRAC/WorkloadManagementSystem/DB/StatusUtils.py +48 -21
- DIRAC/WorkloadManagementSystem/DB/tests/Test_StatusUtils.py +19 -4
- DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapper.py +3 -4
- DIRAC/WorkloadManagementSystem/JobWrapper/Watchdog.py +16 -45
- DIRAC/WorkloadManagementSystem/JobWrapper/test/Test_JobWrapper.py +18 -9
- DIRAC/WorkloadManagementSystem/Service/JobManagerHandler.py +25 -2
- DIRAC/WorkloadManagementSystem/Service/WMSAdministratorHandler.py +18 -31
- DIRAC/WorkloadManagementSystem/Utilities/PilotCStoJSONSynchronizer.py +73 -7
- {dirac-9.0.0a68.dist-info → dirac-9.0.0a70.dist-info}/METADATA +6 -5
- {dirac-9.0.0a68.dist-info → dirac-9.0.0a70.dist-info}/RECORD +88 -86
- {dirac-9.0.0a68.dist-info → dirac-9.0.0a70.dist-info}/WHEEL +0 -0
- {dirac-9.0.0a68.dist-info → dirac-9.0.0a70.dist-info}/entry_points.txt +0 -0
- {dirac-9.0.0a68.dist-info → dirac-9.0.0a70.dist-info}/licenses/LICENSE +0 -0
- {dirac-9.0.0a68.dist-info → dirac-9.0.0a70.dist-info}/top_level.txt +0 -0
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""The Site Director is an agent performing pilot job submission to particular sites/Computing Elements.
|
|
2
2
|
|
|
3
3
|
.. literalinclude:: ../ConfigTemplate.cfg
|
|
4
4
|
:start-after: ##BEGIN SiteDirector
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
:caption: SiteDirector options
|
|
8
8
|
|
|
9
9
|
"""
|
|
10
|
+
|
|
10
11
|
import datetime
|
|
11
12
|
import os
|
|
12
13
|
from collections import defaultdict
|
|
@@ -147,10 +148,10 @@ class SiteDirector(AgentModule):
|
|
|
147
148
|
self.sendSubmissionAccounting = True
|
|
148
149
|
|
|
149
150
|
# Get the site description dictionary
|
|
150
|
-
siteNames = self.am_getOption("Site", [])
|
|
151
|
-
ceTypes = self.am_getOption("CETypes", [])
|
|
152
|
-
ces = self.am_getOption("CEs", [])
|
|
153
|
-
tags = self.am_getOption("Tags", [])
|
|
151
|
+
siteNames = self.am_getOption("Site", []) or None
|
|
152
|
+
ceTypes = self.am_getOption("CETypes", []) or None
|
|
153
|
+
ces = self.am_getOption("CEs", []) or None
|
|
154
|
+
tags = self.am_getOption("Tags", []) or None
|
|
154
155
|
|
|
155
156
|
# Display options used
|
|
156
157
|
self.log.always("VO:", self.vo)
|
|
@@ -229,12 +230,8 @@ class SiteDirector(AgentModule):
|
|
|
229
230
|
site = self.queueDict[queueName]["Site"]
|
|
230
231
|
ce = self.queueDict[queueName]["CEName"]
|
|
231
232
|
|
|
232
|
-
# Check the status of the Site
|
|
233
|
-
if site in siteMaskList:
|
|
234
|
-
continue
|
|
235
|
-
|
|
236
|
-
# Check the status of the CE (only for RSS=Active)
|
|
237
|
-
if ce not in ceMaskList:
|
|
233
|
+
# Check the status of the Site and CE
|
|
234
|
+
if site in siteMaskList and ce in ceMaskList:
|
|
238
235
|
continue
|
|
239
236
|
|
|
240
237
|
self.log.warn("Queue not considered because not usable:", queueName)
|
|
@@ -17,11 +17,9 @@ from DIRAC.ConfigurationSystem.Client.Helpers import cfgPath
|
|
|
17
17
|
from DIRAC.Core.Base.AgentModule import AgentModule
|
|
18
18
|
from DIRAC.Core.Utilities import DErrno
|
|
19
19
|
from DIRAC.Core.Utilities.ClassAd.ClassAdLight import ClassAd
|
|
20
|
+
from DIRAC.Core.Utilities.ObjectLoader import ObjectLoader
|
|
20
21
|
from DIRAC.Core.Utilities.TimeUtilities import fromString, second, toEpoch
|
|
21
22
|
from DIRAC.WorkloadManagementSystem.Client import JobMinorStatus, JobStatus
|
|
22
|
-
from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB
|
|
23
|
-
from DIRAC.WorkloadManagementSystem.DB.JobLoggingDB import JobLoggingDB
|
|
24
|
-
from DIRAC.WorkloadManagementSystem.DB.PilotAgentsDB import PilotAgentsDB
|
|
25
23
|
from DIRAC.WorkloadManagementSystem.Service.JobPolicy import RIGHT_KILL
|
|
26
24
|
from DIRAC.WorkloadManagementSystem.DB.StatusUtils import kill_delete_jobs
|
|
27
25
|
from DIRAC.WorkloadManagementSystem.Utilities.JobParameters import getJobParameters
|
|
@@ -40,6 +38,9 @@ class StalledJobAgent(AgentModule):
|
|
|
40
38
|
|
|
41
39
|
self.jobDB = None
|
|
42
40
|
self.logDB = None
|
|
41
|
+
self.taskQueueDB = None
|
|
42
|
+
self.pilotAgentsDB = None
|
|
43
|
+
self.storageManagementDB = None
|
|
43
44
|
self.matchedTime = 7200
|
|
44
45
|
self.rescheduledTime = 600
|
|
45
46
|
self.submittingTime = 300
|
|
@@ -51,8 +52,30 @@ class StalledJobAgent(AgentModule):
|
|
|
51
52
|
#############################################################################
|
|
52
53
|
def initialize(self):
|
|
53
54
|
"""Sets default parameters."""
|
|
54
|
-
|
|
55
|
-
|
|
55
|
+
result = ObjectLoader().loadObject("WorkloadManagementSystem.DB.JobDB", "JobDB")
|
|
56
|
+
if not result["OK"]:
|
|
57
|
+
return result
|
|
58
|
+
self.jobDB = result["Value"]()
|
|
59
|
+
|
|
60
|
+
result = ObjectLoader().loadObject("WorkloadManagementSystem.DB.JobLoggingDB", "JobLoggingDB")
|
|
61
|
+
if not result["OK"]:
|
|
62
|
+
return result
|
|
63
|
+
self.logDB = result["Value"]()
|
|
64
|
+
|
|
65
|
+
result = ObjectLoader().loadObject("WorkloadManagementSystem.DB.TaskQueueDB", "TaskQueueDB")
|
|
66
|
+
if not result["OK"]:
|
|
67
|
+
return result
|
|
68
|
+
self.taskQueueDB = result["Value"]()
|
|
69
|
+
|
|
70
|
+
result = ObjectLoader().loadObject("WorkloadManagementSystem.DB.PilotAgentsDB", "PilotAgentsDB")
|
|
71
|
+
if not result["OK"]:
|
|
72
|
+
return result
|
|
73
|
+
self.pilotAgentsDB = result["Value"]()
|
|
74
|
+
|
|
75
|
+
result = ObjectLoader().loadObject("StorageManagementSystem.DB.StorageManagementDB", "StorageManagementDB")
|
|
76
|
+
if not result["OK"]:
|
|
77
|
+
return result
|
|
78
|
+
self.storageManagementDB = result["Value"]()
|
|
56
79
|
|
|
57
80
|
# getting parameters
|
|
58
81
|
|
|
@@ -235,7 +258,16 @@ class StalledJobAgent(AgentModule):
|
|
|
235
258
|
# Set the jobs Failed, send them a kill signal in case they are not really dead
|
|
236
259
|
# and send accounting info
|
|
237
260
|
if setFailed:
|
|
238
|
-
res = kill_delete_jobs(
|
|
261
|
+
res = kill_delete_jobs(
|
|
262
|
+
RIGHT_KILL,
|
|
263
|
+
[jobID],
|
|
264
|
+
nonauthJobList=[],
|
|
265
|
+
force=True,
|
|
266
|
+
jobdb=self.jobDB,
|
|
267
|
+
taskqueuedb=self.taskQueueDB,
|
|
268
|
+
pilotagentsdb=self.pilotAgentsDB,
|
|
269
|
+
storagemanagementdb=self.storageManagementDB,
|
|
270
|
+
)
|
|
239
271
|
if not res["OK"]:
|
|
240
272
|
self.log.error("Failed to kill job", jobID)
|
|
241
273
|
|
|
@@ -262,7 +294,7 @@ class StalledJobAgent(AgentModule):
|
|
|
262
294
|
# There is no pilot reference, hence its status is unknown
|
|
263
295
|
return S_OK("NoPilot")
|
|
264
296
|
|
|
265
|
-
result =
|
|
297
|
+
result = self.pilotAgentsDB.getPilotInfo(pilotReference)
|
|
266
298
|
if not result["OK"]:
|
|
267
299
|
if DErrno.cmpError(result, DErrno.EWMSNOPILOT):
|
|
268
300
|
self.log.warn("No pilot found", f"for job {jobID}: {result['Message']}")
|
|
@@ -169,10 +169,16 @@ def sd(mocker, config):
|
|
|
169
169
|
gConfig.getSections("Resources/Sites/LCG")["Value"] + gConfig.getSections("Resources/Sites/DIRAC")["Value"]
|
|
170
170
|
)
|
|
171
171
|
mocker.patch(
|
|
172
|
-
"DIRAC.WorkloadManagementSystem.Agent.SiteDirector.SiteStatus.getUsableSites",
|
|
172
|
+
"DIRAC.WorkloadManagementSystem.Agent.SiteDirector.SiteStatus.getUsableSites", return_value=S_OK(usableSites)
|
|
173
173
|
)
|
|
174
|
+
|
|
175
|
+
# Mock getElementStatus to return a properly formatted dictionary
|
|
176
|
+
def mock_getElementStatus(ceNamesList, *args, **kwargs):
|
|
177
|
+
return S_OK({ceName: {"all": "Active"} for ceName in ceNamesList})
|
|
178
|
+
|
|
174
179
|
mocker.patch(
|
|
175
|
-
"DIRAC.WorkloadManagementSystem.Agent.SiteDirector.ResourceStatus.getElementStatus",
|
|
180
|
+
"DIRAC.WorkloadManagementSystem.Agent.SiteDirector.ResourceStatus.getElementStatus",
|
|
181
|
+
side_effect=mock_getElementStatus,
|
|
176
182
|
)
|
|
177
183
|
mocker.patch(
|
|
178
184
|
"DIRAC.WorkloadManagementSystem.Agent.SiteDirector.gProxyManager.downloadProxy", side_effect=mockPMProxyReply
|
|
@@ -23,10 +23,31 @@ def sja(mocker):
|
|
|
23
23
|
side_effect=lambda x, y=None: y,
|
|
24
24
|
create=True,
|
|
25
25
|
)
|
|
26
|
-
|
|
27
|
-
|
|
26
|
+
|
|
27
|
+
# Mock ObjectLoader to return mock DB instances
|
|
28
|
+
mockJobDB = MagicMock()
|
|
29
|
+
mockJobDB.log = gLogger
|
|
30
|
+
mockJobLoggingDB = MagicMock()
|
|
31
|
+
mockTaskQueueDB = MagicMock()
|
|
32
|
+
mockPilotAgentsDB = MagicMock()
|
|
33
|
+
mockStorageManagementDB = MagicMock()
|
|
34
|
+
|
|
35
|
+
def mock_load_object(module_path, class_name):
|
|
36
|
+
mocks = {
|
|
37
|
+
"JobDB": mockJobDB,
|
|
38
|
+
"JobLoggingDB": mockJobLoggingDB,
|
|
39
|
+
"TaskQueueDB": mockTaskQueueDB,
|
|
40
|
+
"PilotAgentsDB": mockPilotAgentsDB,
|
|
41
|
+
"StorageManagementDB": mockStorageManagementDB,
|
|
42
|
+
}
|
|
43
|
+
return {"OK": True, "Value": lambda: mocks[class_name]}
|
|
44
|
+
|
|
45
|
+
mocker.patch(
|
|
46
|
+
"DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.ObjectLoader.loadObject",
|
|
47
|
+
side_effect=mock_load_object,
|
|
48
|
+
)
|
|
49
|
+
|
|
28
50
|
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.rescheduleJobs", return_value=MagicMock())
|
|
29
|
-
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.PilotAgentsDB", return_value=MagicMock())
|
|
30
51
|
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.getJobParameters", return_value=MagicMock())
|
|
31
52
|
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.kill_delete_jobs", return_value=MagicMock())
|
|
32
53
|
|
|
@@ -34,7 +55,6 @@ def sja(mocker):
|
|
|
34
55
|
stalledJobAgent._AgentModule__configDefaults = mockAM
|
|
35
56
|
stalledJobAgent.log = gLogger
|
|
36
57
|
stalledJobAgent.initialize()
|
|
37
|
-
stalledJobAgent.jobDB.log = gLogger
|
|
38
58
|
stalledJobAgent.log.setLevel("DEBUG")
|
|
39
59
|
stalledJobAgent.stalledTime = 120
|
|
40
60
|
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
1
|
+
"""The Download Input Data module wraps around the Replica Management
|
|
2
|
+
components to provide access to datasets by downloading locally
|
|
3
3
|
"""
|
|
4
|
+
|
|
4
5
|
import os
|
|
5
6
|
import random
|
|
6
7
|
import tempfile
|
|
@@ -226,7 +227,7 @@ class DownloadInputData:
|
|
|
226
227
|
diskSpace = getDiskSpace(self.__getDownloadDir(False)) # MB
|
|
227
228
|
availableBytes = diskSpace * 1024 * 1024 # bytes
|
|
228
229
|
bufferGBs = gConfig.getValue(
|
|
229
|
-
os.path.join("/Systems/WorkloadManagement/JobWrapper", "
|
|
230
|
+
os.path.join("/Systems/WorkloadManagement/JobWrapper", "MinOutputDataBufferGB"), 5.0
|
|
230
231
|
)
|
|
231
232
|
data = bufferGBs * 1024 * 1024 * 1024 # bufferGBs in bytes
|
|
232
233
|
if (data + totalSize) < availableBytes:
|
|
@@ -263,11 +263,11 @@ Agents
|
|
|
263
263
|
# the DN of the certificate proxy used to submit pilots. If not found here, what is in Operations/Pilot section of the CS will be used
|
|
264
264
|
PilotDN =
|
|
265
265
|
|
|
266
|
-
# List of
|
|
266
|
+
# List of Sites that will be treated by this SiteDirector (No value can refer to any CE defined in the CS)
|
|
267
267
|
Site =
|
|
268
|
-
# List of CEs that will be treated by this SiteDirector (No value can refer to any CE defined in the CS)
|
|
268
|
+
# List of CEs that will be treated by this SiteDirector (No value can refer to any type of CE defined in the CS)
|
|
269
269
|
CEs =
|
|
270
|
-
# List of
|
|
270
|
+
# List of CETypes that are required to be present in the CE/Queue definition
|
|
271
271
|
CETypes =
|
|
272
272
|
# List of Tags that are required to be present in the CE/Queue definition
|
|
273
273
|
Tags =
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
1
|
+
"""Module containing a front-end to the OpenSearch-based JobParametersDB.
|
|
2
|
+
This is a drop-in replacement for MySQL-based table JobDB.JobParameters.
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
4
|
+
The following class methods are provided for public usage
|
|
5
|
+
- getJobParameters()
|
|
6
|
+
- setJobParameter()
|
|
7
|
+
- deleteJobParameters()
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
10
|
from DIRAC import S_ERROR, S_OK
|
|
@@ -37,11 +37,11 @@ class JobParametersDB(ElasticDB):
|
|
|
37
37
|
def __init__(self, parentLogger=None):
|
|
38
38
|
"""Standard Constructor"""
|
|
39
39
|
|
|
40
|
-
self.fullname = "WorkloadManagement/
|
|
40
|
+
self.fullname = "WorkloadManagement/JobParametersDB"
|
|
41
41
|
self.index_name = self.getCSOption("index_name", "job_parameters")
|
|
42
42
|
|
|
43
43
|
try:
|
|
44
|
-
# Connecting to the
|
|
44
|
+
# Connecting to the OpenSearch cluster
|
|
45
45
|
super().__init__(self.fullname, self.index_name, parentLogger=parentLogger)
|
|
46
46
|
except Exception:
|
|
47
47
|
RuntimeError("Can't connect to JobParameters index")
|
|
@@ -64,7 +64,7 @@ class SandboxMetadataDB(DB):
|
|
|
64
64
|
"Type": "VARCHAR(64) NOT NULL",
|
|
65
65
|
},
|
|
66
66
|
"Indexes": {"Entity": ["EntityId"], "SBIndex": ["SBId"]},
|
|
67
|
-
"
|
|
67
|
+
"PrimaryKey": ["SBId", "EntityId", "Type"],
|
|
68
68
|
}
|
|
69
69
|
|
|
70
70
|
for tableName in self.__tablesDesc:
|
|
@@ -1,43 +1,40 @@
|
|
|
1
1
|
from DIRAC import S_ERROR, S_OK, gLogger
|
|
2
|
-
from DIRAC.StorageManagementSystem.DB.StorageManagementDB import StorageManagementDB
|
|
3
2
|
from DIRAC.WorkloadManagementSystem.Client import JobStatus
|
|
4
|
-
from DIRAC.
|
|
5
|
-
from DIRAC.WorkloadManagementSystem.DB.PilotAgentsDB import PilotAgentsDB
|
|
6
|
-
from DIRAC.WorkloadManagementSystem.DB.TaskQueueDB import TaskQueueDB
|
|
3
|
+
from DIRAC.Core.Utilities.ObjectLoader import ObjectLoader
|
|
7
4
|
from DIRAC.WorkloadManagementSystem.Service.JobPolicy import RIGHT_DELETE, RIGHT_KILL
|
|
8
5
|
from DIRAC.WorkloadManagementSystem.Utilities.jobAdministration import _filterJobStateTransition
|
|
9
6
|
|
|
10
7
|
|
|
11
|
-
def _deleteJob(jobID, force=False):
|
|
8
|
+
def _deleteJob(jobID, force=False, *, jobdb, taskqueuedb, pilotagentsdb):
|
|
12
9
|
"""Set the job status to "Deleted"
|
|
13
10
|
and remove the pilot that ran and its logging info if the pilot is finished.
|
|
14
11
|
|
|
15
12
|
:param int jobID: job ID
|
|
16
13
|
:return: S_OK()/S_ERROR()
|
|
17
14
|
"""
|
|
18
|
-
if not (result :=
|
|
15
|
+
if not (result := jobdb.setJobStatus(jobID, JobStatus.DELETED, "Checking accounting", force=force))["OK"]:
|
|
19
16
|
gLogger.warn("Failed to set job Deleted status", result["Message"])
|
|
20
17
|
return result
|
|
21
18
|
|
|
22
|
-
if not (result :=
|
|
19
|
+
if not (result := taskqueuedb.deleteJob(jobID))["OK"]:
|
|
23
20
|
gLogger.warn("Failed to delete job from the TaskQueue")
|
|
24
21
|
|
|
25
22
|
# if it was the last job for the pilot
|
|
26
|
-
result =
|
|
23
|
+
result = pilotagentsdb.getPilotsForJobID(jobID)
|
|
27
24
|
if not result["OK"]:
|
|
28
25
|
gLogger.error("Failed to get Pilots for JobID", result["Message"])
|
|
29
26
|
return result
|
|
30
27
|
for pilot in result["Value"]:
|
|
31
|
-
res =
|
|
28
|
+
res = pilotagentsdb.getJobsForPilot(pilot)
|
|
32
29
|
if not res["OK"]:
|
|
33
30
|
gLogger.error("Failed to get jobs for pilot", res["Message"])
|
|
34
31
|
return res
|
|
35
32
|
if not res["Value"]: # if list of jobs for pilot is empty, delete pilot
|
|
36
|
-
result =
|
|
33
|
+
result = pilotagentsdb.getPilotInfo(pilotID=pilot)
|
|
37
34
|
if not result["OK"]:
|
|
38
35
|
gLogger.error("Failed to get pilot info", result["Message"])
|
|
39
36
|
return result
|
|
40
|
-
ret =
|
|
37
|
+
ret = pilotagentsdb.deletePilot(result["Value"]["PilotJobReference"])
|
|
41
38
|
if not ret["OK"]:
|
|
42
39
|
gLogger.error("Failed to delete pilot from PilotAgentsDB", ret["Message"])
|
|
43
40
|
return ret
|
|
@@ -45,7 +42,7 @@ def _deleteJob(jobID, force=False):
|
|
|
45
42
|
return S_OK()
|
|
46
43
|
|
|
47
44
|
|
|
48
|
-
def _killJob(jobID, sendKillCommand=True, force=False):
|
|
45
|
+
def _killJob(jobID, sendKillCommand=True, force=False, *, jobdb, taskqueuedb):
|
|
49
46
|
"""Kill one job
|
|
50
47
|
|
|
51
48
|
:param int jobID: job ID
|
|
@@ -54,32 +51,63 @@ def _killJob(jobID, sendKillCommand=True, force=False):
|
|
|
54
51
|
:return: S_OK()/S_ERROR()
|
|
55
52
|
"""
|
|
56
53
|
if sendKillCommand:
|
|
57
|
-
if not (result :=
|
|
54
|
+
if not (result := jobdb.setJobCommand(jobID, "Kill"))["OK"]:
|
|
58
55
|
gLogger.warn("Failed to set job Kill command", result["Message"])
|
|
59
56
|
return result
|
|
60
57
|
|
|
61
58
|
gLogger.info("Job marked for termination", jobID)
|
|
62
|
-
if not (result :=
|
|
59
|
+
if not (result := jobdb.setJobStatus(jobID, JobStatus.KILLED, "Marked for termination", force=force))["OK"]:
|
|
63
60
|
gLogger.warn("Failed to set job Killed status", result["Message"])
|
|
64
|
-
if not (result :=
|
|
61
|
+
if not (result := taskqueuedb.deleteJob(jobID))["OK"]:
|
|
65
62
|
gLogger.warn("Failed to delete job from the TaskQueue", result["Message"])
|
|
66
63
|
|
|
67
64
|
return S_OK()
|
|
68
65
|
|
|
69
66
|
|
|
70
|
-
def kill_delete_jobs(
|
|
67
|
+
def kill_delete_jobs(
|
|
68
|
+
right,
|
|
69
|
+
validJobList,
|
|
70
|
+
nonauthJobList=[],
|
|
71
|
+
force=False,
|
|
72
|
+
*,
|
|
73
|
+
jobdb=None,
|
|
74
|
+
taskqueuedb=None,
|
|
75
|
+
pilotagentsdb=None,
|
|
76
|
+
storagemanagementdb=None,
|
|
77
|
+
):
|
|
71
78
|
"""Kill (== set the status to "KILLED") or delete (== set the status to "DELETED") jobs as necessary
|
|
72
79
|
|
|
73
80
|
:param str right: RIGHT_KILL or RIGHT_DELETE
|
|
74
81
|
|
|
75
82
|
:return: S_OK()/S_ERROR()
|
|
76
83
|
"""
|
|
84
|
+
if jobdb is None:
|
|
85
|
+
result = ObjectLoader().loadObject("WorkloadManagementSystem.DB.JobDB", "JobDB")
|
|
86
|
+
if not result["OK"]:
|
|
87
|
+
return result
|
|
88
|
+
jobdb = result["Value"]()
|
|
89
|
+
if taskqueuedb is None:
|
|
90
|
+
result = ObjectLoader().loadObject("WorkloadManagementSystem.DB.TaskQueueDB", "TaskQueueDB")
|
|
91
|
+
if not result["OK"]:
|
|
92
|
+
return result
|
|
93
|
+
taskqueuedb = result["Value"]()
|
|
94
|
+
if pilotagentsdb is None:
|
|
95
|
+
result = ObjectLoader().loadObject("WorkloadManagementSystem.DB.PilotAgentsDB", "PilotAgentsDB")
|
|
96
|
+
if not result["OK"]:
|
|
97
|
+
return result
|
|
98
|
+
pilotagentsdb = result["Value"]()
|
|
99
|
+
if storagemanagementdb is None:
|
|
100
|
+
result = ObjectLoader().loadObject("StorageManagementSystem.DB.StorageManagementDB", "StorageManagementDB")
|
|
101
|
+
if not result["OK"]:
|
|
102
|
+
return result
|
|
103
|
+
storagemanagementdb = result["Value"]()
|
|
104
|
+
|
|
77
105
|
badIDs = []
|
|
78
106
|
|
|
79
107
|
killJobList = []
|
|
80
108
|
deleteJobList = []
|
|
81
109
|
if validJobList:
|
|
82
|
-
result =
|
|
110
|
+
result = jobdb.getJobsAttributes(validJobList, ["Status"])
|
|
83
111
|
if not result["OK"]:
|
|
84
112
|
return result
|
|
85
113
|
jobStates = result["Value"]
|
|
@@ -92,12 +120,12 @@ def kill_delete_jobs(right, validJobList, nonauthJobList=[], force=False):
|
|
|
92
120
|
deleteJobList.extend(_filterJobStateTransition(jobStates, JobStatus.DELETED))
|
|
93
121
|
|
|
94
122
|
for jobID in killJobList:
|
|
95
|
-
result = _killJob(jobID, force=force)
|
|
123
|
+
result = _killJob(jobID, force=force, jobdb=jobdb, taskqueuedb=taskqueuedb)
|
|
96
124
|
if not result["OK"]:
|
|
97
125
|
badIDs.append(jobID)
|
|
98
126
|
|
|
99
127
|
for jobID in deleteJobList:
|
|
100
|
-
result = _deleteJob(jobID, force=force)
|
|
128
|
+
result = _deleteJob(jobID, force=force, jobdb=jobdb, taskqueuedb=taskqueuedb, pilotagentsdb=pilotagentsdb)
|
|
101
129
|
if not result["OK"]:
|
|
102
130
|
badIDs.append(jobID)
|
|
103
131
|
|
|
@@ -105,9 +133,8 @@ def kill_delete_jobs(right, validJobList, nonauthJobList=[], force=False):
|
|
|
105
133
|
stagingJobList = [jobID for jobID, sDict in jobStates.items() if sDict["Status"] == JobStatus.STAGING]
|
|
106
134
|
|
|
107
135
|
if stagingJobList:
|
|
108
|
-
stagerDB = StorageManagementDB()
|
|
109
136
|
gLogger.info("Going to send killing signal to stager as well!")
|
|
110
|
-
result =
|
|
137
|
+
result = storagemanagementdb.killTasksBySourceTaskID(stagingJobList)
|
|
111
138
|
if not result["OK"]:
|
|
112
139
|
gLogger.warn("Failed to kill some Stager tasks", result["Message"])
|
|
113
140
|
|
|
@@ -19,10 +19,25 @@ from DIRAC.WorkloadManagementSystem.DB.StatusUtils import kill_delete_jobs
|
|
|
19
19
|
],
|
|
20
20
|
)
|
|
21
21
|
def test___kill_delete_jobs(mocker, jobIDs_list, right):
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
22
|
+
# Mock ObjectLoader to return mock DB instances
|
|
23
|
+
mockJobDB = MagicMock()
|
|
24
|
+
mockTaskQueueDB = MagicMock()
|
|
25
|
+
mockPilotAgentsDB = MagicMock()
|
|
26
|
+
mockStorageManagementDB = MagicMock()
|
|
27
|
+
|
|
28
|
+
def mock_load_object(module_path, class_name):
|
|
29
|
+
mocks = {
|
|
30
|
+
"JobDB": mockJobDB,
|
|
31
|
+
"TaskQueueDB": mockTaskQueueDB,
|
|
32
|
+
"PilotAgentsDB": mockPilotAgentsDB,
|
|
33
|
+
"StorageManagementDB": mockStorageManagementDB,
|
|
34
|
+
}
|
|
35
|
+
return {"OK": True, "Value": lambda: mocks[class_name]}
|
|
36
|
+
|
|
37
|
+
mocker.patch(
|
|
38
|
+
"DIRAC.WorkloadManagementSystem.DB.StatusUtils.ObjectLoader.loadObject",
|
|
39
|
+
side_effect=mock_load_object,
|
|
40
|
+
)
|
|
26
41
|
|
|
27
42
|
res = kill_delete_jobs(right, jobIDs_list)
|
|
28
43
|
assert res["OK"]
|
|
@@ -16,7 +16,6 @@ import datetime
|
|
|
16
16
|
import glob
|
|
17
17
|
import json
|
|
18
18
|
import os
|
|
19
|
-
from pathlib import Path
|
|
20
19
|
import re
|
|
21
20
|
import shutil
|
|
22
21
|
import stat
|
|
@@ -24,12 +23,12 @@ import sys
|
|
|
24
23
|
import tarfile
|
|
25
24
|
import threading
|
|
26
25
|
import time
|
|
26
|
+
from pathlib import Path
|
|
27
27
|
from urllib.parse import unquote
|
|
28
28
|
|
|
29
29
|
import DIRAC
|
|
30
30
|
from DIRAC import S_ERROR, S_OK, gConfig, gLogger
|
|
31
31
|
from DIRAC.AccountingSystem.Client.Types.Job import Job as AccountingJob
|
|
32
|
-
|
|
33
32
|
from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
|
|
34
33
|
from DIRAC.ConfigurationSystem.Client.Helpers.Registry import getVOForGroup
|
|
35
34
|
from DIRAC.Core.Utilities import DEncode, DErrno, List
|
|
@@ -1021,7 +1020,7 @@ class JobWrapper:
|
|
|
1021
1020
|
|
|
1022
1021
|
for i in outputSandbox:
|
|
1023
1022
|
if i not in okFiles:
|
|
1024
|
-
if
|
|
1023
|
+
if f"{i}.tar" not in okFiles:
|
|
1025
1024
|
if not re.search(r"\*", i):
|
|
1026
1025
|
if i not in missing:
|
|
1027
1026
|
missing.append(i)
|
|
@@ -1215,8 +1214,8 @@ class JobWrapper:
|
|
|
1215
1214
|
lfn = str(basePath / outputPath / os.path.basename(localfile))
|
|
1216
1215
|
else:
|
|
1217
1216
|
# if LFN is given, take it as it is
|
|
1218
|
-
localfile = str(self.jobIDPath / outputFile.replace("LFN:", ""))
|
|
1219
1217
|
lfn = outputFile.replace("LFN:", "")
|
|
1218
|
+
localfile = str(self.jobIDPath / os.path.basename(lfn))
|
|
1220
1219
|
|
|
1221
1220
|
return (lfn, localfile)
|
|
1222
1221
|
|
|
@@ -1,22 +1,22 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
1
|
+
"""The Watchdog class is used by the Job Wrapper to resolve and monitor
|
|
2
|
+
the system resource consumption. The Watchdog can determine if
|
|
3
|
+
a running job is stalled and indicate this to the Job Wrapper.
|
|
4
|
+
Furthermore, the Watchdog will identify when the Job CPU limit has been
|
|
5
|
+
exceeded and fail jobs meaningfully.
|
|
6
|
+
|
|
7
|
+
Information is returned to the WMS via the heart-beat mechanism. This
|
|
8
|
+
also interprets control signals from the WMS e.g. to kill a running
|
|
9
|
+
job.
|
|
10
|
+
|
|
11
|
+
- Still to implement:
|
|
12
|
+
- CPU normalization for correct comparison with job limit
|
|
13
13
|
"""
|
|
14
|
+
|
|
14
15
|
import datetime
|
|
15
16
|
import errno
|
|
16
17
|
import getpass
|
|
17
18
|
import math
|
|
18
19
|
import os
|
|
19
|
-
import signal
|
|
20
20
|
import socket
|
|
21
21
|
import time
|
|
22
22
|
from pathlib import Path
|
|
@@ -32,28 +32,6 @@ from DIRAC.WorkloadManagementSystem.Client import JobMinorStatus
|
|
|
32
32
|
from DIRAC.WorkloadManagementSystem.Client.JobStateUpdateClient import JobStateUpdateClient
|
|
33
33
|
|
|
34
34
|
|
|
35
|
-
def kill_proc_tree(pid, sig=signal.SIGTERM, includeParent=True):
|
|
36
|
-
"""Kill a process tree (including grandchildren) with signal
|
|
37
|
-
"sig" and return a (gone, still_alive) tuple.
|
|
38
|
-
called as soon as a child terminates.
|
|
39
|
-
|
|
40
|
-
Taken from https://psutil.readthedocs.io/en/latest/index.html#kill-process-tree
|
|
41
|
-
"""
|
|
42
|
-
assert pid != os.getpid(), "won't kill myself"
|
|
43
|
-
parent = psutil.Process(pid)
|
|
44
|
-
children = parent.children(recursive=True)
|
|
45
|
-
if includeParent:
|
|
46
|
-
children.append(parent)
|
|
47
|
-
for p in children:
|
|
48
|
-
try:
|
|
49
|
-
p.send_signal(sig)
|
|
50
|
-
except psutil.NoSuchProcess:
|
|
51
|
-
pass
|
|
52
|
-
_gone, alive = psutil.wait_procs(children, timeout=10)
|
|
53
|
-
for p in alive:
|
|
54
|
-
p.kill()
|
|
55
|
-
|
|
56
|
-
|
|
57
35
|
class Watchdog:
|
|
58
36
|
#############################################################################
|
|
59
37
|
def __init__(self, pid, exeThread, spObject, jobCPUTime, memoryLimit=0, processors=1, jobArgs={}):
|
|
@@ -212,7 +190,7 @@ class Watchdog:
|
|
|
212
190
|
if self.littleTimeLeftCount == 0 and self.__timeLeft() == -1:
|
|
213
191
|
self.checkError = JobMinorStatus.JOB_EXCEEDED_CPU
|
|
214
192
|
self.log.error(self.checkError, self.timeLeft)
|
|
215
|
-
self.
|
|
193
|
+
self.spObject.killChild()
|
|
216
194
|
return S_OK()
|
|
217
195
|
|
|
218
196
|
self.littleTimeLeftCount -= 1
|
|
@@ -321,7 +299,7 @@ class Watchdog:
|
|
|
321
299
|
|
|
322
300
|
self.log.info("=================END=================")
|
|
323
301
|
|
|
324
|
-
self.
|
|
302
|
+
self.spObject.killChild()
|
|
325
303
|
return S_OK()
|
|
326
304
|
|
|
327
305
|
recentStdOut = "None"
|
|
@@ -408,7 +386,7 @@ class Watchdog:
|
|
|
408
386
|
if "Kill" in signalDict:
|
|
409
387
|
self.log.info("Received Kill signal, stopping job via control signal")
|
|
410
388
|
self.checkError = JobMinorStatus.RECEIVED_KILL_SIGNAL
|
|
411
|
-
self.
|
|
389
|
+
self.spObject.killChild()
|
|
412
390
|
else:
|
|
413
391
|
self.log.info("The following control signal was sent but not understood by the watchdog:")
|
|
414
392
|
self.log.info(signalDict)
|
|
@@ -862,13 +840,6 @@ class Watchdog:
|
|
|
862
840
|
|
|
863
841
|
return result
|
|
864
842
|
|
|
865
|
-
#############################################################################
|
|
866
|
-
def __killRunningThread(self):
|
|
867
|
-
"""Will kill the running thread process and any child processes."""
|
|
868
|
-
self.log.info("Sending kill signal to application PID", self.spObject.getChildPID())
|
|
869
|
-
self.spObject.killChild()
|
|
870
|
-
return S_OK("Thread killed")
|
|
871
|
-
|
|
872
843
|
#############################################################################
|
|
873
844
|
def __sendSignOfLife(self, jobID, heartBeatDict, staticParamDict):
|
|
874
845
|
"""Sends sign of life 'heartbeat' signal and triggers control signal
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
1
|
+
"""Test class for JobWrapper"""
|
|
2
|
+
|
|
3
3
|
import os
|
|
4
4
|
import shutil
|
|
5
5
|
import tempfile
|
|
@@ -314,8 +314,7 @@ def test_processKilledSubprocess(mocker):
|
|
|
314
314
|
result = jw.process("sleep 20", {})
|
|
315
315
|
|
|
316
316
|
assert result["OK"]
|
|
317
|
-
assert result["Value"]["payloadStatus"]
|
|
318
|
-
assert not result["Value"]["payloadOutput"]
|
|
317
|
+
assert result["Value"]["payloadStatus"] is None
|
|
319
318
|
assert not result["Value"]["payloadExecutorError"]
|
|
320
319
|
assert result["Value"]["watchdogError"] == "Job is stalled!" # Error message from the watchdog
|
|
321
320
|
|
|
@@ -664,6 +663,7 @@ def jobIDPath():
|
|
|
664
663
|
# Output data files
|
|
665
664
|
(p / "00232454_00000244_1.sim").touch()
|
|
666
665
|
(p / "1720442808testFileUpload.txt").touch()
|
|
666
|
+
(p / "testFileUploadFullLFN.txt").touch()
|
|
667
667
|
|
|
668
668
|
with open(p / "pool_xml_catalog.xml", "w") as f:
|
|
669
669
|
f.write(
|
|
@@ -863,7 +863,11 @@ def test_processJobOutputs_output_data_upload(mocker, setup_another_job_wrapper)
|
|
|
863
863
|
# BTW, isn't the concept of pool_xml_catalog.xml from lhcbdirac?
|
|
864
864
|
jw.jobArgs = {
|
|
865
865
|
"OutputSandbox": [],
|
|
866
|
-
"OutputData": [
|
|
866
|
+
"OutputData": [
|
|
867
|
+
"1720442808testFileUpload.txt",
|
|
868
|
+
"LFN:00232454_00000244_1.sim",
|
|
869
|
+
"LFN:/dirac/user/u/unknown/testFileUploadFullLFN.txt",
|
|
870
|
+
],
|
|
867
871
|
"Owner": "Jane Doe",
|
|
868
872
|
}
|
|
869
873
|
|
|
@@ -879,10 +883,15 @@ def test_processJobOutputs_output_data_upload(mocker, setup_another_job_wrapper)
|
|
|
879
883
|
assert jw.jobReport.jobStatusInfo[1][:-1] == ("", JobMinorStatus.UPLOADING_OUTPUT_DATA)
|
|
880
884
|
assert jw.jobReport.jobStatusInfo[2][:-1] == (JobStatus.COMPLETING, JobMinorStatus.OUTPUT_DATA_UPLOADED)
|
|
881
885
|
assert len(jw.jobReport.jobParameters) == 1
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
"00232454_00000244_1.sim
|
|
885
|
-
|
|
886
|
+
|
|
887
|
+
expected_files = {
|
|
888
|
+
"00232454_00000244_1.sim",
|
|
889
|
+
"/dirac/user/u/unknown/0/123/1720442808testFileUpload.txt",
|
|
890
|
+
"/dirac/user/u/unknown/testFileUploadFullLFN.txt",
|
|
891
|
+
}
|
|
892
|
+
assert jw.jobReport.jobParameters[0][0] == "UploadedOutputData"
|
|
893
|
+
uploaded_files = set(jw.jobReport.jobParameters[0][1].split(", "))
|
|
894
|
+
assert uploaded_files == expected_files
|
|
886
895
|
|
|
887
896
|
|
|
888
897
|
# -------------------------------------------------------------------------------------------------
|