DIRAC 9.0.0a42__py3-none-any.whl → 9.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- DIRAC/AccountingSystem/Client/AccountingCLI.py +0 -140
- DIRAC/AccountingSystem/Client/DataStoreClient.py +0 -13
- DIRAC/AccountingSystem/Client/Types/BaseAccountingType.py +0 -7
- DIRAC/AccountingSystem/ConfigTemplate.cfg +0 -5
- DIRAC/AccountingSystem/Service/DataStoreHandler.py +0 -72
- DIRAC/ConfigurationSystem/Client/Helpers/CSGlobals.py +0 -9
- DIRAC/ConfigurationSystem/Client/Helpers/Registry.py +38 -26
- DIRAC/ConfigurationSystem/Client/Helpers/Resources.py +11 -43
- DIRAC/ConfigurationSystem/Client/Helpers/test/Test_Helpers.py +0 -16
- DIRAC/ConfigurationSystem/Client/LocalConfiguration.py +14 -8
- DIRAC/ConfigurationSystem/Client/PathFinder.py +47 -8
- DIRAC/ConfigurationSystem/Client/SyncPlugins/CERNLDAPSyncPlugin.py +4 -1
- DIRAC/ConfigurationSystem/Client/VOMS2CSSynchronizer.py +32 -19
- DIRAC/ConfigurationSystem/Client/test/Test_PathFinder.py +41 -1
- DIRAC/ConfigurationSystem/private/RefresherBase.py +4 -2
- DIRAC/Core/Base/API.py +4 -7
- DIRAC/Core/Base/SQLAlchemyDB.py +1 -0
- DIRAC/Core/DISET/ServiceReactor.py +11 -3
- DIRAC/Core/DISET/private/BaseClient.py +1 -2
- DIRAC/Core/DISET/private/Transports/M2SSLTransport.py +9 -7
- DIRAC/Core/DISET/private/Transports/SSL/M2Utils.py +3 -1
- DIRAC/Core/LCG/GOCDBClient.py +5 -7
- DIRAC/Core/Security/DiracX.py +31 -17
- DIRAC/Core/Security/IAMService.py +5 -10
- DIRAC/Core/Security/Locations.py +27 -18
- DIRAC/Core/Security/ProxyInfo.py +9 -5
- DIRAC/Core/Security/VOMSService.py +2 -4
- DIRAC/Core/Security/m2crypto/X509Certificate.py +4 -6
- DIRAC/Core/Security/m2crypto/asn1_utils.py +17 -5
- DIRAC/Core/Security/test/test_diracx_token_from_pem.py +161 -0
- DIRAC/Core/Tornado/Client/ClientSelector.py +4 -1
- DIRAC/Core/Tornado/Server/TornadoService.py +1 -1
- DIRAC/Core/Utilities/CGroups2.py +328 -0
- DIRAC/Core/Utilities/ClassAd/ClassAdLight.py +4 -290
- DIRAC/Core/Utilities/DErrno.py +5 -309
- DIRAC/Core/Utilities/Extensions.py +10 -1
- DIRAC/Core/Utilities/File.py +1 -1
- DIRAC/Core/Utilities/Graphs/GraphData.py +1 -1
- DIRAC/Core/Utilities/Graphs/GraphUtilities.py +6 -1
- DIRAC/Core/Utilities/JDL.py +1 -195
- DIRAC/Core/Utilities/List.py +1 -124
- DIRAC/Core/Utilities/MySQL.py +103 -99
- DIRAC/Core/Utilities/Os.py +32 -1
- DIRAC/Core/Utilities/Platform.py +2 -107
- DIRAC/Core/Utilities/Proxy.py +0 -4
- DIRAC/Core/Utilities/ReturnValues.py +7 -252
- DIRAC/Core/Utilities/StateMachine.py +12 -178
- DIRAC/Core/Utilities/Subprocess.py +35 -14
- DIRAC/Core/Utilities/TimeUtilities.py +10 -253
- DIRAC/Core/Utilities/test/Test_JDL.py +0 -3
- DIRAC/Core/Utilities/test/Test_Profiler.py +20 -20
- DIRAC/Core/scripts/dirac_agent.py +1 -1
- DIRAC/Core/scripts/dirac_apptainer_exec.py +72 -46
- DIRAC/Core/scripts/dirac_configure.py +1 -3
- DIRAC/Core/scripts/dirac_install_db.py +24 -6
- DIRAC/Core/scripts/dirac_platform.py +1 -92
- DIRAC/DataManagementSystem/Agent/FTS3Agent.py +8 -7
- DIRAC/DataManagementSystem/Agent/RequestOperations/RemoveFile.py +7 -6
- DIRAC/DataManagementSystem/Client/FTS3Job.py +71 -34
- DIRAC/DataManagementSystem/DB/FTS3DB.py +7 -3
- DIRAC/DataManagementSystem/DB/FileCatalogComponents/DatasetManager/DatasetManager.py +1 -1
- DIRAC/DataManagementSystem/DB/FileCatalogDB.sql +9 -9
- DIRAC/DataManagementSystem/DB/FileCatalogWithFkAndPsDB.sql +9 -9
- DIRAC/DataManagementSystem/Utilities/DMSHelpers.py +6 -2
- DIRAC/DataManagementSystem/scripts/dirac_admin_allow_se.py +13 -8
- DIRAC/DataManagementSystem/scripts/dirac_admin_ban_se.py +13 -8
- DIRAC/DataManagementSystem/scripts/dirac_dms_create_moving_request.py +2 -0
- DIRAC/DataManagementSystem/scripts/dirac_dms_protocol_matrix.py +0 -1
- DIRAC/FrameworkSystem/Client/BundleDeliveryClient.py +2 -7
- DIRAC/FrameworkSystem/Client/ComponentInstaller.py +9 -4
- DIRAC/FrameworkSystem/Client/ProxyManagerClient.py +5 -2
- DIRAC/FrameworkSystem/Client/SystemAdministratorClientCLI.py +11 -6
- DIRAC/FrameworkSystem/ConfigTemplate.cfg +2 -0
- DIRAC/FrameworkSystem/DB/AuthDB.py +3 -3
- DIRAC/FrameworkSystem/DB/InstalledComponentsDB.py +4 -4
- DIRAC/FrameworkSystem/DB/ProxyDB.py +11 -3
- DIRAC/FrameworkSystem/DB/TokenDB.py +1 -1
- DIRAC/FrameworkSystem/Service/ProxyManagerHandler.py +8 -6
- DIRAC/FrameworkSystem/Utilities/MonitoringUtilities.py +2 -19
- DIRAC/FrameworkSystem/Utilities/TokenManagementUtilities.py +3 -2
- DIRAC/FrameworkSystem/Utilities/diracx.py +36 -14
- DIRAC/FrameworkSystem/private/authorization/AuthServer.py +2 -2
- DIRAC/FrameworkSystem/scripts/dirac_admin_update_pilot.py +18 -11
- DIRAC/FrameworkSystem/scripts/dirac_login.py +2 -2
- DIRAC/FrameworkSystem/scripts/dirac_proxy_init.py +7 -8
- DIRAC/Interfaces/API/Dirac.py +27 -15
- DIRAC/Interfaces/API/DiracAdmin.py +45 -17
- DIRAC/Interfaces/API/Job.py +9 -13
- DIRAC/Interfaces/scripts/dirac_admin_allow_site.py +12 -18
- DIRAC/Interfaces/scripts/dirac_admin_ban_site.py +12 -10
- DIRAC/Interfaces/scripts/dirac_admin_get_site_mask.py +4 -13
- DIRAC/Interfaces/scripts/dirac_admin_reset_job.py +3 -6
- DIRAC/Interfaces/scripts/dirac_wms_job_parameters.py +0 -1
- DIRAC/MonitoringSystem/Client/Types/WMSHistory.py +4 -0
- DIRAC/MonitoringSystem/Client/WebAppClient.py +26 -0
- DIRAC/MonitoringSystem/ConfigTemplate.cfg +9 -0
- DIRAC/MonitoringSystem/DB/MonitoringDB.py +6 -25
- DIRAC/MonitoringSystem/Service/MonitoringHandler.py +0 -33
- DIRAC/MonitoringSystem/Service/WebAppHandler.py +599 -0
- DIRAC/MonitoringSystem/private/MainReporter.py +0 -3
- DIRAC/ProductionSystem/DB/ProductionDB.sql +4 -4
- DIRAC/ProductionSystem/scripts/dirac_prod_get.py +2 -2
- DIRAC/ProductionSystem/scripts/dirac_prod_get_all.py +2 -2
- DIRAC/ProductionSystem/scripts/dirac_prod_get_trans.py +2 -3
- DIRAC/RequestManagementSystem/Agent/RequestExecutingAgent.py +8 -6
- DIRAC/RequestManagementSystem/Agent/RequestOperations/ForwardDISET.py +2 -14
- DIRAC/RequestManagementSystem/Client/ReqClient.py +66 -13
- DIRAC/RequestManagementSystem/ConfigTemplate.cfg +6 -6
- DIRAC/RequestManagementSystem/DB/RequestDB.py +10 -5
- DIRAC/RequestManagementSystem/DB/test/RMSTestScenari.py +2 -0
- DIRAC/RequestManagementSystem/private/RequestValidator.py +40 -46
- DIRAC/ResourceStatusSystem/Client/SiteStatus.py +4 -2
- DIRAC/ResourceStatusSystem/Command/FreeDiskSpaceCommand.py +3 -1
- DIRAC/ResourceStatusSystem/DB/ResourceManagementDB.py +8 -8
- DIRAC/ResourceStatusSystem/DB/ResourceStatusDB.py +2 -2
- DIRAC/ResourceStatusSystem/Utilities/CSHelpers.py +2 -31
- DIRAC/ResourceStatusSystem/scripts/dirac_rss_set_status.py +30 -12
- DIRAC/Resources/Catalog/RucioFileCatalogClient.py +195 -1
- DIRAC/Resources/Catalog/test/Test_RucioFileCatalogClient.py +181 -0
- DIRAC/Resources/Computing/AREXComputingElement.py +25 -8
- DIRAC/Resources/Computing/BatchSystems/Condor.py +126 -108
- DIRAC/Resources/Computing/BatchSystems/SLURM.py +5 -1
- DIRAC/Resources/Computing/BatchSystems/test/Test_SLURM.py +46 -0
- DIRAC/Resources/Computing/ComputingElement.py +1 -1
- DIRAC/Resources/Computing/HTCondorCEComputingElement.py +44 -44
- DIRAC/Resources/Computing/InProcessComputingElement.py +4 -2
- DIRAC/Resources/Computing/LocalComputingElement.py +1 -18
- DIRAC/Resources/Computing/SSHBatchComputingElement.py +1 -17
- DIRAC/Resources/Computing/SSHComputingElement.py +1 -18
- DIRAC/Resources/Computing/SingularityComputingElement.py +19 -5
- DIRAC/Resources/Computing/test/Test_HTCondorCEComputingElement.py +67 -49
- DIRAC/Resources/Computing/test/Test_PoolComputingElement.py +2 -1
- DIRAC/Resources/IdProvider/CheckInIdProvider.py +13 -0
- DIRAC/Resources/IdProvider/IdProviderFactory.py +11 -3
- DIRAC/Resources/MessageQueue/StompMQConnector.py +1 -1
- DIRAC/Resources/Storage/GFAL2_StorageBase.py +24 -15
- DIRAC/Resources/Storage/OccupancyPlugins/WLCGAccountingHTTPJson.py +1 -3
- DIRAC/Resources/Storage/StorageBase.py +4 -2
- DIRAC/Resources/Storage/StorageElement.py +6 -7
- DIRAC/StorageManagementSystem/DB/StorageManagementDB.sql +2 -2
- DIRAC/TransformationSystem/Agent/TaskManagerAgentBase.py +10 -16
- DIRAC/TransformationSystem/Agent/TransformationAgent.py +22 -1
- DIRAC/TransformationSystem/Agent/TransformationCleaningAgent.py +16 -16
- DIRAC/TransformationSystem/Client/TaskManager.py +2 -4
- DIRAC/TransformationSystem/Client/Transformation.py +6 -7
- DIRAC/TransformationSystem/Client/TransformationClient.py +21 -11
- DIRAC/TransformationSystem/Client/Utilities.py +9 -0
- DIRAC/TransformationSystem/DB/TransformationDB.py +11 -14
- DIRAC/TransformationSystem/DB/TransformationDB.sql +9 -9
- DIRAC/TransformationSystem/Service/TransformationManagerHandler.py +0 -333
- DIRAC/TransformationSystem/Utilities/ReplicationCLIParameters.py +3 -3
- DIRAC/TransformationSystem/Utilities/TransformationInfo.py +7 -5
- DIRAC/TransformationSystem/scripts/dirac_production_runjoblocal.py +2 -4
- DIRAC/TransformationSystem/test/Test_TransformationInfo.py +22 -15
- DIRAC/TransformationSystem/test/Test_replicationTransformation.py +5 -6
- DIRAC/Workflow/Modules/test/Test_Modules.py +5 -0
- DIRAC/WorkloadManagementSystem/Agent/JobAgent.py +38 -26
- DIRAC/WorkloadManagementSystem/Agent/JobCleaningAgent.py +12 -8
- DIRAC/WorkloadManagementSystem/Agent/PilotSyncAgent.py +4 -3
- DIRAC/WorkloadManagementSystem/Agent/PushJobAgent.py +13 -13
- DIRAC/WorkloadManagementSystem/Agent/SiteDirector.py +18 -14
- DIRAC/WorkloadManagementSystem/Agent/StalledJobAgent.py +18 -51
- DIRAC/WorkloadManagementSystem/Agent/StatesAccountingAgent.py +41 -1
- DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_JobAgent.py +45 -4
- DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_JobCleaningAgent.py +7 -9
- DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_PushJobAgent.py +1 -0
- DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_SiteDirector.py +9 -2
- DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_StalledJobAgent.py +4 -5
- DIRAC/WorkloadManagementSystem/Client/DownloadInputData.py +9 -9
- DIRAC/WorkloadManagementSystem/Client/InputDataResolution.py +6 -6
- DIRAC/WorkloadManagementSystem/Client/JobMonitoringClient.py +10 -11
- DIRAC/WorkloadManagementSystem/Client/JobReport.py +1 -1
- DIRAC/WorkloadManagementSystem/Client/JobState/CachedJobState.py +3 -0
- DIRAC/WorkloadManagementSystem/Client/JobState/JobManifest.py +32 -261
- DIRAC/WorkloadManagementSystem/Client/JobState/JobState.py +6 -0
- DIRAC/WorkloadManagementSystem/Client/JobStateUpdateClient.py +3 -0
- DIRAC/WorkloadManagementSystem/Client/JobStatus.py +8 -152
- DIRAC/WorkloadManagementSystem/Client/PoolXMLSlice.py +12 -19
- DIRAC/WorkloadManagementSystem/Client/SandboxStoreClient.py +25 -38
- DIRAC/WorkloadManagementSystem/Client/WMSClient.py +2 -3
- DIRAC/WorkloadManagementSystem/Client/test/Test_Client_DownloadInputData.py +29 -0
- DIRAC/WorkloadManagementSystem/ConfigTemplate.cfg +4 -8
- DIRAC/WorkloadManagementSystem/DB/JobDB.py +89 -132
- DIRAC/WorkloadManagementSystem/DB/JobDB.sql +8 -8
- DIRAC/WorkloadManagementSystem/DB/JobDBUtils.py +18 -147
- DIRAC/WorkloadManagementSystem/DB/JobLoggingDB.py +19 -6
- DIRAC/WorkloadManagementSystem/DB/JobParametersDB.py +9 -9
- DIRAC/WorkloadManagementSystem/DB/PilotAgentsDB.py +16 -5
- DIRAC/WorkloadManagementSystem/DB/PilotAgentsDB.sql +3 -3
- DIRAC/WorkloadManagementSystem/DB/SandboxMetadataDB.py +44 -82
- DIRAC/WorkloadManagementSystem/DB/StatusUtils.py +125 -0
- DIRAC/WorkloadManagementSystem/DB/tests/Test_JobDB.py +1 -1
- DIRAC/WorkloadManagementSystem/DB/tests/Test_StatusUtils.py +28 -0
- DIRAC/WorkloadManagementSystem/Executor/JobSanity.py +5 -4
- DIRAC/WorkloadManagementSystem/Executor/JobScheduling.py +4 -0
- DIRAC/WorkloadManagementSystem/FutureClient/JobStateUpdateClient.py +75 -33
- DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapper.py +22 -11
- DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py +9 -10
- DIRAC/WorkloadManagementSystem/JobWrapper/test/Test_JobWrapper.py +60 -10
- DIRAC/WorkloadManagementSystem/JobWrapper/test/Test_JobWrapperTemplate.py +4 -0
- DIRAC/WorkloadManagementSystem/Service/JobManagerHandler.py +33 -154
- DIRAC/WorkloadManagementSystem/Service/JobMonitoringHandler.py +5 -323
- DIRAC/WorkloadManagementSystem/Service/JobStateUpdateHandler.py +0 -16
- DIRAC/WorkloadManagementSystem/Service/PilotManagerHandler.py +6 -103
- DIRAC/WorkloadManagementSystem/Service/SandboxStoreHandler.py +7 -53
- DIRAC/WorkloadManagementSystem/Service/WMSAdministratorHandler.py +16 -79
- DIRAC/WorkloadManagementSystem/Service/WMSUtilities.py +4 -18
- DIRAC/WorkloadManagementSystem/Utilities/JobModel.py +28 -209
- DIRAC/WorkloadManagementSystem/Utilities/JobParameters.py +65 -3
- DIRAC/WorkloadManagementSystem/Utilities/JobStatusUtility.py +2 -64
- DIRAC/WorkloadManagementSystem/Utilities/ParametricJob.py +7 -171
- DIRAC/WorkloadManagementSystem/Utilities/PilotCStoJSONSynchronizer.py +73 -7
- DIRAC/WorkloadManagementSystem/Utilities/PilotWrapper.py +41 -11
- DIRAC/WorkloadManagementSystem/Utilities/RemoteRunner.py +16 -0
- DIRAC/WorkloadManagementSystem/Utilities/Utils.py +36 -1
- DIRAC/WorkloadManagementSystem/Utilities/jobAdministration.py +15 -0
- DIRAC/WorkloadManagementSystem/Utilities/test/Test_JobModel.py +1 -15
- DIRAC/WorkloadManagementSystem/Utilities/test/Test_ParametricJob.py +45 -128
- DIRAC/WorkloadManagementSystem/Utilities/test/Test_PilotWrapper.py +16 -0
- DIRAC/WorkloadManagementSystem/scripts/dirac_jobexec.py +7 -2
- DIRAC/WorkloadManagementSystem/scripts/dirac_wms_pilot_job_info.py +1 -1
- DIRAC/__init__.py +62 -60
- DIRAC/tests/Utilities/testJobDefinitions.py +22 -28
- {DIRAC-9.0.0a42.dist-info → dirac-9.0.7.dist-info}/METADATA +8 -5
- {DIRAC-9.0.0a42.dist-info → dirac-9.0.7.dist-info}/RECORD +229 -228
- {DIRAC-9.0.0a42.dist-info → dirac-9.0.7.dist-info}/WHEEL +1 -1
- {DIRAC-9.0.0a42.dist-info → dirac-9.0.7.dist-info}/entry_points.txt +0 -3
- DIRAC/Core/Utilities/test/Test_List.py +0 -150
- DIRAC/Core/Utilities/test/Test_Time.py +0 -88
- DIRAC/Resources/Computing/PilotBundle.py +0 -70
- DIRAC/TransformationSystem/scripts/dirac_transformation_archive.py +0 -30
- DIRAC/TransformationSystem/scripts/dirac_transformation_clean.py +0 -30
- DIRAC/TransformationSystem/scripts/dirac_transformation_remove_output.py +0 -30
- DIRAC/WorkloadManagementSystem/Utilities/test/Test_JobManager.py +0 -58
- {DIRAC-9.0.0a42.dist-info → dirac-9.0.7.dist-info/licenses}/LICENSE +0 -0
- {DIRAC-9.0.0a42.dist-info → dirac-9.0.7.dist-info}/top_level.txt +0 -0
|
@@ -5,34 +5,36 @@
|
|
|
5
5
|
and the current resource status that is used for matching.
|
|
6
6
|
"""
|
|
7
7
|
import os
|
|
8
|
-
import sys
|
|
9
8
|
import re
|
|
9
|
+
import sys
|
|
10
10
|
import time
|
|
11
|
+
from pathlib import Path
|
|
11
12
|
|
|
12
13
|
from diraccfg import CFG
|
|
13
14
|
|
|
14
|
-
from DIRAC import
|
|
15
|
+
from DIRAC import S_ERROR, S_OK, gConfig, rootPath, siteName
|
|
15
16
|
from DIRAC.ConfigurationSystem.Client.Helpers.Registry import getDNForUsername
|
|
16
|
-
from DIRAC.Core.Utilities.ClassAd.ClassAdLight import ClassAd
|
|
17
17
|
from DIRAC.Core.Base.AgentModule import AgentModule
|
|
18
|
-
from DIRAC.Core.Security.ProxyInfo import getProxyInfo
|
|
19
18
|
from DIRAC.Core.Security import Properties
|
|
19
|
+
from DIRAC.Core.Security.ProxyFile import writeChainToTemporaryFile
|
|
20
|
+
from DIRAC.Core.Security.ProxyInfo import getProxyInfo
|
|
20
21
|
from DIRAC.Core.Utilities import DErrno
|
|
22
|
+
from DIRAC.Core.Utilities.CGroups2 import CG2Manager
|
|
23
|
+
from DIRAC.Core.Utilities.ClassAd.ClassAdLight import ClassAd
|
|
21
24
|
from DIRAC.Core.Utilities.ObjectLoader import ObjectLoader
|
|
22
25
|
from DIRAC.FrameworkSystem.Client.ProxyManagerClient import gProxyManager
|
|
23
|
-
from DIRAC.Resources.Computing.BatchSystems.TimeLeft.TimeLeft import TimeLeft
|
|
24
|
-
from DIRAC.Resources.Computing.ComputingElementFactory import ComputingElementFactory
|
|
25
|
-
from DIRAC.RequestManagementSystem.Client.Request import Request
|
|
26
26
|
from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient
|
|
27
|
+
from DIRAC.RequestManagementSystem.Client.Request import Request
|
|
27
28
|
from DIRAC.RequestManagementSystem.private.RequestValidator import RequestValidator
|
|
28
|
-
from DIRAC.
|
|
29
|
-
from DIRAC.
|
|
29
|
+
from DIRAC.Resources.Computing.BatchSystems.TimeLeft.TimeLeft import TimeLeft
|
|
30
|
+
from DIRAC.Resources.Computing.ComputingElementFactory import ComputingElementFactory
|
|
31
|
+
from DIRAC.WorkloadManagementSystem.Client import JobStatus, PilotStatus
|
|
30
32
|
from DIRAC.WorkloadManagementSystem.Client.JobManagerClient import JobManagerClient
|
|
31
33
|
from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient
|
|
32
34
|
from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport
|
|
33
|
-
from DIRAC.WorkloadManagementSystem.Client import
|
|
35
|
+
from DIRAC.WorkloadManagementSystem.Client.MatcherClient import MatcherClient
|
|
36
|
+
from DIRAC.WorkloadManagementSystem.Client.PilotManagerClient import PilotManagerClient
|
|
34
37
|
from DIRAC.WorkloadManagementSystem.Utilities.Utils import createJobWrapper
|
|
35
|
-
from DIRAC.WorkloadManagementSystem.Client import PilotStatus
|
|
36
38
|
|
|
37
39
|
|
|
38
40
|
class JobAgent(AgentModule):
|
|
@@ -134,6 +136,14 @@ class JobAgent(AgentModule):
|
|
|
134
136
|
|
|
135
137
|
# Utilities
|
|
136
138
|
self.timeLeftUtil = TimeLeft()
|
|
139
|
+
|
|
140
|
+
# Some innerCEs may want to make use of CGroup2 support, so we prepare it globally here
|
|
141
|
+
res = CG2Manager().setUp()
|
|
142
|
+
if res["OK"]:
|
|
143
|
+
self.log.info("CGroup2 support configured successfully.")
|
|
144
|
+
else:
|
|
145
|
+
self.log.info("CGroup2 support unavailable:", res["Message"])
|
|
146
|
+
|
|
137
147
|
return S_OK()
|
|
138
148
|
|
|
139
149
|
def _initializeComputingElement(self, localCE):
|
|
@@ -226,7 +236,6 @@ class JobAgent(AgentModule):
|
|
|
226
236
|
jobGroup = matcherInfo["Group"]
|
|
227
237
|
owner = matcherInfo["Owner"]
|
|
228
238
|
ceDict = matcherInfo["CEDict"]
|
|
229
|
-
matchTime = matcherInfo["matchTime"]
|
|
230
239
|
|
|
231
240
|
optimizerParams = {}
|
|
232
241
|
for key in matcherInfo:
|
|
@@ -253,9 +262,6 @@ class JobAgent(AgentModule):
|
|
|
253
262
|
self.log.verbose("Job request successful: \n", jobRequest["Value"])
|
|
254
263
|
self.log.info("Received", f"JobID={jobID}, JobType={jobType}, Owner={owner}, JobGroup={jobGroup}")
|
|
255
264
|
self.jobCount += 1
|
|
256
|
-
self.jobs[jobID]["JobReport"].setJobParameter(
|
|
257
|
-
par_name="MatcherServiceTime", par_value=str(matchTime), sendFlag=False
|
|
258
|
-
)
|
|
259
265
|
|
|
260
266
|
self.jobs[jobID]["JobReport"].setJobStatus(minorStatus="Job Received by Agent", sendFlag=False)
|
|
261
267
|
result_setupProxy = self._setupProxy(owner, jobGroup)
|
|
@@ -476,8 +482,6 @@ class JobAgent(AgentModule):
|
|
|
476
482
|
|
|
477
483
|
proxyChain = ret["Value"]["chain"]
|
|
478
484
|
if "groupProperties" not in ret["Value"]:
|
|
479
|
-
print(ret["Value"])
|
|
480
|
-
print(proxyChain.dumpAllToString())
|
|
481
485
|
self.log.error("Invalid Proxy", "Group has no properties defined")
|
|
482
486
|
return S_ERROR("Proxy has no group properties defined")
|
|
483
487
|
|
|
@@ -539,7 +543,7 @@ class JobAgent(AgentModule):
|
|
|
539
543
|
jobRequest = MatcherClient().requestJob(ceDict)
|
|
540
544
|
matchTime = time.time() - start
|
|
541
545
|
|
|
542
|
-
self.log.
|
|
546
|
+
self.log.verbose("MatcherTime", f"= {matchTime:.2f} (s)")
|
|
543
547
|
if jobRequest["OK"]:
|
|
544
548
|
jobRequest["Value"]["matchTime"] = matchTime
|
|
545
549
|
jobRequest["Value"]["CEDict"] = ceDict
|
|
@@ -626,13 +630,15 @@ class JobAgent(AgentModule):
|
|
|
626
630
|
|
|
627
631
|
self.log.info("Submitting JobWrapper", f"{os.path.basename(wrapperFile)} to {self.ceName}CE")
|
|
628
632
|
|
|
629
|
-
# Pass proxy to the CE
|
|
630
|
-
|
|
631
|
-
if not
|
|
632
|
-
self.log.error("Invalid proxy",
|
|
633
|
-
return S_ERROR("
|
|
633
|
+
# Pass proxy to the CE, writing it to a temporary file to ensure the DiracX token is included
|
|
634
|
+
retVal = writeChainToTemporaryFile(proxyChain)
|
|
635
|
+
if not retVal["OK"]:
|
|
636
|
+
self.log.error("Invalid proxy", retVal["Message"])
|
|
637
|
+
return S_ERROR("Failed to write proxy to temporary file")
|
|
638
|
+
proxyLocation = Path(retVal["Value"])
|
|
639
|
+
payloadProxy = proxyLocation.read_text()
|
|
640
|
+
proxyLocation.unlink()
|
|
634
641
|
|
|
635
|
-
payloadProxy = proxy["Value"]
|
|
636
642
|
try:
|
|
637
643
|
result = self.computingElement.submitJob(
|
|
638
644
|
wrapperFile,
|
|
@@ -651,7 +657,7 @@ class JobAgent(AgentModule):
|
|
|
651
657
|
self.log.exception("Exception occurred when submitting", f"JobID: {jobID}")
|
|
652
658
|
taskID = 0
|
|
653
659
|
# We create a S_ERROR from the exception to compute it as a normal error
|
|
654
|
-
self.computingElement.taskResults[taskID] = S_ERROR(unexpectedSubmitException)
|
|
660
|
+
self.computingElement.taskResults[taskID] = S_ERROR(str(unexpectedSubmitException))
|
|
655
661
|
self.jobs[jobID]["TaskID"] = taskID
|
|
656
662
|
return S_OK()
|
|
657
663
|
|
|
@@ -683,7 +689,13 @@ class JobAgent(AgentModule):
|
|
|
683
689
|
# Here we iterate over a copy of the keys because we are modifying the dictionary within the loop
|
|
684
690
|
for jobID in list(self.jobs.keys()):
|
|
685
691
|
taskID = self.jobs[jobID].get("TaskID")
|
|
686
|
-
if taskID is None
|
|
692
|
+
if taskID is None:
|
|
693
|
+
# This generally means that there was an error before the submission
|
|
694
|
+
# and the TaskID was not set and will never be.
|
|
695
|
+
self.log.info("No taskID found for job", jobID)
|
|
696
|
+
del self.jobs[jobID]
|
|
697
|
+
continue
|
|
698
|
+
if taskID not in self.computingElement.taskResults:
|
|
687
699
|
continue
|
|
688
700
|
|
|
689
701
|
result = self.computingElement.taskResults[taskID]
|
|
@@ -35,10 +35,12 @@ from DIRAC.RequestManagementSystem.Client.Operation import Operation
|
|
|
35
35
|
from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient
|
|
36
36
|
from DIRAC.RequestManagementSystem.Client.Request import Request
|
|
37
37
|
from DIRAC.WorkloadManagementSystem.Client import JobStatus
|
|
38
|
-
from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient
|
|
39
|
-
from DIRAC.WorkloadManagementSystem.Client.SandboxStoreClient import SandboxStoreClient
|
|
40
38
|
from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient
|
|
41
39
|
from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB
|
|
40
|
+
from DIRAC.WorkloadManagementSystem.DB.SandboxMetadataDB import SandboxMetadataDB
|
|
41
|
+
from DIRAC.WorkloadManagementSystem.Service.JobPolicy import RIGHT_DELETE
|
|
42
|
+
from DIRAC.WorkloadManagementSystem.DB.StatusUtils import kill_delete_jobs
|
|
43
|
+
from DIRAC.WorkloadManagementSystem.Utilities.JobParameters import getJobParameters
|
|
42
44
|
|
|
43
45
|
|
|
44
46
|
class JobCleaningAgent(AgentModule):
|
|
@@ -152,8 +154,9 @@ class JobCleaningAgent(AgentModule):
|
|
|
152
154
|
return S_OK()
|
|
153
155
|
|
|
154
156
|
self.log.info("Unassigning sandboxes from soon to be deleted jobs", f"({len(jobList)})")
|
|
155
|
-
|
|
156
|
-
|
|
157
|
+
|
|
158
|
+
entitiesList = [f"Job:{jobId}" for jobId in jobList]
|
|
159
|
+
if not (result := SandboxMetadataDB().unassignEntities(entitiesList))["OK"]:
|
|
157
160
|
self.log.error("Cannot unassign jobs to sandboxes", result["Message"])
|
|
158
161
|
return result
|
|
159
162
|
|
|
@@ -229,14 +232,14 @@ class JobCleaningAgent(AgentModule):
|
|
|
229
232
|
if not res["OK"]:
|
|
230
233
|
self.log.error("No DN found", f"for {user}")
|
|
231
234
|
return res
|
|
232
|
-
wmsClient = WMSClient(useCertificates=True, delegatedDN=res["Value"][0], delegatedGroup=ownerGroup)
|
|
233
235
|
if remove:
|
|
236
|
+
wmsClient = WMSClient(useCertificates=True, delegatedDN=res["Value"][0], delegatedGroup=ownerGroup)
|
|
234
237
|
result = wmsClient.removeJob(jobsList)
|
|
235
238
|
else:
|
|
236
|
-
result =
|
|
239
|
+
result = kill_delete_jobs(RIGHT_DELETE, jobsList)
|
|
237
240
|
if not result["OK"]:
|
|
238
241
|
self.log.error(
|
|
239
|
-
"Could not {'remove' if remove else 'delete'} jobs",
|
|
242
|
+
f"Could not {'remove' if remove else 'delete'} jobs",
|
|
240
243
|
f"for {user} : {ownerGroup} (n={len(jobsList)}) : {result['Message']}",
|
|
241
244
|
)
|
|
242
245
|
fail = True
|
|
@@ -293,7 +296,8 @@ class JobCleaningAgent(AgentModule):
|
|
|
293
296
|
failed = {}
|
|
294
297
|
successful = {}
|
|
295
298
|
|
|
296
|
-
|
|
299
|
+
jobIDs = [int(jobID) for jobID in jobIDList]
|
|
300
|
+
result = getJobParameters(jobIDs, "OutputSandboxLFN")
|
|
297
301
|
if not result["OK"]:
|
|
298
302
|
return result
|
|
299
303
|
osLFNDict = result["Value"]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""This agent syncs CS and pilot files to a web server of your choice
|
|
2
2
|
|
|
3
3
|
.. literalinclude:: ../ConfigTemplate.cfg
|
|
4
4
|
:start-after: ##BEGIN PilotSyncAgent
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
:caption: PilotsSyncAgent options
|
|
8
8
|
|
|
9
9
|
"""
|
|
10
|
+
|
|
10
11
|
import os
|
|
11
12
|
import json
|
|
12
13
|
import shutil
|
|
@@ -38,8 +39,8 @@ class PilotSyncAgent(AgentModule):
|
|
|
38
39
|
self.workingDirectory = self.am_getOption("WorkDirectory")
|
|
39
40
|
self.saveDir = self.am_getOption("SaveDirectory", self.saveDir)
|
|
40
41
|
self.uploadLocations = self.am_getOption("UploadLocations", self.uploadLocations)
|
|
41
|
-
includeMasterCS = self.am_getOption("IncludeMasterCS", self.includeMasterCS)
|
|
42
|
-
if isinstance(includeMasterCS, str) and includeMasterCS.lower() in ["n", "no", "false"]:
|
|
42
|
+
self.includeMasterCS = self.am_getOption("IncludeMasterCS", self.includeMasterCS)
|
|
43
|
+
if isinstance(self.includeMasterCS, str) and self.includeMasterCS.lower() in ["n", "no", "false"]:
|
|
43
44
|
self.includeMasterCS = False
|
|
44
45
|
|
|
45
46
|
self.certAndKeyLocation = getHostCertificateAndKeyLocation()
|
|
@@ -12,16 +12,14 @@
|
|
|
12
12
|
import hashlib
|
|
13
13
|
import json
|
|
14
14
|
import os
|
|
15
|
-
from pathlib import Path
|
|
16
15
|
import random
|
|
17
16
|
import shutil
|
|
18
17
|
import sys
|
|
19
|
-
from collections import defaultdict
|
|
20
18
|
import time
|
|
19
|
+
from collections import defaultdict
|
|
20
|
+
from pathlib import Path
|
|
21
21
|
|
|
22
|
-
from
|
|
23
|
-
|
|
24
|
-
from DIRAC import gConfig, S_OK, S_ERROR
|
|
22
|
+
from DIRAC import S_ERROR, S_OK, gConfig
|
|
25
23
|
from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
|
|
26
24
|
from DIRAC.ConfigurationSystem.Client.Helpers.Resources import getQueues
|
|
27
25
|
from DIRAC.Core.Utilities import DErrno
|
|
@@ -30,6 +28,7 @@ from DIRAC.Core.Utilities.Proxy import executeWithUserProxy
|
|
|
30
28
|
from DIRAC.Core.Utilities.Version import getVersion
|
|
31
29
|
from DIRAC.FrameworkSystem.Client.ProxyManagerClient import gProxyManager
|
|
32
30
|
from DIRAC.Resources.Computing import ComputingElement
|
|
31
|
+
from DIRAC.WorkloadManagementSystem.Agent.JobAgent import JobAgent
|
|
33
32
|
from DIRAC.WorkloadManagementSystem.Client import JobMinorStatus, JobStatus, PilotStatus
|
|
34
33
|
from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient
|
|
35
34
|
from DIRAC.WorkloadManagementSystem.Client.JobReport import JobReport
|
|
@@ -41,11 +40,10 @@ from DIRAC.WorkloadManagementSystem.JobWrapper.JobWrapperUtilities import (
|
|
|
41
40
|
resolveInputData,
|
|
42
41
|
transferInputSandbox,
|
|
43
42
|
)
|
|
44
|
-
from DIRAC.WorkloadManagementSystem.Utilities.QueueUtilities import getQueuesResolved
|
|
45
|
-
from DIRAC.WorkloadManagementSystem.Agent.JobAgent import JobAgent
|
|
46
|
-
from DIRAC.WorkloadManagementSystem.Utilities.Utils import createJobWrapper
|
|
47
43
|
from DIRAC.WorkloadManagementSystem.private.ConfigHelper import findGenericPilotCredentials
|
|
44
|
+
from DIRAC.WorkloadManagementSystem.Utilities.JobParameters import getJobParameters
|
|
48
45
|
from DIRAC.WorkloadManagementSystem.Utilities.QueueUtilities import getQueuesResolved
|
|
46
|
+
from DIRAC.WorkloadManagementSystem.Utilities.Utils import createJobWrapper
|
|
49
47
|
|
|
50
48
|
MAX_JOBS_MANAGED = 100
|
|
51
49
|
|
|
@@ -230,6 +228,12 @@ class PushJobAgent(JobAgent):
|
|
|
230
228
|
return result
|
|
231
229
|
pilotProxy = result["Value"]
|
|
232
230
|
|
|
231
|
+
# Dump the proxy to a file to get DiracX token (it's later used by DiracX)
|
|
232
|
+
result = gProxyManager.dumpProxyToFile(pilotProxy)
|
|
233
|
+
if not result["OK"]:
|
|
234
|
+
return result
|
|
235
|
+
os.environ["X509_USER_PROXY"] = result["Value"]
|
|
236
|
+
|
|
233
237
|
for queueName, queueDictionary in queueDictItems:
|
|
234
238
|
# Make sure there is no problem with the queue before trying to submit
|
|
235
239
|
if not self._allowedToSubmit(queueName):
|
|
@@ -281,7 +285,6 @@ class PushJobAgent(JobAgent):
|
|
|
281
285
|
jobGroup = matcherInfo["Group"]
|
|
282
286
|
owner = matcherInfo["Owner"]
|
|
283
287
|
ceDict = matcherInfo["CEDict"]
|
|
284
|
-
matchTime = matcherInfo["matchTime"]
|
|
285
288
|
|
|
286
289
|
optimizerParams = {}
|
|
287
290
|
for key in matcherInfo:
|
|
@@ -309,9 +312,6 @@ class PushJobAgent(JobAgent):
|
|
|
309
312
|
self.log.verbose("Job request successful: \n", jobRequest["Value"])
|
|
310
313
|
self.log.info("Received", f"JobID={jobID}, JobType={jobType}, Owner={owner}, JobGroup={jobGroup}")
|
|
311
314
|
|
|
312
|
-
self.jobs[jobID]["JobReport"].setJobParameter(
|
|
313
|
-
par_name="MatcherServiceTime", par_value=str(matchTime), sendFlag=False
|
|
314
|
-
)
|
|
315
315
|
self.jobs[jobID]["JobReport"].setJobStatus(
|
|
316
316
|
status=JobStatus.MATCHED, minorStatus="Job Received by Agent", sendFlag=False
|
|
317
317
|
)
|
|
@@ -734,7 +734,7 @@ class PushJobAgent(JobAgent):
|
|
|
734
734
|
return S_OK()
|
|
735
735
|
|
|
736
736
|
# Get their parameters
|
|
737
|
-
if not (result :=
|
|
737
|
+
if not (result := getJobParameters(jobs, ["GridCE", "TaskID", "Stamp"]))["OK"]:
|
|
738
738
|
self.log.error("Failed to get the list of taskIDs", result["Message"])
|
|
739
739
|
return result
|
|
740
740
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""The Site Director is an agent performing pilot job submission to particular sites/Computing Elements.
|
|
2
2
|
|
|
3
3
|
.. literalinclude:: ../ConfigTemplate.cfg
|
|
4
4
|
:start-after: ##BEGIN SiteDirector
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
:caption: SiteDirector options
|
|
8
8
|
|
|
9
9
|
"""
|
|
10
|
+
|
|
10
11
|
import datetime
|
|
11
12
|
import os
|
|
12
13
|
from collections import defaultdict
|
|
@@ -14,7 +15,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
14
15
|
from typing import Any
|
|
15
16
|
|
|
16
17
|
import DIRAC
|
|
17
|
-
from DIRAC import S_ERROR, S_OK
|
|
18
|
+
from DIRAC import S_ERROR, S_OK
|
|
18
19
|
from DIRAC.AccountingSystem.Client.DataStoreClient import gDataStoreClient
|
|
19
20
|
from DIRAC.AccountingSystem.Client.Types.Pilot import Pilot as PilotAccounting
|
|
20
21
|
from DIRAC.AccountingSystem.Client.Types.PilotSubmission import (
|
|
@@ -147,10 +148,10 @@ class SiteDirector(AgentModule):
|
|
|
147
148
|
self.sendSubmissionAccounting = True
|
|
148
149
|
|
|
149
150
|
# Get the site description dictionary
|
|
150
|
-
siteNames = self.am_getOption("Site"
|
|
151
|
-
ceTypes = self.am_getOption("CETypes"
|
|
152
|
-
ces = self.am_getOption("CEs"
|
|
153
|
-
tags = self.am_getOption("Tags"
|
|
151
|
+
siteNames = self.am_getOption("Site")
|
|
152
|
+
ceTypes = self.am_getOption("CETypes")
|
|
153
|
+
ces = self.am_getOption("CEs")
|
|
154
|
+
tags = self.am_getOption("Tags")
|
|
154
155
|
|
|
155
156
|
# Display options used
|
|
156
157
|
self.log.always("VO:", self.vo)
|
|
@@ -168,7 +169,7 @@ class SiteDirector(AgentModule):
|
|
|
168
169
|
self.log.always("MaxPilotsToSubmit:", self.maxPilotsToSubmit)
|
|
169
170
|
|
|
170
171
|
# Build the dictionary of queues that are going to be used: self.queueDict
|
|
171
|
-
if not (result := self._buildQueueDict(siteNames,
|
|
172
|
+
if not (result := self._buildQueueDict(siteNames, ces, ceTypes, tags))["OK"]:
|
|
172
173
|
return result
|
|
173
174
|
|
|
174
175
|
# Stop the execution if there is no usable queue
|
|
@@ -229,12 +230,8 @@ class SiteDirector(AgentModule):
|
|
|
229
230
|
site = self.queueDict[queueName]["Site"]
|
|
230
231
|
ce = self.queueDict[queueName]["CEName"]
|
|
231
232
|
|
|
232
|
-
# Check the status of the Site
|
|
233
|
-
if site in siteMaskList:
|
|
234
|
-
continue
|
|
235
|
-
|
|
236
|
-
# Check the status of the CE (only for RSS=Active)
|
|
237
|
-
if ce not in ceMaskList:
|
|
233
|
+
# Check the status of the Site and CE
|
|
234
|
+
if site in siteMaskList and ce in ceMaskList:
|
|
238
235
|
continue
|
|
239
236
|
|
|
240
237
|
self.log.warn("Queue not considered because not usable:", queueName)
|
|
@@ -580,7 +577,7 @@ class SiteDirector(AgentModule):
|
|
|
580
577
|
pilotOptions = []
|
|
581
578
|
|
|
582
579
|
pilotOptions = " ".join(pilotOptions)
|
|
583
|
-
self.log.verbose(f"
|
|
580
|
+
self.log.verbose(f"{pilotOptions=}")
|
|
584
581
|
|
|
585
582
|
# if a global workingDirectory is defined for the CEType (like HTCondor)
|
|
586
583
|
# use it (otherwise the __cleanup done by HTCondor will be in the wrong folder !)
|
|
@@ -624,6 +621,11 @@ class SiteDirector(AgentModule):
|
|
|
624
621
|
else:
|
|
625
622
|
self.log.info("DIRAC project will be installed by pilots")
|
|
626
623
|
|
|
624
|
+
# Architecture script to use
|
|
625
|
+
architectureScript = opsHelper.getValue("Pilot/ArchitectureScript", "")
|
|
626
|
+
if architectureScript:
|
|
627
|
+
pilotOptions.append(f"--architectureScript={architectureScript}")
|
|
628
|
+
|
|
627
629
|
# Preinstalled environment or list of CVMFS locations defined ?
|
|
628
630
|
preinstalledEnv = opsHelper.getValue("Pilot/PreinstalledEnv", "")
|
|
629
631
|
preinstalledEnvPrefix = opsHelper.getValue("Pilot/PreinstalledEnvPrefix", "")
|
|
@@ -695,6 +697,8 @@ class SiteDirector(AgentModule):
|
|
|
695
697
|
:returns: file name of the pilot wrapper created
|
|
696
698
|
"""
|
|
697
699
|
|
|
700
|
+
pilotFilesCompressedEncodedDict = None
|
|
701
|
+
|
|
698
702
|
try:
|
|
699
703
|
pilotFilesCompressedEncodedDict = getPilotFilesCompressedEncodedDict([], proxy)
|
|
700
704
|
except Exception as be:
|
|
@@ -14,18 +14,18 @@ import datetime
|
|
|
14
14
|
from DIRAC import S_ERROR, S_OK, gConfig
|
|
15
15
|
from DIRAC.AccountingSystem.Client.Types.Job import Job
|
|
16
16
|
from DIRAC.ConfigurationSystem.Client.Helpers import cfgPath
|
|
17
|
-
from DIRAC.ConfigurationSystem.Client.Helpers.Registry import getDNForUsername
|
|
18
17
|
from DIRAC.Core.Base.AgentModule import AgentModule
|
|
19
18
|
from DIRAC.Core.Utilities import DErrno
|
|
20
19
|
from DIRAC.Core.Utilities.ClassAd.ClassAdLight import ClassAd
|
|
21
20
|
from DIRAC.Core.Utilities.TimeUtilities import fromString, second, toEpoch
|
|
22
21
|
from DIRAC.WorkloadManagementSystem.Client import JobMinorStatus, JobStatus
|
|
23
|
-
from DIRAC.WorkloadManagementSystem.Client.JobManagerClient import JobManagerClient
|
|
24
|
-
from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient
|
|
25
|
-
from DIRAC.WorkloadManagementSystem.Client.PilotManagerClient import PilotManagerClient
|
|
26
|
-
from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient
|
|
27
22
|
from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB
|
|
28
23
|
from DIRAC.WorkloadManagementSystem.DB.JobLoggingDB import JobLoggingDB
|
|
24
|
+
from DIRAC.WorkloadManagementSystem.DB.PilotAgentsDB import PilotAgentsDB
|
|
25
|
+
from DIRAC.WorkloadManagementSystem.Service.JobPolicy import RIGHT_KILL
|
|
26
|
+
from DIRAC.WorkloadManagementSystem.DB.StatusUtils import kill_delete_jobs
|
|
27
|
+
from DIRAC.WorkloadManagementSystem.Utilities.JobParameters import getJobParameters
|
|
28
|
+
from DIRAC.WorkloadManagementSystem.Utilities.Utils import rescheduleJobs
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class StalledJobAgent(AgentModule):
|
|
@@ -235,7 +235,7 @@ class StalledJobAgent(AgentModule):
|
|
|
235
235
|
# Set the jobs Failed, send them a kill signal in case they are not really dead
|
|
236
236
|
# and send accounting info
|
|
237
237
|
if setFailed:
|
|
238
|
-
res =
|
|
238
|
+
res = kill_delete_jobs(RIGHT_KILL, [jobID], nonauthJobList=[], force=True)
|
|
239
239
|
if not res["OK"]:
|
|
240
240
|
self.log.error("Failed to kill job", jobID)
|
|
241
241
|
|
|
@@ -254,15 +254,15 @@ class StalledJobAgent(AgentModule):
|
|
|
254
254
|
|
|
255
255
|
def _getJobPilotStatus(self, jobID):
|
|
256
256
|
"""Get the job pilot status."""
|
|
257
|
-
result =
|
|
257
|
+
result = getJobParameters([jobID], "Pilot_Reference")
|
|
258
258
|
if not result["OK"]:
|
|
259
259
|
return result
|
|
260
|
-
pilotReference = result["Value"].get("Pilot_Reference"
|
|
261
|
-
if pilotReference
|
|
260
|
+
pilotReference = result["Value"].get("Pilot_Reference")
|
|
261
|
+
if not pilotReference:
|
|
262
262
|
# There is no pilot reference, hence its status is unknown
|
|
263
263
|
return S_OK("NoPilot")
|
|
264
264
|
|
|
265
|
-
result =
|
|
265
|
+
result = PilotAgentsDB().getPilotInfo(pilotReference)
|
|
266
266
|
if not result["OK"]:
|
|
267
267
|
if DErrno.cmpError(result, DErrno.EWMSNOPILOT):
|
|
268
268
|
self.log.warn("No pilot found", f"for job {jobID}: {result['Message']}")
|
|
@@ -389,11 +389,11 @@ class StalledJobAgent(AgentModule):
|
|
|
389
389
|
if lastHeartBeatTime is not None and lastHeartBeatTime > endTime:
|
|
390
390
|
endTime = lastHeartBeatTime
|
|
391
391
|
|
|
392
|
-
result =
|
|
393
|
-
if not result["OK"] or not result["Value"]:
|
|
392
|
+
result = getJobParameters([jobID], "CPUNormalizationFactor")
|
|
393
|
+
if not result["OK"] or not result["Value"] or not result["Value"].get("CPUNormalizationFactor"):
|
|
394
394
|
self.log.error(
|
|
395
395
|
"Error getting Job Parameter CPUNormalizationFactor, setting 0",
|
|
396
|
-
result.get("Message"
|
|
396
|
+
result.get("Message"),
|
|
397
397
|
)
|
|
398
398
|
cpuNormalization = 0.0
|
|
399
399
|
else:
|
|
@@ -518,8 +518,7 @@ class StalledJobAgent(AgentModule):
|
|
|
518
518
|
return startTime, endTime
|
|
519
519
|
|
|
520
520
|
def _kickStuckJobs(self):
|
|
521
|
-
"""Reschedule jobs stuck in initialization status Rescheduled,
|
|
522
|
-
Matched."""
|
|
521
|
+
"""Reschedule jobs stuck in initialization status Rescheduled, Matched."""
|
|
523
522
|
|
|
524
523
|
message = ""
|
|
525
524
|
|
|
@@ -530,17 +529,12 @@ class StalledJobAgent(AgentModule):
|
|
|
530
529
|
return result
|
|
531
530
|
|
|
532
531
|
jobIDs = result["Value"]
|
|
533
|
-
jobManagerClient = JobManagerClient()
|
|
534
532
|
if jobIDs:
|
|
535
533
|
self.log.info(f"Rescheduling {len(jobIDs)} jobs stuck in {JobStatus.MATCHED} status")
|
|
536
|
-
result =
|
|
534
|
+
result = rescheduleJobs(jobIDs)
|
|
537
535
|
if not result["OK"]:
|
|
538
536
|
message = f"Failed to reschedule jobs stuck in {JobStatus.MATCHED} status"
|
|
539
537
|
message += "\n" + result["Message"]
|
|
540
|
-
if "InvalidJobIDs" in result:
|
|
541
|
-
message += "\n" + "\tInvalid job IDs: " + str(result["InvalidJobIDs"])
|
|
542
|
-
if "NonauthorizedJobIDs" in result:
|
|
543
|
-
message += "\n" + "\tNon authorized job IDs: " + str(result["NonauthorizedJobIDs"])
|
|
544
538
|
|
|
545
539
|
checkTime = datetime.datetime.utcnow() - self.rescheduledTime * second
|
|
546
540
|
result = self.jobDB.selectJobs({"Status": JobStatus.RESCHEDULED}, older=checkTime)
|
|
@@ -550,18 +544,14 @@ class StalledJobAgent(AgentModule):
|
|
|
550
544
|
|
|
551
545
|
jobIDs = result["Value"]
|
|
552
546
|
if jobIDs:
|
|
553
|
-
self.log.info(f"Rescheduling {len(jobIDs)} jobs stuck in
|
|
554
|
-
result =
|
|
547
|
+
self.log.info(f"Rescheduling {len(jobIDs)} jobs stuck in {JobStatus.RESCHEDULED} status")
|
|
548
|
+
result = rescheduleJobs(jobIDs)
|
|
555
549
|
if not result["OK"]:
|
|
556
550
|
message = f"Failed to reschedule jobs stuck in {JobStatus.RESCHEDULED} status"
|
|
557
551
|
message += "\n" + result["Message"]
|
|
558
|
-
if "InvalidJobIDs" in result:
|
|
559
|
-
message += "\n" + "\tInvalid job IDs: " + str(result["InvalidJobIDs"])
|
|
560
|
-
if "NonauthorizedJobIDs" in result:
|
|
561
|
-
message += "\n" + "\tNon authorized job IDs: " + str(result["NonauthorizedJobIDs"])
|
|
562
552
|
|
|
563
553
|
if message:
|
|
564
|
-
|
|
554
|
+
self.log.error(message)
|
|
565
555
|
return S_OK()
|
|
566
556
|
|
|
567
557
|
def _failSubmittingJobs(self):
|
|
@@ -584,26 +574,3 @@ class StalledJobAgent(AgentModule):
|
|
|
584
574
|
continue
|
|
585
575
|
|
|
586
576
|
return S_OK()
|
|
587
|
-
|
|
588
|
-
def _sendKillCommand(self, job):
|
|
589
|
-
"""Send a kill signal to the job such that it cannot continue running.
|
|
590
|
-
|
|
591
|
-
:param int job: ID of job to send kill command
|
|
592
|
-
"""
|
|
593
|
-
|
|
594
|
-
res = self.jobDB.getJobAttribute(job, "Owner")
|
|
595
|
-
if not res["OK"]:
|
|
596
|
-
return res
|
|
597
|
-
owner = res["Value"]
|
|
598
|
-
|
|
599
|
-
res = self.jobDB.getJobAttribute(job, "OwnerGroup")
|
|
600
|
-
if not res["OK"]:
|
|
601
|
-
return res
|
|
602
|
-
ownerGroup = res["Value"]
|
|
603
|
-
|
|
604
|
-
wmsClient = WMSClient(
|
|
605
|
-
useCertificates=True,
|
|
606
|
-
delegatedDN=getDNForUsername(owner)["Value"][0] if owner else None,
|
|
607
|
-
delegatedGroup=ownerGroup,
|
|
608
|
-
)
|
|
609
|
-
return wmsClient.killJob(job)
|
|
@@ -9,10 +9,11 @@
|
|
|
9
9
|
"""
|
|
10
10
|
import datetime
|
|
11
11
|
|
|
12
|
-
from DIRAC import S_ERROR, S_OK
|
|
12
|
+
from DIRAC import S_ERROR, S_OK, gConfig
|
|
13
13
|
from DIRAC.AccountingSystem.Client.DataStoreClient import DataStoreClient
|
|
14
14
|
from DIRAC.AccountingSystem.Client.Types.WMSHistory import WMSHistory
|
|
15
15
|
from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
|
|
16
|
+
from DIRAC.ConfigurationSystem.Client.Helpers.Resources import getSites
|
|
16
17
|
from DIRAC.Core.Base.AgentModule import AgentModule
|
|
17
18
|
from DIRAC.Core.Utilities import TimeUtilities
|
|
18
19
|
from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter
|
|
@@ -77,6 +78,8 @@ class StatesAccountingAgent(AgentModule):
|
|
|
77
78
|
def execute(self):
|
|
78
79
|
"""Main execution method"""
|
|
79
80
|
|
|
81
|
+
site_metadata = self._getSitesMetadata()
|
|
82
|
+
|
|
80
83
|
# on the first iteration of the agent, do nothing in order to avoid double committing after a restart
|
|
81
84
|
if self.am_getModuleParam("cyclesDone") == 0:
|
|
82
85
|
self.log.notice("Skipping the first iteration of the agent")
|
|
@@ -131,6 +134,16 @@ class StatesAccountingAgent(AgentModule):
|
|
|
131
134
|
|
|
132
135
|
for backend in self.datastores:
|
|
133
136
|
if backend.lower() == "monitoring":
|
|
137
|
+
site_name = rD["Site"]
|
|
138
|
+
if site_name not in site_metadata:
|
|
139
|
+
self.log.warn(
|
|
140
|
+
f"Site {site_name} not found in site metadata, using default values",
|
|
141
|
+
)
|
|
142
|
+
rD["Tier"] = "4"
|
|
143
|
+
rD["Type"] = site_name.split(".")[0]
|
|
144
|
+
else:
|
|
145
|
+
rD["Tier"] = site_metadata[site_name]["Tier"]
|
|
146
|
+
rD["Type"] = site_metadata[site_name]["Type"]
|
|
134
147
|
rD["timestamp"] = int(TimeUtilities.toEpochMilliSeconds(now))
|
|
135
148
|
self.datastores["Monitoring"].addRecord(rD)
|
|
136
149
|
|
|
@@ -154,3 +167,30 @@ class StatesAccountingAgent(AgentModule):
|
|
|
154
167
|
self.log.verbose(f"Done committing WMSHistory to {backend} backend")
|
|
155
168
|
|
|
156
169
|
return S_OK()
|
|
170
|
+
|
|
171
|
+
def _getSitesMetadata(self):
|
|
172
|
+
"""Get the metadata for the sites"""
|
|
173
|
+
res = getSites()
|
|
174
|
+
if not res["OK"]:
|
|
175
|
+
return res
|
|
176
|
+
sites = res["Value"]
|
|
177
|
+
site_metadata = {}
|
|
178
|
+
|
|
179
|
+
for site in sites:
|
|
180
|
+
site_metadata[site] = {}
|
|
181
|
+
|
|
182
|
+
# Get the site metadata from the Configuration System
|
|
183
|
+
grid = site.split(".")[0]
|
|
184
|
+
res = gConfig.getOptionsDict(f"Resources/Sites/{grid}/{site}")
|
|
185
|
+
if not res["OK"]:
|
|
186
|
+
self.log.error("Failure getting options dict for site", f"{site}: {res['Message']}")
|
|
187
|
+
continue
|
|
188
|
+
siteInfoCS = res["Value"]
|
|
189
|
+
|
|
190
|
+
# The site tier is normally 1 or 2. Few VOs may define tier 3.
|
|
191
|
+
# If the tier is not defined, we assume it is 4, with 4 meaning "not pledged" (opportunistic).
|
|
192
|
+
site_metadata[site]["Tier"] = siteInfoCS.get("MoUTierLevel", "4")
|
|
193
|
+
# The site type is defined by the first part of the site name.
|
|
194
|
+
# It needs to be interpreted at the Monitoring side (e.g. in Grafana).
|
|
195
|
+
site_metadata[site]["Type"] = site.split(".")[0]
|
|
196
|
+
return site_metadata
|