DIRAC 9.0.0a42__py3-none-any.whl → 9.0.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- DIRAC/AccountingSystem/Client/AccountingCLI.py +0 -140
- DIRAC/AccountingSystem/Client/DataStoreClient.py +0 -13
- DIRAC/AccountingSystem/Client/Types/BaseAccountingType.py +0 -7
- DIRAC/AccountingSystem/ConfigTemplate.cfg +0 -5
- DIRAC/AccountingSystem/Service/DataStoreHandler.py +0 -72
- DIRAC/ConfigurationSystem/Client/Helpers/CSGlobals.py +0 -9
- DIRAC/ConfigurationSystem/Client/Helpers/Registry.py +38 -26
- DIRAC/ConfigurationSystem/Client/Helpers/Resources.py +11 -43
- DIRAC/ConfigurationSystem/Client/Helpers/test/Test_Helpers.py +0 -16
- DIRAC/ConfigurationSystem/Client/LocalConfiguration.py +14 -8
- DIRAC/ConfigurationSystem/Client/PathFinder.py +47 -8
- DIRAC/ConfigurationSystem/Client/SyncPlugins/CERNLDAPSyncPlugin.py +4 -1
- DIRAC/ConfigurationSystem/Client/VOMS2CSSynchronizer.py +32 -19
- DIRAC/ConfigurationSystem/Client/test/Test_PathFinder.py +41 -1
- DIRAC/ConfigurationSystem/private/RefresherBase.py +4 -2
- DIRAC/Core/Base/API.py +4 -7
- DIRAC/Core/Base/SQLAlchemyDB.py +1 -0
- DIRAC/Core/DISET/ServiceReactor.py +11 -3
- DIRAC/Core/DISET/private/BaseClient.py +1 -2
- DIRAC/Core/DISET/private/Transports/M2SSLTransport.py +9 -7
- DIRAC/Core/DISET/private/Transports/SSL/M2Utils.py +3 -1
- DIRAC/Core/LCG/GOCDBClient.py +5 -7
- DIRAC/Core/Security/DiracX.py +31 -17
- DIRAC/Core/Security/IAMService.py +5 -10
- DIRAC/Core/Security/Locations.py +27 -18
- DIRAC/Core/Security/ProxyInfo.py +9 -5
- DIRAC/Core/Security/VOMSService.py +2 -4
- DIRAC/Core/Security/m2crypto/X509Certificate.py +4 -6
- DIRAC/Core/Security/m2crypto/asn1_utils.py +17 -5
- DIRAC/Core/Security/test/test_diracx_token_from_pem.py +161 -0
- DIRAC/Core/Tornado/Client/ClientSelector.py +4 -1
- DIRAC/Core/Tornado/Server/TornadoService.py +1 -1
- DIRAC/Core/Utilities/CGroups2.py +328 -0
- DIRAC/Core/Utilities/ClassAd/ClassAdLight.py +4 -290
- DIRAC/Core/Utilities/DErrno.py +5 -309
- DIRAC/Core/Utilities/Extensions.py +10 -1
- DIRAC/Core/Utilities/File.py +1 -1
- DIRAC/Core/Utilities/Graphs/GraphData.py +1 -1
- DIRAC/Core/Utilities/Graphs/GraphUtilities.py +6 -1
- DIRAC/Core/Utilities/JDL.py +1 -195
- DIRAC/Core/Utilities/List.py +1 -124
- DIRAC/Core/Utilities/MySQL.py +103 -99
- DIRAC/Core/Utilities/Os.py +32 -1
- DIRAC/Core/Utilities/Platform.py +2 -107
- DIRAC/Core/Utilities/Proxy.py +0 -4
- DIRAC/Core/Utilities/ReturnValues.py +7 -252
- DIRAC/Core/Utilities/StateMachine.py +12 -178
- DIRAC/Core/Utilities/Subprocess.py +35 -14
- DIRAC/Core/Utilities/TimeUtilities.py +10 -253
- DIRAC/Core/Utilities/test/Test_JDL.py +0 -3
- DIRAC/Core/Utilities/test/Test_Profiler.py +20 -20
- DIRAC/Core/scripts/dirac_agent.py +1 -1
- DIRAC/Core/scripts/dirac_apptainer_exec.py +72 -46
- DIRAC/Core/scripts/dirac_configure.py +1 -3
- DIRAC/Core/scripts/dirac_install_db.py +24 -6
- DIRAC/Core/scripts/dirac_platform.py +1 -92
- DIRAC/DataManagementSystem/Agent/FTS3Agent.py +8 -7
- DIRAC/DataManagementSystem/Agent/RequestOperations/RemoveFile.py +7 -6
- DIRAC/DataManagementSystem/Client/FTS3Job.py +71 -34
- DIRAC/DataManagementSystem/DB/FTS3DB.py +7 -3
- DIRAC/DataManagementSystem/DB/FileCatalogComponents/DatasetManager/DatasetManager.py +1 -1
- DIRAC/DataManagementSystem/DB/FileCatalogDB.sql +9 -9
- DIRAC/DataManagementSystem/DB/FileCatalogWithFkAndPsDB.sql +9 -9
- DIRAC/DataManagementSystem/Utilities/DMSHelpers.py +6 -2
- DIRAC/DataManagementSystem/scripts/dirac_admin_allow_se.py +13 -8
- DIRAC/DataManagementSystem/scripts/dirac_admin_ban_se.py +13 -8
- DIRAC/DataManagementSystem/scripts/dirac_dms_create_moving_request.py +2 -0
- DIRAC/DataManagementSystem/scripts/dirac_dms_protocol_matrix.py +0 -1
- DIRAC/FrameworkSystem/Client/BundleDeliveryClient.py +2 -7
- DIRAC/FrameworkSystem/Client/ComponentInstaller.py +9 -4
- DIRAC/FrameworkSystem/Client/ProxyManagerClient.py +5 -2
- DIRAC/FrameworkSystem/Client/SystemAdministratorClientCLI.py +11 -6
- DIRAC/FrameworkSystem/ConfigTemplate.cfg +2 -0
- DIRAC/FrameworkSystem/DB/AuthDB.py +3 -3
- DIRAC/FrameworkSystem/DB/InstalledComponentsDB.py +4 -4
- DIRAC/FrameworkSystem/DB/ProxyDB.py +11 -3
- DIRAC/FrameworkSystem/DB/TokenDB.py +1 -1
- DIRAC/FrameworkSystem/Service/ProxyManagerHandler.py +8 -6
- DIRAC/FrameworkSystem/Utilities/MonitoringUtilities.py +2 -19
- DIRAC/FrameworkSystem/Utilities/TokenManagementUtilities.py +3 -2
- DIRAC/FrameworkSystem/Utilities/diracx.py +36 -14
- DIRAC/FrameworkSystem/private/authorization/AuthServer.py +2 -2
- DIRAC/FrameworkSystem/scripts/dirac_admin_update_pilot.py +18 -11
- DIRAC/FrameworkSystem/scripts/dirac_login.py +2 -2
- DIRAC/FrameworkSystem/scripts/dirac_proxy_init.py +7 -8
- DIRAC/Interfaces/API/Dirac.py +27 -15
- DIRAC/Interfaces/API/DiracAdmin.py +45 -17
- DIRAC/Interfaces/API/Job.py +9 -13
- DIRAC/Interfaces/scripts/dirac_admin_allow_site.py +12 -18
- DIRAC/Interfaces/scripts/dirac_admin_ban_site.py +12 -10
- DIRAC/Interfaces/scripts/dirac_admin_get_site_mask.py +4 -13
- DIRAC/Interfaces/scripts/dirac_admin_reset_job.py +3 -6
- DIRAC/Interfaces/scripts/dirac_wms_job_parameters.py +0 -1
- DIRAC/MonitoringSystem/Client/Types/WMSHistory.py +4 -0
- DIRAC/MonitoringSystem/Client/WebAppClient.py +26 -0
- DIRAC/MonitoringSystem/ConfigTemplate.cfg +9 -0
- DIRAC/MonitoringSystem/DB/MonitoringDB.py +6 -25
- DIRAC/MonitoringSystem/Service/MonitoringHandler.py +0 -33
- DIRAC/MonitoringSystem/Service/WebAppHandler.py +599 -0
- DIRAC/MonitoringSystem/private/MainReporter.py +0 -3
- DIRAC/ProductionSystem/DB/ProductionDB.sql +4 -4
- DIRAC/ProductionSystem/scripts/dirac_prod_get.py +2 -2
- DIRAC/ProductionSystem/scripts/dirac_prod_get_all.py +2 -2
- DIRAC/ProductionSystem/scripts/dirac_prod_get_trans.py +2 -3
- DIRAC/RequestManagementSystem/Agent/RequestExecutingAgent.py +8 -6
- DIRAC/RequestManagementSystem/Agent/RequestOperations/ForwardDISET.py +2 -14
- DIRAC/RequestManagementSystem/Client/ReqClient.py +66 -13
- DIRAC/RequestManagementSystem/ConfigTemplate.cfg +6 -6
- DIRAC/RequestManagementSystem/DB/RequestDB.py +10 -5
- DIRAC/RequestManagementSystem/DB/test/RMSTestScenari.py +2 -0
- DIRAC/RequestManagementSystem/private/RequestValidator.py +40 -46
- DIRAC/ResourceStatusSystem/Client/SiteStatus.py +4 -2
- DIRAC/ResourceStatusSystem/Command/FreeDiskSpaceCommand.py +3 -1
- DIRAC/ResourceStatusSystem/DB/ResourceManagementDB.py +8 -8
- DIRAC/ResourceStatusSystem/DB/ResourceStatusDB.py +2 -2
- DIRAC/ResourceStatusSystem/Utilities/CSHelpers.py +2 -31
- DIRAC/ResourceStatusSystem/scripts/dirac_rss_set_status.py +30 -12
- DIRAC/Resources/Catalog/RucioFileCatalogClient.py +195 -1
- DIRAC/Resources/Catalog/test/Test_RucioFileCatalogClient.py +181 -0
- DIRAC/Resources/Computing/AREXComputingElement.py +25 -8
- DIRAC/Resources/Computing/BatchSystems/Condor.py +126 -108
- DIRAC/Resources/Computing/BatchSystems/SLURM.py +5 -1
- DIRAC/Resources/Computing/BatchSystems/test/Test_SLURM.py +46 -0
- DIRAC/Resources/Computing/ComputingElement.py +1 -1
- DIRAC/Resources/Computing/HTCondorCEComputingElement.py +44 -44
- DIRAC/Resources/Computing/InProcessComputingElement.py +4 -2
- DIRAC/Resources/Computing/LocalComputingElement.py +1 -18
- DIRAC/Resources/Computing/SSHBatchComputingElement.py +1 -17
- DIRAC/Resources/Computing/SSHComputingElement.py +1 -18
- DIRAC/Resources/Computing/SingularityComputingElement.py +19 -5
- DIRAC/Resources/Computing/test/Test_HTCondorCEComputingElement.py +67 -49
- DIRAC/Resources/Computing/test/Test_PoolComputingElement.py +2 -1
- DIRAC/Resources/IdProvider/CheckInIdProvider.py +13 -0
- DIRAC/Resources/IdProvider/IdProviderFactory.py +11 -3
- DIRAC/Resources/MessageQueue/StompMQConnector.py +1 -1
- DIRAC/Resources/Storage/GFAL2_StorageBase.py +24 -15
- DIRAC/Resources/Storage/OccupancyPlugins/WLCGAccountingHTTPJson.py +1 -3
- DIRAC/Resources/Storage/StorageBase.py +4 -2
- DIRAC/Resources/Storage/StorageElement.py +6 -7
- DIRAC/StorageManagementSystem/DB/StorageManagementDB.sql +2 -2
- DIRAC/TransformationSystem/Agent/TaskManagerAgentBase.py +10 -16
- DIRAC/TransformationSystem/Agent/TransformationAgent.py +22 -1
- DIRAC/TransformationSystem/Agent/TransformationCleaningAgent.py +16 -16
- DIRAC/TransformationSystem/Client/TaskManager.py +2 -4
- DIRAC/TransformationSystem/Client/Transformation.py +6 -7
- DIRAC/TransformationSystem/Client/TransformationClient.py +21 -11
- DIRAC/TransformationSystem/Client/Utilities.py +9 -0
- DIRAC/TransformationSystem/DB/TransformationDB.py +11 -14
- DIRAC/TransformationSystem/DB/TransformationDB.sql +9 -9
- DIRAC/TransformationSystem/Service/TransformationManagerHandler.py +0 -333
- DIRAC/TransformationSystem/Utilities/ReplicationCLIParameters.py +3 -3
- DIRAC/TransformationSystem/Utilities/TransformationInfo.py +7 -5
- DIRAC/TransformationSystem/scripts/dirac_production_runjoblocal.py +2 -4
- DIRAC/TransformationSystem/test/Test_TransformationInfo.py +22 -15
- DIRAC/TransformationSystem/test/Test_replicationTransformation.py +5 -6
- DIRAC/Workflow/Modules/test/Test_Modules.py +5 -0
- DIRAC/WorkloadManagementSystem/Agent/JobAgent.py +38 -26
- DIRAC/WorkloadManagementSystem/Agent/JobCleaningAgent.py +12 -8
- DIRAC/WorkloadManagementSystem/Agent/PilotSyncAgent.py +4 -3
- DIRAC/WorkloadManagementSystem/Agent/PushJobAgent.py +13 -13
- DIRAC/WorkloadManagementSystem/Agent/SiteDirector.py +18 -14
- DIRAC/WorkloadManagementSystem/Agent/StalledJobAgent.py +18 -51
- DIRAC/WorkloadManagementSystem/Agent/StatesAccountingAgent.py +41 -1
- DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_JobAgent.py +45 -4
- DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_JobCleaningAgent.py +7 -9
- DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_PushJobAgent.py +1 -0
- DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_SiteDirector.py +9 -2
- DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_StalledJobAgent.py +4 -5
- DIRAC/WorkloadManagementSystem/Client/DownloadInputData.py +9 -9
- DIRAC/WorkloadManagementSystem/Client/InputDataResolution.py +6 -6
- DIRAC/WorkloadManagementSystem/Client/JobMonitoringClient.py +10 -11
- DIRAC/WorkloadManagementSystem/Client/JobReport.py +1 -1
- DIRAC/WorkloadManagementSystem/Client/JobState/CachedJobState.py +3 -0
- DIRAC/WorkloadManagementSystem/Client/JobState/JobManifest.py +32 -261
- DIRAC/WorkloadManagementSystem/Client/JobState/JobState.py +6 -0
- DIRAC/WorkloadManagementSystem/Client/JobStateUpdateClient.py +3 -0
- DIRAC/WorkloadManagementSystem/Client/JobStatus.py +8 -152
- DIRAC/WorkloadManagementSystem/Client/PoolXMLSlice.py +12 -19
- DIRAC/WorkloadManagementSystem/Client/SandboxStoreClient.py +25 -38
- DIRAC/WorkloadManagementSystem/Client/WMSClient.py +2 -3
- DIRAC/WorkloadManagementSystem/Client/test/Test_Client_DownloadInputData.py +29 -0
- DIRAC/WorkloadManagementSystem/ConfigTemplate.cfg +4 -8
- DIRAC/WorkloadManagementSystem/DB/JobDB.py +89 -132
- DIRAC/WorkloadManagementSystem/DB/JobDB.sql +8 -8
- DIRAC/WorkloadManagementSystem/DB/JobDBUtils.py +18 -147
- DIRAC/WorkloadManagementSystem/DB/JobLoggingDB.py +19 -6
- DIRAC/WorkloadManagementSystem/DB/JobParametersDB.py +9 -9
- DIRAC/WorkloadManagementSystem/DB/PilotAgentsDB.py +16 -5
- DIRAC/WorkloadManagementSystem/DB/PilotAgentsDB.sql +3 -3
- DIRAC/WorkloadManagementSystem/DB/SandboxMetadataDB.py +44 -82
- DIRAC/WorkloadManagementSystem/DB/StatusUtils.py +125 -0
- DIRAC/WorkloadManagementSystem/DB/tests/Test_JobDB.py +1 -1
- DIRAC/WorkloadManagementSystem/DB/tests/Test_StatusUtils.py +28 -0
- DIRAC/WorkloadManagementSystem/Executor/JobSanity.py +5 -4
- DIRAC/WorkloadManagementSystem/Executor/JobScheduling.py +4 -0
- DIRAC/WorkloadManagementSystem/FutureClient/JobStateUpdateClient.py +75 -33
- DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapper.py +22 -11
- DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py +9 -10
- DIRAC/WorkloadManagementSystem/JobWrapper/test/Test_JobWrapper.py +60 -10
- DIRAC/WorkloadManagementSystem/JobWrapper/test/Test_JobWrapperTemplate.py +4 -0
- DIRAC/WorkloadManagementSystem/Service/JobManagerHandler.py +33 -154
- DIRAC/WorkloadManagementSystem/Service/JobMonitoringHandler.py +5 -323
- DIRAC/WorkloadManagementSystem/Service/JobStateUpdateHandler.py +0 -16
- DIRAC/WorkloadManagementSystem/Service/PilotManagerHandler.py +6 -103
- DIRAC/WorkloadManagementSystem/Service/SandboxStoreHandler.py +7 -53
- DIRAC/WorkloadManagementSystem/Service/WMSAdministratorHandler.py +16 -79
- DIRAC/WorkloadManagementSystem/Service/WMSUtilities.py +4 -18
- DIRAC/WorkloadManagementSystem/Utilities/JobModel.py +28 -209
- DIRAC/WorkloadManagementSystem/Utilities/JobParameters.py +65 -3
- DIRAC/WorkloadManagementSystem/Utilities/JobStatusUtility.py +2 -64
- DIRAC/WorkloadManagementSystem/Utilities/ParametricJob.py +7 -171
- DIRAC/WorkloadManagementSystem/Utilities/PilotCStoJSONSynchronizer.py +73 -7
- DIRAC/WorkloadManagementSystem/Utilities/PilotWrapper.py +41 -11
- DIRAC/WorkloadManagementSystem/Utilities/RemoteRunner.py +16 -0
- DIRAC/WorkloadManagementSystem/Utilities/Utils.py +36 -1
- DIRAC/WorkloadManagementSystem/Utilities/jobAdministration.py +15 -0
- DIRAC/WorkloadManagementSystem/Utilities/test/Test_JobModel.py +1 -15
- DIRAC/WorkloadManagementSystem/Utilities/test/Test_ParametricJob.py +45 -128
- DIRAC/WorkloadManagementSystem/Utilities/test/Test_PilotWrapper.py +16 -0
- DIRAC/WorkloadManagementSystem/scripts/dirac_jobexec.py +7 -2
- DIRAC/WorkloadManagementSystem/scripts/dirac_wms_pilot_job_info.py +1 -1
- DIRAC/__init__.py +62 -60
- DIRAC/tests/Utilities/testJobDefinitions.py +22 -28
- {DIRAC-9.0.0a42.dist-info → dirac-9.0.7.dist-info}/METADATA +8 -5
- {DIRAC-9.0.0a42.dist-info → dirac-9.0.7.dist-info}/RECORD +229 -228
- {DIRAC-9.0.0a42.dist-info → dirac-9.0.7.dist-info}/WHEEL +1 -1
- {DIRAC-9.0.0a42.dist-info → dirac-9.0.7.dist-info}/entry_points.txt +0 -3
- DIRAC/Core/Utilities/test/Test_List.py +0 -150
- DIRAC/Core/Utilities/test/Test_Time.py +0 -88
- DIRAC/Resources/Computing/PilotBundle.py +0 -70
- DIRAC/TransformationSystem/scripts/dirac_transformation_archive.py +0 -30
- DIRAC/TransformationSystem/scripts/dirac_transformation_clean.py +0 -30
- DIRAC/TransformationSystem/scripts/dirac_transformation_remove_output.py +0 -30
- DIRAC/WorkloadManagementSystem/Utilities/test/Test_JobManager.py +0 -58
- {DIRAC-9.0.0a42.dist-info → dirac-9.0.7.dist-info/licenses}/LICENSE +0 -0
- {DIRAC-9.0.0a42.dist-info → dirac-9.0.7.dist-info}/top_level.txt +0 -0
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
from __future__ import print_function
|
|
7
7
|
from __future__ import absolute_import
|
|
8
8
|
from __future__ import division
|
|
9
|
+
import json
|
|
9
10
|
import re
|
|
10
11
|
import tempfile
|
|
11
12
|
import subprocess
|
|
@@ -25,6 +26,8 @@ STATES_MAP = {
|
|
|
25
26
|
|
|
26
27
|
HOLD_REASON_SUBCODE = "55"
|
|
27
28
|
|
|
29
|
+
STATE_ATTRIBUTES = "ClusterId,ProcId,JobStatus,HoldReasonCode,HoldReasonSubCode,HoldReason"
|
|
30
|
+
|
|
28
31
|
subTemplate = """
|
|
29
32
|
# Environment
|
|
30
33
|
# -----------
|
|
@@ -62,6 +65,7 @@ environment = "DIRAC_PILOT_STAMP=$(stamp) %(environment)s"
|
|
|
62
65
|
# Requirements
|
|
63
66
|
# ------------
|
|
64
67
|
request_cpus = %(processors)s
|
|
68
|
+
requirements = NumJobStarts == 0
|
|
65
69
|
|
|
66
70
|
# Exit options
|
|
67
71
|
# ------------
|
|
@@ -73,7 +77,8 @@ on_exit_hold = ExitCode =!= 0
|
|
|
73
77
|
# A subcode of our choice to identify who put the job on hold
|
|
74
78
|
on_exit_hold_subcode = %(holdReasonSubcode)s
|
|
75
79
|
# Jobs are then deleted from the system after N days if they are not idle or running
|
|
76
|
-
periodic_remove = (JobStatus
|
|
80
|
+
periodic_remove = ((JobStatus == 1) && (NumJobStarts > 0)) || \
|
|
81
|
+
((JobStatus != 1) && (JobStatus != 2) && ((time() - EnteredCurrentStatus) > (%(daysToKeepRemoteLogs)s * 24 * 3600)))
|
|
77
82
|
|
|
78
83
|
# Specific options
|
|
79
84
|
# ----------------
|
|
@@ -87,63 +92,34 @@ Queue stamp in %(pilotStampList)s
|
|
|
87
92
|
"""
|
|
88
93
|
|
|
89
94
|
|
|
90
|
-
def
|
|
95
|
+
def getCondorStatus(jobMetadata):
|
|
91
96
|
"""parse the condor_q or condor_history output for the job status
|
|
92
97
|
|
|
93
|
-
:param
|
|
94
|
-
:type
|
|
95
|
-
:param str jobID: jobID of condor job, e.g.: 123.53
|
|
98
|
+
:param jobMetadata: dict with job metadata
|
|
99
|
+
:type jobMetadata: dict[str, str | int]
|
|
96
100
|
:returns: Status as known by DIRAC, and a reason if the job is being held
|
|
97
101
|
"""
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
for
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
# A job can be held for various reasons,
|
|
120
|
-
# we need to further investigate with the holdReasonCode & holdReasonSubCode
|
|
121
|
-
# Details in:
|
|
122
|
-
# https://htcondor.readthedocs.io/en/latest/classad-attributes/job-classad-attributes.html#HoldReasonCode
|
|
123
|
-
|
|
124
|
-
# By default, a held (5) job is defined as Aborted in STATES_MAP, but there might be some exceptions
|
|
125
|
-
status = 3
|
|
126
|
-
try:
|
|
127
|
-
holdReasonCode = l[2]
|
|
128
|
-
holdReasonSubcode = l[3]
|
|
129
|
-
holdReason = " ".join(l[4:])
|
|
130
|
-
except IndexError:
|
|
131
|
-
# This should not happen in theory
|
|
132
|
-
# Just set the status to unknown such as
|
|
133
|
-
status = None
|
|
134
|
-
holdReasonCode = "undefined"
|
|
135
|
-
holdReasonSubcode = "undefined"
|
|
136
|
-
break
|
|
137
|
-
|
|
138
|
-
# If holdReasonCode is 3 (The PERIODIC_HOLD expression evaluated to True. Or, ON_EXIT_HOLD was true)
|
|
139
|
-
# And subcode is HOLD_REASON_SUBCODE, then it means the job failed by itself, it needs to be marked as Failed
|
|
140
|
-
if holdReasonCode == "3" and holdReasonSubcode == HOLD_REASON_SUBCODE:
|
|
141
|
-
status = 5
|
|
142
|
-
# If holdReasonCode is 16 (Input files are being spooled), the job should be marked as Waiting
|
|
143
|
-
elif holdReasonCode == "16":
|
|
144
|
-
status = 1
|
|
145
|
-
|
|
146
|
-
return (STATES_MAP.get(status, "Unknown"), holdReason)
|
|
102
|
+
if jobMetadata["JobStatus"] != 5:
|
|
103
|
+
# If the job is not held, we can return the status directly
|
|
104
|
+
return (STATES_MAP.get(jobMetadata["JobStatus"], "Unknown"), "")
|
|
105
|
+
|
|
106
|
+
# A job can be held for various reasons,
|
|
107
|
+
# we need to further investigate with the holdReasonCode & holdReasonSubCode
|
|
108
|
+
# Details in:
|
|
109
|
+
# https://htcondor.readthedocs.io/en/latest/classad-attributes/job-classad-attributes.html#HoldReasonCode
|
|
110
|
+
|
|
111
|
+
# By default, a held (5) job is defined as Aborted in STATES_MAP, but there might be some exceptions
|
|
112
|
+
status = 3
|
|
113
|
+
|
|
114
|
+
# If holdReasonCode is 3 (The PERIODIC_HOLD expression evaluated to True. Or, ON_EXIT_HOLD was true)
|
|
115
|
+
# And subcode is HOLD_REASON_SUBCODE, then it means the job failed by itself, it needs to be marked as Failed
|
|
116
|
+
if jobMetadata["HoldReasonCode"] == 3 and jobMetadata["HoldReasonSubCode"] == HOLD_REASON_SUBCODE:
|
|
117
|
+
status = 5
|
|
118
|
+
# If holdReasonCode is 16 (Input files are being spooled), the job should be marked as Waiting
|
|
119
|
+
elif jobMetadata["HoldReasonCode"] == 16:
|
|
120
|
+
status = 1
|
|
121
|
+
|
|
122
|
+
return (STATES_MAP.get(status, "Unknown"), jobMetadata["HoldReason"])
|
|
147
123
|
|
|
148
124
|
|
|
149
125
|
class Condor(object):
|
|
@@ -171,8 +147,6 @@ class Condor(object):
|
|
|
171
147
|
preamble = kwargs.get("Preamble")
|
|
172
148
|
|
|
173
149
|
jdlFile = tempfile.NamedTemporaryFile(dir=outputDir, suffix=".jdl", mode="wt")
|
|
174
|
-
scheddOptions = 'requirements = OpSys == "LINUX"\n'
|
|
175
|
-
scheddOptions += "gentenv = False"
|
|
176
150
|
jdlFile.write(
|
|
177
151
|
subTemplate
|
|
178
152
|
% dict(
|
|
@@ -185,7 +159,7 @@ class Condor(object):
|
|
|
185
159
|
holdReasonSubcode=HOLD_REASON_SUBCODE,
|
|
186
160
|
daysToKeepRemoteLogs=1,
|
|
187
161
|
scheddOptions="",
|
|
188
|
-
extraString=
|
|
162
|
+
extraString=submitOptions,
|
|
189
163
|
pilotStampList=",".join(stamps),
|
|
190
164
|
)
|
|
191
165
|
)
|
|
@@ -193,7 +167,7 @@ class Condor(object):
|
|
|
193
167
|
jdlFile.flush()
|
|
194
168
|
|
|
195
169
|
cmd = "%s; " % preamble if preamble else ""
|
|
196
|
-
cmd += "condor_submit
|
|
170
|
+
cmd += "condor_submit -spool %s" % jdlFile.name
|
|
197
171
|
sp = subprocess.Popen(
|
|
198
172
|
cmd,
|
|
199
173
|
shell=True,
|
|
@@ -283,7 +257,6 @@ class Condor(object):
|
|
|
283
257
|
|
|
284
258
|
def getJobStatus(self, **kwargs):
|
|
285
259
|
"""Get status of the jobs in the given list"""
|
|
286
|
-
|
|
287
260
|
resultDict = {}
|
|
288
261
|
|
|
289
262
|
MANDATORY_PARAMETERS = ["JobIDList"]
|
|
@@ -299,15 +272,11 @@ class Condor(object):
|
|
|
299
272
|
resultDict["Message"] = "Empty job list"
|
|
300
273
|
return resultDict
|
|
301
274
|
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
user = os.environ.get("USER")
|
|
305
|
-
if not user:
|
|
306
|
-
resultDict["Status"] = -1
|
|
307
|
-
resultDict["Message"] = "No user name"
|
|
308
|
-
return resultDict
|
|
275
|
+
# Prepare the command to get the status of the jobs
|
|
276
|
+
cmdJobs = " ".join(str(jobID) for jobID in jobIDList)
|
|
309
277
|
|
|
310
|
-
|
|
278
|
+
# Get the status of the jobs currently active
|
|
279
|
+
cmd = "condor_q %s -attributes %s -json" % (cmdJobs, STATE_ATTRIBUTES)
|
|
311
280
|
sp = subprocess.Popen(
|
|
312
281
|
shlex.split(cmd),
|
|
313
282
|
stdout=subprocess.PIPE,
|
|
@@ -321,12 +290,13 @@ class Condor(object):
|
|
|
321
290
|
resultDict["Status"] = status
|
|
322
291
|
resultDict["Message"] = error
|
|
323
292
|
return resultDict
|
|
293
|
+
if not output:
|
|
294
|
+
output = "[]"
|
|
324
295
|
|
|
325
|
-
|
|
296
|
+
jobsMetadata = json.loads(output)
|
|
326
297
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
)
|
|
298
|
+
# Get the status of the jobs in the history
|
|
299
|
+
condorHistCall = "condor_history %s -attributes %s -json" % (cmdJobs, STATE_ATTRIBUTES)
|
|
330
300
|
sp = subprocess.Popen(
|
|
331
301
|
shlex.split(condorHistCall),
|
|
332
302
|
stdout=subprocess.PIPE,
|
|
@@ -335,15 +305,28 @@ class Condor(object):
|
|
|
335
305
|
)
|
|
336
306
|
output, _ = sp.communicate()
|
|
337
307
|
status = sp.returncode
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
308
|
+
|
|
309
|
+
if status != 0:
|
|
310
|
+
resultDict["Status"] = status
|
|
311
|
+
resultDict["Message"] = error
|
|
312
|
+
return resultDict
|
|
313
|
+
if not output:
|
|
314
|
+
output = "[]"
|
|
315
|
+
|
|
316
|
+
jobsMetadata += json.loads(output)
|
|
341
317
|
|
|
342
318
|
statusDict = {}
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
319
|
+
# Build a set of job IDs found in jobsMetadata
|
|
320
|
+
foundJobIDs = set()
|
|
321
|
+
for jobDict in jobsMetadata:
|
|
322
|
+
jobID = "%s.%s" % (jobDict["ClusterId"], jobDict["ProcId"])
|
|
323
|
+
statusDict[jobID], _ = getCondorStatus(jobDict)
|
|
324
|
+
foundJobIDs.add(jobID)
|
|
325
|
+
|
|
326
|
+
# For job IDs not found, set status to "Unknown"
|
|
327
|
+
for jobID in jobIDList:
|
|
328
|
+
if str(jobID) not in foundJobIDs:
|
|
329
|
+
statusDict[str(jobID)] = "Unknown"
|
|
347
330
|
|
|
348
331
|
# Final output
|
|
349
332
|
status = 0
|
|
@@ -355,19 +338,30 @@ class Condor(object):
|
|
|
355
338
|
"""Get the overall status of the CE"""
|
|
356
339
|
resultDict = {}
|
|
357
340
|
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
341
|
+
cmd = "condor_q -totals -json"
|
|
342
|
+
sp = subprocess.Popen(
|
|
343
|
+
shlex.split(cmd),
|
|
344
|
+
stdout=subprocess.PIPE,
|
|
345
|
+
stderr=subprocess.PIPE,
|
|
346
|
+
universal_newlines=True,
|
|
347
|
+
)
|
|
348
|
+
output, error = sp.communicate()
|
|
349
|
+
status = sp.returncode
|
|
350
|
+
|
|
351
|
+
if status != 0 or not output:
|
|
362
352
|
resultDict["Status"] = -1
|
|
363
|
-
resultDict["Message"] =
|
|
353
|
+
resultDict["Message"] = error
|
|
364
354
|
return resultDict
|
|
365
355
|
|
|
366
|
-
|
|
367
|
-
|
|
356
|
+
jresult = json.loads(output)
|
|
357
|
+
resultDict["Status"] = 0
|
|
358
|
+
resultDict["Waiting"] = jresult[0]["Idle"]
|
|
359
|
+
resultDict["Running"] = jresult[0]["Running"]
|
|
368
360
|
|
|
361
|
+
# We also need to check the hold jobs, some of them are actually waiting (e.g. for input files)
|
|
362
|
+
cmd = 'condor_q -json -constraint "JobStatus == 5" -attributes HoldReasonCode'
|
|
369
363
|
sp = subprocess.Popen(
|
|
370
|
-
shlex.split(
|
|
364
|
+
shlex.split(cmd),
|
|
371
365
|
stdout=subprocess.PIPE,
|
|
372
366
|
stderr=subprocess.PIPE,
|
|
373
367
|
universal_newlines=True,
|
|
@@ -376,33 +370,57 @@ class Condor(object):
|
|
|
376
370
|
status = sp.returncode
|
|
377
371
|
|
|
378
372
|
if status != 0:
|
|
379
|
-
|
|
380
|
-
resultDict["Status"] = 0
|
|
381
|
-
resultDict["Waiting"] = waitingJobs
|
|
382
|
-
resultDict["Running"] = runningJobs
|
|
383
|
-
return resultDict
|
|
384
|
-
resultDict["Status"] = status
|
|
373
|
+
resultDict["Status"] = -1
|
|
385
374
|
resultDict["Message"] = error
|
|
386
375
|
return resultDict
|
|
387
376
|
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
resultDict["Waiting"] = waitingJobs
|
|
391
|
-
resultDict["Running"] = runningJobs
|
|
377
|
+
# If there are no held jobs, we can return the result
|
|
378
|
+
if not output:
|
|
392
379
|
return resultDict
|
|
393
380
|
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
381
|
+
jresult = json.loads(output)
|
|
382
|
+
for job_metadata in jresult:
|
|
383
|
+
if job_metadata["HoldReasonCode"] == 16:
|
|
384
|
+
resultDict["Waiting"] += 1
|
|
385
|
+
|
|
386
|
+
return resultDict
|
|
387
|
+
|
|
388
|
+
def getJobOutputFiles(self, **kwargs):
|
|
389
|
+
"""Get output file names and templates for the specific CE"""
|
|
390
|
+
resultDict = {}
|
|
391
|
+
|
|
392
|
+
MANDATORY_PARAMETERS = ["JobIDList", "OutputDir", "ErrorDir"]
|
|
393
|
+
for argument in MANDATORY_PARAMETERS:
|
|
394
|
+
if argument not in kwargs:
|
|
395
|
+
resultDict["Status"] = -1
|
|
396
|
+
resultDict["Message"] = "No %s" % argument
|
|
397
|
+
return resultDict
|
|
398
|
+
|
|
399
|
+
outputDir = kwargs["OutputDir"]
|
|
400
|
+
errorDir = kwargs["ErrorDir"]
|
|
401
|
+
jobIDList = kwargs["JobIDList"]
|
|
402
|
+
|
|
403
|
+
jobDict = {}
|
|
404
|
+
for jobID in jobIDList:
|
|
405
|
+
jobDict[jobID] = {}
|
|
406
|
+
|
|
407
|
+
cmd = "condor_transfer_data %s" % jobID
|
|
408
|
+
sp = subprocess.Popen(
|
|
409
|
+
shlex.split(cmd),
|
|
410
|
+
stdout=subprocess.PIPE,
|
|
411
|
+
stderr=subprocess.PIPE,
|
|
412
|
+
universal_newlines=True,
|
|
413
|
+
)
|
|
414
|
+
_, error = sp.communicate()
|
|
415
|
+
status = sp.returncode
|
|
416
|
+
if status != 0:
|
|
417
|
+
resultDict["Status"] = -1
|
|
418
|
+
resultDict["Message"] = error
|
|
419
|
+
return resultDict
|
|
420
|
+
|
|
421
|
+
jobDict[jobID]["Output"] = "%s/%s.out" % (outputDir, jobID)
|
|
422
|
+
jobDict[jobID]["Error"] = "%s/%s.err" % (errorDir, jobID)
|
|
403
423
|
|
|
404
|
-
# Final output
|
|
405
424
|
resultDict["Status"] = 0
|
|
406
|
-
resultDict["
|
|
407
|
-
resultDict["Running"] = runningJobs
|
|
425
|
+
resultDict["Jobs"] = jobDict
|
|
408
426
|
return resultDict
|
|
@@ -40,6 +40,7 @@ class SLURM(object):
|
|
|
40
40
|
executable = kwargs["Executable"]
|
|
41
41
|
account = kwargs.get("Account", "")
|
|
42
42
|
numberOfProcessors = kwargs.get("NumberOfProcessors", 1)
|
|
43
|
+
wholeNode = kwargs.get("WholeNode", False)
|
|
43
44
|
# numberOfNodes is treated as a string as it can contain values such as "2-4"
|
|
44
45
|
# where 2 would represent the minimum number of nodes to allocate, and 4 the maximum
|
|
45
46
|
numberOfNodes = kwargs.get("NumberOfNodes", "1")
|
|
@@ -72,7 +73,10 @@ class SLURM(object):
|
|
|
72
73
|
# One pilot (task) per node, allocating a certain number of processors
|
|
73
74
|
cmd += "--ntasks-per-node=1 "
|
|
74
75
|
cmd += "--nodes=%s " % numberOfNodes
|
|
75
|
-
|
|
76
|
+
if wholeNode:
|
|
77
|
+
cmd += "--exclusive "
|
|
78
|
+
else:
|
|
79
|
+
cmd += "--cpus-per-task=%d " % numberOfProcessors
|
|
76
80
|
if numberOfGPUs:
|
|
77
81
|
cmd += "--gpus-per-task=%d " % int(numberOfGPUs)
|
|
78
82
|
# Additional options
|
|
@@ -198,3 +198,49 @@ def test_getJobOutputFiles(numberOfNodes, outputContent, expectedContent):
|
|
|
198
198
|
|
|
199
199
|
os.remove(outputFile)
|
|
200
200
|
os.remove(errorFile)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def test_submitJob_cmd_generation(mocker):
|
|
204
|
+
"""Test submitJob() command string generation for various kwargs"""
|
|
205
|
+
slurm = SLURM()
|
|
206
|
+
# Mock subprocess.Popen to capture the command
|
|
207
|
+
popen_mock = mocker.patch("subprocess.Popen")
|
|
208
|
+
process_mock = popen_mock.return_value
|
|
209
|
+
process_mock.communicate.return_value = ("Submitted batch job 1234\n", "")
|
|
210
|
+
process_mock.returncode = 0
|
|
211
|
+
|
|
212
|
+
# Minimal kwargs
|
|
213
|
+
kwargs = {
|
|
214
|
+
"Executable": "/bin/echo",
|
|
215
|
+
"OutputDir": "/tmp",
|
|
216
|
+
"ErrorDir": "/tmp",
|
|
217
|
+
"Queue": "testq",
|
|
218
|
+
"SubmitOptions": "",
|
|
219
|
+
"JobStamps": ["stamp1"],
|
|
220
|
+
"NJobs": 1,
|
|
221
|
+
}
|
|
222
|
+
# Test default (WholeNode False)
|
|
223
|
+
slurm.submitJob(**kwargs)
|
|
224
|
+
cmd = popen_mock.call_args[0][0]
|
|
225
|
+
assert "--cpus-per-task=1" in cmd
|
|
226
|
+
assert "--exclusive" not in cmd
|
|
227
|
+
|
|
228
|
+
# Test WholeNode True disables --cpus-per-task and adds --exclusive
|
|
229
|
+
kwargs["WholeNode"] = True
|
|
230
|
+
slurm.submitJob(**kwargs)
|
|
231
|
+
cmd = popen_mock.call_args[0][0]
|
|
232
|
+
assert "--exclusive" in cmd
|
|
233
|
+
assert "--cpus-per-task" not in cmd
|
|
234
|
+
|
|
235
|
+
# Test NumberOfProcessors
|
|
236
|
+
kwargs["WholeNode"] = False
|
|
237
|
+
kwargs["NumberOfProcessors"] = 8
|
|
238
|
+
slurm.submitJob(**kwargs)
|
|
239
|
+
cmd = popen_mock.call_args[0][0]
|
|
240
|
+
assert "--cpus-per-task=8" in cmd
|
|
241
|
+
|
|
242
|
+
# Test NumberOfGPUs
|
|
243
|
+
kwargs["NumberOfGPUs"] = 2
|
|
244
|
+
slurm.submitJob(**kwargs)
|
|
245
|
+
cmd = popen_mock.call_args[0][0]
|
|
246
|
+
assert "--gpus-per-task=2" in cmd
|
|
@@ -105,7 +105,7 @@ class ComputingElement:
|
|
|
105
105
|
def _prepareProxy(self):
|
|
106
106
|
"""Set the environment variable X509_USER_PROXY"""
|
|
107
107
|
if self.proxy:
|
|
108
|
-
result = gProxyManager.dumpProxyToFile(self.proxy, requiredTimeLeft=self.minProxyTime)
|
|
108
|
+
result = gProxyManager.dumpProxyToFile(self.proxy, requiredTimeLeft=self.minProxyTime, includeToken=False)
|
|
109
109
|
if not result["OK"]:
|
|
110
110
|
return result
|
|
111
111
|
os.environ["X509_USER_PROXY"] = result["Value"]
|
|
@@ -50,6 +50,7 @@ When using a local condor_schedd look at the HTCondor documentation for enabling
|
|
|
50
50
|
|
|
51
51
|
import datetime
|
|
52
52
|
import errno
|
|
53
|
+
import json
|
|
53
54
|
import os
|
|
54
55
|
import subprocess
|
|
55
56
|
import tempfile
|
|
@@ -63,10 +64,14 @@ from DIRAC.Core.Utilities.File import mkDir
|
|
|
63
64
|
from DIRAC.Core.Utilities.List import breakListIntoChunks
|
|
64
65
|
from DIRAC.Core.Utilities.Subprocess import systemCall
|
|
65
66
|
from DIRAC.FrameworkSystem.private.authorization.utils.Tokens import writeToTokenFile
|
|
66
|
-
from DIRAC.Resources.Computing.BatchSystems.Condor import
|
|
67
|
+
from DIRAC.Resources.Computing.BatchSystems.Condor import (
|
|
68
|
+
HOLD_REASON_SUBCODE,
|
|
69
|
+
STATE_ATTRIBUTES,
|
|
70
|
+
getCondorStatus,
|
|
71
|
+
subTemplate,
|
|
72
|
+
)
|
|
67
73
|
from DIRAC.Resources.Computing.ComputingElement import ComputingElement
|
|
68
74
|
from DIRAC.WorkloadManagementSystem.Client import PilotStatus
|
|
69
|
-
from DIRAC.WorkloadManagementSystem.Client.PilotManagerClient import PilotManagerClient
|
|
70
75
|
|
|
71
76
|
MANDATORY_PARAMETERS = ["Queue"]
|
|
72
77
|
DEFAULT_WORKINGDIRECTORY = "/opt/dirac/pro/runit/WorkloadManagement/SiteDirectorHT"
|
|
@@ -170,7 +175,9 @@ class HTCondorCEComputingElement(ComputingElement):
|
|
|
170
175
|
|
|
171
176
|
executable = os.path.join(self.workingDirectory, executable)
|
|
172
177
|
|
|
173
|
-
|
|
178
|
+
# For now, we still need to include a proxy in the submit file
|
|
179
|
+
# HTCondor extracts VOMS attribute from it for the sites
|
|
180
|
+
useCredentials = "use_x509userproxy = true"
|
|
174
181
|
# If tokenFile is present, then we transfer it to the worker node
|
|
175
182
|
if tokenFile:
|
|
176
183
|
useCredentials += textwrap.dedent(
|
|
@@ -271,6 +278,10 @@ class HTCondorCEComputingElement(ComputingElement):
|
|
|
271
278
|
htcEnv = {
|
|
272
279
|
"_CONDOR_SEC_CLIENT_AUTHENTICATION_METHODS": "SCITOKENS",
|
|
273
280
|
"_CONDOR_SCITOKENS_FILE": self.tokenFile.name,
|
|
281
|
+
# This options is needed because we are still passing the proxy in the JDL (see use_x509userproxy)
|
|
282
|
+
# In condor v24.4, there is a bug preventing us from delegating the proxy, so we have to set
|
|
283
|
+
# it to false: https://opensciencegrid.atlassian.net/browse/HTCONDOR-2904
|
|
284
|
+
"_CONDOR_DELEGATE_JOB_GSI_CREDENTIALS": "false",
|
|
274
285
|
}
|
|
275
286
|
if cas := getCAsLocation():
|
|
276
287
|
htcEnv["_CONDOR_AUTH_SSL_CLIENT_CADIR"] = cas
|
|
@@ -380,33 +391,10 @@ class HTCondorCEComputingElement(ComputingElement):
|
|
|
380
391
|
|
|
381
392
|
#############################################################################
|
|
382
393
|
def getCEStatus(self):
|
|
383
|
-
"""Method to return information on running and pending jobs.
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
"""
|
|
388
|
-
result = S_OK()
|
|
389
|
-
result["SubmittedJobs"] = 0
|
|
390
|
-
result["RunningJobs"] = 0
|
|
391
|
-
result["WaitingJobs"] = 0
|
|
392
|
-
|
|
393
|
-
# getWaitingPilots
|
|
394
|
-
condDict = {"DestinationSite": self.ceName, "Status": PilotStatus.PILOT_WAITING_STATES}
|
|
395
|
-
res = PilotManagerClient().countPilots(condDict)
|
|
396
|
-
if res["OK"]:
|
|
397
|
-
result["WaitingJobs"] = int(res["Value"])
|
|
398
|
-
else:
|
|
399
|
-
self.log.warn(f"Failure getting pilot count for {self.ceName}: {res['Message']} ")
|
|
400
|
-
|
|
401
|
-
# getRunningPilots
|
|
402
|
-
condDict = {"DestinationSite": self.ceName, "Status": PilotStatus.RUNNING}
|
|
403
|
-
res = PilotManagerClient().countPilots(condDict)
|
|
404
|
-
if res["OK"]:
|
|
405
|
-
result["RunningJobs"] = int(res["Value"])
|
|
406
|
-
else:
|
|
407
|
-
self.log.warn(f"Failure getting pilot count for {self.ceName}: {res['Message']} ")
|
|
408
|
-
|
|
409
|
-
return result
|
|
394
|
+
"""Method to return information on running and pending jobs."""
|
|
395
|
+
return S_ERROR(
|
|
396
|
+
"getCEStatus() not supported for HTCondorCEComputingElement: HTCondor does not expose this information"
|
|
397
|
+
)
|
|
410
398
|
|
|
411
399
|
def getJobStatus(self, jobIDList):
|
|
412
400
|
"""Get the status information for the given list of jobs"""
|
|
@@ -418,45 +406,57 @@ class HTCondorCEComputingElement(ComputingElement):
|
|
|
418
406
|
if isinstance(jobIDList, str):
|
|
419
407
|
jobIDList = [jobIDList]
|
|
420
408
|
|
|
409
|
+
self.tokenFile = None
|
|
421
410
|
resultDict = {}
|
|
422
411
|
condorIDs = {}
|
|
423
412
|
# Get all condorIDs so we can just call condor_q and condor_history once
|
|
424
413
|
for jobReference in jobIDList:
|
|
425
414
|
jobReference = jobReference.split(":::")[0]
|
|
426
|
-
condorIDs[jobReference] =
|
|
415
|
+
condorIDs[self._jobReferenceToCondorID(jobReference)] = jobReference
|
|
427
416
|
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
qList = []
|
|
431
|
-
for _condorIDs in breakListIntoChunks(condorIDs.values(), 100):
|
|
432
|
-
# This will return a list of 1245.75 3 undefined undefined undefined
|
|
417
|
+
jobsMetadata = []
|
|
418
|
+
for _condorIDs in breakListIntoChunks(condorIDs.keys(), 100):
|
|
433
419
|
cmd = ["condor_q"]
|
|
434
420
|
cmd.extend(self.remoteScheddOptions.strip().split(" "))
|
|
435
421
|
cmd.extend(_condorIDs)
|
|
436
|
-
cmd.extend(["-
|
|
422
|
+
cmd.extend(["-attributes", STATE_ATTRIBUTES])
|
|
423
|
+
cmd.extend(["-json"])
|
|
437
424
|
result = self._executeCondorCommand(cmd, keepTokenFile=True)
|
|
438
425
|
if not result["OK"]:
|
|
439
426
|
return result
|
|
440
427
|
|
|
441
|
-
|
|
428
|
+
if result["Value"]:
|
|
429
|
+
jobsMetadata.extend(json.loads(result["Value"]))
|
|
442
430
|
|
|
443
431
|
condorHistCall = ["condor_history"]
|
|
444
432
|
condorHistCall.extend(self.remoteScheddOptions.strip().split(" "))
|
|
445
433
|
condorHistCall.extend(_condorIDs)
|
|
446
|
-
condorHistCall.extend(["-
|
|
434
|
+
condorHistCall.extend(["-attributes", STATE_ATTRIBUTES])
|
|
435
|
+
condorHistCall.extend(["-json"])
|
|
447
436
|
result = self._executeCondorCommand(cmd, keepTokenFile=True)
|
|
448
437
|
if not result["OK"]:
|
|
449
438
|
return result
|
|
450
439
|
|
|
451
|
-
|
|
440
|
+
if result["Value"]:
|
|
441
|
+
jobsMetadata.extend(json.loads(result["Value"]))
|
|
452
442
|
|
|
453
|
-
|
|
454
|
-
|
|
443
|
+
foundJobIDs = set()
|
|
444
|
+
for jobDict in jobsMetadata:
|
|
445
|
+
jobStatus, reason = getCondorStatus(jobDict)
|
|
446
|
+
condorId = f"{jobDict['ClusterId']}.{jobDict['ProcId']}"
|
|
447
|
+
jobReference = condorIDs.get(condorId)
|
|
455
448
|
|
|
456
449
|
if jobStatus == PilotStatus.ABORTED:
|
|
457
|
-
self.log.verbose("Job", f"{
|
|
450
|
+
self.log.verbose("Job", f"{jobReference} held: {reason}")
|
|
451
|
+
|
|
452
|
+
resultDict[jobReference] = jobStatus
|
|
453
|
+
foundJobIDs.add(jobReference)
|
|
458
454
|
|
|
459
|
-
|
|
455
|
+
# Check if we have any jobs that were not found in the condor_q or condor_history
|
|
456
|
+
for jobReference in condorIDs.values():
|
|
457
|
+
if jobReference not in foundJobIDs:
|
|
458
|
+
self.log.verbose("Job", f"{jobReference} not found in condor_q or condor_history")
|
|
459
|
+
resultDict[jobReference] = PilotStatus.UNKNOWN
|
|
460
460
|
|
|
461
461
|
self.tokenFile = None
|
|
462
462
|
|
|
@@ -7,8 +7,8 @@ import os
|
|
|
7
7
|
import stat
|
|
8
8
|
|
|
9
9
|
from DIRAC import S_OK, S_ERROR
|
|
10
|
-
from DIRAC.Core.Utilities.Subprocess import systemCall
|
|
11
10
|
from DIRAC.Core.Utilities.ThreadScheduler import gThreadScheduler
|
|
11
|
+
from DIRAC.Core.Utilities.CGroups2 import CG2Manager
|
|
12
12
|
|
|
13
13
|
from DIRAC.Resources.Computing.ComputingElement import ComputingElement
|
|
14
14
|
|
|
@@ -61,7 +61,9 @@ class InProcessComputingElement(ComputingElement):
|
|
|
61
61
|
os.chmod(executableFile, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH)
|
|
62
62
|
cmd = os.path.abspath(executableFile)
|
|
63
63
|
self.log.verbose("CE submission command:", cmd)
|
|
64
|
-
result = systemCall(
|
|
64
|
+
result = CG2Manager().systemCall(
|
|
65
|
+
0, cmd, callbackFunction=self.sendOutput, env=payloadEnv, ceParameters=self.ceParameters
|
|
66
|
+
)
|
|
65
67
|
if payloadProxy:
|
|
66
68
|
os.unlink(payloadProxy)
|
|
67
69
|
|
|
@@ -43,7 +43,6 @@ from urllib.parse import urlparse
|
|
|
43
43
|
from DIRAC import S_OK, S_ERROR
|
|
44
44
|
|
|
45
45
|
from DIRAC.Resources.Computing.ComputingElement import ComputingElement
|
|
46
|
-
from DIRAC.Resources.Computing.PilotBundle import bundleProxy, writeScript
|
|
47
46
|
from DIRAC.Core.Utilities.List import uniqueElements
|
|
48
47
|
from DIRAC.Core.Utilities.Subprocess import systemCall
|
|
49
48
|
|
|
@@ -153,26 +152,12 @@ class LocalComputingElement(ComputingElement):
|
|
|
153
152
|
if not os.access(executableFile, 5):
|
|
154
153
|
os.chmod(executableFile, stat.S_IRWXU | stat.S_IRGRP | stat.S_IXGRP | stat.S_IROTH | stat.S_IXOTH)
|
|
155
154
|
|
|
156
|
-
# if no proxy is supplied, the executable can be submitted directly
|
|
157
|
-
# otherwise a wrapper script is needed to get the proxy to the execution node
|
|
158
|
-
# The wrapper script makes debugging more complicated and thus it is
|
|
159
|
-
# recommended to transfer a proxy inside the executable if possible.
|
|
160
|
-
if self.proxy and not proxy:
|
|
161
|
-
proxy = self.proxy
|
|
162
|
-
if proxy:
|
|
163
|
-
self.log.verbose("Setting up proxy for payload")
|
|
164
|
-
wrapperContent = bundleProxy(executableFile, proxy)
|
|
165
|
-
name = writeScript(wrapperContent, os.getcwd())
|
|
166
|
-
submitFile = name
|
|
167
|
-
else: # no proxy
|
|
168
|
-
submitFile = executableFile
|
|
169
|
-
|
|
170
155
|
jobStamps = []
|
|
171
156
|
for _i in range(numberOfJobs):
|
|
172
157
|
jobStamps.append(uuid.uuid4().hex)
|
|
173
158
|
|
|
174
159
|
batchDict = {
|
|
175
|
-
"Executable":
|
|
160
|
+
"Executable": executableFile,
|
|
176
161
|
"NJobs": numberOfJobs,
|
|
177
162
|
"OutputDir": self.batchOutput,
|
|
178
163
|
"ErrorDir": self.batchError,
|
|
@@ -186,8 +171,6 @@ class LocalComputingElement(ComputingElement):
|
|
|
186
171
|
"NumberOfGPUs": self.numberOfGPUs,
|
|
187
172
|
}
|
|
188
173
|
resultSubmit = self.batchSystem.submitJob(**batchDict)
|
|
189
|
-
if proxy:
|
|
190
|
-
os.remove(submitFile)
|
|
191
174
|
|
|
192
175
|
if resultSubmit["Status"] == 0:
|
|
193
176
|
self.submittedJobs += len(resultSubmit["Jobs"])
|