DIRAC 9.0.0a54__py3-none-any.whl → 9.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (166) hide show
  1. DIRAC/AccountingSystem/Client/AccountingCLI.py +0 -140
  2. DIRAC/AccountingSystem/Client/DataStoreClient.py +0 -13
  3. DIRAC/AccountingSystem/Client/Types/BaseAccountingType.py +0 -7
  4. DIRAC/AccountingSystem/ConfigTemplate.cfg +0 -5
  5. DIRAC/AccountingSystem/Service/DataStoreHandler.py +0 -72
  6. DIRAC/ConfigurationSystem/Client/Helpers/CSGlobals.py +0 -9
  7. DIRAC/ConfigurationSystem/Client/Helpers/Registry.py +34 -32
  8. DIRAC/ConfigurationSystem/Client/Helpers/Resources.py +11 -43
  9. DIRAC/ConfigurationSystem/Client/Helpers/test/Test_Helpers.py +0 -16
  10. DIRAC/ConfigurationSystem/Client/LocalConfiguration.py +14 -8
  11. DIRAC/ConfigurationSystem/Client/PathFinder.py +47 -8
  12. DIRAC/ConfigurationSystem/Client/SyncPlugins/CERNLDAPSyncPlugin.py +4 -1
  13. DIRAC/ConfigurationSystem/Client/VOMS2CSSynchronizer.py +9 -2
  14. DIRAC/ConfigurationSystem/Client/test/Test_PathFinder.py +41 -1
  15. DIRAC/ConfigurationSystem/private/RefresherBase.py +4 -2
  16. DIRAC/Core/DISET/ServiceReactor.py +11 -3
  17. DIRAC/Core/DISET/private/BaseClient.py +1 -2
  18. DIRAC/Core/DISET/private/Transports/M2SSLTransport.py +9 -7
  19. DIRAC/Core/Security/DiracX.py +12 -7
  20. DIRAC/Core/Security/IAMService.py +4 -3
  21. DIRAC/Core/Security/ProxyInfo.py +9 -5
  22. DIRAC/Core/Security/test/test_diracx_token_from_pem.py +161 -0
  23. DIRAC/Core/Tornado/Client/ClientSelector.py +4 -1
  24. DIRAC/Core/Tornado/Server/TornadoService.py +1 -1
  25. DIRAC/Core/Utilities/ClassAd/ClassAdLight.py +4 -290
  26. DIRAC/Core/Utilities/DErrno.py +5 -309
  27. DIRAC/Core/Utilities/Extensions.py +10 -1
  28. DIRAC/Core/Utilities/Graphs/GraphData.py +1 -1
  29. DIRAC/Core/Utilities/JDL.py +1 -195
  30. DIRAC/Core/Utilities/List.py +1 -124
  31. DIRAC/Core/Utilities/MySQL.py +101 -97
  32. DIRAC/Core/Utilities/Os.py +32 -1
  33. DIRAC/Core/Utilities/Platform.py +2 -107
  34. DIRAC/Core/Utilities/ReturnValues.py +7 -252
  35. DIRAC/Core/Utilities/StateMachine.py +12 -178
  36. DIRAC/Core/Utilities/TimeUtilities.py +10 -253
  37. DIRAC/Core/Utilities/test/Test_JDL.py +0 -3
  38. DIRAC/Core/Utilities/test/Test_Profiler.py +20 -20
  39. DIRAC/Core/scripts/dirac_agent.py +1 -1
  40. DIRAC/Core/scripts/dirac_apptainer_exec.py +16 -7
  41. DIRAC/Core/scripts/dirac_platform.py +1 -92
  42. DIRAC/DataManagementSystem/Agent/FTS3Agent.py +8 -7
  43. DIRAC/DataManagementSystem/Agent/RequestOperations/RemoveFile.py +7 -6
  44. DIRAC/DataManagementSystem/Client/FTS3Job.py +71 -34
  45. DIRAC/DataManagementSystem/DB/FTS3DB.py +3 -0
  46. DIRAC/DataManagementSystem/DB/FileCatalogComponents/DatasetManager/DatasetManager.py +1 -1
  47. DIRAC/DataManagementSystem/Utilities/DMSHelpers.py +6 -2
  48. DIRAC/DataManagementSystem/scripts/dirac_dms_create_moving_request.py +2 -0
  49. DIRAC/DataManagementSystem/scripts/dirac_dms_protocol_matrix.py +0 -1
  50. DIRAC/FrameworkSystem/Client/ComponentInstaller.py +4 -2
  51. DIRAC/FrameworkSystem/DB/ProxyDB.py +9 -5
  52. DIRAC/FrameworkSystem/Utilities/TokenManagementUtilities.py +3 -2
  53. DIRAC/FrameworkSystem/Utilities/diracx.py +2 -74
  54. DIRAC/FrameworkSystem/private/authorization/AuthServer.py +2 -2
  55. DIRAC/FrameworkSystem/scripts/dirac_login.py +2 -2
  56. DIRAC/FrameworkSystem/scripts/dirac_proxy_init.py +1 -1
  57. DIRAC/Interfaces/API/Dirac.py +27 -13
  58. DIRAC/Interfaces/API/DiracAdmin.py +42 -7
  59. DIRAC/Interfaces/API/Job.py +1 -0
  60. DIRAC/Interfaces/scripts/dirac_admin_allow_site.py +7 -1
  61. DIRAC/Interfaces/scripts/dirac_admin_ban_site.py +7 -1
  62. DIRAC/Interfaces/scripts/dirac_wms_job_parameters.py +0 -1
  63. DIRAC/MonitoringSystem/Client/Types/WMSHistory.py +4 -0
  64. DIRAC/MonitoringSystem/Client/WebAppClient.py +26 -0
  65. DIRAC/MonitoringSystem/ConfigTemplate.cfg +9 -0
  66. DIRAC/MonitoringSystem/DB/MonitoringDB.py +6 -25
  67. DIRAC/MonitoringSystem/Service/MonitoringHandler.py +0 -33
  68. DIRAC/MonitoringSystem/Service/WebAppHandler.py +599 -0
  69. DIRAC/MonitoringSystem/private/MainReporter.py +0 -3
  70. DIRAC/ProductionSystem/scripts/dirac_prod_get_trans.py +2 -3
  71. DIRAC/RequestManagementSystem/Agent/RequestExecutingAgent.py +8 -6
  72. DIRAC/RequestManagementSystem/ConfigTemplate.cfg +6 -6
  73. DIRAC/RequestManagementSystem/DB/test/RMSTestScenari.py +2 -0
  74. DIRAC/ResourceStatusSystem/Client/SiteStatus.py +4 -2
  75. DIRAC/ResourceStatusSystem/Command/FreeDiskSpaceCommand.py +3 -1
  76. DIRAC/ResourceStatusSystem/Utilities/CSHelpers.py +2 -31
  77. DIRAC/ResourceStatusSystem/scripts/dirac_rss_set_status.py +18 -4
  78. DIRAC/Resources/Catalog/RucioFileCatalogClient.py +1 -1
  79. DIRAC/Resources/Computing/AREXComputingElement.py +19 -3
  80. DIRAC/Resources/Computing/BatchSystems/Condor.py +126 -108
  81. DIRAC/Resources/Computing/BatchSystems/SLURM.py +5 -1
  82. DIRAC/Resources/Computing/BatchSystems/test/Test_SLURM.py +46 -0
  83. DIRAC/Resources/Computing/HTCondorCEComputingElement.py +37 -43
  84. DIRAC/Resources/Computing/SingularityComputingElement.py +6 -1
  85. DIRAC/Resources/Computing/test/Test_HTCondorCEComputingElement.py +67 -49
  86. DIRAC/Resources/Computing/test/Test_PoolComputingElement.py +2 -1
  87. DIRAC/Resources/IdProvider/CheckInIdProvider.py +13 -0
  88. DIRAC/Resources/IdProvider/IdProviderFactory.py +11 -3
  89. DIRAC/Resources/Storage/StorageBase.py +4 -2
  90. DIRAC/Resources/Storage/StorageElement.py +4 -4
  91. DIRAC/TransformationSystem/Agent/TaskManagerAgentBase.py +10 -16
  92. DIRAC/TransformationSystem/Agent/TransformationAgent.py +22 -1
  93. DIRAC/TransformationSystem/Agent/TransformationCleaningAgent.py +15 -15
  94. DIRAC/TransformationSystem/Client/Transformation.py +2 -1
  95. DIRAC/TransformationSystem/Client/TransformationClient.py +0 -7
  96. DIRAC/TransformationSystem/Client/Utilities.py +9 -0
  97. DIRAC/TransformationSystem/Service/TransformationManagerHandler.py +0 -336
  98. DIRAC/TransformationSystem/Utilities/ReplicationCLIParameters.py +3 -3
  99. DIRAC/TransformationSystem/scripts/dirac_production_runjoblocal.py +2 -4
  100. DIRAC/TransformationSystem/test/Test_replicationTransformation.py +5 -6
  101. DIRAC/Workflow/Modules/test/Test_Modules.py +5 -0
  102. DIRAC/WorkloadManagementSystem/Agent/JobAgent.py +1 -5
  103. DIRAC/WorkloadManagementSystem/Agent/JobCleaningAgent.py +11 -7
  104. DIRAC/WorkloadManagementSystem/Agent/PilotSyncAgent.py +4 -3
  105. DIRAC/WorkloadManagementSystem/Agent/PushJobAgent.py +13 -13
  106. DIRAC/WorkloadManagementSystem/Agent/SiteDirector.py +10 -13
  107. DIRAC/WorkloadManagementSystem/Agent/StalledJobAgent.py +18 -51
  108. DIRAC/WorkloadManagementSystem/Agent/StatesAccountingAgent.py +41 -1
  109. DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_JobAgent.py +2 -0
  110. DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_JobCleaningAgent.py +7 -9
  111. DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_PushJobAgent.py +1 -0
  112. DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_SiteDirector.py +8 -2
  113. DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_StalledJobAgent.py +4 -5
  114. DIRAC/WorkloadManagementSystem/Client/DownloadInputData.py +7 -5
  115. DIRAC/WorkloadManagementSystem/Client/JobMonitoringClient.py +10 -11
  116. DIRAC/WorkloadManagementSystem/Client/JobState/JobManifest.py +32 -261
  117. DIRAC/WorkloadManagementSystem/Client/JobStateUpdateClient.py +3 -0
  118. DIRAC/WorkloadManagementSystem/Client/JobStatus.py +8 -152
  119. DIRAC/WorkloadManagementSystem/Client/SandboxStoreClient.py +25 -38
  120. DIRAC/WorkloadManagementSystem/Client/WMSClient.py +2 -3
  121. DIRAC/WorkloadManagementSystem/Client/test/Test_Client_DownloadInputData.py +29 -0
  122. DIRAC/WorkloadManagementSystem/ConfigTemplate.cfg +4 -8
  123. DIRAC/WorkloadManagementSystem/DB/JobDB.py +40 -69
  124. DIRAC/WorkloadManagementSystem/DB/JobDBUtils.py +18 -147
  125. DIRAC/WorkloadManagementSystem/DB/JobParametersDB.py +9 -9
  126. DIRAC/WorkloadManagementSystem/DB/PilotAgentsDB.py +3 -2
  127. DIRAC/WorkloadManagementSystem/DB/SandboxMetadataDB.py +28 -39
  128. DIRAC/WorkloadManagementSystem/DB/StatusUtils.py +125 -0
  129. DIRAC/WorkloadManagementSystem/DB/tests/Test_JobDB.py +1 -1
  130. DIRAC/WorkloadManagementSystem/DB/tests/Test_StatusUtils.py +28 -0
  131. DIRAC/WorkloadManagementSystem/Executor/JobSanity.py +3 -3
  132. DIRAC/WorkloadManagementSystem/FutureClient/JobStateUpdateClient.py +2 -14
  133. DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapper.py +14 -9
  134. DIRAC/WorkloadManagementSystem/JobWrapper/test/Test_JobWrapper.py +36 -10
  135. DIRAC/WorkloadManagementSystem/JobWrapper/test/Test_JobWrapperTemplate.py +4 -0
  136. DIRAC/WorkloadManagementSystem/Service/JobManagerHandler.py +33 -154
  137. DIRAC/WorkloadManagementSystem/Service/JobMonitoringHandler.py +5 -323
  138. DIRAC/WorkloadManagementSystem/Service/JobStateUpdateHandler.py +0 -16
  139. DIRAC/WorkloadManagementSystem/Service/PilotManagerHandler.py +6 -102
  140. DIRAC/WorkloadManagementSystem/Service/SandboxStoreHandler.py +5 -51
  141. DIRAC/WorkloadManagementSystem/Service/WMSAdministratorHandler.py +16 -79
  142. DIRAC/WorkloadManagementSystem/Utilities/JobModel.py +28 -199
  143. DIRAC/WorkloadManagementSystem/Utilities/JobParameters.py +65 -3
  144. DIRAC/WorkloadManagementSystem/Utilities/JobStatusUtility.py +2 -64
  145. DIRAC/WorkloadManagementSystem/Utilities/ParametricJob.py +7 -171
  146. DIRAC/WorkloadManagementSystem/Utilities/PilotCStoJSONSynchronizer.py +73 -7
  147. DIRAC/WorkloadManagementSystem/Utilities/PilotWrapper.py +2 -0
  148. DIRAC/WorkloadManagementSystem/Utilities/RemoteRunner.py +16 -0
  149. DIRAC/WorkloadManagementSystem/Utilities/Utils.py +36 -1
  150. DIRAC/WorkloadManagementSystem/Utilities/jobAdministration.py +15 -0
  151. DIRAC/WorkloadManagementSystem/Utilities/test/Test_JobModel.py +1 -5
  152. DIRAC/WorkloadManagementSystem/Utilities/test/Test_ParametricJob.py +45 -128
  153. DIRAC/WorkloadManagementSystem/Utilities/test/Test_PilotWrapper.py +16 -0
  154. DIRAC/__init__.py +55 -54
  155. {dirac-9.0.0a54.dist-info → dirac-9.0.7.dist-info}/METADATA +6 -4
  156. {dirac-9.0.0a54.dist-info → dirac-9.0.7.dist-info}/RECORD +160 -160
  157. {dirac-9.0.0a54.dist-info → dirac-9.0.7.dist-info}/WHEEL +1 -1
  158. {dirac-9.0.0a54.dist-info → dirac-9.0.7.dist-info}/entry_points.txt +0 -3
  159. DIRAC/Core/Utilities/test/Test_List.py +0 -150
  160. DIRAC/Core/Utilities/test/Test_Time.py +0 -88
  161. DIRAC/TransformationSystem/scripts/dirac_transformation_archive.py +0 -30
  162. DIRAC/TransformationSystem/scripts/dirac_transformation_clean.py +0 -30
  163. DIRAC/TransformationSystem/scripts/dirac_transformation_remove_output.py +0 -30
  164. DIRAC/WorkloadManagementSystem/Utilities/test/Test_JobManager.py +0 -58
  165. {dirac-9.0.0a54.dist-info → dirac-9.0.7.dist-info}/licenses/LICENSE +0 -0
  166. {dirac-9.0.0a54.dist-info → dirac-9.0.7.dist-info}/top_level.txt +0 -0
@@ -44,19 +44,19 @@ Agents
44
44
  {
45
45
  PollingTime = 60
46
46
  # number of Requests to execute per cycle
47
- RequestsPerCycle = 100
47
+ RequestsPerCycle = 300
48
48
  # minimum number of workers process in the ProcessPool
49
- MinProcess = 20
49
+ MinProcess = 50
50
50
  # maximum number of workers process in the ProcessPool; recommended to set it to the same value as MinProcess
51
- MaxProcess = 20
51
+ MaxProcess = 50
52
52
  # queue depth of the ProcessPool
53
- ProcessPoolQueueSize = 20
53
+ ProcessPoolQueueSize = 100
54
54
  # timeout for the ProcessPool finalization
55
55
  ProcessPoolTimeout = 900
56
56
  # sleep time before retrying to get a free slot in the ProcessPool
57
- ProcessPoolSleep = 5
57
+ ProcessPoolSleep = 1
58
58
  # If a positive integer n is given, we fetch n requests at once from the DB. Otherwise, one by one
59
- BulkRequest = 0
59
+ BulkRequest = 300
60
60
  OperationHandlers
61
61
  {
62
62
  ForwardDISET
@@ -6,6 +6,7 @@ integration tests (Test_ReqDB.py)
6
6
  # pylint: disable=invalid-name,wrong-import-position
7
7
  import time
8
8
 
9
+ import pytest
9
10
 
10
11
  from DIRAC.RequestManagementSystem.Client.Request import Request
11
12
  from DIRAC.RequestManagementSystem.Client.Operation import Operation
@@ -42,6 +43,7 @@ def test_stress(reqDB):
42
43
  assert delete["OK"], delete
43
44
 
44
45
 
46
+ @pytest.mark.slow
45
47
  def test_stressBulk(reqDB):
46
48
  """stress test bulk"""
47
49
 
@@ -1,4 +1,4 @@
1
- """ SiteStatus helper
1
+ """SiteStatus helper
2
2
 
3
3
  Module that acts as a helper for knowing the status of a site.
4
4
  It takes care of switching between the CS and the RSS.
@@ -195,7 +195,7 @@ class SiteStatus(metaclass=DIRACSingleton):
195
195
 
196
196
  return S_OK(siteList)
197
197
 
198
- def setSiteStatus(self, site, status, comment="No comment"):
198
+ def setSiteStatus(self, site, status, comment="No comment", expiry=None):
199
199
  """
200
200
  Set the status of a site in the 'SiteStatus' table of RSS
201
201
 
@@ -231,6 +231,8 @@ class SiteStatus(metaclass=DIRACSingleton):
231
231
  return S_ERROR(f"Unable to get user proxy info {result['Message']} ")
232
232
 
233
233
  tokenExpiration = datetime.utcnow() + timedelta(days=1)
234
+ if expiry:
235
+ tokenExpiration = expiry
234
236
 
235
237
  self.rssCache.acquireLock()
236
238
  try:
@@ -121,7 +121,9 @@ class FreeDiskSpaceCommand(Command):
121
121
  "Site": siteRes["Value"] if siteRes["Value"] else "unassigned",
122
122
  }
123
123
 
124
- results["Used"] = results["Total"] - results["Free"]
124
+ # There are sometimes small discrepencies which can lead to negative
125
+ # used values.
126
+ results["Used"] = max(0, results["Total"] - results["Free"])
125
127
 
126
128
  for sType in ["Total", "Free", "Used"]:
127
129
  spaceTokenAccounting = StorageOccupancy()
@@ -3,11 +3,10 @@ Module containing functions interacting with the CS and useful for the RSS
3
3
  modules.
4
4
  """
5
5
 
6
- from DIRAC import gConfig, gLogger, S_OK
6
+ from DIRAC import S_OK, gConfig, gLogger
7
+ from DIRAC.ConfigurationSystem.Client.Helpers.Resources import getQueues
7
8
  from DIRAC.Core.Utilities.SiteSEMapping import getSEParameters
8
- from DIRAC.ConfigurationSystem.Client.Helpers.Resources import getQueues, getCESiteMapping
9
9
  from DIRAC.DataManagementSystem.Utilities.DMSHelpers import DMSHelpers
10
- from DIRAC.ResourceStatusSystem.Utilities import Utils
11
10
 
12
11
 
13
12
  def warmUp():
@@ -19,28 +18,6 @@ def warmUp():
19
18
  gRefresher.refreshConfigurationIfNeeded()
20
19
 
21
20
 
22
- def getResources():
23
- """
24
- Gets all resources
25
- """
26
-
27
- resources = DMSHelpers().getStorageElements()
28
-
29
- fts = getFTS()
30
- if fts["OK"]:
31
- resources = resources + fts["Value"]
32
-
33
- fc = getFileCatalogs()
34
- if fc["OK"]:
35
- resources = resources + fc["Value"]
36
-
37
- res = getCESiteMapping()
38
- if res["OK"]:
39
- resources = resources + list(res["Value"])
40
-
41
- return S_OK(resources)
42
-
43
-
44
21
  def getStorageElementEndpoint(seName):
45
22
  """Get endpoints of a StorageElement
46
23
 
@@ -86,12 +63,6 @@ def getFTS():
86
63
  return S_OK([])
87
64
 
88
65
 
89
- def getSpaceTokenEndpoints():
90
- """Get Space Token Endpoints"""
91
-
92
- return Utils.getCSTree("Shares/Disk")
93
-
94
-
95
66
  def getFileCatalogs():
96
67
  """
97
68
  Gets all storage elements from /Resources/FileCatalogs
@@ -29,6 +29,7 @@ def registerSwitches():
29
29
  ("reason=", "Reason to set the Status"),
30
30
  ("VO=", "VO to change a status for. When omitted, status will be changed for all VOs"),
31
31
  ("tokenOwner=", "Owner of the token"),
32
+ ("days=", "Number of days the token is valid for. Default is 1 day. 0 or less days denotes forever."),
32
33
  )
33
34
 
34
35
  for switch in switches:
@@ -50,6 +51,7 @@ def parseSwitches():
50
51
  switches = dict(Script.getUnprocessedSwitches())
51
52
  switches.setdefault("statusType", None)
52
53
  switches.setdefault("VO", None)
54
+ switches.setdefault("days", 1)
53
55
 
54
56
  for key in ("element", "name", "status", "reason"):
55
57
  if key not in switches:
@@ -183,7 +185,11 @@ def setStatus(switchDict, tokenOwner):
183
185
  )
184
186
  return S_OK()
185
187
 
186
- tomorrow = datetime.utcnow().replace(microsecond=0) + timedelta(days=1)
188
+ tokenLifetime = int(switchDict["days"])
189
+ if tokenLifetime <= 0:
190
+ tokenExpiration = datetime.max
191
+ else:
192
+ tokenExpiration = datetime.utcnow().replace(microsecond=0) + timedelta(days=tokenLifetime)
187
193
 
188
194
  for status, statusType in elements:
189
195
  gLogger.debug(f"{status} {statusType}")
@@ -193,8 +199,16 @@ def setStatus(switchDict, tokenOwner):
193
199
  continue
194
200
 
195
201
  gLogger.debug(
196
- "About to set status %s -> %s for %s, statusType: %s, VO: %s, reason: %s"
197
- % (status, switchDict["status"], switchDict["name"], statusType, switchDict["VO"], switchDict["reason"])
202
+ "About to set status %s -> %s for %s, statusType: %s, VO: %s, reason: %s, days: %s"
203
+ % (
204
+ status,
205
+ switchDict["status"],
206
+ switchDict["name"],
207
+ statusType,
208
+ switchDict["VO"],
209
+ switchDict["reason"],
210
+ switchDict["days"],
211
+ )
198
212
  )
199
213
  result = rssClient.modifyStatusElement(
200
214
  switchDict["element"],
@@ -205,7 +219,7 @@ def setStatus(switchDict, tokenOwner):
205
219
  reason=switchDict["reason"],
206
220
  vO=switchDict["VO"],
207
221
  tokenOwner=tokenOwner,
208
- tokenExpiration=tomorrow,
222
+ tokenExpiration=tokenExpiration,
209
223
  )
210
224
  if not result["OK"]:
211
225
  return result
@@ -151,7 +151,7 @@ class RucioFileCatalogClient(FileCatalogClientBase):
151
151
  self.authHost = options.get("AuthHost", None)
152
152
  self.caCertPath = Locations.getCAsLocation()
153
153
  try:
154
- sLog.info(f"Logging in with a proxy located at: {self.proxyPath}")
154
+ sLog.debug(f"Logging in with a proxy located at: {self.proxyPath}")
155
155
  sLog.debug("account: ", self.username)
156
156
  sLog.debug("rucio host: ", self.rucioHost)
157
157
  sLog.debug("auth host: ", self.authHost)
@@ -1,4 +1,4 @@
1
- """ AREX Computing Element (ARC REST interface)
1
+ """AREX Computing Element (ARC REST interface)
2
2
 
3
3
  Allows interacting with ARC AREX services via a REST interface.
4
4
 
@@ -807,7 +807,23 @@ class AREXComputingElement(ComputingElement):
807
807
  return S_ERROR(f"Failed decoding the status of the CE")
808
808
 
809
809
  # Look only in the relevant section out of the headache
810
- queueInfo = ceData["Domains"]["AdminDomain"]["Services"]["ComputingService"]["ComputingShare"]
810
+ # This "safe_get" function allows to go down the dictionary
811
+ # even if some elements are lists instead of dictionaries
812
+ # and returns None if any element is not found
813
+ # FIXME: this is a temporary measure to be removed after https://github.com/DIRACGrid/DIRAC/issues/8354
814
+ def safe_get(d, *keys):
815
+ for k in keys:
816
+ if isinstance(d, list):
817
+ d = d[0] # assume first element
818
+ d = d.get(k) if isinstance(d, dict) else None
819
+ if d is None:
820
+ break
821
+ return d
822
+
823
+ queueInfo = safe_get(ceData, "Domains", "AdminDomain", "Services", "ComputingService", "ComputingShare")
824
+ if queueInfo is None:
825
+ self.log.error("Failed to extract queue info")
826
+
811
827
  if not isinstance(queueInfo, list):
812
828
  queueInfo = [queueInfo]
813
829
 
@@ -819,7 +835,7 @@ class AREXComputingElement(ComputingElement):
819
835
  for qi in queueInfo:
820
836
  if qi["ID"].endswith(magic):
821
837
  result["RunningJobs"] = int(qi["RunningJobs"])
822
- result["WaitingJobs"] = int(qi["WaitingJobs"])
838
+ result["WaitingJobs"] = int(qi["WaitingJobs"]) + int(qi["StagingJobs"]) + int(qi["PreLRMSWaitingJobs"])
823
839
  break # Pick the first (should be only ...) matching queue + VO
824
840
  else:
825
841
  return S_ERROR(f"Could not find the queue {self.queue} associated to VO {vo}")
@@ -6,6 +6,7 @@
6
6
  from __future__ import print_function
7
7
  from __future__ import absolute_import
8
8
  from __future__ import division
9
+ import json
9
10
  import re
10
11
  import tempfile
11
12
  import subprocess
@@ -25,6 +26,8 @@ STATES_MAP = {
25
26
 
26
27
  HOLD_REASON_SUBCODE = "55"
27
28
 
29
+ STATE_ATTRIBUTES = "ClusterId,ProcId,JobStatus,HoldReasonCode,HoldReasonSubCode,HoldReason"
30
+
28
31
  subTemplate = """
29
32
  # Environment
30
33
  # -----------
@@ -62,6 +65,7 @@ environment = "DIRAC_PILOT_STAMP=$(stamp) %(environment)s"
62
65
  # Requirements
63
66
  # ------------
64
67
  request_cpus = %(processors)s
68
+ requirements = NumJobStarts == 0
65
69
 
66
70
  # Exit options
67
71
  # ------------
@@ -73,7 +77,8 @@ on_exit_hold = ExitCode =!= 0
73
77
  # A subcode of our choice to identify who put the job on hold
74
78
  on_exit_hold_subcode = %(holdReasonSubcode)s
75
79
  # Jobs are then deleted from the system after N days if they are not idle or running
76
- periodic_remove = (JobStatus != 1) && (JobStatus != 2) && ((time() - EnteredCurrentStatus) > (%(daysToKeepRemoteLogs)s * 24 * 3600))
80
+ periodic_remove = ((JobStatus == 1) && (NumJobStarts > 0)) || \
81
+ ((JobStatus != 1) && (JobStatus != 2) && ((time() - EnteredCurrentStatus) > (%(daysToKeepRemoteLogs)s * 24 * 3600)))
77
82
 
78
83
  # Specific options
79
84
  # ----------------
@@ -87,63 +92,34 @@ Queue stamp in %(pilotStampList)s
87
92
  """
88
93
 
89
94
 
90
- def parseCondorStatus(lines, jobID):
95
+ def getCondorStatus(jobMetadata):
91
96
  """parse the condor_q or condor_history output for the job status
92
97
 
93
- :param lines: list of lines from the output of the condor commands, each line is a tuple of jobID, statusID, and holdReasonCode
94
- :type lines: python:list
95
- :param str jobID: jobID of condor job, e.g.: 123.53
98
+ :param jobMetadata: dict with job metadata
99
+ :type jobMetadata: dict[str, str | int]
96
100
  :returns: Status as known by DIRAC, and a reason if the job is being held
97
101
  """
98
- jobID = str(jobID)
99
-
100
- holdReason = ""
101
- status = None
102
- for line in lines:
103
- l = line.strip().split()
104
-
105
- # Make sure the job ID exists
106
- if len(l) < 1 or l[0] != jobID:
107
- continue
108
-
109
- # Make sure the status is present and is an integer
110
- try:
111
- status = int(l[1])
112
- except (ValueError, IndexError):
113
- break
114
-
115
- # Stop here if the status is not held (5): result should be found in STATES_MAP
116
- if status != 5:
117
- break
118
-
119
- # A job can be held for various reasons,
120
- # we need to further investigate with the holdReasonCode & holdReasonSubCode
121
- # Details in:
122
- # https://htcondor.readthedocs.io/en/latest/classad-attributes/job-classad-attributes.html#HoldReasonCode
123
-
124
- # By default, a held (5) job is defined as Aborted in STATES_MAP, but there might be some exceptions
125
- status = 3
126
- try:
127
- holdReasonCode = l[2]
128
- holdReasonSubcode = l[3]
129
- holdReason = " ".join(l[4:])
130
- except IndexError:
131
- # This should not happen in theory
132
- # Just set the status to unknown such as
133
- status = None
134
- holdReasonCode = "undefined"
135
- holdReasonSubcode = "undefined"
136
- break
137
-
138
- # If holdReasonCode is 3 (The PERIODIC_HOLD expression evaluated to True. Or, ON_EXIT_HOLD was true)
139
- # And subcode is HOLD_REASON_SUBCODE, then it means the job failed by itself, it needs to be marked as Failed
140
- if holdReasonCode == "3" and holdReasonSubcode == HOLD_REASON_SUBCODE:
141
- status = 5
142
- # If holdReasonCode is 16 (Input files are being spooled), the job should be marked as Waiting
143
- elif holdReasonCode == "16":
144
- status = 1
145
-
146
- return (STATES_MAP.get(status, "Unknown"), holdReason)
102
+ if jobMetadata["JobStatus"] != 5:
103
+ # If the job is not held, we can return the status directly
104
+ return (STATES_MAP.get(jobMetadata["JobStatus"], "Unknown"), "")
105
+
106
+ # A job can be held for various reasons,
107
+ # we need to further investigate with the holdReasonCode & holdReasonSubCode
108
+ # Details in:
109
+ # https://htcondor.readthedocs.io/en/latest/classad-attributes/job-classad-attributes.html#HoldReasonCode
110
+
111
+ # By default, a held (5) job is defined as Aborted in STATES_MAP, but there might be some exceptions
112
+ status = 3
113
+
114
+ # If holdReasonCode is 3 (The PERIODIC_HOLD expression evaluated to True. Or, ON_EXIT_HOLD was true)
115
+ # And subcode is HOLD_REASON_SUBCODE, then it means the job failed by itself, it needs to be marked as Failed
116
+ if jobMetadata["HoldReasonCode"] == 3 and jobMetadata["HoldReasonSubCode"] == HOLD_REASON_SUBCODE:
117
+ status = 5
118
+ # If holdReasonCode is 16 (Input files are being spooled), the job should be marked as Waiting
119
+ elif jobMetadata["HoldReasonCode"] == 16:
120
+ status = 1
121
+
122
+ return (STATES_MAP.get(status, "Unknown"), jobMetadata["HoldReason"])
147
123
 
148
124
 
149
125
  class Condor(object):
@@ -171,8 +147,6 @@ class Condor(object):
171
147
  preamble = kwargs.get("Preamble")
172
148
 
173
149
  jdlFile = tempfile.NamedTemporaryFile(dir=outputDir, suffix=".jdl", mode="wt")
174
- scheddOptions = 'requirements = OpSys == "LINUX"\n'
175
- scheddOptions += "gentenv = False"
176
150
  jdlFile.write(
177
151
  subTemplate
178
152
  % dict(
@@ -185,7 +159,7 @@ class Condor(object):
185
159
  holdReasonSubcode=HOLD_REASON_SUBCODE,
186
160
  daysToKeepRemoteLogs=1,
187
161
  scheddOptions="",
188
- extraString="",
162
+ extraString=submitOptions,
189
163
  pilotStampList=",".join(stamps),
190
164
  )
191
165
  )
@@ -193,7 +167,7 @@ class Condor(object):
193
167
  jdlFile.flush()
194
168
 
195
169
  cmd = "%s; " % preamble if preamble else ""
196
- cmd += "condor_submit %s %s" % (submitOptions, jdlFile.name)
170
+ cmd += "condor_submit -spool %s" % jdlFile.name
197
171
  sp = subprocess.Popen(
198
172
  cmd,
199
173
  shell=True,
@@ -283,7 +257,6 @@ class Condor(object):
283
257
 
284
258
  def getJobStatus(self, **kwargs):
285
259
  """Get status of the jobs in the given list"""
286
-
287
260
  resultDict = {}
288
261
 
289
262
  MANDATORY_PARAMETERS = ["JobIDList"]
@@ -299,15 +272,11 @@ class Condor(object):
299
272
  resultDict["Message"] = "Empty job list"
300
273
  return resultDict
301
274
 
302
- user = kwargs.get("User")
303
- if not user:
304
- user = os.environ.get("USER")
305
- if not user:
306
- resultDict["Status"] = -1
307
- resultDict["Message"] = "No user name"
308
- return resultDict
275
+ # Prepare the command to get the status of the jobs
276
+ cmdJobs = " ".join(str(jobID) for jobID in jobIDList)
309
277
 
310
- cmd = "condor_q -submitter %s -af:j JobStatus HoldReasonCode HoldReasonSubCode HoldReason" % user
278
+ # Get the status of the jobs currently active
279
+ cmd = "condor_q %s -attributes %s -json" % (cmdJobs, STATE_ATTRIBUTES)
311
280
  sp = subprocess.Popen(
312
281
  shlex.split(cmd),
313
282
  stdout=subprocess.PIPE,
@@ -321,12 +290,13 @@ class Condor(object):
321
290
  resultDict["Status"] = status
322
291
  resultDict["Message"] = error
323
292
  return resultDict
293
+ if not output:
294
+ output = "[]"
324
295
 
325
- qList = output.strip().split("\n")
296
+ jobsMetadata = json.loads(output)
326
297
 
327
- condorHistCall = (
328
- "condor_history -af:j JobStatus HoldReasonCode HoldReasonSubCode HoldReason -submitter %s" % user
329
- )
298
+ # Get the status of the jobs in the history
299
+ condorHistCall = "condor_history %s -attributes %s -json" % (cmdJobs, STATE_ATTRIBUTES)
330
300
  sp = subprocess.Popen(
331
301
  shlex.split(condorHistCall),
332
302
  stdout=subprocess.PIPE,
@@ -335,15 +305,28 @@ class Condor(object):
335
305
  )
336
306
  output, _ = sp.communicate()
337
307
  status = sp.returncode
338
- if status == 0:
339
- for line in output.split("\n"):
340
- qList.append(line)
308
+
309
+ if status != 0:
310
+ resultDict["Status"] = status
311
+ resultDict["Message"] = error
312
+ return resultDict
313
+ if not output:
314
+ output = "[]"
315
+
316
+ jobsMetadata += json.loads(output)
341
317
 
342
318
  statusDict = {}
343
- if len(qList):
344
- for job in jobIDList:
345
- job = str(job)
346
- statusDict[job], _ = parseCondorStatus(qList, job)
319
+ # Build a set of job IDs found in jobsMetadata
320
+ foundJobIDs = set()
321
+ for jobDict in jobsMetadata:
322
+ jobID = "%s.%s" % (jobDict["ClusterId"], jobDict["ProcId"])
323
+ statusDict[jobID], _ = getCondorStatus(jobDict)
324
+ foundJobIDs.add(jobID)
325
+
326
+ # For job IDs not found, set status to "Unknown"
327
+ for jobID in jobIDList:
328
+ if str(jobID) not in foundJobIDs:
329
+ statusDict[str(jobID)] = "Unknown"
347
330
 
348
331
  # Final output
349
332
  status = 0
@@ -355,19 +338,30 @@ class Condor(object):
355
338
  """Get the overall status of the CE"""
356
339
  resultDict = {}
357
340
 
358
- user = kwargs.get("User")
359
- if not user:
360
- user = os.environ.get("USER")
361
- if not user:
341
+ cmd = "condor_q -totals -json"
342
+ sp = subprocess.Popen(
343
+ shlex.split(cmd),
344
+ stdout=subprocess.PIPE,
345
+ stderr=subprocess.PIPE,
346
+ universal_newlines=True,
347
+ )
348
+ output, error = sp.communicate()
349
+ status = sp.returncode
350
+
351
+ if status != 0 or not output:
362
352
  resultDict["Status"] = -1
363
- resultDict["Message"] = "No user name"
353
+ resultDict["Message"] = error
364
354
  return resultDict
365
355
 
366
- waitingJobs = 0
367
- runningJobs = 0
356
+ jresult = json.loads(output)
357
+ resultDict["Status"] = 0
358
+ resultDict["Waiting"] = jresult[0]["Idle"]
359
+ resultDict["Running"] = jresult[0]["Running"]
368
360
 
361
+ # We also need to check the hold jobs, some of them are actually waiting (e.g. for input files)
362
+ cmd = 'condor_q -json -constraint "JobStatus == 5" -attributes HoldReasonCode'
369
363
  sp = subprocess.Popen(
370
- shlex.split("condor_q -submitter %s" % user),
364
+ shlex.split(cmd),
371
365
  stdout=subprocess.PIPE,
372
366
  stderr=subprocess.PIPE,
373
367
  universal_newlines=True,
@@ -376,33 +370,57 @@ class Condor(object):
376
370
  status = sp.returncode
377
371
 
378
372
  if status != 0:
379
- if "no record" in output:
380
- resultDict["Status"] = 0
381
- resultDict["Waiting"] = waitingJobs
382
- resultDict["Running"] = runningJobs
383
- return resultDict
384
- resultDict["Status"] = status
373
+ resultDict["Status"] = -1
385
374
  resultDict["Message"] = error
386
375
  return resultDict
387
376
 
388
- if "no record" in output:
389
- resultDict["Status"] = 0
390
- resultDict["Waiting"] = waitingJobs
391
- resultDict["Running"] = runningJobs
377
+ # If there are no held jobs, we can return the result
378
+ if not output:
392
379
  return resultDict
393
380
 
394
- if output:
395
- lines = output.split("\n")
396
- for line in lines:
397
- if not line.strip():
398
- continue
399
- if " I " in line:
400
- waitingJobs += 1
401
- elif " R " in line:
402
- runningJobs += 1
381
+ jresult = json.loads(output)
382
+ for job_metadata in jresult:
383
+ if job_metadata["HoldReasonCode"] == 16:
384
+ resultDict["Waiting"] += 1
385
+
386
+ return resultDict
387
+
388
+ def getJobOutputFiles(self, **kwargs):
389
+ """Get output file names and templates for the specific CE"""
390
+ resultDict = {}
391
+
392
+ MANDATORY_PARAMETERS = ["JobIDList", "OutputDir", "ErrorDir"]
393
+ for argument in MANDATORY_PARAMETERS:
394
+ if argument not in kwargs:
395
+ resultDict["Status"] = -1
396
+ resultDict["Message"] = "No %s" % argument
397
+ return resultDict
398
+
399
+ outputDir = kwargs["OutputDir"]
400
+ errorDir = kwargs["ErrorDir"]
401
+ jobIDList = kwargs["JobIDList"]
402
+
403
+ jobDict = {}
404
+ for jobID in jobIDList:
405
+ jobDict[jobID] = {}
406
+
407
+ cmd = "condor_transfer_data %s" % jobID
408
+ sp = subprocess.Popen(
409
+ shlex.split(cmd),
410
+ stdout=subprocess.PIPE,
411
+ stderr=subprocess.PIPE,
412
+ universal_newlines=True,
413
+ )
414
+ _, error = sp.communicate()
415
+ status = sp.returncode
416
+ if status != 0:
417
+ resultDict["Status"] = -1
418
+ resultDict["Message"] = error
419
+ return resultDict
420
+
421
+ jobDict[jobID]["Output"] = "%s/%s.out" % (outputDir, jobID)
422
+ jobDict[jobID]["Error"] = "%s/%s.err" % (errorDir, jobID)
403
423
 
404
- # Final output
405
424
  resultDict["Status"] = 0
406
- resultDict["Waiting"] = waitingJobs
407
- resultDict["Running"] = runningJobs
425
+ resultDict["Jobs"] = jobDict
408
426
  return resultDict
@@ -40,6 +40,7 @@ class SLURM(object):
40
40
  executable = kwargs["Executable"]
41
41
  account = kwargs.get("Account", "")
42
42
  numberOfProcessors = kwargs.get("NumberOfProcessors", 1)
43
+ wholeNode = kwargs.get("WholeNode", False)
43
44
  # numberOfNodes is treated as a string as it can contain values such as "2-4"
44
45
  # where 2 would represent the minimum number of nodes to allocate, and 4 the maximum
45
46
  numberOfNodes = kwargs.get("NumberOfNodes", "1")
@@ -72,7 +73,10 @@ class SLURM(object):
72
73
  # One pilot (task) per node, allocating a certain number of processors
73
74
  cmd += "--ntasks-per-node=1 "
74
75
  cmd += "--nodes=%s " % numberOfNodes
75
- cmd += "--cpus-per-task=%d " % numberOfProcessors
76
+ if wholeNode:
77
+ cmd += "--exclusive "
78
+ else:
79
+ cmd += "--cpus-per-task=%d " % numberOfProcessors
76
80
  if numberOfGPUs:
77
81
  cmd += "--gpus-per-task=%d " % int(numberOfGPUs)
78
82
  # Additional options
@@ -198,3 +198,49 @@ def test_getJobOutputFiles(numberOfNodes, outputContent, expectedContent):
198
198
 
199
199
  os.remove(outputFile)
200
200
  os.remove(errorFile)
201
+
202
+
203
+ def test_submitJob_cmd_generation(mocker):
204
+ """Test submitJob() command string generation for various kwargs"""
205
+ slurm = SLURM()
206
+ # Mock subprocess.Popen to capture the command
207
+ popen_mock = mocker.patch("subprocess.Popen")
208
+ process_mock = popen_mock.return_value
209
+ process_mock.communicate.return_value = ("Submitted batch job 1234\n", "")
210
+ process_mock.returncode = 0
211
+
212
+ # Minimal kwargs
213
+ kwargs = {
214
+ "Executable": "/bin/echo",
215
+ "OutputDir": "/tmp",
216
+ "ErrorDir": "/tmp",
217
+ "Queue": "testq",
218
+ "SubmitOptions": "",
219
+ "JobStamps": ["stamp1"],
220
+ "NJobs": 1,
221
+ }
222
+ # Test default (WholeNode False)
223
+ slurm.submitJob(**kwargs)
224
+ cmd = popen_mock.call_args[0][0]
225
+ assert "--cpus-per-task=1" in cmd
226
+ assert "--exclusive" not in cmd
227
+
228
+ # Test WholeNode True disables --cpus-per-task and adds --exclusive
229
+ kwargs["WholeNode"] = True
230
+ slurm.submitJob(**kwargs)
231
+ cmd = popen_mock.call_args[0][0]
232
+ assert "--exclusive" in cmd
233
+ assert "--cpus-per-task" not in cmd
234
+
235
+ # Test NumberOfProcessors
236
+ kwargs["WholeNode"] = False
237
+ kwargs["NumberOfProcessors"] = 8
238
+ slurm.submitJob(**kwargs)
239
+ cmd = popen_mock.call_args[0][0]
240
+ assert "--cpus-per-task=8" in cmd
241
+
242
+ # Test NumberOfGPUs
243
+ kwargs["NumberOfGPUs"] = 2
244
+ slurm.submitJob(**kwargs)
245
+ cmd = popen_mock.call_args[0][0]
246
+ assert "--gpus-per-task=2" in cmd