DIRAC 9.0.0a61__py3-none-any.whl → 9.0.0a63__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. DIRAC/ConfigurationSystem/Client/Helpers/Registry.py +35 -7
  2. DIRAC/ConfigurationSystem/Client/LocalConfiguration.py +3 -0
  3. DIRAC/ConfigurationSystem/Client/VOMS2CSSynchronizer.py +8 -1
  4. DIRAC/Core/Security/DiracX.py +1 -1
  5. DIRAC/Core/Security/ProxyInfo.py +9 -5
  6. DIRAC/Core/Tornado/Client/ClientSelector.py +4 -1
  7. DIRAC/Core/Utilities/Extensions.py +10 -1
  8. DIRAC/Core/Utilities/Os.py +32 -1
  9. DIRAC/Core/scripts/dirac_apptainer_exec.py +10 -3
  10. DIRAC/Interfaces/API/Dirac.py +22 -13
  11. DIRAC/Interfaces/API/DiracAdmin.py +17 -5
  12. DIRAC/Interfaces/scripts/dirac_admin_allow_site.py +7 -1
  13. DIRAC/Interfaces/scripts/dirac_admin_ban_site.py +7 -1
  14. DIRAC/MonitoringSystem/Client/Types/WMSHistory.py +4 -0
  15. DIRAC/MonitoringSystem/Service/WebAppHandler.py +68 -1
  16. DIRAC/ResourceStatusSystem/Client/SiteStatus.py +4 -2
  17. DIRAC/ResourceStatusSystem/Utilities/CSHelpers.py +2 -31
  18. DIRAC/ResourceStatusSystem/scripts/dirac_rss_set_status.py +18 -4
  19. DIRAC/Resources/Computing/BatchSystems/Condor.py +23 -4
  20. DIRAC/TransformationSystem/Agent/TaskManagerAgentBase.py +10 -13
  21. DIRAC/TransformationSystem/Agent/TransformationAgent.py +22 -1
  22. DIRAC/TransformationSystem/Agent/TransformationCleaningAgent.py +15 -15
  23. DIRAC/TransformationSystem/Client/Utilities.py +6 -0
  24. DIRAC/WorkloadManagementSystem/Agent/JobCleaningAgent.py +11 -7
  25. DIRAC/WorkloadManagementSystem/Agent/StalledJobAgent.py +3 -26
  26. DIRAC/WorkloadManagementSystem/Agent/StatesAccountingAgent.py +41 -1
  27. DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_JobCleaningAgent.py +7 -9
  28. DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_StalledJobAgent.py +1 -2
  29. DIRAC/WorkloadManagementSystem/Client/JobMonitoringClient.py +4 -11
  30. DIRAC/WorkloadManagementSystem/Client/JobStatus.py +0 -59
  31. DIRAC/WorkloadManagementSystem/Client/SandboxStoreClient.py +25 -38
  32. DIRAC/WorkloadManagementSystem/Client/WMSClient.py +2 -3
  33. DIRAC/WorkloadManagementSystem/DB/JobDB.py +0 -58
  34. DIRAC/WorkloadManagementSystem/DB/SandboxMetadataDB.py +25 -37
  35. DIRAC/WorkloadManagementSystem/Executor/JobSanity.py +3 -3
  36. DIRAC/WorkloadManagementSystem/FutureClient/JobStateUpdateClient.py +2 -14
  37. DIRAC/WorkloadManagementSystem/Service/JobManagerHandler.py +27 -138
  38. DIRAC/WorkloadManagementSystem/Service/JobMonitoringHandler.py +0 -126
  39. DIRAC/WorkloadManagementSystem/Service/JobStateUpdateHandler.py +0 -16
  40. DIRAC/WorkloadManagementSystem/Service/SandboxStoreHandler.py +5 -51
  41. DIRAC/WorkloadManagementSystem/Utilities/JobParameters.py +1 -1
  42. DIRAC/WorkloadManagementSystem/Utilities/PilotWrapper.py +2 -0
  43. DIRAC/WorkloadManagementSystem/Utilities/jobAdministration.py +138 -0
  44. DIRAC/WorkloadManagementSystem/Utilities/test/Test_JobAdministration.py +28 -0
  45. {dirac-9.0.0a61.dist-info → dirac-9.0.0a63.dist-info}/METADATA +2 -1
  46. {dirac-9.0.0a61.dist-info → dirac-9.0.0a63.dist-info}/RECORD +50 -52
  47. {dirac-9.0.0a61.dist-info → dirac-9.0.0a63.dist-info}/entry_points.txt +0 -3
  48. DIRAC/TransformationSystem/scripts/dirac_transformation_archive.py +0 -30
  49. DIRAC/TransformationSystem/scripts/dirac_transformation_clean.py +0 -30
  50. DIRAC/TransformationSystem/scripts/dirac_transformation_remove_output.py +0 -30
  51. DIRAC/WorkloadManagementSystem/Utilities/test/Test_JobManager.py +0 -58
  52. {dirac-9.0.0a61.dist-info → dirac-9.0.0a63.dist-info}/WHEEL +0 -0
  53. {dirac-9.0.0a61.dist-info → dirac-9.0.0a63.dist-info}/licenses/LICENSE +0 -0
  54. {dirac-9.0.0a61.dist-info → dirac-9.0.0a63.dist-info}/top_level.txt +0 -0
@@ -29,6 +29,7 @@ def registerSwitches():
29
29
  ("reason=", "Reason to set the Status"),
30
30
  ("VO=", "VO to change a status for. When omitted, status will be changed for all VOs"),
31
31
  ("tokenOwner=", "Owner of the token"),
32
+ ("days=", "Number of days the token is valid for. Default is 1 day. 0 or less days denotes forever."),
32
33
  )
33
34
 
34
35
  for switch in switches:
@@ -50,6 +51,7 @@ def parseSwitches():
50
51
  switches = dict(Script.getUnprocessedSwitches())
51
52
  switches.setdefault("statusType", None)
52
53
  switches.setdefault("VO", None)
54
+ switches.setdefault("days", 1)
53
55
 
54
56
  for key in ("element", "name", "status", "reason"):
55
57
  if key not in switches:
@@ -183,7 +185,11 @@ def setStatus(switchDict, tokenOwner):
183
185
  )
184
186
  return S_OK()
185
187
 
186
- tomorrow = datetime.utcnow().replace(microsecond=0) + timedelta(days=1)
188
+ tokenLifetime = int(switchDict["days"])
189
+ if tokenLifetime <= 0:
190
+ tokenExpiration = datetime.max
191
+ else:
192
+ tokenExpiration = datetime.utcnow().replace(microsecond=0) + timedelta(days=tokenLifetime)
187
193
 
188
194
  for status, statusType in elements:
189
195
  gLogger.debug(f"{status} {statusType}")
@@ -193,8 +199,16 @@ def setStatus(switchDict, tokenOwner):
193
199
  continue
194
200
 
195
201
  gLogger.debug(
196
- "About to set status %s -> %s for %s, statusType: %s, VO: %s, reason: %s"
197
- % (status, switchDict["status"], switchDict["name"], statusType, switchDict["VO"], switchDict["reason"])
202
+ "About to set status %s -> %s for %s, statusType: %s, VO: %s, reason: %s, days: %s"
203
+ % (
204
+ status,
205
+ switchDict["status"],
206
+ switchDict["name"],
207
+ statusType,
208
+ switchDict["VO"],
209
+ switchDict["reason"],
210
+ switchDict["days"],
211
+ )
198
212
  )
199
213
  result = rssClient.modifyStatusElement(
200
214
  switchDict["element"],
@@ -205,7 +219,7 @@ def setStatus(switchDict, tokenOwner):
205
219
  reason=switchDict["reason"],
206
220
  vO=switchDict["VO"],
207
221
  tokenOwner=tokenOwner,
208
- tokenExpiration=tomorrow,
222
+ tokenExpiration=tokenExpiration,
209
223
  )
210
224
  if not result["OK"]:
211
225
  return result
@@ -78,7 +78,7 @@ on_exit_hold = ExitCode =!= 0
78
78
  on_exit_hold_subcode = %(holdReasonSubcode)s
79
79
  # Jobs are then deleted from the system after N days if they are not idle or running
80
80
  periodic_remove = ((JobStatus == 1) && (NumJobStarts > 0)) || \
81
- ((JobStatus != 1) && (JobStatus != 2) && ((time() - EnteredCurrentStatus) > (%(daysToKeepRemoteLogs)s * 24 * 3600))
81
+ ((JobStatus != 1) && (JobStatus != 2) && ((time() - EnteredCurrentStatus) > (%(daysToKeepRemoteLogs)s * 24 * 3600)))
82
82
 
83
83
  # Specific options
84
84
  # ----------------
@@ -167,7 +167,7 @@ class Condor(object):
167
167
  jdlFile.flush()
168
168
 
169
169
  cmd = "%s; " % preamble if preamble else ""
170
- cmd += "condor_submit %s" % jdlFile.name
170
+ cmd += "condor_submit -spool %s" % jdlFile.name
171
171
  sp = subprocess.Popen(
172
172
  cmd,
173
173
  shell=True,
@@ -286,10 +286,12 @@ class Condor(object):
286
286
  output, error = sp.communicate()
287
287
  status = sp.returncode
288
288
 
289
- if status != 0 or not output:
289
+ if status != 0:
290
290
  resultDict["Status"] = status
291
291
  resultDict["Message"] = error
292
292
  return resultDict
293
+ if not output:
294
+ output = "[]"
293
295
 
294
296
  jobsMetadata = json.loads(output)
295
297
 
@@ -304,10 +306,12 @@ class Condor(object):
304
306
  output, _ = sp.communicate()
305
307
  status = sp.returncode
306
308
 
307
- if status != 0 or not output:
309
+ if status != 0:
308
310
  resultDict["Status"] = status
309
311
  resultDict["Message"] = error
310
312
  return resultDict
313
+ if not output:
314
+ output = "[]"
311
315
 
312
316
  jobsMetadata += json.loads(output)
313
317
 
@@ -399,6 +403,21 @@ class Condor(object):
399
403
  jobDict = {}
400
404
  for jobID in jobIDList:
401
405
  jobDict[jobID] = {}
406
+
407
+ cmd = "condor_transfer_data %s" % jobID
408
+ sp = subprocess.Popen(
409
+ shlex.split(cmd),
410
+ stdout=subprocess.PIPE,
411
+ stderr=subprocess.PIPE,
412
+ universal_newlines=True,
413
+ )
414
+ _, error = sp.communicate()
415
+ status = sp.returncode
416
+ if status != 0:
417
+ resultDict["Status"] = -1
418
+ resultDict["Message"] = error
419
+ return resultDict
420
+
402
421
  jobDict[jobID]["Output"] = "%s/%s.out" % (outputDir, jobID)
403
422
  jobDict[jobID]["Error"] = "%s/%s.err" % (errorDir, jobID)
404
423
 
@@ -7,21 +7,20 @@
7
7
  In case you want to further extend it you are required to follow the note on the
8
8
  initialize method and on the _getClients method.
9
9
  """
10
- import time
11
- import datetime
12
10
  import concurrent.futures
11
+ import datetime
12
+ import time
13
13
 
14
- from DIRAC import S_OK
15
-
14
+ from DIRAC import S_OK, gConfig
15
+ from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
16
16
  from DIRAC.Core.Base.AgentModule import AgentModule
17
17
  from DIRAC.Core.Security.ProxyInfo import getProxyInfo
18
- from DIRAC.Core.Utilities.List import breakListIntoChunks
19
18
  from DIRAC.Core.Utilities.Dictionaries import breakDictionaryIntoChunks
20
- from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
19
+ from DIRAC.Core.Utilities.List import breakListIntoChunks
20
+ from DIRAC.TransformationSystem.Agent.TransformationAgentsUtilities import TransformationAgentsUtilities
21
21
  from DIRAC.TransformationSystem.Client.FileReport import FileReport
22
- from DIRAC.TransformationSystem.Client.WorkflowTasks import WorkflowTasks
23
22
  from DIRAC.TransformationSystem.Client.TransformationClient import TransformationClient
24
- from DIRAC.TransformationSystem.Agent.TransformationAgentsUtilities import TransformationAgentsUtilities
23
+ from DIRAC.TransformationSystem.Client.WorkflowTasks import WorkflowTasks
25
24
  from DIRAC.WorkloadManagementSystem.Client import JobStatus
26
25
  from DIRAC.WorkloadManagementSystem.Client.JobManagerClient import JobManagerClient
27
26
 
@@ -193,11 +192,9 @@ class TaskManagerAgentBase(AgentModule, TransformationAgentsUtilities):
193
192
  else:
194
193
  # Get the transformations which should be submitted
195
194
  self.tasksPerLoop = self.am_getOption("TasksPerLoop", self.tasksPerLoop)
196
- res = self.jobManagerClient.getMaxParametricJobs()
197
- if not res["OK"]:
198
- self.log.warn("Could not get the maxParametricJobs from JobManager", res["Message"])
199
- else:
200
- self.maxParametricJobs = res["Value"]
195
+ self.maxParametricJobs = gConfig.getValue(
196
+ "/Systems/WorkloadManagement/Services/JobManager/MaxParametricJobs", self.maxParametricJobs
197
+ )
201
198
 
202
199
  self._addOperationForTransformations(
203
200
  self.operationsOnTransformationDict,
@@ -1,4 +1,4 @@
1
- """ TransformationAgent processes transformations found in the transformation database.
1
+ """TransformationAgent processes transformations found in the transformation database.
2
2
 
3
3
  The following options can be set for the TransformationAgent.
4
4
 
@@ -8,6 +8,7 @@ The following options can be set for the TransformationAgent.
8
8
  :dedent: 2
9
9
  :caption: TransformationAgent options
10
10
  """
11
+
11
12
  from importlib import import_module
12
13
 
13
14
  import time
@@ -15,6 +16,7 @@ import os
15
16
  import datetime
16
17
  import pickle
17
18
  import concurrent.futures
19
+ from pathlib import Path
18
20
 
19
21
  from DIRAC import S_OK, S_ERROR
20
22
  from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
@@ -127,6 +129,9 @@ class TransformationAgent(AgentModule, TransformationAgentsUtilities):
127
129
  if not res["OK"]:
128
130
  self._logError("Failed to obtain transformations:", res["Message"])
129
131
  return S_OK()
132
+
133
+ active_trans_ids = [t["TransformationID"] for t in res["Value"]]
134
+ self.cleanOldTransformationCache(active_trans_ids)
130
135
  # Process the transformations
131
136
  count = 0
132
137
  future_to_transID = {}
@@ -164,6 +169,22 @@ class TransformationAgent(AgentModule, TransformationAgentsUtilities):
164
169
 
165
170
  return S_OK()
166
171
 
172
+ def cleanOldTransformationCache(self, active_trans_ids: list[int]):
173
+ cache_filenames = {Path(self.__cacheFile(tid)) for tid in active_trans_ids}
174
+ existing_caches = set(Path(self.workDirectory).glob("*.pkl"))
175
+ useless_cache_files = existing_caches - cache_filenames
176
+
177
+ if useless_cache_files:
178
+ self._logInfo(f"Found potentially {len(useless_cache_files)} useless cache files")
179
+
180
+ # Since idle transformations aren't in active_trans_ids, let's filter it more
181
+ # and take only files that haven't been touched for 2 month
182
+ last_update_threshold = (datetime.datetime.utcnow() - datetime.timedelta(days=60)).timestamp()
183
+
184
+ for cache_file in useless_cache_files:
185
+ if Path(cache_file).stat().st_mtime < last_update_threshold:
186
+ cache_file.unlink()
187
+
167
188
  def getTransformations(self):
168
189
  """Obtain the transformations to be executed - this is executed at the start of every loop (it's really the
169
190
  only real thing in the execute()
@@ -16,14 +16,12 @@ from datetime import datetime, timedelta
16
16
 
17
17
  # # from DIRAC
18
18
  from DIRAC import S_ERROR, S_OK
19
- from DIRAC.ConfigurationSystem.Client.ConfigurationData import gConfigurationData
20
19
  from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
21
20
  from DIRAC.Core.Base.AgentModule import AgentModule
22
21
  from DIRAC.Core.Utilities.DErrno import cmpError
23
22
  from DIRAC.Core.Utilities.List import breakListIntoChunks
24
23
  from DIRAC.Core.Utilities.Proxy import executeWithUserProxy
25
24
  from DIRAC.Core.Utilities.ReturnValues import returnSingleResult
26
- from DIRAC.DataManagementSystem.Client.DataManager import DataManager
27
25
  from DIRAC.RequestManagementSystem.Client.File import File
28
26
  from DIRAC.RequestManagementSystem.Client.Operation import Operation
29
27
  from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient
@@ -34,8 +32,12 @@ from DIRAC.Resources.Catalog.FileCatalogClient import FileCatalogClient
34
32
  from DIRAC.Resources.Storage.StorageElement import StorageElement
35
33
  from DIRAC.TransformationSystem.Client import TransformationStatus
36
34
  from DIRAC.TransformationSystem.Client.TransformationClient import TransformationClient
37
- from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient
38
- from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient
35
+ from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB
36
+ from DIRAC.WorkloadManagementSystem.Service.JobPolicy import (
37
+ RIGHT_DELETE,
38
+ RIGHT_KILL,
39
+ )
40
+ from DIRAC.WorkloadManagementSystem.Utilities.jobAdministration import kill_delete_jobs
39
41
 
40
42
  # # agent's name
41
43
  AGENT_NAME = "Transformation/TransformationCleaningAgent"
@@ -59,12 +61,12 @@ class TransformationCleaningAgent(AgentModule):
59
61
 
60
62
  # # transformation client
61
63
  self.transClient = None
62
- # # wms client
63
- self.wmsClient = None
64
64
  # # request client
65
65
  self.reqClient = None
66
66
  # # file catalog client
67
67
  self.metadataClient = None
68
+ # # JobDB
69
+ self.jobDB = None
68
70
 
69
71
  # # transformations types
70
72
  self.transformationTypes = None
@@ -119,14 +121,12 @@ class TransformationCleaningAgent(AgentModule):
119
121
 
120
122
  # # transformation client
121
123
  self.transClient = TransformationClient()
122
- # # wms client
123
- self.wmsClient = WMSClient()
124
124
  # # request client
125
125
  self.reqClient = ReqClient()
126
126
  # # file catalog client
127
127
  self.metadataClient = FileCatalogClient()
128
- # # job monitoring client
129
- self.jobMonitoringClient = JobMonitoringClient()
128
+ # # job DB
129
+ self.jobDB = JobDB()
130
130
 
131
131
  return S_OK()
132
132
 
@@ -224,7 +224,7 @@ class TransformationCleaningAgent(AgentModule):
224
224
  So, we should just clean from time to time.
225
225
  What I added here is done only when the agent finalize, and it's quite light-ish operation anyway.
226
226
  """
227
- res = self.jobMonitoringClient.getJobGroups(None, datetime.utcnow() - timedelta(days=365))
227
+ res = self.jobDB.getDistinctJobAttributes("JobGroup", None, datetime.utcnow() - timedelta(days=365))
228
228
  if not res["OK"]:
229
229
  self.log.error("Failed to get job groups", res["Message"])
230
230
  return res
@@ -268,7 +268,7 @@ class TransformationCleaningAgent(AgentModule):
268
268
 
269
269
  # Remove JobIDs that were unknown to the TransformationSystem
270
270
  jobGroupsToCheck = [str(transDict["TransformationID"]).zfill(8) for transDict in toClean + toArchive]
271
- res = self.jobMonitoringClient.getJobs({"JobGroup": jobGroupsToCheck})
271
+ res = self.jobDB.selectJobs({"JobGroup": jobGroupsToCheck})
272
272
  if not res["OK"]:
273
273
  return res
274
274
  jobIDsToRemove = [int(jobID) for jobID in res["Value"]]
@@ -610,8 +610,8 @@ class TransformationCleaningAgent(AgentModule):
610
610
  # Prevent 0 job IDs
611
611
  jobIDs = [int(j) for j in transJobIDs if int(j)]
612
612
  allRemove = True
613
- for jobList in breakListIntoChunks(jobIDs, 500):
614
- res = self.wmsClient.killJob(jobList, force=True)
613
+ for jobList in breakListIntoChunks(jobIDs, 1000):
614
+ res = kill_delete_jobs(RIGHT_KILL, jobList, force=True)
615
615
  if res["OK"]:
616
616
  self.log.info(f"Successfully killed {len(jobList)} jobs from WMS")
617
617
  elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs" not in res) and ("FailedJobIDs" not in res):
@@ -623,7 +623,7 @@ class TransformationCleaningAgent(AgentModule):
623
623
  self.log.error("Failed to kill jobs", f"(n={len(res['FailedJobIDs'])})")
624
624
  allRemove = False
625
625
 
626
- res = self.wmsClient.deleteJob(jobList)
626
+ res = kill_delete_jobs(RIGHT_DELETE, jobList, force=True)
627
627
  if res["OK"]:
628
628
  self.log.info("Successfully deleted jobs from WMS", f"(n={len(jobList)})")
629
629
  elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs" not in res) and ("FailedJobIDs" not in res):
@@ -9,6 +9,8 @@ Utilities for Transformation system
9
9
  import ast
10
10
  import random
11
11
 
12
+ from cachetools import LRUCache, cached
13
+ from cachetools.keys import hashkey
12
14
  from DIRAC import S_OK, S_ERROR, gLogger
13
15
 
14
16
  from DIRAC.Core.Utilities.List import breakListIntoChunks
@@ -400,6 +402,10 @@ class PluginUtilities:
400
402
 
401
403
  return StorageElement(se1).isSameSE(StorageElement(se2))
402
404
 
405
+ @cached(
406
+ LRUCache(maxsize=1024),
407
+ key=lambda _, a, b: hashkey(a, *sorted(b)),
408
+ )
403
409
  def isSameSEInList(self, se1, seList):
404
410
  """Check if an SE is the same as any in a list"""
405
411
  if se1 in seList:
@@ -35,10 +35,12 @@ from DIRAC.RequestManagementSystem.Client.Operation import Operation
35
35
  from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient
36
36
  from DIRAC.RequestManagementSystem.Client.Request import Request
37
37
  from DIRAC.WorkloadManagementSystem.Client import JobStatus
38
- from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient
39
- from DIRAC.WorkloadManagementSystem.Client.SandboxStoreClient import SandboxStoreClient
40
38
  from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient
41
39
  from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB
40
+ from DIRAC.WorkloadManagementSystem.DB.SandboxMetadataDB import SandboxMetadataDB
41
+ from DIRAC.WorkloadManagementSystem.Service.JobPolicy import RIGHT_DELETE
42
+ from DIRAC.WorkloadManagementSystem.Utilities.jobAdministration import kill_delete_jobs
43
+ from DIRAC.WorkloadManagementSystem.Utilities.JobParameters import getJobParameters
42
44
 
43
45
 
44
46
  class JobCleaningAgent(AgentModule):
@@ -152,8 +154,9 @@ class JobCleaningAgent(AgentModule):
152
154
  return S_OK()
153
155
 
154
156
  self.log.info("Unassigning sandboxes from soon to be deleted jobs", f"({len(jobList)})")
155
- result = SandboxStoreClient(useCertificates=True).unassignJobs(jobList)
156
- if not result["OK"]:
157
+
158
+ entitiesList = [f"Job:{jobId}" for jobId in jobList]
159
+ if not (result := SandboxMetadataDB().unassignEntities(entitiesList))["OK"]:
157
160
  self.log.error("Cannot unassign jobs to sandboxes", result["Message"])
158
161
  return result
159
162
 
@@ -229,11 +232,11 @@ class JobCleaningAgent(AgentModule):
229
232
  if not res["OK"]:
230
233
  self.log.error("No DN found", f"for {user}")
231
234
  return res
232
- wmsClient = WMSClient(useCertificates=True, delegatedDN=res["Value"][0], delegatedGroup=ownerGroup)
233
235
  if remove:
236
+ wmsClient = WMSClient(useCertificates=True, delegatedDN=res["Value"][0], delegatedGroup=ownerGroup)
234
237
  result = wmsClient.removeJob(jobsList)
235
238
  else:
236
- result = wmsClient.deleteJob(jobsList)
239
+ result = kill_delete_jobs(RIGHT_DELETE, jobsList)
237
240
  if not result["OK"]:
238
241
  self.log.error(
239
242
  f"Could not {'remove' if remove else 'delete'} jobs",
@@ -293,7 +296,8 @@ class JobCleaningAgent(AgentModule):
293
296
  failed = {}
294
297
  successful = {}
295
298
 
296
- result = JobMonitoringClient().getJobParameters(jobIDList, ["OutputSandboxLFN"])
299
+ jobIDs = [int(jobID) for jobID in jobIDList]
300
+ result = getJobParameters(jobIDs, "OutputSandboxLFN")
297
301
  if not result["OK"]:
298
302
  return result
299
303
  osLFNDict = result["Value"]
@@ -14,16 +14,16 @@ import datetime
14
14
  from DIRAC import S_ERROR, S_OK, gConfig
15
15
  from DIRAC.AccountingSystem.Client.Types.Job import Job
16
16
  from DIRAC.ConfigurationSystem.Client.Helpers import cfgPath
17
- from DIRAC.ConfigurationSystem.Client.Helpers.Registry import getDNForUsername
18
17
  from DIRAC.Core.Base.AgentModule import AgentModule
19
18
  from DIRAC.Core.Utilities import DErrno
20
19
  from DIRAC.Core.Utilities.ClassAd.ClassAdLight import ClassAd
21
20
  from DIRAC.Core.Utilities.TimeUtilities import fromString, second, toEpoch
22
21
  from DIRAC.WorkloadManagementSystem.Client import JobMinorStatus, JobStatus
23
- from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient
24
22
  from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB
25
23
  from DIRAC.WorkloadManagementSystem.DB.JobLoggingDB import JobLoggingDB
26
24
  from DIRAC.WorkloadManagementSystem.DB.PilotAgentsDB import PilotAgentsDB
25
+ from DIRAC.WorkloadManagementSystem.Service.JobPolicy import RIGHT_KILL
26
+ from DIRAC.WorkloadManagementSystem.Utilities.jobAdministration import kill_delete_jobs
27
27
  from DIRAC.WorkloadManagementSystem.Utilities.JobParameters import getJobParameters
28
28
  from DIRAC.WorkloadManagementSystem.Utilities.Utils import rescheduleJobs
29
29
 
@@ -235,7 +235,7 @@ class StalledJobAgent(AgentModule):
235
235
  # Set the jobs Failed, send them a kill signal in case they are not really dead
236
236
  # and send accounting info
237
237
  if setFailed:
238
- res = self._sendKillCommand(jobID)
238
+ res = kill_delete_jobs(RIGHT_KILL, [jobID], nonauthJobList=[], force=True)
239
239
  if not res["OK"]:
240
240
  self.log.error("Failed to kill job", jobID)
241
241
 
@@ -574,26 +574,3 @@ class StalledJobAgent(AgentModule):
574
574
  continue
575
575
 
576
576
  return S_OK()
577
-
578
- def _sendKillCommand(self, job):
579
- """Send a kill signal to the job such that it cannot continue running.
580
-
581
- :param int job: ID of job to send kill command
582
- """
583
-
584
- res = self.jobDB.getJobAttribute(job, "Owner")
585
- if not res["OK"]:
586
- return res
587
- owner = res["Value"]
588
-
589
- res = self.jobDB.getJobAttribute(job, "OwnerGroup")
590
- if not res["OK"]:
591
- return res
592
- ownerGroup = res["Value"]
593
-
594
- wmsClient = WMSClient(
595
- useCertificates=True,
596
- delegatedDN=getDNForUsername(owner)["Value"][0] if owner else None,
597
- delegatedGroup=ownerGroup,
598
- )
599
- return wmsClient.killJob(job)
@@ -9,10 +9,11 @@
9
9
  """
10
10
  import datetime
11
11
 
12
- from DIRAC import S_ERROR, S_OK
12
+ from DIRAC import S_ERROR, S_OK, gConfig
13
13
  from DIRAC.AccountingSystem.Client.DataStoreClient import DataStoreClient
14
14
  from DIRAC.AccountingSystem.Client.Types.WMSHistory import WMSHistory
15
15
  from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
16
+ from DIRAC.ConfigurationSystem.Client.Helpers.Resources import getSites
16
17
  from DIRAC.Core.Base.AgentModule import AgentModule
17
18
  from DIRAC.Core.Utilities import TimeUtilities
18
19
  from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter
@@ -77,6 +78,8 @@ class StatesAccountingAgent(AgentModule):
77
78
  def execute(self):
78
79
  """Main execution method"""
79
80
 
81
+ site_metadata = self._getSitesMetadata()
82
+
80
83
  # on the first iteration of the agent, do nothing in order to avoid double committing after a restart
81
84
  if self.am_getModuleParam("cyclesDone") == 0:
82
85
  self.log.notice("Skipping the first iteration of the agent")
@@ -131,6 +134,16 @@ class StatesAccountingAgent(AgentModule):
131
134
 
132
135
  for backend in self.datastores:
133
136
  if backend.lower() == "monitoring":
137
+ site_name = rD["Site"]
138
+ if site_name not in site_metadata:
139
+ self.log.warn(
140
+ f"Site {site_name} not found in site metadata, using default values",
141
+ )
142
+ rD["Tier"] = "4"
143
+ rD["Type"] = site_name.split(".")[0]
144
+ else:
145
+ rD["Tier"] = site_metadata[site_name]["Tier"]
146
+ rD["Type"] = site_metadata[site_name]["Type"]
134
147
  rD["timestamp"] = int(TimeUtilities.toEpochMilliSeconds(now))
135
148
  self.datastores["Monitoring"].addRecord(rD)
136
149
 
@@ -154,3 +167,30 @@ class StatesAccountingAgent(AgentModule):
154
167
  self.log.verbose(f"Done committing WMSHistory to {backend} backend")
155
168
 
156
169
  return S_OK()
170
+
171
+ def _getSitesMetadata(self):
172
+ """Get the metadata for the sites"""
173
+ res = getSites()
174
+ if not res["OK"]:
175
+ return res
176
+ sites = res["Value"]
177
+ site_metadata = {}
178
+
179
+ for site in sites:
180
+ site_metadata[site] = {}
181
+
182
+ # Get the site metadata from the Configuration System
183
+ grid = site.split(".")[0]
184
+ res = gConfig.getOptionsDict(f"Resources/Sites/{grid}/{site}")
185
+ if not res["OK"]:
186
+ self.log.error("Failure getting options dict for site", f"{site}: {res['Message']}")
187
+ continue
188
+ siteInfoCS = res["Value"]
189
+
190
+ # The site tier is normally 1 or 2. Few VOs may define tier 3.
191
+ # If the tier is not defined, we assume it is 4, with 4 meaning "not pledged" (opportunistic).
192
+ site_metadata[site]["Tier"] = siteInfoCS.get("MoUTierLevel", "4")
193
+ # The site type is defined by the first part of the site name.
194
+ # It needs to be interpreted at the Monitoring side (e.g. in Grafana).
195
+ site_metadata[site]["Type"] = site.split(".")[0]
196
+ return site_metadata
@@ -1,10 +1,11 @@
1
1
  """ Test class for Job Cleaning Agent
2
2
  """
3
- import pytest
4
3
  from unittest.mock import MagicMock
5
4
 
5
+ import pytest
6
+
6
7
  # DIRAC Components
7
- from DIRAC import gLogger, S_OK
8
+ from DIRAC import S_OK, gLogger
8
9
  from DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent import JobCleaningAgent
9
10
 
10
11
  gLogger.setLevel("DEBUG")
@@ -32,7 +33,6 @@ def jca(mocker):
32
33
  mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.JobDB.selectJobs", side_effect=mockReply)
33
34
  mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.JobDB.__init__", side_effect=mockNone)
34
35
  mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.ReqClient", return_value=mockNone)
35
- mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.JobMonitoringClient", return_value=mockJMC)
36
36
 
37
37
  jca = JobCleaningAgent()
38
38
  jca.log = gLogger
@@ -98,7 +98,7 @@ def test_deleteJobsByStatus(jca, conditions, mockReplyInput, expected):
98
98
  "inputs, params, expected",
99
99
  [
100
100
  ([], {"OK": True, "Value": {}}, {"OK": True, "Value": {"Failed": {}, "Successful": {}}}),
101
- (["a", "b"], {"OK": True, "Value": {}}, {"OK": True, "Value": {"Failed": {}, "Successful": {}}}),
101
+ (["123", "456"], {"OK": True, "Value": {}}, {"OK": True, "Value": {"Failed": {}, "Successful": {}}}),
102
102
  (
103
103
  [],
104
104
  {"OK": True, "Value": {1: {"OutputSandboxLFN": "/some/lfn/1.txt"}}},
@@ -113,11 +113,11 @@ def test_deleteJobsByStatus(jca, conditions, mockReplyInput, expected):
113
113
  {"OK": True, "Value": {"Failed": {}, "Successful": {1: "/some/lfn/1.txt", 2: "/some/other/lfn/2.txt"}}},
114
114
  ),
115
115
  (
116
- ["a", "b"],
116
+ ["123", "456"],
117
117
  {"OK": True, "Value": {1: {"OutputSandboxLFN": "/some/lfn/1.txt"}}},
118
118
  {"OK": True, "Value": {"Failed": {}, "Successful": {1: "/some/lfn/1.txt"}}},
119
119
  ),
120
- (["a", "b"], {"OK": False}, {"OK": False}),
120
+ (["123", "456"], {"OK": False}, {"OK": False}),
121
121
  ],
122
122
  )
123
123
  def test_deleteJobOversizedSandbox(mocker, inputs, params, expected):
@@ -127,10 +127,10 @@ def test_deleteJobOversizedSandbox(mocker, inputs, params, expected):
127
127
  mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.AgentModule.am_getOption", return_value=mockAM)
128
128
  mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.JobDB", return_value=mockNone)
129
129
  mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.ReqClient", return_value=mockNone)
130
- mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.JobMonitoringClient", return_value=mockJMC)
131
130
  mocker.patch(
132
131
  "DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.getDNForUsername", return_value=S_OK(["/bih/boh/DN"])
133
132
  )
133
+ mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.getJobParameters", return_value=params)
134
134
 
135
135
  jobCleaningAgent = JobCleaningAgent()
136
136
  jobCleaningAgent.log = gLogger
@@ -138,8 +138,6 @@ def test_deleteJobOversizedSandbox(mocker, inputs, params, expected):
138
138
  jobCleaningAgent._AgentModule__configDefaults = mockAM
139
139
  jobCleaningAgent.initialize()
140
140
 
141
- mockJMC.getJobParameters.return_value = params
142
-
143
141
  result = jobCleaningAgent.deleteJobOversizedSandbox(inputs)
144
142
 
145
143
  assert result == expected
@@ -28,8 +28,7 @@ def sja(mocker):
28
28
  mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.rescheduleJobs", return_value=MagicMock())
29
29
  mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.PilotAgentsDB", return_value=MagicMock())
30
30
  mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.getJobParameters", return_value=MagicMock())
31
- mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.WMSClient", return_value=MagicMock())
32
- mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.getDNForUsername", return_value=MagicMock())
31
+ mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.kill_delete_jobs", return_value=MagicMock())
33
32
 
34
33
  stalledJobAgent = StalledJobAgent()
35
34
  stalledJobAgent._AgentModule__configDefaults = mockAM
@@ -14,12 +14,14 @@ except ImportError:
14
14
 
15
15
  @createClient("WorkloadManagement/JobMonitoring")
16
16
  class JobMonitoringClient(Client):
17
+ # Set to None to raise an error if this service is set as "legacy adapted"
18
+ # See ClientSelector
19
+ diracxClient = None
20
+
17
21
  def __init__(self, **kwargs):
18
22
  super().__init__(**kwargs)
19
23
  self.setServer("WorkloadManagement/JobMonitoring")
20
24
 
21
- diracxClient = futureJobMonitoringClient
22
-
23
25
  @ignoreEncodeWarning
24
26
  def getJobsStatus(self, jobIDs):
25
27
  res = self._getRPC().getJobsStatus(jobIDs)
@@ -38,15 +40,6 @@ class JobMonitoringClient(Client):
38
40
  res["Value"] = strToIntDict(res["Value"])
39
41
  return res
40
42
 
41
- @ignoreEncodeWarning
42
- def getJobsParameters(self, jobIDs, parameters):
43
- res = self._getRPC().getJobsParameters(jobIDs, parameters)
44
-
45
- # Cast the str keys to int
46
- if res["OK"]:
47
- res["Value"] = strToIntDict(res["Value"])
48
- return res
49
-
50
43
  @ignoreEncodeWarning
51
44
  def getJobsMinorStatus(self, jobIDs):
52
45
  res = self._getRPC().getJobsMinorStatus(jobIDs)