DIRAC 9.0.0a69__py3-none-any.whl → 9.0.0a70__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. DIRAC/AccountingSystem/Client/Types/Network.py +8 -8
  2. DIRAC/AccountingSystem/Client/Types/PilotSubmission.py +3 -3
  3. DIRAC/ConfigurationSystem/Client/CSAPI.py +11 -1
  4. DIRAC/ConfigurationSystem/Client/Helpers/CSGlobals.py +0 -9
  5. DIRAC/ConfigurationSystem/Client/Helpers/Registry.py +3 -29
  6. DIRAC/ConfigurationSystem/Client/SyncPlugins/CERNLDAPSyncPlugin.py +4 -1
  7. DIRAC/ConfigurationSystem/ConfigTemplate.cfg +3 -0
  8. DIRAC/ConfigurationSystem/private/Modificator.py +11 -3
  9. DIRAC/ConfigurationSystem/private/RefresherBase.py +4 -2
  10. DIRAC/Core/DISET/ServiceReactor.py +11 -3
  11. DIRAC/Core/DISET/private/Transports/M2SSLTransport.py +9 -7
  12. DIRAC/Core/Security/DiracX.py +11 -6
  13. DIRAC/Core/Security/test/test_diracx_token_from_pem.py +161 -0
  14. DIRAC/Core/Tornado/Server/TornadoService.py +1 -1
  15. DIRAC/Core/Utilities/ElasticSearchDB.py +1 -2
  16. DIRAC/Core/Utilities/Subprocess.py +66 -57
  17. DIRAC/Core/Utilities/test/Test_Profiler.py +20 -20
  18. DIRAC/Core/Utilities/test/Test_Subprocess.py +58 -8
  19. DIRAC/Core/scripts/dirac_apptainer_exec.py +8 -8
  20. DIRAC/DataManagementSystem/Agent/FTS3Agent.py +8 -7
  21. DIRAC/DataManagementSystem/Client/DataManager.py +6 -7
  22. DIRAC/DataManagementSystem/Client/FTS3Job.py +125 -34
  23. DIRAC/DataManagementSystem/Client/test/Test_FTS3Objects.py +1 -0
  24. DIRAC/DataManagementSystem/Client/test/Test_scitag.py +69 -0
  25. DIRAC/DataManagementSystem/DB/FileCatalogComponents/DatasetManager/DatasetManager.py +1 -1
  26. DIRAC/DataManagementSystem/scripts/dirac_dms_create_moving_request.py +2 -0
  27. DIRAC/FrameworkSystem/DB/InstalledComponentsDB.py +3 -2
  28. DIRAC/FrameworkSystem/DB/ProxyDB.py +9 -5
  29. DIRAC/FrameworkSystem/Utilities/MonitoringUtilities.py +1 -0
  30. DIRAC/FrameworkSystem/Utilities/TokenManagementUtilities.py +3 -2
  31. DIRAC/FrameworkSystem/Utilities/diracx.py +41 -10
  32. DIRAC/FrameworkSystem/scripts/dirac_login.py +2 -2
  33. DIRAC/FrameworkSystem/scripts/dirac_proxy_init.py +1 -1
  34. DIRAC/FrameworkSystem/scripts/dirac_uninstall_component.py +1 -0
  35. DIRAC/Interfaces/API/Dirac.py +3 -6
  36. DIRAC/Interfaces/Utilities/DConfigCache.py +2 -0
  37. DIRAC/MonitoringSystem/DB/MonitoringDB.py +6 -5
  38. DIRAC/MonitoringSystem/Service/WebAppHandler.py +25 -6
  39. DIRAC/MonitoringSystem/private/MainReporter.py +0 -3
  40. DIRAC/RequestManagementSystem/Agent/RequestExecutingAgent.py +8 -6
  41. DIRAC/RequestManagementSystem/ConfigTemplate.cfg +6 -6
  42. DIRAC/ResourceStatusSystem/Command/FreeDiskSpaceCommand.py +3 -1
  43. DIRAC/Resources/Computing/AREXComputingElement.py +18 -2
  44. DIRAC/Resources/Computing/BatchSystems/Condor.py +0 -3
  45. DIRAC/Resources/Computing/BatchSystems/executeBatch.py +15 -7
  46. DIRAC/Resources/Computing/LocalComputingElement.py +0 -2
  47. DIRAC/Resources/Computing/SSHComputingElement.py +61 -38
  48. DIRAC/Resources/IdProvider/CheckInIdProvider.py +13 -0
  49. DIRAC/Resources/IdProvider/IdProviderFactory.py +13 -3
  50. DIRAC/Resources/IdProvider/tests/Test_IdProviderFactory.py +7 -0
  51. DIRAC/Resources/Storage/FileStorage.py +121 -2
  52. DIRAC/TransformationSystem/Agent/InputDataAgent.py +4 -1
  53. DIRAC/TransformationSystem/Agent/MCExtensionAgent.py +5 -2
  54. DIRAC/TransformationSystem/Agent/TaskManagerAgentBase.py +3 -4
  55. DIRAC/TransformationSystem/Agent/TransformationCleaningAgent.py +44 -9
  56. DIRAC/TransformationSystem/Agent/ValidateOutputDataAgent.py +4 -2
  57. DIRAC/TransformationSystem/Client/TransformationClient.py +9 -1
  58. DIRAC/TransformationSystem/Client/Utilities.py +6 -3
  59. DIRAC/TransformationSystem/DB/TransformationDB.py +105 -43
  60. DIRAC/TransformationSystem/Utilities/ReplicationCLIParameters.py +3 -3
  61. DIRAC/TransformationSystem/scripts/dirac_production_runjoblocal.py +2 -4
  62. DIRAC/TransformationSystem/test/Test_replicationTransformation.py +5 -6
  63. DIRAC/WorkloadManagementSystem/Agent/SiteDirector.py +8 -11
  64. DIRAC/WorkloadManagementSystem/Agent/StalledJobAgent.py +39 -7
  65. DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_SiteDirector.py +8 -2
  66. DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_StalledJobAgent.py +24 -4
  67. DIRAC/WorkloadManagementSystem/Client/DownloadInputData.py +4 -3
  68. DIRAC/WorkloadManagementSystem/ConfigTemplate.cfg +3 -3
  69. DIRAC/WorkloadManagementSystem/DB/JobParametersDB.py +8 -8
  70. DIRAC/WorkloadManagementSystem/DB/SandboxMetadataDB.py +1 -1
  71. DIRAC/WorkloadManagementSystem/DB/StatusUtils.py +48 -21
  72. DIRAC/WorkloadManagementSystem/DB/tests/Test_StatusUtils.py +19 -4
  73. DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapper.py +3 -4
  74. DIRAC/WorkloadManagementSystem/JobWrapper/Watchdog.py +16 -45
  75. DIRAC/WorkloadManagementSystem/JobWrapper/test/Test_JobWrapper.py +18 -9
  76. DIRAC/WorkloadManagementSystem/Service/JobManagerHandler.py +25 -2
  77. DIRAC/WorkloadManagementSystem/Service/WMSAdministratorHandler.py +18 -31
  78. DIRAC/WorkloadManagementSystem/Utilities/PilotCStoJSONSynchronizer.py +4 -1
  79. {dirac-9.0.0a69.dist-info → dirac-9.0.0a70.dist-info}/METADATA +6 -5
  80. {dirac-9.0.0a69.dist-info → dirac-9.0.0a70.dist-info}/RECORD +84 -82
  81. {dirac-9.0.0a69.dist-info → dirac-9.0.0a70.dist-info}/WHEEL +0 -0
  82. {dirac-9.0.0a69.dist-info → dirac-9.0.0a70.dist-info}/entry_points.txt +0 -0
  83. {dirac-9.0.0a69.dist-info → dirac-9.0.0a70.dist-info}/licenses/LICENSE +0 -0
  84. {dirac-9.0.0a69.dist-info → dirac-9.0.0a70.dist-info}/top_level.txt +0 -0
@@ -17,11 +17,9 @@ from DIRAC.ConfigurationSystem.Client.Helpers import cfgPath
17
17
  from DIRAC.Core.Base.AgentModule import AgentModule
18
18
  from DIRAC.Core.Utilities import DErrno
19
19
  from DIRAC.Core.Utilities.ClassAd.ClassAdLight import ClassAd
20
+ from DIRAC.Core.Utilities.ObjectLoader import ObjectLoader
20
21
  from DIRAC.Core.Utilities.TimeUtilities import fromString, second, toEpoch
21
22
  from DIRAC.WorkloadManagementSystem.Client import JobMinorStatus, JobStatus
22
- from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB
23
- from DIRAC.WorkloadManagementSystem.DB.JobLoggingDB import JobLoggingDB
24
- from DIRAC.WorkloadManagementSystem.DB.PilotAgentsDB import PilotAgentsDB
25
23
  from DIRAC.WorkloadManagementSystem.Service.JobPolicy import RIGHT_KILL
26
24
  from DIRAC.WorkloadManagementSystem.DB.StatusUtils import kill_delete_jobs
27
25
  from DIRAC.WorkloadManagementSystem.Utilities.JobParameters import getJobParameters
@@ -40,6 +38,9 @@ class StalledJobAgent(AgentModule):
40
38
 
41
39
  self.jobDB = None
42
40
  self.logDB = None
41
+ self.taskQueueDB = None
42
+ self.pilotAgentsDB = None
43
+ self.storageManagementDB = None
43
44
  self.matchedTime = 7200
44
45
  self.rescheduledTime = 600
45
46
  self.submittingTime = 300
@@ -51,8 +52,30 @@ class StalledJobAgent(AgentModule):
51
52
  #############################################################################
52
53
  def initialize(self):
53
54
  """Sets default parameters."""
54
- self.jobDB = JobDB()
55
- self.logDB = JobLoggingDB()
55
+ result = ObjectLoader().loadObject("WorkloadManagementSystem.DB.JobDB", "JobDB")
56
+ if not result["OK"]:
57
+ return result
58
+ self.jobDB = result["Value"]()
59
+
60
+ result = ObjectLoader().loadObject("WorkloadManagementSystem.DB.JobLoggingDB", "JobLoggingDB")
61
+ if not result["OK"]:
62
+ return result
63
+ self.logDB = result["Value"]()
64
+
65
+ result = ObjectLoader().loadObject("WorkloadManagementSystem.DB.TaskQueueDB", "TaskQueueDB")
66
+ if not result["OK"]:
67
+ return result
68
+ self.taskQueueDB = result["Value"]()
69
+
70
+ result = ObjectLoader().loadObject("WorkloadManagementSystem.DB.PilotAgentsDB", "PilotAgentsDB")
71
+ if not result["OK"]:
72
+ return result
73
+ self.pilotAgentsDB = result["Value"]()
74
+
75
+ result = ObjectLoader().loadObject("StorageManagementSystem.DB.StorageManagementDB", "StorageManagementDB")
76
+ if not result["OK"]:
77
+ return result
78
+ self.storageManagementDB = result["Value"]()
56
79
 
57
80
  # getting parameters
58
81
 
@@ -235,7 +258,16 @@ class StalledJobAgent(AgentModule):
235
258
  # Set the jobs Failed, send them a kill signal in case they are not really dead
236
259
  # and send accounting info
237
260
  if setFailed:
238
- res = kill_delete_jobs(RIGHT_KILL, [jobID], nonauthJobList=[], force=True)
261
+ res = kill_delete_jobs(
262
+ RIGHT_KILL,
263
+ [jobID],
264
+ nonauthJobList=[],
265
+ force=True,
266
+ jobdb=self.jobDB,
267
+ taskqueuedb=self.taskQueueDB,
268
+ pilotagentsdb=self.pilotAgentsDB,
269
+ storagemanagementdb=self.storageManagementDB,
270
+ )
239
271
  if not res["OK"]:
240
272
  self.log.error("Failed to kill job", jobID)
241
273
 
@@ -262,7 +294,7 @@ class StalledJobAgent(AgentModule):
262
294
  # There is no pilot reference, hence its status is unknown
263
295
  return S_OK("NoPilot")
264
296
 
265
- result = PilotAgentsDB().getPilotInfo(pilotReference)
297
+ result = self.pilotAgentsDB.getPilotInfo(pilotReference)
266
298
  if not result["OK"]:
267
299
  if DErrno.cmpError(result, DErrno.EWMSNOPILOT):
268
300
  self.log.warn("No pilot found", f"for job {jobID}: {result['Message']}")
@@ -169,10 +169,16 @@ def sd(mocker, config):
169
169
  gConfig.getSections("Resources/Sites/LCG")["Value"] + gConfig.getSections("Resources/Sites/DIRAC")["Value"]
170
170
  )
171
171
  mocker.patch(
172
- "DIRAC.WorkloadManagementSystem.Agent.SiteDirector.SiteStatus.getUsableSites", return_values=usableSites
172
+ "DIRAC.WorkloadManagementSystem.Agent.SiteDirector.SiteStatus.getUsableSites", return_value=S_OK(usableSites)
173
173
  )
174
+
175
+ # Mock getElementStatus to return a properly formatted dictionary
176
+ def mock_getElementStatus(ceNamesList, *args, **kwargs):
177
+ return S_OK({ceName: {"all": "Active"} for ceName in ceNamesList})
178
+
174
179
  mocker.patch(
175
- "DIRAC.WorkloadManagementSystem.Agent.SiteDirector.ResourceStatus.getElementStatus", return_values=usableSites
180
+ "DIRAC.WorkloadManagementSystem.Agent.SiteDirector.ResourceStatus.getElementStatus",
181
+ side_effect=mock_getElementStatus,
176
182
  )
177
183
  mocker.patch(
178
184
  "DIRAC.WorkloadManagementSystem.Agent.SiteDirector.gProxyManager.downloadProxy", side_effect=mockPMProxyReply
@@ -23,10 +23,31 @@ def sja(mocker):
23
23
  side_effect=lambda x, y=None: y,
24
24
  create=True,
25
25
  )
26
- mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.JobDB")
27
- mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.JobLoggingDB")
26
+
27
+ # Mock ObjectLoader to return mock DB instances
28
+ mockJobDB = MagicMock()
29
+ mockJobDB.log = gLogger
30
+ mockJobLoggingDB = MagicMock()
31
+ mockTaskQueueDB = MagicMock()
32
+ mockPilotAgentsDB = MagicMock()
33
+ mockStorageManagementDB = MagicMock()
34
+
35
+ def mock_load_object(module_path, class_name):
36
+ mocks = {
37
+ "JobDB": mockJobDB,
38
+ "JobLoggingDB": mockJobLoggingDB,
39
+ "TaskQueueDB": mockTaskQueueDB,
40
+ "PilotAgentsDB": mockPilotAgentsDB,
41
+ "StorageManagementDB": mockStorageManagementDB,
42
+ }
43
+ return {"OK": True, "Value": lambda: mocks[class_name]}
44
+
45
+ mocker.patch(
46
+ "DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.ObjectLoader.loadObject",
47
+ side_effect=mock_load_object,
48
+ )
49
+
28
50
  mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.rescheduleJobs", return_value=MagicMock())
29
- mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.PilotAgentsDB", return_value=MagicMock())
30
51
  mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.getJobParameters", return_value=MagicMock())
31
52
  mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.kill_delete_jobs", return_value=MagicMock())
32
53
 
@@ -34,7 +55,6 @@ def sja(mocker):
34
55
  stalledJobAgent._AgentModule__configDefaults = mockAM
35
56
  stalledJobAgent.log = gLogger
36
57
  stalledJobAgent.initialize()
37
- stalledJobAgent.jobDB.log = gLogger
38
58
  stalledJobAgent.log.setLevel("DEBUG")
39
59
  stalledJobAgent.stalledTime = 120
40
60
 
@@ -1,6 +1,7 @@
1
- """ The Download Input Data module wraps around the Replica Management
2
- components to provide access to datasets by downloading locally
1
+ """The Download Input Data module wraps around the Replica Management
2
+ components to provide access to datasets by downloading locally
3
3
  """
4
+
4
5
  import os
5
6
  import random
6
7
  import tempfile
@@ -226,7 +227,7 @@ class DownloadInputData:
226
227
  diskSpace = getDiskSpace(self.__getDownloadDir(False)) # MB
227
228
  availableBytes = diskSpace * 1024 * 1024 # bytes
228
229
  bufferGBs = gConfig.getValue(
229
- os.path.join("/Systems/WorkloadManagement/JobWrapper", "JobWrapper", "MinOutputDataBufferGB"), 5.0
230
+ os.path.join("/Systems/WorkloadManagement/JobWrapper", "MinOutputDataBufferGB"), 5.0
230
231
  )
231
232
  data = bufferGBs * 1024 * 1024 * 1024 # bufferGBs in bytes
232
233
  if (data + totalSize) < availableBytes:
@@ -263,11 +263,11 @@ Agents
263
263
  # the DN of the certificate proxy used to submit pilots. If not found here, what is in Operations/Pilot section of the CS will be used
264
264
  PilotDN =
265
265
 
266
- # List of sites that will be treated by this SiteDirector (No value can refer to any Site defined in the CS)
266
+ # List of Sites that will be treated by this SiteDirector (No value can refer to any CE defined in the CS)
267
267
  Site =
268
- # List of CEs that will be treated by this SiteDirector (No value can refer to any CE defined in the CS)
268
+ # List of CEs that will be treated by this SiteDirector (No value can refer to any type of CE defined in the CS)
269
269
  CEs =
270
- # List of CE types that will be treated by this SiteDirector (No value can refer to any type of CE defined in the CS)
270
+ # List of CETypes that are required to be present in the CE/Queue definition
271
271
  CETypes =
272
272
  # List of Tags that are required to be present in the CE/Queue definition
273
273
  Tags =
@@ -1,10 +1,10 @@
1
- """ Module containing a front-end to the OpenSearch-based JobParametersDB.
2
- This is a drop-in replacement for MySQL-based table JobDB.JobParameters.
1
+ """Module containing a front-end to the OpenSearch-based JobParametersDB.
2
+ This is a drop-in replacement for MySQL-based table JobDB.JobParameters.
3
3
 
4
- The following class methods are provided for public usage
5
- - getJobParameters()
6
- - setJobParameter()
7
- - deleteJobParameters()
4
+ The following class methods are provided for public usage
5
+ - getJobParameters()
6
+ - setJobParameter()
7
+ - deleteJobParameters()
8
8
  """
9
9
 
10
10
  from DIRAC import S_ERROR, S_OK
@@ -37,11 +37,11 @@ class JobParametersDB(ElasticDB):
37
37
  def __init__(self, parentLogger=None):
38
38
  """Standard Constructor"""
39
39
 
40
- self.fullname = "WorkloadManagement/ElasticJobParametersDB"
40
+ self.fullname = "WorkloadManagement/JobParametersDB"
41
41
  self.index_name = self.getCSOption("index_name", "job_parameters")
42
42
 
43
43
  try:
44
- # Connecting to the ES cluster
44
+ # Connecting to the OpenSearch cluster
45
45
  super().__init__(self.fullname, self.index_name, parentLogger=parentLogger)
46
46
  except Exception:
47
47
  RuntimeError("Can't connect to JobParameters index")
@@ -64,7 +64,7 @@ class SandboxMetadataDB(DB):
64
64
  "Type": "VARCHAR(64) NOT NULL",
65
65
  },
66
66
  "Indexes": {"Entity": ["EntityId"], "SBIndex": ["SBId"]},
67
- "UniqueIndexes": {"Mapping": ["SBId", "EntityId", "Type"]},
67
+ "PrimaryKey": ["SBId", "EntityId", "Type"],
68
68
  }
69
69
 
70
70
  for tableName in self.__tablesDesc:
@@ -1,43 +1,40 @@
1
1
  from DIRAC import S_ERROR, S_OK, gLogger
2
- from DIRAC.StorageManagementSystem.DB.StorageManagementDB import StorageManagementDB
3
2
  from DIRAC.WorkloadManagementSystem.Client import JobStatus
4
- from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB
5
- from DIRAC.WorkloadManagementSystem.DB.PilotAgentsDB import PilotAgentsDB
6
- from DIRAC.WorkloadManagementSystem.DB.TaskQueueDB import TaskQueueDB
3
+ from DIRAC.Core.Utilities.ObjectLoader import ObjectLoader
7
4
  from DIRAC.WorkloadManagementSystem.Service.JobPolicy import RIGHT_DELETE, RIGHT_KILL
8
5
  from DIRAC.WorkloadManagementSystem.Utilities.jobAdministration import _filterJobStateTransition
9
6
 
10
7
 
11
- def _deleteJob(jobID, force=False):
8
+ def _deleteJob(jobID, force=False, *, jobdb, taskqueuedb, pilotagentsdb):
12
9
  """Set the job status to "Deleted"
13
10
  and remove the pilot that ran and its logging info if the pilot is finished.
14
11
 
15
12
  :param int jobID: job ID
16
13
  :return: S_OK()/S_ERROR()
17
14
  """
18
- if not (result := JobDB().setJobStatus(jobID, JobStatus.DELETED, "Checking accounting", force=force))["OK"]:
15
+ if not (result := jobdb.setJobStatus(jobID, JobStatus.DELETED, "Checking accounting", force=force))["OK"]:
19
16
  gLogger.warn("Failed to set job Deleted status", result["Message"])
20
17
  return result
21
18
 
22
- if not (result := TaskQueueDB().deleteJob(jobID))["OK"]:
19
+ if not (result := taskqueuedb.deleteJob(jobID))["OK"]:
23
20
  gLogger.warn("Failed to delete job from the TaskQueue")
24
21
 
25
22
  # if it was the last job for the pilot
26
- result = PilotAgentsDB().getPilotsForJobID(jobID)
23
+ result = pilotagentsdb.getPilotsForJobID(jobID)
27
24
  if not result["OK"]:
28
25
  gLogger.error("Failed to get Pilots for JobID", result["Message"])
29
26
  return result
30
27
  for pilot in result["Value"]:
31
- res = PilotAgentsDB().getJobsForPilot(pilot)
28
+ res = pilotagentsdb.getJobsForPilot(pilot)
32
29
  if not res["OK"]:
33
30
  gLogger.error("Failed to get jobs for pilot", res["Message"])
34
31
  return res
35
32
  if not res["Value"]: # if list of jobs for pilot is empty, delete pilot
36
- result = PilotAgentsDB().getPilotInfo(pilotID=pilot)
33
+ result = pilotagentsdb.getPilotInfo(pilotID=pilot)
37
34
  if not result["OK"]:
38
35
  gLogger.error("Failed to get pilot info", result["Message"])
39
36
  return result
40
- ret = PilotAgentsDB().deletePilot(result["Value"]["PilotJobReference"])
37
+ ret = pilotagentsdb.deletePilot(result["Value"]["PilotJobReference"])
41
38
  if not ret["OK"]:
42
39
  gLogger.error("Failed to delete pilot from PilotAgentsDB", ret["Message"])
43
40
  return ret
@@ -45,7 +42,7 @@ def _deleteJob(jobID, force=False):
45
42
  return S_OK()
46
43
 
47
44
 
48
- def _killJob(jobID, sendKillCommand=True, force=False):
45
+ def _killJob(jobID, sendKillCommand=True, force=False, *, jobdb, taskqueuedb):
49
46
  """Kill one job
50
47
 
51
48
  :param int jobID: job ID
@@ -54,32 +51,63 @@ def _killJob(jobID, sendKillCommand=True, force=False):
54
51
  :return: S_OK()/S_ERROR()
55
52
  """
56
53
  if sendKillCommand:
57
- if not (result := JobDB().setJobCommand(jobID, "Kill"))["OK"]:
54
+ if not (result := jobdb.setJobCommand(jobID, "Kill"))["OK"]:
58
55
  gLogger.warn("Failed to set job Kill command", result["Message"])
59
56
  return result
60
57
 
61
58
  gLogger.info("Job marked for termination", jobID)
62
- if not (result := JobDB().setJobStatus(jobID, JobStatus.KILLED, "Marked for termination", force=force))["OK"]:
59
+ if not (result := jobdb.setJobStatus(jobID, JobStatus.KILLED, "Marked for termination", force=force))["OK"]:
63
60
  gLogger.warn("Failed to set job Killed status", result["Message"])
64
- if not (result := TaskQueueDB().deleteJob(jobID))["OK"]:
61
+ if not (result := taskqueuedb.deleteJob(jobID))["OK"]:
65
62
  gLogger.warn("Failed to delete job from the TaskQueue", result["Message"])
66
63
 
67
64
  return S_OK()
68
65
 
69
66
 
70
- def kill_delete_jobs(right, validJobList, nonauthJobList=[], force=False):
67
+ def kill_delete_jobs(
68
+ right,
69
+ validJobList,
70
+ nonauthJobList=[],
71
+ force=False,
72
+ *,
73
+ jobdb=None,
74
+ taskqueuedb=None,
75
+ pilotagentsdb=None,
76
+ storagemanagementdb=None,
77
+ ):
71
78
  """Kill (== set the status to "KILLED") or delete (== set the status to "DELETED") jobs as necessary
72
79
 
73
80
  :param str right: RIGHT_KILL or RIGHT_DELETE
74
81
 
75
82
  :return: S_OK()/S_ERROR()
76
83
  """
84
+ if jobdb is None:
85
+ result = ObjectLoader().loadObject("WorkloadManagementSystem.DB.JobDB", "JobDB")
86
+ if not result["OK"]:
87
+ return result
88
+ jobdb = result["Value"]()
89
+ if taskqueuedb is None:
90
+ result = ObjectLoader().loadObject("WorkloadManagementSystem.DB.TaskQueueDB", "TaskQueueDB")
91
+ if not result["OK"]:
92
+ return result
93
+ taskqueuedb = result["Value"]()
94
+ if pilotagentsdb is None:
95
+ result = ObjectLoader().loadObject("WorkloadManagementSystem.DB.PilotAgentsDB", "PilotAgentsDB")
96
+ if not result["OK"]:
97
+ return result
98
+ pilotagentsdb = result["Value"]()
99
+ if storagemanagementdb is None:
100
+ result = ObjectLoader().loadObject("StorageManagementSystem.DB.StorageManagementDB", "StorageManagementDB")
101
+ if not result["OK"]:
102
+ return result
103
+ storagemanagementdb = result["Value"]()
104
+
77
105
  badIDs = []
78
106
 
79
107
  killJobList = []
80
108
  deleteJobList = []
81
109
  if validJobList:
82
- result = JobDB().getJobsAttributes(killJobList, ["Status"])
110
+ result = jobdb.getJobsAttributes(validJobList, ["Status"])
83
111
  if not result["OK"]:
84
112
  return result
85
113
  jobStates = result["Value"]
@@ -92,12 +120,12 @@ def kill_delete_jobs(right, validJobList, nonauthJobList=[], force=False):
92
120
  deleteJobList.extend(_filterJobStateTransition(jobStates, JobStatus.DELETED))
93
121
 
94
122
  for jobID in killJobList:
95
- result = _killJob(jobID, force=force)
123
+ result = _killJob(jobID, force=force, jobdb=jobdb, taskqueuedb=taskqueuedb)
96
124
  if not result["OK"]:
97
125
  badIDs.append(jobID)
98
126
 
99
127
  for jobID in deleteJobList:
100
- result = _deleteJob(jobID, force=force)
128
+ result = _deleteJob(jobID, force=force, jobdb=jobdb, taskqueuedb=taskqueuedb, pilotagentsdb=pilotagentsdb)
101
129
  if not result["OK"]:
102
130
  badIDs.append(jobID)
103
131
 
@@ -105,9 +133,8 @@ def kill_delete_jobs(right, validJobList, nonauthJobList=[], force=False):
105
133
  stagingJobList = [jobID for jobID, sDict in jobStates.items() if sDict["Status"] == JobStatus.STAGING]
106
134
 
107
135
  if stagingJobList:
108
- stagerDB = StorageManagementDB()
109
136
  gLogger.info("Going to send killing signal to stager as well!")
110
- result = stagerDB.killTasksBySourceTaskID(stagingJobList)
137
+ result = storagemanagementdb.killTasksBySourceTaskID(stagingJobList)
111
138
  if not result["OK"]:
112
139
  gLogger.warn("Failed to kill some Stager tasks", result["Message"])
113
140
 
@@ -19,10 +19,25 @@ from DIRAC.WorkloadManagementSystem.DB.StatusUtils import kill_delete_jobs
19
19
  ],
20
20
  )
21
21
  def test___kill_delete_jobs(mocker, jobIDs_list, right):
22
- mocker.patch("DIRAC.WorkloadManagementSystem.DB.StatusUtils.JobDB", MagicMock())
23
- mocker.patch("DIRAC.WorkloadManagementSystem.DB.StatusUtils.TaskQueueDB", MagicMock())
24
- mocker.patch("DIRAC.WorkloadManagementSystem.DB.StatusUtils.PilotAgentsDB", MagicMock())
25
- mocker.patch("DIRAC.WorkloadManagementSystem.DB.StatusUtils.StorageManagementDB", MagicMock())
22
+ # Mock ObjectLoader to return mock DB instances
23
+ mockJobDB = MagicMock()
24
+ mockTaskQueueDB = MagicMock()
25
+ mockPilotAgentsDB = MagicMock()
26
+ mockStorageManagementDB = MagicMock()
27
+
28
+ def mock_load_object(module_path, class_name):
29
+ mocks = {
30
+ "JobDB": mockJobDB,
31
+ "TaskQueueDB": mockTaskQueueDB,
32
+ "PilotAgentsDB": mockPilotAgentsDB,
33
+ "StorageManagementDB": mockStorageManagementDB,
34
+ }
35
+ return {"OK": True, "Value": lambda: mocks[class_name]}
36
+
37
+ mocker.patch(
38
+ "DIRAC.WorkloadManagementSystem.DB.StatusUtils.ObjectLoader.loadObject",
39
+ side_effect=mock_load_object,
40
+ )
26
41
 
27
42
  res = kill_delete_jobs(right, jobIDs_list)
28
43
  assert res["OK"]
@@ -16,7 +16,6 @@ import datetime
16
16
  import glob
17
17
  import json
18
18
  import os
19
- from pathlib import Path
20
19
  import re
21
20
  import shutil
22
21
  import stat
@@ -24,12 +23,12 @@ import sys
24
23
  import tarfile
25
24
  import threading
26
25
  import time
26
+ from pathlib import Path
27
27
  from urllib.parse import unquote
28
28
 
29
29
  import DIRAC
30
30
  from DIRAC import S_ERROR, S_OK, gConfig, gLogger
31
31
  from DIRAC.AccountingSystem.Client.Types.Job import Job as AccountingJob
32
-
33
32
  from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
34
33
  from DIRAC.ConfigurationSystem.Client.Helpers.Registry import getVOForGroup
35
34
  from DIRAC.Core.Utilities import DEncode, DErrno, List
@@ -1021,7 +1020,7 @@ class JobWrapper:
1021
1020
 
1022
1021
  for i in outputSandbox:
1023
1022
  if i not in okFiles:
1024
- if not f"{i}.tar" in okFiles:
1023
+ if f"{i}.tar" not in okFiles:
1025
1024
  if not re.search(r"\*", i):
1026
1025
  if i not in missing:
1027
1026
  missing.append(i)
@@ -1215,8 +1214,8 @@ class JobWrapper:
1215
1214
  lfn = str(basePath / outputPath / os.path.basename(localfile))
1216
1215
  else:
1217
1216
  # if LFN is given, take it as it is
1218
- localfile = str(self.jobIDPath / outputFile.replace("LFN:", ""))
1219
1217
  lfn = outputFile.replace("LFN:", "")
1218
+ localfile = str(self.jobIDPath / os.path.basename(lfn))
1220
1219
 
1221
1220
  return (lfn, localfile)
1222
1221
 
@@ -1,22 +1,22 @@
1
- """ The Watchdog class is used by the Job Wrapper to resolve and monitor
2
- the system resource consumption. The Watchdog can determine if
3
- a running job is stalled and indicate this to the Job Wrapper.
4
- Furthermore, the Watchdog will identify when the Job CPU limit has been
5
- exceeded and fail jobs meaningfully.
6
-
7
- Information is returned to the WMS via the heart-beat mechanism. This
8
- also interprets control signals from the WMS e.g. to kill a running
9
- job.
10
-
11
- - Still to implement:
12
- - CPU normalization for correct comparison with job limit
1
+ """The Watchdog class is used by the Job Wrapper to resolve and monitor
2
+ the system resource consumption. The Watchdog can determine if
3
+ a running job is stalled and indicate this to the Job Wrapper.
4
+ Furthermore, the Watchdog will identify when the Job CPU limit has been
5
+ exceeded and fail jobs meaningfully.
6
+
7
+ Information is returned to the WMS via the heart-beat mechanism. This
8
+ also interprets control signals from the WMS e.g. to kill a running
9
+ job.
10
+
11
+ - Still to implement:
12
+ - CPU normalization for correct comparison with job limit
13
13
  """
14
+
14
15
  import datetime
15
16
  import errno
16
17
  import getpass
17
18
  import math
18
19
  import os
19
- import signal
20
20
  import socket
21
21
  import time
22
22
  from pathlib import Path
@@ -32,28 +32,6 @@ from DIRAC.WorkloadManagementSystem.Client import JobMinorStatus
32
32
  from DIRAC.WorkloadManagementSystem.Client.JobStateUpdateClient import JobStateUpdateClient
33
33
 
34
34
 
35
- def kill_proc_tree(pid, sig=signal.SIGTERM, includeParent=True):
36
- """Kill a process tree (including grandchildren) with signal
37
- "sig" and return a (gone, still_alive) tuple.
38
- called as soon as a child terminates.
39
-
40
- Taken from https://psutil.readthedocs.io/en/latest/index.html#kill-process-tree
41
- """
42
- assert pid != os.getpid(), "won't kill myself"
43
- parent = psutil.Process(pid)
44
- children = parent.children(recursive=True)
45
- if includeParent:
46
- children.append(parent)
47
- for p in children:
48
- try:
49
- p.send_signal(sig)
50
- except psutil.NoSuchProcess:
51
- pass
52
- _gone, alive = psutil.wait_procs(children, timeout=10)
53
- for p in alive:
54
- p.kill()
55
-
56
-
57
35
  class Watchdog:
58
36
  #############################################################################
59
37
  def __init__(self, pid, exeThread, spObject, jobCPUTime, memoryLimit=0, processors=1, jobArgs={}):
@@ -212,7 +190,7 @@ class Watchdog:
212
190
  if self.littleTimeLeftCount == 0 and self.__timeLeft() == -1:
213
191
  self.checkError = JobMinorStatus.JOB_EXCEEDED_CPU
214
192
  self.log.error(self.checkError, self.timeLeft)
215
- self.__killRunningThread()
193
+ self.spObject.killChild()
216
194
  return S_OK()
217
195
 
218
196
  self.littleTimeLeftCount -= 1
@@ -321,7 +299,7 @@ class Watchdog:
321
299
 
322
300
  self.log.info("=================END=================")
323
301
 
324
- self.__killRunningThread()
302
+ self.spObject.killChild()
325
303
  return S_OK()
326
304
 
327
305
  recentStdOut = "None"
@@ -408,7 +386,7 @@ class Watchdog:
408
386
  if "Kill" in signalDict:
409
387
  self.log.info("Received Kill signal, stopping job via control signal")
410
388
  self.checkError = JobMinorStatus.RECEIVED_KILL_SIGNAL
411
- self.__killRunningThread()
389
+ self.spObject.killChild()
412
390
  else:
413
391
  self.log.info("The following control signal was sent but not understood by the watchdog:")
414
392
  self.log.info(signalDict)
@@ -862,13 +840,6 @@ class Watchdog:
862
840
 
863
841
  return result
864
842
 
865
- #############################################################################
866
- def __killRunningThread(self):
867
- """Will kill the running thread process and any child processes."""
868
- self.log.info("Sending kill signal to application PID", self.spObject.getChildPID())
869
- self.spObject.killChild()
870
- return S_OK("Thread killed")
871
-
872
843
  #############################################################################
873
844
  def __sendSignOfLife(self, jobID, heartBeatDict, staticParamDict):
874
845
  """Sends sign of life 'heartbeat' signal and triggers control signal
@@ -1,5 +1,5 @@
1
- """ Test class for JobWrapper
2
- """
1
+ """Test class for JobWrapper"""
2
+
3
3
  import os
4
4
  import shutil
5
5
  import tempfile
@@ -314,8 +314,7 @@ def test_processKilledSubprocess(mocker):
314
314
  result = jw.process("sleep 20", {})
315
315
 
316
316
  assert result["OK"]
317
- assert result["Value"]["payloadStatus"] == 15 # SIGTERM
318
- assert not result["Value"]["payloadOutput"]
317
+ assert result["Value"]["payloadStatus"] is None
319
318
  assert not result["Value"]["payloadExecutorError"]
320
319
  assert result["Value"]["watchdogError"] == "Job is stalled!" # Error message from the watchdog
321
320
 
@@ -664,6 +663,7 @@ def jobIDPath():
664
663
  # Output data files
665
664
  (p / "00232454_00000244_1.sim").touch()
666
665
  (p / "1720442808testFileUpload.txt").touch()
666
+ (p / "testFileUploadFullLFN.txt").touch()
667
667
 
668
668
  with open(p / "pool_xml_catalog.xml", "w") as f:
669
669
  f.write(
@@ -863,7 +863,11 @@ def test_processJobOutputs_output_data_upload(mocker, setup_another_job_wrapper)
863
863
  # BTW, isn't the concept of pool_xml_catalog.xml from lhcbdirac?
864
864
  jw.jobArgs = {
865
865
  "OutputSandbox": [],
866
- "OutputData": ["1720442808testFileUpload.txt", "LFN:00232454_00000244_1.sim"],
866
+ "OutputData": [
867
+ "1720442808testFileUpload.txt",
868
+ "LFN:00232454_00000244_1.sim",
869
+ "LFN:/dirac/user/u/unknown/testFileUploadFullLFN.txt",
870
+ ],
867
871
  "Owner": "Jane Doe",
868
872
  }
869
873
 
@@ -879,10 +883,15 @@ def test_processJobOutputs_output_data_upload(mocker, setup_another_job_wrapper)
879
883
  assert jw.jobReport.jobStatusInfo[1][:-1] == ("", JobMinorStatus.UPLOADING_OUTPUT_DATA)
880
884
  assert jw.jobReport.jobStatusInfo[2][:-1] == (JobStatus.COMPLETING, JobMinorStatus.OUTPUT_DATA_UPLOADED)
881
885
  assert len(jw.jobReport.jobParameters) == 1
882
- assert jw.jobReport.jobParameters[0] == (
883
- "UploadedOutputData",
884
- "00232454_00000244_1.sim, /dirac/user/u/unknown/0/123/1720442808testFileUpload.txt",
885
- )
886
+
887
+ expected_files = {
888
+ "00232454_00000244_1.sim",
889
+ "/dirac/user/u/unknown/0/123/1720442808testFileUpload.txt",
890
+ "/dirac/user/u/unknown/testFileUploadFullLFN.txt",
891
+ }
892
+ assert jw.jobReport.jobParameters[0][0] == "UploadedOutputData"
893
+ uploaded_files = set(jw.jobReport.jobParameters[0][1].split(", "))
894
+ assert uploaded_files == expected_files
886
895
 
887
896
 
888
897
  # -------------------------------------------------------------------------------------------------
@@ -65,6 +65,11 @@ class JobManagerHandlerMixin:
65
65
  return result
66
66
  cls.pilotAgentsDB = result["Value"](parentLogger=cls.log)
67
67
 
68
+ result = ObjectLoader().loadObject("StorageManagementSystem.DB.StorageManagementDB", "StorageManagementDB")
69
+ if not result["OK"]:
70
+ return result
71
+ cls.storageManagementDB = result["Value"](parentLogger=cls.log)
72
+
68
73
  except RuntimeError as excp:
69
74
  return S_ERROR(f"Can't connect to DB: {excp!r}")
70
75
 
@@ -449,7 +454,16 @@ class JobManagerHandlerMixin:
449
454
  jobList, RIGHT_DELETE
450
455
  )
451
456
 
452
- result = kill_delete_jobs(RIGHT_DELETE, validJobList, nonauthJobList, force=force)
457
+ result = kill_delete_jobs(
458
+ RIGHT_DELETE,
459
+ validJobList,
460
+ nonauthJobList,
461
+ force=force,
462
+ jobdb=self.jobDB,
463
+ taskqueuedb=self.taskQueueDB,
464
+ pilotagentsdb=self.pilotAgentsDB,
465
+ storagemanagementdb=self.storageManagementDB,
466
+ )
453
467
 
454
468
  result["requireProxyUpload"] = len(ownerJobList) > 0 and self.__checkIfProxyUploadIsRequired()
455
469
 
@@ -478,7 +492,16 @@ class JobManagerHandlerMixin:
478
492
  jobList, RIGHT_KILL
479
493
  )
480
494
 
481
- result = kill_delete_jobs(RIGHT_KILL, validJobList, nonauthJobList, force=force)
495
+ result = kill_delete_jobs(
496
+ RIGHT_KILL,
497
+ validJobList,
498
+ nonauthJobList,
499
+ force=force,
500
+ jobdb=self.jobDB,
501
+ taskqueuedb=self.taskQueueDB,
502
+ pilotagentsdb=self.pilotAgentsDB,
503
+ storagemanagementdb=self.storageManagementDB,
504
+ )
482
505
 
483
506
  result["requireProxyUpload"] = len(ownerJobList) > 0 and self.__checkIfProxyUploadIsRequired()
484
507