DIRAC 9.0.0a61__py3-none-any.whl → 9.0.0a63__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- DIRAC/ConfigurationSystem/Client/Helpers/Registry.py +35 -7
- DIRAC/ConfigurationSystem/Client/LocalConfiguration.py +3 -0
- DIRAC/ConfigurationSystem/Client/VOMS2CSSynchronizer.py +8 -1
- DIRAC/Core/Security/DiracX.py +1 -1
- DIRAC/Core/Security/ProxyInfo.py +9 -5
- DIRAC/Core/Tornado/Client/ClientSelector.py +4 -1
- DIRAC/Core/Utilities/Extensions.py +10 -1
- DIRAC/Core/Utilities/Os.py +32 -1
- DIRAC/Core/scripts/dirac_apptainer_exec.py +10 -3
- DIRAC/Interfaces/API/Dirac.py +22 -13
- DIRAC/Interfaces/API/DiracAdmin.py +17 -5
- DIRAC/Interfaces/scripts/dirac_admin_allow_site.py +7 -1
- DIRAC/Interfaces/scripts/dirac_admin_ban_site.py +7 -1
- DIRAC/MonitoringSystem/Client/Types/WMSHistory.py +4 -0
- DIRAC/MonitoringSystem/Service/WebAppHandler.py +68 -1
- DIRAC/ResourceStatusSystem/Client/SiteStatus.py +4 -2
- DIRAC/ResourceStatusSystem/Utilities/CSHelpers.py +2 -31
- DIRAC/ResourceStatusSystem/scripts/dirac_rss_set_status.py +18 -4
- DIRAC/Resources/Computing/BatchSystems/Condor.py +23 -4
- DIRAC/TransformationSystem/Agent/TaskManagerAgentBase.py +10 -13
- DIRAC/TransformationSystem/Agent/TransformationAgent.py +22 -1
- DIRAC/TransformationSystem/Agent/TransformationCleaningAgent.py +15 -15
- DIRAC/TransformationSystem/Client/Utilities.py +6 -0
- DIRAC/WorkloadManagementSystem/Agent/JobCleaningAgent.py +11 -7
- DIRAC/WorkloadManagementSystem/Agent/StalledJobAgent.py +3 -26
- DIRAC/WorkloadManagementSystem/Agent/StatesAccountingAgent.py +41 -1
- DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_JobCleaningAgent.py +7 -9
- DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_StalledJobAgent.py +1 -2
- DIRAC/WorkloadManagementSystem/Client/JobMonitoringClient.py +4 -11
- DIRAC/WorkloadManagementSystem/Client/JobStatus.py +0 -59
- DIRAC/WorkloadManagementSystem/Client/SandboxStoreClient.py +25 -38
- DIRAC/WorkloadManagementSystem/Client/WMSClient.py +2 -3
- DIRAC/WorkloadManagementSystem/DB/JobDB.py +0 -58
- DIRAC/WorkloadManagementSystem/DB/SandboxMetadataDB.py +25 -37
- DIRAC/WorkloadManagementSystem/Executor/JobSanity.py +3 -3
- DIRAC/WorkloadManagementSystem/FutureClient/JobStateUpdateClient.py +2 -14
- DIRAC/WorkloadManagementSystem/Service/JobManagerHandler.py +27 -138
- DIRAC/WorkloadManagementSystem/Service/JobMonitoringHandler.py +0 -126
- DIRAC/WorkloadManagementSystem/Service/JobStateUpdateHandler.py +0 -16
- DIRAC/WorkloadManagementSystem/Service/SandboxStoreHandler.py +5 -51
- DIRAC/WorkloadManagementSystem/Utilities/JobParameters.py +1 -1
- DIRAC/WorkloadManagementSystem/Utilities/PilotWrapper.py +2 -0
- DIRAC/WorkloadManagementSystem/Utilities/jobAdministration.py +138 -0
- DIRAC/WorkloadManagementSystem/Utilities/test/Test_JobAdministration.py +28 -0
- {dirac-9.0.0a61.dist-info → dirac-9.0.0a63.dist-info}/METADATA +2 -1
- {dirac-9.0.0a61.dist-info → dirac-9.0.0a63.dist-info}/RECORD +50 -52
- {dirac-9.0.0a61.dist-info → dirac-9.0.0a63.dist-info}/entry_points.txt +0 -3
- DIRAC/TransformationSystem/scripts/dirac_transformation_archive.py +0 -30
- DIRAC/TransformationSystem/scripts/dirac_transformation_clean.py +0 -30
- DIRAC/TransformationSystem/scripts/dirac_transformation_remove_output.py +0 -30
- DIRAC/WorkloadManagementSystem/Utilities/test/Test_JobManager.py +0 -58
- {dirac-9.0.0a61.dist-info → dirac-9.0.0a63.dist-info}/WHEEL +0 -0
- {dirac-9.0.0a61.dist-info → dirac-9.0.0a63.dist-info}/licenses/LICENSE +0 -0
- {dirac-9.0.0a61.dist-info → dirac-9.0.0a63.dist-info}/top_level.txt +0 -0
|
@@ -29,6 +29,7 @@ def registerSwitches():
|
|
|
29
29
|
("reason=", "Reason to set the Status"),
|
|
30
30
|
("VO=", "VO to change a status for. When omitted, status will be changed for all VOs"),
|
|
31
31
|
("tokenOwner=", "Owner of the token"),
|
|
32
|
+
("days=", "Number of days the token is valid for. Default is 1 day. 0 or less days denotes forever."),
|
|
32
33
|
)
|
|
33
34
|
|
|
34
35
|
for switch in switches:
|
|
@@ -50,6 +51,7 @@ def parseSwitches():
|
|
|
50
51
|
switches = dict(Script.getUnprocessedSwitches())
|
|
51
52
|
switches.setdefault("statusType", None)
|
|
52
53
|
switches.setdefault("VO", None)
|
|
54
|
+
switches.setdefault("days", 1)
|
|
53
55
|
|
|
54
56
|
for key in ("element", "name", "status", "reason"):
|
|
55
57
|
if key not in switches:
|
|
@@ -183,7 +185,11 @@ def setStatus(switchDict, tokenOwner):
|
|
|
183
185
|
)
|
|
184
186
|
return S_OK()
|
|
185
187
|
|
|
186
|
-
|
|
188
|
+
tokenLifetime = int(switchDict["days"])
|
|
189
|
+
if tokenLifetime <= 0:
|
|
190
|
+
tokenExpiration = datetime.max
|
|
191
|
+
else:
|
|
192
|
+
tokenExpiration = datetime.utcnow().replace(microsecond=0) + timedelta(days=tokenLifetime)
|
|
187
193
|
|
|
188
194
|
for status, statusType in elements:
|
|
189
195
|
gLogger.debug(f"{status} {statusType}")
|
|
@@ -193,8 +199,16 @@ def setStatus(switchDict, tokenOwner):
|
|
|
193
199
|
continue
|
|
194
200
|
|
|
195
201
|
gLogger.debug(
|
|
196
|
-
"About to set status %s -> %s for %s, statusType: %s, VO: %s, reason: %s"
|
|
197
|
-
% (
|
|
202
|
+
"About to set status %s -> %s for %s, statusType: %s, VO: %s, reason: %s, days: %s"
|
|
203
|
+
% (
|
|
204
|
+
status,
|
|
205
|
+
switchDict["status"],
|
|
206
|
+
switchDict["name"],
|
|
207
|
+
statusType,
|
|
208
|
+
switchDict["VO"],
|
|
209
|
+
switchDict["reason"],
|
|
210
|
+
switchDict["days"],
|
|
211
|
+
)
|
|
198
212
|
)
|
|
199
213
|
result = rssClient.modifyStatusElement(
|
|
200
214
|
switchDict["element"],
|
|
@@ -205,7 +219,7 @@ def setStatus(switchDict, tokenOwner):
|
|
|
205
219
|
reason=switchDict["reason"],
|
|
206
220
|
vO=switchDict["VO"],
|
|
207
221
|
tokenOwner=tokenOwner,
|
|
208
|
-
tokenExpiration=
|
|
222
|
+
tokenExpiration=tokenExpiration,
|
|
209
223
|
)
|
|
210
224
|
if not result["OK"]:
|
|
211
225
|
return result
|
|
@@ -78,7 +78,7 @@ on_exit_hold = ExitCode =!= 0
|
|
|
78
78
|
on_exit_hold_subcode = %(holdReasonSubcode)s
|
|
79
79
|
# Jobs are then deleted from the system after N days if they are not idle or running
|
|
80
80
|
periodic_remove = ((JobStatus == 1) && (NumJobStarts > 0)) || \
|
|
81
|
-
((JobStatus != 1) && (JobStatus != 2) && ((time() - EnteredCurrentStatus) > (%(daysToKeepRemoteLogs)s * 24 * 3600))
|
|
81
|
+
((JobStatus != 1) && (JobStatus != 2) && ((time() - EnteredCurrentStatus) > (%(daysToKeepRemoteLogs)s * 24 * 3600)))
|
|
82
82
|
|
|
83
83
|
# Specific options
|
|
84
84
|
# ----------------
|
|
@@ -167,7 +167,7 @@ class Condor(object):
|
|
|
167
167
|
jdlFile.flush()
|
|
168
168
|
|
|
169
169
|
cmd = "%s; " % preamble if preamble else ""
|
|
170
|
-
cmd += "condor_submit %s" % jdlFile.name
|
|
170
|
+
cmd += "condor_submit -spool %s" % jdlFile.name
|
|
171
171
|
sp = subprocess.Popen(
|
|
172
172
|
cmd,
|
|
173
173
|
shell=True,
|
|
@@ -286,10 +286,12 @@ class Condor(object):
|
|
|
286
286
|
output, error = sp.communicate()
|
|
287
287
|
status = sp.returncode
|
|
288
288
|
|
|
289
|
-
if status != 0
|
|
289
|
+
if status != 0:
|
|
290
290
|
resultDict["Status"] = status
|
|
291
291
|
resultDict["Message"] = error
|
|
292
292
|
return resultDict
|
|
293
|
+
if not output:
|
|
294
|
+
output = "[]"
|
|
293
295
|
|
|
294
296
|
jobsMetadata = json.loads(output)
|
|
295
297
|
|
|
@@ -304,10 +306,12 @@ class Condor(object):
|
|
|
304
306
|
output, _ = sp.communicate()
|
|
305
307
|
status = sp.returncode
|
|
306
308
|
|
|
307
|
-
if status != 0
|
|
309
|
+
if status != 0:
|
|
308
310
|
resultDict["Status"] = status
|
|
309
311
|
resultDict["Message"] = error
|
|
310
312
|
return resultDict
|
|
313
|
+
if not output:
|
|
314
|
+
output = "[]"
|
|
311
315
|
|
|
312
316
|
jobsMetadata += json.loads(output)
|
|
313
317
|
|
|
@@ -399,6 +403,21 @@ class Condor(object):
|
|
|
399
403
|
jobDict = {}
|
|
400
404
|
for jobID in jobIDList:
|
|
401
405
|
jobDict[jobID] = {}
|
|
406
|
+
|
|
407
|
+
cmd = "condor_transfer_data %s" % jobID
|
|
408
|
+
sp = subprocess.Popen(
|
|
409
|
+
shlex.split(cmd),
|
|
410
|
+
stdout=subprocess.PIPE,
|
|
411
|
+
stderr=subprocess.PIPE,
|
|
412
|
+
universal_newlines=True,
|
|
413
|
+
)
|
|
414
|
+
_, error = sp.communicate()
|
|
415
|
+
status = sp.returncode
|
|
416
|
+
if status != 0:
|
|
417
|
+
resultDict["Status"] = -1
|
|
418
|
+
resultDict["Message"] = error
|
|
419
|
+
return resultDict
|
|
420
|
+
|
|
402
421
|
jobDict[jobID]["Output"] = "%s/%s.out" % (outputDir, jobID)
|
|
403
422
|
jobDict[jobID]["Error"] = "%s/%s.err" % (errorDir, jobID)
|
|
404
423
|
|
|
@@ -7,21 +7,20 @@
|
|
|
7
7
|
In case you want to further extend it you are required to follow the note on the
|
|
8
8
|
initialize method and on the _getClients method.
|
|
9
9
|
"""
|
|
10
|
-
import time
|
|
11
|
-
import datetime
|
|
12
10
|
import concurrent.futures
|
|
11
|
+
import datetime
|
|
12
|
+
import time
|
|
13
13
|
|
|
14
|
-
from DIRAC import S_OK
|
|
15
|
-
|
|
14
|
+
from DIRAC import S_OK, gConfig
|
|
15
|
+
from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
|
|
16
16
|
from DIRAC.Core.Base.AgentModule import AgentModule
|
|
17
17
|
from DIRAC.Core.Security.ProxyInfo import getProxyInfo
|
|
18
|
-
from DIRAC.Core.Utilities.List import breakListIntoChunks
|
|
19
18
|
from DIRAC.Core.Utilities.Dictionaries import breakDictionaryIntoChunks
|
|
20
|
-
from DIRAC.
|
|
19
|
+
from DIRAC.Core.Utilities.List import breakListIntoChunks
|
|
20
|
+
from DIRAC.TransformationSystem.Agent.TransformationAgentsUtilities import TransformationAgentsUtilities
|
|
21
21
|
from DIRAC.TransformationSystem.Client.FileReport import FileReport
|
|
22
|
-
from DIRAC.TransformationSystem.Client.WorkflowTasks import WorkflowTasks
|
|
23
22
|
from DIRAC.TransformationSystem.Client.TransformationClient import TransformationClient
|
|
24
|
-
from DIRAC.TransformationSystem.
|
|
23
|
+
from DIRAC.TransformationSystem.Client.WorkflowTasks import WorkflowTasks
|
|
25
24
|
from DIRAC.WorkloadManagementSystem.Client import JobStatus
|
|
26
25
|
from DIRAC.WorkloadManagementSystem.Client.JobManagerClient import JobManagerClient
|
|
27
26
|
|
|
@@ -193,11 +192,9 @@ class TaskManagerAgentBase(AgentModule, TransformationAgentsUtilities):
|
|
|
193
192
|
else:
|
|
194
193
|
# Get the transformations which should be submitted
|
|
195
194
|
self.tasksPerLoop = self.am_getOption("TasksPerLoop", self.tasksPerLoop)
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
else:
|
|
200
|
-
self.maxParametricJobs = res["Value"]
|
|
195
|
+
self.maxParametricJobs = gConfig.getValue(
|
|
196
|
+
"/Systems/WorkloadManagement/Services/JobManager/MaxParametricJobs", self.maxParametricJobs
|
|
197
|
+
)
|
|
201
198
|
|
|
202
199
|
self._addOperationForTransformations(
|
|
203
200
|
self.operationsOnTransformationDict,
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""TransformationAgent processes transformations found in the transformation database.
|
|
2
2
|
|
|
3
3
|
The following options can be set for the TransformationAgent.
|
|
4
4
|
|
|
@@ -8,6 +8,7 @@ The following options can be set for the TransformationAgent.
|
|
|
8
8
|
:dedent: 2
|
|
9
9
|
:caption: TransformationAgent options
|
|
10
10
|
"""
|
|
11
|
+
|
|
11
12
|
from importlib import import_module
|
|
12
13
|
|
|
13
14
|
import time
|
|
@@ -15,6 +16,7 @@ import os
|
|
|
15
16
|
import datetime
|
|
16
17
|
import pickle
|
|
17
18
|
import concurrent.futures
|
|
19
|
+
from pathlib import Path
|
|
18
20
|
|
|
19
21
|
from DIRAC import S_OK, S_ERROR
|
|
20
22
|
from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
|
|
@@ -127,6 +129,9 @@ class TransformationAgent(AgentModule, TransformationAgentsUtilities):
|
|
|
127
129
|
if not res["OK"]:
|
|
128
130
|
self._logError("Failed to obtain transformations:", res["Message"])
|
|
129
131
|
return S_OK()
|
|
132
|
+
|
|
133
|
+
active_trans_ids = [t["TransformationID"] for t in res["Value"]]
|
|
134
|
+
self.cleanOldTransformationCache(active_trans_ids)
|
|
130
135
|
# Process the transformations
|
|
131
136
|
count = 0
|
|
132
137
|
future_to_transID = {}
|
|
@@ -164,6 +169,22 @@ class TransformationAgent(AgentModule, TransformationAgentsUtilities):
|
|
|
164
169
|
|
|
165
170
|
return S_OK()
|
|
166
171
|
|
|
172
|
+
def cleanOldTransformationCache(self, active_trans_ids: list[int]):
|
|
173
|
+
cache_filenames = {Path(self.__cacheFile(tid)) for tid in active_trans_ids}
|
|
174
|
+
existing_caches = set(Path(self.workDirectory).glob("*.pkl"))
|
|
175
|
+
useless_cache_files = existing_caches - cache_filenames
|
|
176
|
+
|
|
177
|
+
if useless_cache_files:
|
|
178
|
+
self._logInfo(f"Found potentially {len(useless_cache_files)} useless cache files")
|
|
179
|
+
|
|
180
|
+
# Since idle transformations aren't in active_trans_ids, let's filter it more
|
|
181
|
+
# and take only files that haven't been touched for 2 month
|
|
182
|
+
last_update_threshold = (datetime.datetime.utcnow() - datetime.timedelta(days=60)).timestamp()
|
|
183
|
+
|
|
184
|
+
for cache_file in useless_cache_files:
|
|
185
|
+
if Path(cache_file).stat().st_mtime < last_update_threshold:
|
|
186
|
+
cache_file.unlink()
|
|
187
|
+
|
|
167
188
|
def getTransformations(self):
|
|
168
189
|
"""Obtain the transformations to be executed - this is executed at the start of every loop (it's really the
|
|
169
190
|
only real thing in the execute()
|
|
@@ -16,14 +16,12 @@ from datetime import datetime, timedelta
|
|
|
16
16
|
|
|
17
17
|
# # from DIRAC
|
|
18
18
|
from DIRAC import S_ERROR, S_OK
|
|
19
|
-
from DIRAC.ConfigurationSystem.Client.ConfigurationData import gConfigurationData
|
|
20
19
|
from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
|
|
21
20
|
from DIRAC.Core.Base.AgentModule import AgentModule
|
|
22
21
|
from DIRAC.Core.Utilities.DErrno import cmpError
|
|
23
22
|
from DIRAC.Core.Utilities.List import breakListIntoChunks
|
|
24
23
|
from DIRAC.Core.Utilities.Proxy import executeWithUserProxy
|
|
25
24
|
from DIRAC.Core.Utilities.ReturnValues import returnSingleResult
|
|
26
|
-
from DIRAC.DataManagementSystem.Client.DataManager import DataManager
|
|
27
25
|
from DIRAC.RequestManagementSystem.Client.File import File
|
|
28
26
|
from DIRAC.RequestManagementSystem.Client.Operation import Operation
|
|
29
27
|
from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient
|
|
@@ -34,8 +32,12 @@ from DIRAC.Resources.Catalog.FileCatalogClient import FileCatalogClient
|
|
|
34
32
|
from DIRAC.Resources.Storage.StorageElement import StorageElement
|
|
35
33
|
from DIRAC.TransformationSystem.Client import TransformationStatus
|
|
36
34
|
from DIRAC.TransformationSystem.Client.TransformationClient import TransformationClient
|
|
37
|
-
from DIRAC.WorkloadManagementSystem.
|
|
38
|
-
from DIRAC.WorkloadManagementSystem.
|
|
35
|
+
from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB
|
|
36
|
+
from DIRAC.WorkloadManagementSystem.Service.JobPolicy import (
|
|
37
|
+
RIGHT_DELETE,
|
|
38
|
+
RIGHT_KILL,
|
|
39
|
+
)
|
|
40
|
+
from DIRAC.WorkloadManagementSystem.Utilities.jobAdministration import kill_delete_jobs
|
|
39
41
|
|
|
40
42
|
# # agent's name
|
|
41
43
|
AGENT_NAME = "Transformation/TransformationCleaningAgent"
|
|
@@ -59,12 +61,12 @@ class TransformationCleaningAgent(AgentModule):
|
|
|
59
61
|
|
|
60
62
|
# # transformation client
|
|
61
63
|
self.transClient = None
|
|
62
|
-
# # wms client
|
|
63
|
-
self.wmsClient = None
|
|
64
64
|
# # request client
|
|
65
65
|
self.reqClient = None
|
|
66
66
|
# # file catalog client
|
|
67
67
|
self.metadataClient = None
|
|
68
|
+
# # JobDB
|
|
69
|
+
self.jobDB = None
|
|
68
70
|
|
|
69
71
|
# # transformations types
|
|
70
72
|
self.transformationTypes = None
|
|
@@ -119,14 +121,12 @@ class TransformationCleaningAgent(AgentModule):
|
|
|
119
121
|
|
|
120
122
|
# # transformation client
|
|
121
123
|
self.transClient = TransformationClient()
|
|
122
|
-
# # wms client
|
|
123
|
-
self.wmsClient = WMSClient()
|
|
124
124
|
# # request client
|
|
125
125
|
self.reqClient = ReqClient()
|
|
126
126
|
# # file catalog client
|
|
127
127
|
self.metadataClient = FileCatalogClient()
|
|
128
|
-
# # job
|
|
129
|
-
self.
|
|
128
|
+
# # job DB
|
|
129
|
+
self.jobDB = JobDB()
|
|
130
130
|
|
|
131
131
|
return S_OK()
|
|
132
132
|
|
|
@@ -224,7 +224,7 @@ class TransformationCleaningAgent(AgentModule):
|
|
|
224
224
|
So, we should just clean from time to time.
|
|
225
225
|
What I added here is done only when the agent finalize, and it's quite light-ish operation anyway.
|
|
226
226
|
"""
|
|
227
|
-
res = self.
|
|
227
|
+
res = self.jobDB.getDistinctJobAttributes("JobGroup", None, datetime.utcnow() - timedelta(days=365))
|
|
228
228
|
if not res["OK"]:
|
|
229
229
|
self.log.error("Failed to get job groups", res["Message"])
|
|
230
230
|
return res
|
|
@@ -268,7 +268,7 @@ class TransformationCleaningAgent(AgentModule):
|
|
|
268
268
|
|
|
269
269
|
# Remove JobIDs that were unknown to the TransformationSystem
|
|
270
270
|
jobGroupsToCheck = [str(transDict["TransformationID"]).zfill(8) for transDict in toClean + toArchive]
|
|
271
|
-
res = self.
|
|
271
|
+
res = self.jobDB.selectJobs({"JobGroup": jobGroupsToCheck})
|
|
272
272
|
if not res["OK"]:
|
|
273
273
|
return res
|
|
274
274
|
jobIDsToRemove = [int(jobID) for jobID in res["Value"]]
|
|
@@ -610,8 +610,8 @@ class TransformationCleaningAgent(AgentModule):
|
|
|
610
610
|
# Prevent 0 job IDs
|
|
611
611
|
jobIDs = [int(j) for j in transJobIDs if int(j)]
|
|
612
612
|
allRemove = True
|
|
613
|
-
for jobList in breakListIntoChunks(jobIDs,
|
|
614
|
-
res =
|
|
613
|
+
for jobList in breakListIntoChunks(jobIDs, 1000):
|
|
614
|
+
res = kill_delete_jobs(RIGHT_KILL, jobList, force=True)
|
|
615
615
|
if res["OK"]:
|
|
616
616
|
self.log.info(f"Successfully killed {len(jobList)} jobs from WMS")
|
|
617
617
|
elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs" not in res) and ("FailedJobIDs" not in res):
|
|
@@ -623,7 +623,7 @@ class TransformationCleaningAgent(AgentModule):
|
|
|
623
623
|
self.log.error("Failed to kill jobs", f"(n={len(res['FailedJobIDs'])})")
|
|
624
624
|
allRemove = False
|
|
625
625
|
|
|
626
|
-
res =
|
|
626
|
+
res = kill_delete_jobs(RIGHT_DELETE, jobList, force=True)
|
|
627
627
|
if res["OK"]:
|
|
628
628
|
self.log.info("Successfully deleted jobs from WMS", f"(n={len(jobList)})")
|
|
629
629
|
elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs" not in res) and ("FailedJobIDs" not in res):
|
|
@@ -9,6 +9,8 @@ Utilities for Transformation system
|
|
|
9
9
|
import ast
|
|
10
10
|
import random
|
|
11
11
|
|
|
12
|
+
from cachetools import LRUCache, cached
|
|
13
|
+
from cachetools.keys import hashkey
|
|
12
14
|
from DIRAC import S_OK, S_ERROR, gLogger
|
|
13
15
|
|
|
14
16
|
from DIRAC.Core.Utilities.List import breakListIntoChunks
|
|
@@ -400,6 +402,10 @@ class PluginUtilities:
|
|
|
400
402
|
|
|
401
403
|
return StorageElement(se1).isSameSE(StorageElement(se2))
|
|
402
404
|
|
|
405
|
+
@cached(
|
|
406
|
+
LRUCache(maxsize=1024),
|
|
407
|
+
key=lambda _, a, b: hashkey(a, *sorted(b)),
|
|
408
|
+
)
|
|
403
409
|
def isSameSEInList(self, se1, seList):
|
|
404
410
|
"""Check if an SE is the same as any in a list"""
|
|
405
411
|
if se1 in seList:
|
|
@@ -35,10 +35,12 @@ from DIRAC.RequestManagementSystem.Client.Operation import Operation
|
|
|
35
35
|
from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient
|
|
36
36
|
from DIRAC.RequestManagementSystem.Client.Request import Request
|
|
37
37
|
from DIRAC.WorkloadManagementSystem.Client import JobStatus
|
|
38
|
-
from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient
|
|
39
|
-
from DIRAC.WorkloadManagementSystem.Client.SandboxStoreClient import SandboxStoreClient
|
|
40
38
|
from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient
|
|
41
39
|
from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB
|
|
40
|
+
from DIRAC.WorkloadManagementSystem.DB.SandboxMetadataDB import SandboxMetadataDB
|
|
41
|
+
from DIRAC.WorkloadManagementSystem.Service.JobPolicy import RIGHT_DELETE
|
|
42
|
+
from DIRAC.WorkloadManagementSystem.Utilities.jobAdministration import kill_delete_jobs
|
|
43
|
+
from DIRAC.WorkloadManagementSystem.Utilities.JobParameters import getJobParameters
|
|
42
44
|
|
|
43
45
|
|
|
44
46
|
class JobCleaningAgent(AgentModule):
|
|
@@ -152,8 +154,9 @@ class JobCleaningAgent(AgentModule):
|
|
|
152
154
|
return S_OK()
|
|
153
155
|
|
|
154
156
|
self.log.info("Unassigning sandboxes from soon to be deleted jobs", f"({len(jobList)})")
|
|
155
|
-
|
|
156
|
-
|
|
157
|
+
|
|
158
|
+
entitiesList = [f"Job:{jobId}" for jobId in jobList]
|
|
159
|
+
if not (result := SandboxMetadataDB().unassignEntities(entitiesList))["OK"]:
|
|
157
160
|
self.log.error("Cannot unassign jobs to sandboxes", result["Message"])
|
|
158
161
|
return result
|
|
159
162
|
|
|
@@ -229,11 +232,11 @@ class JobCleaningAgent(AgentModule):
|
|
|
229
232
|
if not res["OK"]:
|
|
230
233
|
self.log.error("No DN found", f"for {user}")
|
|
231
234
|
return res
|
|
232
|
-
wmsClient = WMSClient(useCertificates=True, delegatedDN=res["Value"][0], delegatedGroup=ownerGroup)
|
|
233
235
|
if remove:
|
|
236
|
+
wmsClient = WMSClient(useCertificates=True, delegatedDN=res["Value"][0], delegatedGroup=ownerGroup)
|
|
234
237
|
result = wmsClient.removeJob(jobsList)
|
|
235
238
|
else:
|
|
236
|
-
result =
|
|
239
|
+
result = kill_delete_jobs(RIGHT_DELETE, jobsList)
|
|
237
240
|
if not result["OK"]:
|
|
238
241
|
self.log.error(
|
|
239
242
|
f"Could not {'remove' if remove else 'delete'} jobs",
|
|
@@ -293,7 +296,8 @@ class JobCleaningAgent(AgentModule):
|
|
|
293
296
|
failed = {}
|
|
294
297
|
successful = {}
|
|
295
298
|
|
|
296
|
-
|
|
299
|
+
jobIDs = [int(jobID) for jobID in jobIDList]
|
|
300
|
+
result = getJobParameters(jobIDs, "OutputSandboxLFN")
|
|
297
301
|
if not result["OK"]:
|
|
298
302
|
return result
|
|
299
303
|
osLFNDict = result["Value"]
|
|
@@ -14,16 +14,16 @@ import datetime
|
|
|
14
14
|
from DIRAC import S_ERROR, S_OK, gConfig
|
|
15
15
|
from DIRAC.AccountingSystem.Client.Types.Job import Job
|
|
16
16
|
from DIRAC.ConfigurationSystem.Client.Helpers import cfgPath
|
|
17
|
-
from DIRAC.ConfigurationSystem.Client.Helpers.Registry import getDNForUsername
|
|
18
17
|
from DIRAC.Core.Base.AgentModule import AgentModule
|
|
19
18
|
from DIRAC.Core.Utilities import DErrno
|
|
20
19
|
from DIRAC.Core.Utilities.ClassAd.ClassAdLight import ClassAd
|
|
21
20
|
from DIRAC.Core.Utilities.TimeUtilities import fromString, second, toEpoch
|
|
22
21
|
from DIRAC.WorkloadManagementSystem.Client import JobMinorStatus, JobStatus
|
|
23
|
-
from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient
|
|
24
22
|
from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB
|
|
25
23
|
from DIRAC.WorkloadManagementSystem.DB.JobLoggingDB import JobLoggingDB
|
|
26
24
|
from DIRAC.WorkloadManagementSystem.DB.PilotAgentsDB import PilotAgentsDB
|
|
25
|
+
from DIRAC.WorkloadManagementSystem.Service.JobPolicy import RIGHT_KILL
|
|
26
|
+
from DIRAC.WorkloadManagementSystem.Utilities.jobAdministration import kill_delete_jobs
|
|
27
27
|
from DIRAC.WorkloadManagementSystem.Utilities.JobParameters import getJobParameters
|
|
28
28
|
from DIRAC.WorkloadManagementSystem.Utilities.Utils import rescheduleJobs
|
|
29
29
|
|
|
@@ -235,7 +235,7 @@ class StalledJobAgent(AgentModule):
|
|
|
235
235
|
# Set the jobs Failed, send them a kill signal in case they are not really dead
|
|
236
236
|
# and send accounting info
|
|
237
237
|
if setFailed:
|
|
238
|
-
res =
|
|
238
|
+
res = kill_delete_jobs(RIGHT_KILL, [jobID], nonauthJobList=[], force=True)
|
|
239
239
|
if not res["OK"]:
|
|
240
240
|
self.log.error("Failed to kill job", jobID)
|
|
241
241
|
|
|
@@ -574,26 +574,3 @@ class StalledJobAgent(AgentModule):
|
|
|
574
574
|
continue
|
|
575
575
|
|
|
576
576
|
return S_OK()
|
|
577
|
-
|
|
578
|
-
def _sendKillCommand(self, job):
|
|
579
|
-
"""Send a kill signal to the job such that it cannot continue running.
|
|
580
|
-
|
|
581
|
-
:param int job: ID of job to send kill command
|
|
582
|
-
"""
|
|
583
|
-
|
|
584
|
-
res = self.jobDB.getJobAttribute(job, "Owner")
|
|
585
|
-
if not res["OK"]:
|
|
586
|
-
return res
|
|
587
|
-
owner = res["Value"]
|
|
588
|
-
|
|
589
|
-
res = self.jobDB.getJobAttribute(job, "OwnerGroup")
|
|
590
|
-
if not res["OK"]:
|
|
591
|
-
return res
|
|
592
|
-
ownerGroup = res["Value"]
|
|
593
|
-
|
|
594
|
-
wmsClient = WMSClient(
|
|
595
|
-
useCertificates=True,
|
|
596
|
-
delegatedDN=getDNForUsername(owner)["Value"][0] if owner else None,
|
|
597
|
-
delegatedGroup=ownerGroup,
|
|
598
|
-
)
|
|
599
|
-
return wmsClient.killJob(job)
|
|
@@ -9,10 +9,11 @@
|
|
|
9
9
|
"""
|
|
10
10
|
import datetime
|
|
11
11
|
|
|
12
|
-
from DIRAC import S_ERROR, S_OK
|
|
12
|
+
from DIRAC import S_ERROR, S_OK, gConfig
|
|
13
13
|
from DIRAC.AccountingSystem.Client.DataStoreClient import DataStoreClient
|
|
14
14
|
from DIRAC.AccountingSystem.Client.Types.WMSHistory import WMSHistory
|
|
15
15
|
from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
|
|
16
|
+
from DIRAC.ConfigurationSystem.Client.Helpers.Resources import getSites
|
|
16
17
|
from DIRAC.Core.Base.AgentModule import AgentModule
|
|
17
18
|
from DIRAC.Core.Utilities import TimeUtilities
|
|
18
19
|
from DIRAC.MonitoringSystem.Client.MonitoringReporter import MonitoringReporter
|
|
@@ -77,6 +78,8 @@ class StatesAccountingAgent(AgentModule):
|
|
|
77
78
|
def execute(self):
|
|
78
79
|
"""Main execution method"""
|
|
79
80
|
|
|
81
|
+
site_metadata = self._getSitesMetadata()
|
|
82
|
+
|
|
80
83
|
# on the first iteration of the agent, do nothing in order to avoid double committing after a restart
|
|
81
84
|
if self.am_getModuleParam("cyclesDone") == 0:
|
|
82
85
|
self.log.notice("Skipping the first iteration of the agent")
|
|
@@ -131,6 +134,16 @@ class StatesAccountingAgent(AgentModule):
|
|
|
131
134
|
|
|
132
135
|
for backend in self.datastores:
|
|
133
136
|
if backend.lower() == "monitoring":
|
|
137
|
+
site_name = rD["Site"]
|
|
138
|
+
if site_name not in site_metadata:
|
|
139
|
+
self.log.warn(
|
|
140
|
+
f"Site {site_name} not found in site metadata, using default values",
|
|
141
|
+
)
|
|
142
|
+
rD["Tier"] = "4"
|
|
143
|
+
rD["Type"] = site_name.split(".")[0]
|
|
144
|
+
else:
|
|
145
|
+
rD["Tier"] = site_metadata[site_name]["Tier"]
|
|
146
|
+
rD["Type"] = site_metadata[site_name]["Type"]
|
|
134
147
|
rD["timestamp"] = int(TimeUtilities.toEpochMilliSeconds(now))
|
|
135
148
|
self.datastores["Monitoring"].addRecord(rD)
|
|
136
149
|
|
|
@@ -154,3 +167,30 @@ class StatesAccountingAgent(AgentModule):
|
|
|
154
167
|
self.log.verbose(f"Done committing WMSHistory to {backend} backend")
|
|
155
168
|
|
|
156
169
|
return S_OK()
|
|
170
|
+
|
|
171
|
+
def _getSitesMetadata(self):
|
|
172
|
+
"""Get the metadata for the sites"""
|
|
173
|
+
res = getSites()
|
|
174
|
+
if not res["OK"]:
|
|
175
|
+
return res
|
|
176
|
+
sites = res["Value"]
|
|
177
|
+
site_metadata = {}
|
|
178
|
+
|
|
179
|
+
for site in sites:
|
|
180
|
+
site_metadata[site] = {}
|
|
181
|
+
|
|
182
|
+
# Get the site metadata from the Configuration System
|
|
183
|
+
grid = site.split(".")[0]
|
|
184
|
+
res = gConfig.getOptionsDict(f"Resources/Sites/{grid}/{site}")
|
|
185
|
+
if not res["OK"]:
|
|
186
|
+
self.log.error("Failure getting options dict for site", f"{site}: {res['Message']}")
|
|
187
|
+
continue
|
|
188
|
+
siteInfoCS = res["Value"]
|
|
189
|
+
|
|
190
|
+
# The site tier is normally 1 or 2. Few VOs may define tier 3.
|
|
191
|
+
# If the tier is not defined, we assume it is 4, with 4 meaning "not pledged" (opportunistic).
|
|
192
|
+
site_metadata[site]["Tier"] = siteInfoCS.get("MoUTierLevel", "4")
|
|
193
|
+
# The site type is defined by the first part of the site name.
|
|
194
|
+
# It needs to be interpreted at the Monitoring side (e.g. in Grafana).
|
|
195
|
+
site_metadata[site]["Type"] = site.split(".")[0]
|
|
196
|
+
return site_metadata
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
""" Test class for Job Cleaning Agent
|
|
2
2
|
"""
|
|
3
|
-
import pytest
|
|
4
3
|
from unittest.mock import MagicMock
|
|
5
4
|
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
6
7
|
# DIRAC Components
|
|
7
|
-
from DIRAC import
|
|
8
|
+
from DIRAC import S_OK, gLogger
|
|
8
9
|
from DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent import JobCleaningAgent
|
|
9
10
|
|
|
10
11
|
gLogger.setLevel("DEBUG")
|
|
@@ -32,7 +33,6 @@ def jca(mocker):
|
|
|
32
33
|
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.JobDB.selectJobs", side_effect=mockReply)
|
|
33
34
|
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.JobDB.__init__", side_effect=mockNone)
|
|
34
35
|
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.ReqClient", return_value=mockNone)
|
|
35
|
-
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.JobMonitoringClient", return_value=mockJMC)
|
|
36
36
|
|
|
37
37
|
jca = JobCleaningAgent()
|
|
38
38
|
jca.log = gLogger
|
|
@@ -98,7 +98,7 @@ def test_deleteJobsByStatus(jca, conditions, mockReplyInput, expected):
|
|
|
98
98
|
"inputs, params, expected",
|
|
99
99
|
[
|
|
100
100
|
([], {"OK": True, "Value": {}}, {"OK": True, "Value": {"Failed": {}, "Successful": {}}}),
|
|
101
|
-
(["
|
|
101
|
+
(["123", "456"], {"OK": True, "Value": {}}, {"OK": True, "Value": {"Failed": {}, "Successful": {}}}),
|
|
102
102
|
(
|
|
103
103
|
[],
|
|
104
104
|
{"OK": True, "Value": {1: {"OutputSandboxLFN": "/some/lfn/1.txt"}}},
|
|
@@ -113,11 +113,11 @@ def test_deleteJobsByStatus(jca, conditions, mockReplyInput, expected):
|
|
|
113
113
|
{"OK": True, "Value": {"Failed": {}, "Successful": {1: "/some/lfn/1.txt", 2: "/some/other/lfn/2.txt"}}},
|
|
114
114
|
),
|
|
115
115
|
(
|
|
116
|
-
["
|
|
116
|
+
["123", "456"],
|
|
117
117
|
{"OK": True, "Value": {1: {"OutputSandboxLFN": "/some/lfn/1.txt"}}},
|
|
118
118
|
{"OK": True, "Value": {"Failed": {}, "Successful": {1: "/some/lfn/1.txt"}}},
|
|
119
119
|
),
|
|
120
|
-
(["
|
|
120
|
+
(["123", "456"], {"OK": False}, {"OK": False}),
|
|
121
121
|
],
|
|
122
122
|
)
|
|
123
123
|
def test_deleteJobOversizedSandbox(mocker, inputs, params, expected):
|
|
@@ -127,10 +127,10 @@ def test_deleteJobOversizedSandbox(mocker, inputs, params, expected):
|
|
|
127
127
|
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.AgentModule.am_getOption", return_value=mockAM)
|
|
128
128
|
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.JobDB", return_value=mockNone)
|
|
129
129
|
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.ReqClient", return_value=mockNone)
|
|
130
|
-
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.JobMonitoringClient", return_value=mockJMC)
|
|
131
130
|
mocker.patch(
|
|
132
131
|
"DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.getDNForUsername", return_value=S_OK(["/bih/boh/DN"])
|
|
133
132
|
)
|
|
133
|
+
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.getJobParameters", return_value=params)
|
|
134
134
|
|
|
135
135
|
jobCleaningAgent = JobCleaningAgent()
|
|
136
136
|
jobCleaningAgent.log = gLogger
|
|
@@ -138,8 +138,6 @@ def test_deleteJobOversizedSandbox(mocker, inputs, params, expected):
|
|
|
138
138
|
jobCleaningAgent._AgentModule__configDefaults = mockAM
|
|
139
139
|
jobCleaningAgent.initialize()
|
|
140
140
|
|
|
141
|
-
mockJMC.getJobParameters.return_value = params
|
|
142
|
-
|
|
143
141
|
result = jobCleaningAgent.deleteJobOversizedSandbox(inputs)
|
|
144
142
|
|
|
145
143
|
assert result == expected
|
|
@@ -28,8 +28,7 @@ def sja(mocker):
|
|
|
28
28
|
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.rescheduleJobs", return_value=MagicMock())
|
|
29
29
|
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.PilotAgentsDB", return_value=MagicMock())
|
|
30
30
|
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.getJobParameters", return_value=MagicMock())
|
|
31
|
-
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.
|
|
32
|
-
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.getDNForUsername", return_value=MagicMock())
|
|
31
|
+
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.kill_delete_jobs", return_value=MagicMock())
|
|
33
32
|
|
|
34
33
|
stalledJobAgent = StalledJobAgent()
|
|
35
34
|
stalledJobAgent._AgentModule__configDefaults = mockAM
|
|
@@ -14,12 +14,14 @@ except ImportError:
|
|
|
14
14
|
|
|
15
15
|
@createClient("WorkloadManagement/JobMonitoring")
|
|
16
16
|
class JobMonitoringClient(Client):
|
|
17
|
+
# Set to None to raise an error if this service is set as "legacy adapted"
|
|
18
|
+
# See ClientSelector
|
|
19
|
+
diracxClient = None
|
|
20
|
+
|
|
17
21
|
def __init__(self, **kwargs):
|
|
18
22
|
super().__init__(**kwargs)
|
|
19
23
|
self.setServer("WorkloadManagement/JobMonitoring")
|
|
20
24
|
|
|
21
|
-
diracxClient = futureJobMonitoringClient
|
|
22
|
-
|
|
23
25
|
@ignoreEncodeWarning
|
|
24
26
|
def getJobsStatus(self, jobIDs):
|
|
25
27
|
res = self._getRPC().getJobsStatus(jobIDs)
|
|
@@ -38,15 +40,6 @@ class JobMonitoringClient(Client):
|
|
|
38
40
|
res["Value"] = strToIntDict(res["Value"])
|
|
39
41
|
return res
|
|
40
42
|
|
|
41
|
-
@ignoreEncodeWarning
|
|
42
|
-
def getJobsParameters(self, jobIDs, parameters):
|
|
43
|
-
res = self._getRPC().getJobsParameters(jobIDs, parameters)
|
|
44
|
-
|
|
45
|
-
# Cast the str keys to int
|
|
46
|
-
if res["OK"]:
|
|
47
|
-
res["Value"] = strToIntDict(res["Value"])
|
|
48
|
-
return res
|
|
49
|
-
|
|
50
43
|
@ignoreEncodeWarning
|
|
51
44
|
def getJobsMinorStatus(self, jobIDs):
|
|
52
45
|
res = self._getRPC().getJobsMinorStatus(jobIDs)
|