DIRAC 9.0.0a62__py3-none-any.whl → 9.0.0a63__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- DIRAC/ConfigurationSystem/Client/Helpers/Registry.py +1 -1
- DIRAC/Core/Security/DiracX.py +1 -1
- DIRAC/Core/Tornado/Client/ClientSelector.py +4 -1
- DIRAC/Core/Utilities/Extensions.py +10 -1
- DIRAC/Interfaces/API/Dirac.py +22 -13
- DIRAC/TransformationSystem/Agent/TransformationCleaningAgent.py +9 -12
- DIRAC/WorkloadManagementSystem/Agent/JobCleaningAgent.py +7 -4
- DIRAC/WorkloadManagementSystem/Agent/StalledJobAgent.py +3 -26
- DIRAC/WorkloadManagementSystem/Agent/StatesAccountingAgent.py +9 -2
- DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_JobCleaningAgent.py +7 -9
- DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_StalledJobAgent.py +1 -2
- DIRAC/WorkloadManagementSystem/Client/JobMonitoringClient.py +4 -2
- DIRAC/WorkloadManagementSystem/Client/JobStatus.py +0 -59
- DIRAC/WorkloadManagementSystem/Client/SandboxStoreClient.py +25 -1
- DIRAC/WorkloadManagementSystem/Client/WMSClient.py +2 -3
- DIRAC/WorkloadManagementSystem/FutureClient/JobStateUpdateClient.py +2 -14
- DIRAC/WorkloadManagementSystem/Service/JobManagerHandler.py +27 -128
- DIRAC/WorkloadManagementSystem/Service/JobStateUpdateHandler.py +0 -16
- DIRAC/WorkloadManagementSystem/Utilities/JobParameters.py +1 -1
- DIRAC/WorkloadManagementSystem/Utilities/PilotWrapper.py +2 -0
- DIRAC/WorkloadManagementSystem/Utilities/jobAdministration.py +138 -0
- DIRAC/WorkloadManagementSystem/Utilities/test/Test_JobAdministration.py +28 -0
- {dirac-9.0.0a62.dist-info → dirac-9.0.0a63.dist-info}/METADATA +2 -1
- {dirac-9.0.0a62.dist-info → dirac-9.0.0a63.dist-info}/RECORD +28 -30
- {dirac-9.0.0a62.dist-info → dirac-9.0.0a63.dist-info}/entry_points.txt +0 -3
- DIRAC/TransformationSystem/scripts/dirac_transformation_archive.py +0 -30
- DIRAC/TransformationSystem/scripts/dirac_transformation_clean.py +0 -30
- DIRAC/TransformationSystem/scripts/dirac_transformation_remove_output.py +0 -30
- DIRAC/WorkloadManagementSystem/Utilities/test/Test_JobManager.py +0 -58
- {dirac-9.0.0a62.dist-info → dirac-9.0.0a63.dist-info}/WHEEL +0 -0
- {dirac-9.0.0a62.dist-info → dirac-9.0.0a63.dist-info}/licenses/LICENSE +0 -0
- {dirac-9.0.0a62.dist-info → dirac-9.0.0a63.dist-info}/top_level.txt +0 -0
|
@@ -457,7 +457,7 @@ def getVOForGroup(group):
|
|
|
457
457
|
|
|
458
458
|
:return: str
|
|
459
459
|
"""
|
|
460
|
-
return
|
|
460
|
+
return gConfig.getValue(f"{gBaseRegistrySection}/Groups/{group}/VO", "") or getVO()
|
|
461
461
|
|
|
462
462
|
|
|
463
463
|
def getIdPForGroup(group):
|
DIRAC/Core/Security/DiracX.py
CHANGED
|
@@ -47,7 +47,7 @@ RE_DIRACX_PEM = re.compile(rf"{PEM_BEGIN}\n(.*)\n{PEM_END}", re.MULTILINE | re.D
|
|
|
47
47
|
def addTokenToPEM(pemPath, group):
|
|
48
48
|
from DIRAC.Core.Base.Client import Client
|
|
49
49
|
|
|
50
|
-
vo = Registry.
|
|
50
|
+
vo = Registry.getVOForGroup(group)
|
|
51
51
|
if not vo:
|
|
52
52
|
gLogger.error(f"ERROR: Could not find VO for group {group}, DiracX will not work!")
|
|
53
53
|
disabledVOs = gConfig.getValue("/DiracX/DisabledVOs", [])
|
|
@@ -17,7 +17,6 @@ from DIRAC.Core.DISET.RPCClient import RPCClient
|
|
|
17
17
|
from DIRAC.Core.DISET.TransferClient import TransferClient
|
|
18
18
|
from DIRAC.Core.Tornado.Client.TornadoClient import TornadoClient
|
|
19
19
|
|
|
20
|
-
|
|
21
20
|
sLog = gLogger.getSubLogger(__name__)
|
|
22
21
|
|
|
23
22
|
|
|
@@ -82,6 +81,10 @@ def ClientSelector(disetClient, *args, **kwargs): # We use same interface as RP
|
|
|
82
81
|
rpc = tornadoClient(*args, **kwargs)
|
|
83
82
|
else:
|
|
84
83
|
rpc = disetClient(*args, **kwargs)
|
|
84
|
+
except NotImplementedError as e:
|
|
85
|
+
# We catch explicitly NotImplementedError to avoid just printing "there's an error"
|
|
86
|
+
# If we mis-configured the CS for legacy adapted services, we MUST have an error.
|
|
87
|
+
raise e
|
|
85
88
|
except Exception as e: # pylint: disable=broad-except
|
|
86
89
|
# If anything went wrong in the resolution, we return default RPCClient
|
|
87
90
|
# So the behaviour is exactly the same as before implementation of Tornado
|
|
@@ -73,6 +73,15 @@ def findServices(modules):
|
|
|
73
73
|
return findModules(modules, "Service", "*Handler")
|
|
74
74
|
|
|
75
75
|
|
|
76
|
+
def findFutureServices(modules):
|
|
77
|
+
"""Find the legacy adapted services for one or more DIRAC extension(s)
|
|
78
|
+
|
|
79
|
+
:param list/str/module module: One or more Python modules or Python module names
|
|
80
|
+
:returns: list of tuples of the form (SystemName, ServiceName)
|
|
81
|
+
"""
|
|
82
|
+
return findModules(modules, "FutureClient")
|
|
83
|
+
|
|
84
|
+
|
|
76
85
|
@iterateThenSort
|
|
77
86
|
def findDatabases(module):
|
|
78
87
|
"""Find the DB SQL schema defintions for one or more DIRAC extension(s)
|
|
@@ -182,7 +191,7 @@ def parseArgs():
|
|
|
182
191
|
parser = argparse.ArgumentParser()
|
|
183
192
|
subparsers = parser.add_subparsers(required=True, dest="function")
|
|
184
193
|
defaultExtensions = extensionsByPriority()
|
|
185
|
-
for func in [findSystems, findAgents, findExecutors, findServices, findDatabases]:
|
|
194
|
+
for func in [findSystems, findAgents, findExecutors, findServices, findDatabases, findFutureServices]:
|
|
186
195
|
subparser = subparsers.add_parser(func.__name__)
|
|
187
196
|
subparser.add_argument("--extensions", nargs="+", default=defaultExtensions)
|
|
188
197
|
subparser.set_defaults(func=func)
|
DIRAC/Interfaces/API/Dirac.py
CHANGED
|
@@ -46,6 +46,7 @@ from DIRAC.WorkloadManagementSystem.Client import JobStatus
|
|
|
46
46
|
from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient
|
|
47
47
|
from DIRAC.WorkloadManagementSystem.Client.SandboxStoreClient import SandboxStoreClient
|
|
48
48
|
from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient
|
|
49
|
+
from DIRAC.WorkloadManagementSystem.Utilities.jobAdministration import _filterJobStateTransition
|
|
49
50
|
|
|
50
51
|
|
|
51
52
|
def parseArguments(args):
|
|
@@ -1450,10 +1451,13 @@ class Dirac(API):
|
|
|
1450
1451
|
# Remove any job IDs that can't change to the Killed or Deleted states
|
|
1451
1452
|
filteredJobs = set()
|
|
1452
1453
|
for filterState in (JobStatus.KILLED, JobStatus.DELETED):
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1454
|
+
# get a dictionary of jobID:status
|
|
1455
|
+
res = JobMonitoringClient().getJobsStatus(jobIDs)
|
|
1456
|
+
if not res["OK"]:
|
|
1457
|
+
return res
|
|
1458
|
+
js = {k: v["Status"] for k, v in res["Value"].items()}
|
|
1459
|
+
# then filter
|
|
1460
|
+
filteredJobs.update(_filterJobStateTransition(js, filterState))
|
|
1457
1461
|
|
|
1458
1462
|
return WMSClient(useCertificates=self.useCertificates).deleteJob(list(filteredJobs))
|
|
1459
1463
|
|
|
@@ -1480,11 +1484,13 @@ class Dirac(API):
|
|
|
1480
1484
|
return ret
|
|
1481
1485
|
jobIDs = ret["Value"]
|
|
1482
1486
|
|
|
1483
|
-
#
|
|
1484
|
-
|
|
1485
|
-
if not
|
|
1486
|
-
return
|
|
1487
|
-
|
|
1487
|
+
# get a dictionary of jobID:status
|
|
1488
|
+
res = JobMonitoringClient().getJobsStatus(jobIDs)
|
|
1489
|
+
if not res["OK"]:
|
|
1490
|
+
return res
|
|
1491
|
+
js = {k: v["Status"] for k, v in res["Value"].items()}
|
|
1492
|
+
# then filter
|
|
1493
|
+
jobIDsToReschedule = _filterJobStateTransition(js, JobStatus.RESCHEDULED)
|
|
1488
1494
|
|
|
1489
1495
|
return WMSClient(useCertificates=self.useCertificates).rescheduleJob(jobIDsToReschedule)
|
|
1490
1496
|
|
|
@@ -1510,10 +1516,13 @@ class Dirac(API):
|
|
|
1510
1516
|
# Remove any job IDs that can't change to the Killed or Deleted states
|
|
1511
1517
|
filteredJobs = set()
|
|
1512
1518
|
for filterState in (JobStatus.KILLED, JobStatus.DELETED):
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1516
|
-
|
|
1519
|
+
# get a dictionary of jobID:status
|
|
1520
|
+
res = JobMonitoringClient().getJobsStatus(jobIDs)
|
|
1521
|
+
if not res["OK"]:
|
|
1522
|
+
return res
|
|
1523
|
+
js = {k: v["Status"] for k, v in res["Value"].items()}
|
|
1524
|
+
# then filter
|
|
1525
|
+
filteredJobs.update(_filterJobStateTransition(js, filterState))
|
|
1517
1526
|
|
|
1518
1527
|
return WMSClient(useCertificates=self.useCertificates).killJob(list(filteredJobs))
|
|
1519
1528
|
|
|
@@ -32,9 +32,12 @@ from DIRAC.Resources.Catalog.FileCatalogClient import FileCatalogClient
|
|
|
32
32
|
from DIRAC.Resources.Storage.StorageElement import StorageElement
|
|
33
33
|
from DIRAC.TransformationSystem.Client import TransformationStatus
|
|
34
34
|
from DIRAC.TransformationSystem.Client.TransformationClient import TransformationClient
|
|
35
|
-
from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient
|
|
36
|
-
from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient
|
|
37
35
|
from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB
|
|
36
|
+
from DIRAC.WorkloadManagementSystem.Service.JobPolicy import (
|
|
37
|
+
RIGHT_DELETE,
|
|
38
|
+
RIGHT_KILL,
|
|
39
|
+
)
|
|
40
|
+
from DIRAC.WorkloadManagementSystem.Utilities.jobAdministration import kill_delete_jobs
|
|
38
41
|
|
|
39
42
|
# # agent's name
|
|
40
43
|
AGENT_NAME = "Transformation/TransformationCleaningAgent"
|
|
@@ -58,8 +61,6 @@ class TransformationCleaningAgent(AgentModule):
|
|
|
58
61
|
|
|
59
62
|
# # transformation client
|
|
60
63
|
self.transClient = None
|
|
61
|
-
# # wms client
|
|
62
|
-
self.wmsClient = None
|
|
63
64
|
# # request client
|
|
64
65
|
self.reqClient = None
|
|
65
66
|
# # file catalog client
|
|
@@ -120,14 +121,10 @@ class TransformationCleaningAgent(AgentModule):
|
|
|
120
121
|
|
|
121
122
|
# # transformation client
|
|
122
123
|
self.transClient = TransformationClient()
|
|
123
|
-
# # wms client
|
|
124
|
-
self.wmsClient = WMSClient()
|
|
125
124
|
# # request client
|
|
126
125
|
self.reqClient = ReqClient()
|
|
127
126
|
# # file catalog client
|
|
128
127
|
self.metadataClient = FileCatalogClient()
|
|
129
|
-
# # job monitoring client
|
|
130
|
-
self.jobMonitoringClient = JobMonitoringClient()
|
|
131
128
|
# # job DB
|
|
132
129
|
self.jobDB = JobDB()
|
|
133
130
|
|
|
@@ -271,7 +268,7 @@ class TransformationCleaningAgent(AgentModule):
|
|
|
271
268
|
|
|
272
269
|
# Remove JobIDs that were unknown to the TransformationSystem
|
|
273
270
|
jobGroupsToCheck = [str(transDict["TransformationID"]).zfill(8) for transDict in toClean + toArchive]
|
|
274
|
-
res = self.
|
|
271
|
+
res = self.jobDB.selectJobs({"JobGroup": jobGroupsToCheck})
|
|
275
272
|
if not res["OK"]:
|
|
276
273
|
return res
|
|
277
274
|
jobIDsToRemove = [int(jobID) for jobID in res["Value"]]
|
|
@@ -613,8 +610,8 @@ class TransformationCleaningAgent(AgentModule):
|
|
|
613
610
|
# Prevent 0 job IDs
|
|
614
611
|
jobIDs = [int(j) for j in transJobIDs if int(j)]
|
|
615
612
|
allRemove = True
|
|
616
|
-
for jobList in breakListIntoChunks(jobIDs,
|
|
617
|
-
res =
|
|
613
|
+
for jobList in breakListIntoChunks(jobIDs, 1000):
|
|
614
|
+
res = kill_delete_jobs(RIGHT_KILL, jobList, force=True)
|
|
618
615
|
if res["OK"]:
|
|
619
616
|
self.log.info(f"Successfully killed {len(jobList)} jobs from WMS")
|
|
620
617
|
elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs" not in res) and ("FailedJobIDs" not in res):
|
|
@@ -626,7 +623,7 @@ class TransformationCleaningAgent(AgentModule):
|
|
|
626
623
|
self.log.error("Failed to kill jobs", f"(n={len(res['FailedJobIDs'])})")
|
|
627
624
|
allRemove = False
|
|
628
625
|
|
|
629
|
-
res =
|
|
626
|
+
res = kill_delete_jobs(RIGHT_DELETE, jobList, force=True)
|
|
630
627
|
if res["OK"]:
|
|
631
628
|
self.log.info("Successfully deleted jobs from WMS", f"(n={len(jobList)})")
|
|
632
629
|
elif ("InvalidJobIDs" in res) and ("NonauthorizedJobIDs" not in res) and ("FailedJobIDs" not in res):
|
|
@@ -35,10 +35,12 @@ from DIRAC.RequestManagementSystem.Client.Operation import Operation
|
|
|
35
35
|
from DIRAC.RequestManagementSystem.Client.ReqClient import ReqClient
|
|
36
36
|
from DIRAC.RequestManagementSystem.Client.Request import Request
|
|
37
37
|
from DIRAC.WorkloadManagementSystem.Client import JobStatus
|
|
38
|
-
from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient
|
|
39
38
|
from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient
|
|
40
39
|
from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB
|
|
41
40
|
from DIRAC.WorkloadManagementSystem.DB.SandboxMetadataDB import SandboxMetadataDB
|
|
41
|
+
from DIRAC.WorkloadManagementSystem.Service.JobPolicy import RIGHT_DELETE
|
|
42
|
+
from DIRAC.WorkloadManagementSystem.Utilities.jobAdministration import kill_delete_jobs
|
|
43
|
+
from DIRAC.WorkloadManagementSystem.Utilities.JobParameters import getJobParameters
|
|
42
44
|
|
|
43
45
|
|
|
44
46
|
class JobCleaningAgent(AgentModule):
|
|
@@ -230,11 +232,11 @@ class JobCleaningAgent(AgentModule):
|
|
|
230
232
|
if not res["OK"]:
|
|
231
233
|
self.log.error("No DN found", f"for {user}")
|
|
232
234
|
return res
|
|
233
|
-
wmsClient = WMSClient(useCertificates=True, delegatedDN=res["Value"][0], delegatedGroup=ownerGroup)
|
|
234
235
|
if remove:
|
|
236
|
+
wmsClient = WMSClient(useCertificates=True, delegatedDN=res["Value"][0], delegatedGroup=ownerGroup)
|
|
235
237
|
result = wmsClient.removeJob(jobsList)
|
|
236
238
|
else:
|
|
237
|
-
result =
|
|
239
|
+
result = kill_delete_jobs(RIGHT_DELETE, jobsList)
|
|
238
240
|
if not result["OK"]:
|
|
239
241
|
self.log.error(
|
|
240
242
|
f"Could not {'remove' if remove else 'delete'} jobs",
|
|
@@ -294,7 +296,8 @@ class JobCleaningAgent(AgentModule):
|
|
|
294
296
|
failed = {}
|
|
295
297
|
successful = {}
|
|
296
298
|
|
|
297
|
-
|
|
299
|
+
jobIDs = [int(jobID) for jobID in jobIDList]
|
|
300
|
+
result = getJobParameters(jobIDs, "OutputSandboxLFN")
|
|
298
301
|
if not result["OK"]:
|
|
299
302
|
return result
|
|
300
303
|
osLFNDict = result["Value"]
|
|
@@ -14,16 +14,16 @@ import datetime
|
|
|
14
14
|
from DIRAC import S_ERROR, S_OK, gConfig
|
|
15
15
|
from DIRAC.AccountingSystem.Client.Types.Job import Job
|
|
16
16
|
from DIRAC.ConfigurationSystem.Client.Helpers import cfgPath
|
|
17
|
-
from DIRAC.ConfigurationSystem.Client.Helpers.Registry import getDNForUsername
|
|
18
17
|
from DIRAC.Core.Base.AgentModule import AgentModule
|
|
19
18
|
from DIRAC.Core.Utilities import DErrno
|
|
20
19
|
from DIRAC.Core.Utilities.ClassAd.ClassAdLight import ClassAd
|
|
21
20
|
from DIRAC.Core.Utilities.TimeUtilities import fromString, second, toEpoch
|
|
22
21
|
from DIRAC.WorkloadManagementSystem.Client import JobMinorStatus, JobStatus
|
|
23
|
-
from DIRAC.WorkloadManagementSystem.Client.WMSClient import WMSClient
|
|
24
22
|
from DIRAC.WorkloadManagementSystem.DB.JobDB import JobDB
|
|
25
23
|
from DIRAC.WorkloadManagementSystem.DB.JobLoggingDB import JobLoggingDB
|
|
26
24
|
from DIRAC.WorkloadManagementSystem.DB.PilotAgentsDB import PilotAgentsDB
|
|
25
|
+
from DIRAC.WorkloadManagementSystem.Service.JobPolicy import RIGHT_KILL
|
|
26
|
+
from DIRAC.WorkloadManagementSystem.Utilities.jobAdministration import kill_delete_jobs
|
|
27
27
|
from DIRAC.WorkloadManagementSystem.Utilities.JobParameters import getJobParameters
|
|
28
28
|
from DIRAC.WorkloadManagementSystem.Utilities.Utils import rescheduleJobs
|
|
29
29
|
|
|
@@ -235,7 +235,7 @@ class StalledJobAgent(AgentModule):
|
|
|
235
235
|
# Set the jobs Failed, send them a kill signal in case they are not really dead
|
|
236
236
|
# and send accounting info
|
|
237
237
|
if setFailed:
|
|
238
|
-
res =
|
|
238
|
+
res = kill_delete_jobs(RIGHT_KILL, [jobID], nonauthJobList=[], force=True)
|
|
239
239
|
if not res["OK"]:
|
|
240
240
|
self.log.error("Failed to kill job", jobID)
|
|
241
241
|
|
|
@@ -574,26 +574,3 @@ class StalledJobAgent(AgentModule):
|
|
|
574
574
|
continue
|
|
575
575
|
|
|
576
576
|
return S_OK()
|
|
577
|
-
|
|
578
|
-
def _sendKillCommand(self, job):
|
|
579
|
-
"""Send a kill signal to the job such that it cannot continue running.
|
|
580
|
-
|
|
581
|
-
:param int job: ID of job to send kill command
|
|
582
|
-
"""
|
|
583
|
-
|
|
584
|
-
res = self.jobDB.getJobAttribute(job, "Owner")
|
|
585
|
-
if not res["OK"]:
|
|
586
|
-
return res
|
|
587
|
-
owner = res["Value"]
|
|
588
|
-
|
|
589
|
-
res = self.jobDB.getJobAttribute(job, "OwnerGroup")
|
|
590
|
-
if not res["OK"]:
|
|
591
|
-
return res
|
|
592
|
-
ownerGroup = res["Value"]
|
|
593
|
-
|
|
594
|
-
wmsClient = WMSClient(
|
|
595
|
-
useCertificates=True,
|
|
596
|
-
delegatedDN=getDNForUsername(owner)["Value"][0] if owner else None,
|
|
597
|
-
delegatedGroup=ownerGroup,
|
|
598
|
-
)
|
|
599
|
-
return wmsClient.killJob(job)
|
|
@@ -135,8 +135,15 @@ class StatesAccountingAgent(AgentModule):
|
|
|
135
135
|
for backend in self.datastores:
|
|
136
136
|
if backend.lower() == "monitoring":
|
|
137
137
|
site_name = rD["Site"]
|
|
138
|
-
|
|
139
|
-
|
|
138
|
+
if site_name not in site_metadata:
|
|
139
|
+
self.log.warn(
|
|
140
|
+
f"Site {site_name} not found in site metadata, using default values",
|
|
141
|
+
)
|
|
142
|
+
rD["Tier"] = "4"
|
|
143
|
+
rD["Type"] = site_name.split(".")[0]
|
|
144
|
+
else:
|
|
145
|
+
rD["Tier"] = site_metadata[site_name]["Tier"]
|
|
146
|
+
rD["Type"] = site_metadata[site_name]["Type"]
|
|
140
147
|
rD["timestamp"] = int(TimeUtilities.toEpochMilliSeconds(now))
|
|
141
148
|
self.datastores["Monitoring"].addRecord(rD)
|
|
142
149
|
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
""" Test class for Job Cleaning Agent
|
|
2
2
|
"""
|
|
3
|
-
import pytest
|
|
4
3
|
from unittest.mock import MagicMock
|
|
5
4
|
|
|
5
|
+
import pytest
|
|
6
|
+
|
|
6
7
|
# DIRAC Components
|
|
7
|
-
from DIRAC import
|
|
8
|
+
from DIRAC import S_OK, gLogger
|
|
8
9
|
from DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent import JobCleaningAgent
|
|
9
10
|
|
|
10
11
|
gLogger.setLevel("DEBUG")
|
|
@@ -32,7 +33,6 @@ def jca(mocker):
|
|
|
32
33
|
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.JobDB.selectJobs", side_effect=mockReply)
|
|
33
34
|
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.JobDB.__init__", side_effect=mockNone)
|
|
34
35
|
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.ReqClient", return_value=mockNone)
|
|
35
|
-
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.JobMonitoringClient", return_value=mockJMC)
|
|
36
36
|
|
|
37
37
|
jca = JobCleaningAgent()
|
|
38
38
|
jca.log = gLogger
|
|
@@ -98,7 +98,7 @@ def test_deleteJobsByStatus(jca, conditions, mockReplyInput, expected):
|
|
|
98
98
|
"inputs, params, expected",
|
|
99
99
|
[
|
|
100
100
|
([], {"OK": True, "Value": {}}, {"OK": True, "Value": {"Failed": {}, "Successful": {}}}),
|
|
101
|
-
(["
|
|
101
|
+
(["123", "456"], {"OK": True, "Value": {}}, {"OK": True, "Value": {"Failed": {}, "Successful": {}}}),
|
|
102
102
|
(
|
|
103
103
|
[],
|
|
104
104
|
{"OK": True, "Value": {1: {"OutputSandboxLFN": "/some/lfn/1.txt"}}},
|
|
@@ -113,11 +113,11 @@ def test_deleteJobsByStatus(jca, conditions, mockReplyInput, expected):
|
|
|
113
113
|
{"OK": True, "Value": {"Failed": {}, "Successful": {1: "/some/lfn/1.txt", 2: "/some/other/lfn/2.txt"}}},
|
|
114
114
|
),
|
|
115
115
|
(
|
|
116
|
-
["
|
|
116
|
+
["123", "456"],
|
|
117
117
|
{"OK": True, "Value": {1: {"OutputSandboxLFN": "/some/lfn/1.txt"}}},
|
|
118
118
|
{"OK": True, "Value": {"Failed": {}, "Successful": {1: "/some/lfn/1.txt"}}},
|
|
119
119
|
),
|
|
120
|
-
(["
|
|
120
|
+
(["123", "456"], {"OK": False}, {"OK": False}),
|
|
121
121
|
],
|
|
122
122
|
)
|
|
123
123
|
def test_deleteJobOversizedSandbox(mocker, inputs, params, expected):
|
|
@@ -127,10 +127,10 @@ def test_deleteJobOversizedSandbox(mocker, inputs, params, expected):
|
|
|
127
127
|
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.AgentModule.am_getOption", return_value=mockAM)
|
|
128
128
|
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.JobDB", return_value=mockNone)
|
|
129
129
|
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.ReqClient", return_value=mockNone)
|
|
130
|
-
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.JobMonitoringClient", return_value=mockJMC)
|
|
131
130
|
mocker.patch(
|
|
132
131
|
"DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.getDNForUsername", return_value=S_OK(["/bih/boh/DN"])
|
|
133
132
|
)
|
|
133
|
+
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.JobCleaningAgent.getJobParameters", return_value=params)
|
|
134
134
|
|
|
135
135
|
jobCleaningAgent = JobCleaningAgent()
|
|
136
136
|
jobCleaningAgent.log = gLogger
|
|
@@ -138,8 +138,6 @@ def test_deleteJobOversizedSandbox(mocker, inputs, params, expected):
|
|
|
138
138
|
jobCleaningAgent._AgentModule__configDefaults = mockAM
|
|
139
139
|
jobCleaningAgent.initialize()
|
|
140
140
|
|
|
141
|
-
mockJMC.getJobParameters.return_value = params
|
|
142
|
-
|
|
143
141
|
result = jobCleaningAgent.deleteJobOversizedSandbox(inputs)
|
|
144
142
|
|
|
145
143
|
assert result == expected
|
|
@@ -28,8 +28,7 @@ def sja(mocker):
|
|
|
28
28
|
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.rescheduleJobs", return_value=MagicMock())
|
|
29
29
|
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.PilotAgentsDB", return_value=MagicMock())
|
|
30
30
|
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.getJobParameters", return_value=MagicMock())
|
|
31
|
-
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.
|
|
32
|
-
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.getDNForUsername", return_value=MagicMock())
|
|
31
|
+
mocker.patch("DIRAC.WorkloadManagementSystem.Agent.StalledJobAgent.kill_delete_jobs", return_value=MagicMock())
|
|
33
32
|
|
|
34
33
|
stalledJobAgent = StalledJobAgent()
|
|
35
34
|
stalledJobAgent._AgentModule__configDefaults = mockAM
|
|
@@ -14,12 +14,14 @@ except ImportError:
|
|
|
14
14
|
|
|
15
15
|
@createClient("WorkloadManagement/JobMonitoring")
|
|
16
16
|
class JobMonitoringClient(Client):
|
|
17
|
+
# Set to None to raise an error if this service is set as "legacy adapted"
|
|
18
|
+
# See ClientSelector
|
|
19
|
+
diracxClient = None
|
|
20
|
+
|
|
17
21
|
def __init__(self, **kwargs):
|
|
18
22
|
super().__init__(**kwargs)
|
|
19
23
|
self.setServer("WorkloadManagement/JobMonitoring")
|
|
20
24
|
|
|
21
|
-
diracxClient = futureJobMonitoringClient
|
|
22
|
-
|
|
23
25
|
@ignoreEncodeWarning
|
|
24
26
|
def getJobsStatus(self, jobIDs):
|
|
25
27
|
res = self._getRPC().getJobsStatus(jobIDs)
|
|
@@ -2,12 +2,7 @@
|
|
|
2
2
|
This module contains constants and lists for the possible job states.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
-
from DIRAC import gLogger, S_OK, S_ERROR
|
|
6
5
|
from DIRAC.Core.Utilities.StateMachine import State, StateMachine
|
|
7
|
-
from DIRAC.Core.Utilities.Decorators import deprecated
|
|
8
|
-
|
|
9
|
-
from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient
|
|
10
|
-
|
|
11
6
|
|
|
12
7
|
#:
|
|
13
8
|
SUBMITTING = "Submitting"
|
|
@@ -98,57 +93,3 @@ class JobsStateMachine(StateMachine):
|
|
|
98
93
|
RECEIVED: State(1, [SCOUTING, CHECKING, STAGING, WAITING, FAILED, DELETED, KILLED], defState=RECEIVED),
|
|
99
94
|
SUBMITTING: State(0, [RECEIVED, CHECKING, DELETED, KILLED], defState=SUBMITTING), # initial state
|
|
100
95
|
}
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
@deprecated("Use filterJobStateTransition instead")
|
|
104
|
-
def checkJobStateTransition(jobID, candidateState, currentStatus=None, jobMonitoringClient=None):
|
|
105
|
-
"""Utility to check if a job state transition is allowed"""
|
|
106
|
-
if not currentStatus:
|
|
107
|
-
if not jobMonitoringClient:
|
|
108
|
-
from DIRAC.WorkloadManagementSystem.Client.JobMonitoringClient import JobMonitoringClient
|
|
109
|
-
|
|
110
|
-
jobMonitoringClient = JobMonitoringClient()
|
|
111
|
-
|
|
112
|
-
res = jobMonitoringClient.getJobsStatus(jobID)
|
|
113
|
-
if not res["OK"]:
|
|
114
|
-
return res
|
|
115
|
-
try:
|
|
116
|
-
currentStatus = res["Value"][jobID]["Status"]
|
|
117
|
-
except KeyError:
|
|
118
|
-
return S_ERROR("Job does not exist")
|
|
119
|
-
|
|
120
|
-
res = JobsStateMachine(currentStatus).getNextState(candidateState)
|
|
121
|
-
if not res["OK"]:
|
|
122
|
-
return res
|
|
123
|
-
|
|
124
|
-
# If the JobsStateMachine does not accept the candidate, return an ERROR
|
|
125
|
-
if candidateState != res["Value"]:
|
|
126
|
-
gLogger.error(
|
|
127
|
-
"Job Status Error",
|
|
128
|
-
f"{jobID} can't move from {currentStatus} to {candidateState}",
|
|
129
|
-
)
|
|
130
|
-
return S_ERROR("Job state transition not allowed")
|
|
131
|
-
return S_OK()
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
def filterJobStateTransition(jobIDs, candidateState):
|
|
135
|
-
"""Given a list of jobIDs, return a list that are allowed to transition
|
|
136
|
-
to the given candidate state.
|
|
137
|
-
"""
|
|
138
|
-
allowedJobs = []
|
|
139
|
-
|
|
140
|
-
if not isinstance(jobIDs, list):
|
|
141
|
-
jobIDs = [jobIDs]
|
|
142
|
-
|
|
143
|
-
res = JobMonitoringClient().getJobsStatus(jobIDs)
|
|
144
|
-
if not res["OK"]:
|
|
145
|
-
return res
|
|
146
|
-
|
|
147
|
-
for jobID in jobIDs:
|
|
148
|
-
if jobID in res["Value"]:
|
|
149
|
-
curState = res["Value"][jobID]["Status"]
|
|
150
|
-
stateRes = JobsStateMachine(curState).getNextState(candidateState)
|
|
151
|
-
if stateRes["OK"]:
|
|
152
|
-
if stateRes["Value"] == candidateState:
|
|
153
|
-
allowedJobs.append(jobID)
|
|
154
|
-
return S_OK(allowedJobs)
|
|
@@ -1,13 +1,18 @@
|
|
|
1
1
|
""" Client for the SandboxStore.
|
|
2
2
|
Will connect to the WorkloadManagement/SandboxStore service.
|
|
3
3
|
"""
|
|
4
|
+
from __future__ import annotations
|
|
4
5
|
|
|
5
6
|
import hashlib
|
|
6
7
|
import os
|
|
7
8
|
import re
|
|
8
9
|
import tarfile
|
|
9
10
|
import tempfile
|
|
11
|
+
from contextlib import contextmanager
|
|
10
12
|
from io import BytesIO, StringIO
|
|
13
|
+
from typing import Literal
|
|
14
|
+
|
|
15
|
+
import zstandard
|
|
11
16
|
|
|
12
17
|
from DIRAC import S_ERROR, S_OK, gLogger
|
|
13
18
|
from DIRAC.ConfigurationSystem.Client.Helpers.Registry import getVOForGroup
|
|
@@ -18,6 +23,25 @@ from DIRAC.Core.Utilities.ReturnValues import returnSingleResult
|
|
|
18
23
|
from DIRAC.Resources.Storage.StorageElement import StorageElement
|
|
19
24
|
|
|
20
25
|
|
|
26
|
+
@contextmanager
|
|
27
|
+
def ZstdCompatibleTarFile(tarFileName: os.PathLike, *, mode: Literal["r"] = "r"):
|
|
28
|
+
"""Context manager to extend tarfile.open to support zstd compressed files.
|
|
29
|
+
|
|
30
|
+
This is only needed for Python <=3.13.
|
|
31
|
+
"""
|
|
32
|
+
with open(tarFileName, "rb") as f:
|
|
33
|
+
magic = f.read(4)
|
|
34
|
+
# Read magic bytes to determine compression format
|
|
35
|
+
if magic.startswith(b"\x28\xb5\x2f\xfd"): # zstd magic number
|
|
36
|
+
dctx = zstandard.ZstdDecompressor()
|
|
37
|
+
with dctx.stream_reader(f) as decompressor:
|
|
38
|
+
with tarfile.open(fileobj=decompressor, mode=f"{mode}|") as tf:
|
|
39
|
+
yield tf
|
|
40
|
+
else:
|
|
41
|
+
with tarfile.open(name=tarFileName, mode=mode) as tf:
|
|
42
|
+
yield tf
|
|
43
|
+
|
|
44
|
+
|
|
21
45
|
class SandboxStoreClient:
|
|
22
46
|
__validSandboxTypes = ("Input", "Output")
|
|
23
47
|
|
|
@@ -192,7 +216,7 @@ class SandboxStoreClient:
|
|
|
192
216
|
|
|
193
217
|
try:
|
|
194
218
|
sandboxSize = 0
|
|
195
|
-
with
|
|
219
|
+
with ZstdCompatibleTarFile(tarFileName, mode="r") as tf:
|
|
196
220
|
for tarinfo in tf:
|
|
197
221
|
tf.extract(tarinfo, path=destinationDir)
|
|
198
222
|
sandboxSize += tarinfo.size
|
|
@@ -2,11 +2,10 @@
|
|
|
2
2
|
methods necessary to communicate with the Workload Management System
|
|
3
3
|
"""
|
|
4
4
|
import os
|
|
5
|
-
from io import StringIO
|
|
6
5
|
import time
|
|
6
|
+
from io import StringIO
|
|
7
7
|
|
|
8
|
-
from DIRAC import
|
|
9
|
-
|
|
8
|
+
from DIRAC import S_ERROR, S_OK, gLogger
|
|
10
9
|
from DIRAC.ConfigurationSystem.Client.Helpers.Operations import Operations
|
|
11
10
|
from DIRAC.Core.Utilities import File
|
|
12
11
|
from DIRAC.Core.Utilities.ClassAd.ClassAdLight import ClassAd
|
|
@@ -77,17 +77,11 @@ class JobStateUpdateClient(FutureClient):
|
|
|
77
77
|
def setJobAttribute(self, jobID: str | int, attribute: str, value: str):
|
|
78
78
|
with DiracXClient() as api:
|
|
79
79
|
if attribute == "Status":
|
|
80
|
-
api.jobs.set_job_statuses(
|
|
80
|
+
return api.jobs.set_job_statuses(
|
|
81
81
|
{jobID: {datetime.now(tz=timezone.utc): {"Status": value}}},
|
|
82
82
|
)
|
|
83
83
|
else:
|
|
84
|
-
api.jobs.patch_metadata({jobID: {attribute: value}})
|
|
85
|
-
|
|
86
|
-
@stripValueIfOK
|
|
87
|
-
@convertToReturnValue
|
|
88
|
-
def setJobFlag(self, jobID: str | int, flag: str):
|
|
89
|
-
with DiracXClient() as api:
|
|
90
|
-
api.jobs.patch_metadata({jobID: {flag: True}})
|
|
84
|
+
return api.jobs.patch_metadata({jobID: {attribute: value}})
|
|
91
85
|
|
|
92
86
|
@stripValueIfOK
|
|
93
87
|
@convertToReturnValue
|
|
@@ -151,12 +145,6 @@ class JobStateUpdateClient(FutureClient):
|
|
|
151
145
|
updates = {job_id: {k: v} for job_id, (k, v) in jobsParameterDict.items()}
|
|
152
146
|
api.jobs.patch_metadata(updates)
|
|
153
147
|
|
|
154
|
-
@stripValueIfOK
|
|
155
|
-
@convertToReturnValue
|
|
156
|
-
def unsetJobFlag(self, jobID: str | int, flag: str):
|
|
157
|
-
with DiracXClient() as api:
|
|
158
|
-
api.jobs.patch_metadata({jobID: {flag: False}})
|
|
159
|
-
|
|
160
148
|
@stripValueIfOK
|
|
161
149
|
@convertToReturnValue
|
|
162
150
|
def updateJobFromStager(self, jobID: str | int, status: str):
|