DIRAC 9.0.14__py3-none-any.whl → 9.0.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- DIRAC/ConfigurationSystem/Client/CSAPI.py +11 -0
- DIRAC/Core/Tornado/Client/private/TornadoBaseClient.py +1 -1
- DIRAC/Core/Utilities/CGroups2.py +1 -0
- DIRAC/Core/Utilities/ElasticSearchDB.py +1 -1
- DIRAC/Core/Utilities/MySQL.py +51 -25
- DIRAC/DataManagementSystem/Client/DataManager.py +7 -10
- DIRAC/DataManagementSystem/Client/FTS3Job.py +12 -3
- DIRAC/FrameworkSystem/Service/SystemAdministratorHandler.py +41 -11
- DIRAC/Interfaces/API/Dirac.py +12 -4
- DIRAC/Interfaces/API/Job.py +62 -17
- DIRAC/RequestManagementSystem/private/RequestTask.py +2 -1
- DIRAC/Resources/Catalog/FileCatalogClient.py +18 -7
- DIRAC/Resources/Catalog/Utilities.py +3 -3
- DIRAC/Resources/Computing/BatchSystems/SLURM.py +1 -1
- DIRAC/Resources/Computing/BatchSystems/TimeLeft/TimeLeft.py +3 -1
- DIRAC/Resources/Computing/ComputingElement.py +39 -34
- DIRAC/Resources/Computing/InProcessComputingElement.py +20 -7
- DIRAC/Resources/Computing/PoolComputingElement.py +76 -37
- DIRAC/Resources/Computing/SingularityComputingElement.py +19 -9
- DIRAC/Resources/Computing/test/Test_InProcessComputingElement.py +69 -8
- DIRAC/Resources/Computing/test/Test_PoolComputingElement.py +102 -35
- DIRAC/Resources/Storage/GFAL2_StorageBase.py +9 -0
- DIRAC/TransformationSystem/Agent/TransformationAgent.py +12 -13
- DIRAC/WorkloadManagementSystem/Client/JobReport.py +10 -6
- DIRAC/WorkloadManagementSystem/Client/JobState/JobState.py +12 -3
- DIRAC/WorkloadManagementSystem/Client/Matcher.py +18 -24
- DIRAC/WorkloadManagementSystem/DB/TaskQueueDB.py +137 -7
- DIRAC/WorkloadManagementSystem/Executor/JobScheduling.py +8 -14
- DIRAC/WorkloadManagementSystem/Executor/test/Test_Executor.py +3 -5
- DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapper.py +2 -2
- DIRAC/WorkloadManagementSystem/JobWrapper/test/Test_JobWrapper.py +1 -1
- DIRAC/WorkloadManagementSystem/Service/JobManagerHandler.py +7 -1
- DIRAC/WorkloadManagementSystem/Utilities/JobParameters.py +81 -2
- DIRAC/WorkloadManagementSystem/Utilities/PilotCStoJSONSynchronizer.py +7 -6
- DIRAC/WorkloadManagementSystem/Utilities/QueueUtilities.py +5 -5
- DIRAC/WorkloadManagementSystem/Utilities/RemoteRunner.py +2 -1
- DIRAC/WorkloadManagementSystem/Utilities/Utils.py +21 -4
- DIRAC/WorkloadManagementSystem/Utilities/test/Test_RemoteRunner.py +7 -3
- DIRAC/WorkloadManagementSystem/scripts/dirac_wms_get_wn_parameters.py +3 -3
- DIRAC/__init__.py +1 -1
- DIRAC/tests/Utilities/testJobDefinitions.py +57 -20
- {dirac-9.0.14.dist-info → dirac-9.0.16.dist-info}/METADATA +2 -2
- {dirac-9.0.14.dist-info → dirac-9.0.16.dist-info}/RECORD +47 -47
- {dirac-9.0.14.dist-info → dirac-9.0.16.dist-info}/WHEEL +0 -0
- {dirac-9.0.14.dist-info → dirac-9.0.16.dist-info}/entry_points.txt +0 -0
- {dirac-9.0.14.dist-info → dirac-9.0.16.dist-info}/licenses/LICENSE +0 -0
- {dirac-9.0.14.dist-info → dirac-9.0.16.dist-info}/top_level.txt +0 -0
|
@@ -1,40 +1,40 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
1
|
+
"""The Computing Element class is a base class for all the various
|
|
2
|
+
types CEs. It serves several purposes:
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
4
|
+
- collects general CE related parameters to generate CE description
|
|
5
|
+
for the job matching
|
|
6
|
+
- provides logic for evaluation of the number of available CPU slots
|
|
7
|
+
- provides logic for the proxy renewal while executing jobs
|
|
8
8
|
|
|
9
|
-
|
|
10
|
-
|
|
9
|
+
The CE parameters are collected from the following sources, in hierarchy
|
|
10
|
+
descending order:
|
|
11
11
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
12
|
+
- parameters provided through setParameters() method of the class
|
|
13
|
+
- parameters in /LocalSite configuration section
|
|
14
|
+
- parameters in /LocalSite/<ceName>/ResourceDict configuration section
|
|
15
|
+
- parameters in /LocalSite/ResourceDict configuration section
|
|
16
|
+
- parameters in /LocalSite/<ceName> configuration section
|
|
17
|
+
- parameters in /Resources/Computing/<ceName> configuration section
|
|
18
|
+
- parameters in /Resources/Computing/CEDefaults configuration section
|
|
19
19
|
|
|
20
|
-
|
|
21
|
-
|
|
20
|
+
The ComputingElement objects are usually instantiated with the help of
|
|
21
|
+
ComputingElementFactory.
|
|
22
22
|
|
|
23
|
-
|
|
24
|
-
|
|
23
|
+
The ComputingElement class can be considered abstract. 3 kinds of abstract ComputingElements
|
|
24
|
+
can be distinguished from it:
|
|
25
25
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
26
|
+
- Remote ComputingElement: includes methods to interact with a remote ComputingElement
|
|
27
|
+
(e.g. HtCondorCEComputingElement, AREXComputingElement).
|
|
28
|
+
- Inner ComputingElement: includes methods to locally interact with an underlying worker node.
|
|
29
|
+
It is worth noting that an Inner ComputingElement provides synchronous submission
|
|
30
|
+
(the submission of a job is blocking the execution until its completion). It deals with one job at a time.
|
|
31
|
+
- Inner Pool ComputingElement: includes methods to locally interact with Inner ComputingElements asynchronously.
|
|
32
|
+
It can manage a pool of jobs running simultaneously.
|
|
33
33
|
|
|
34
|
-
|
|
34
|
+
To configure the use of Tokens for CEs:
|
|
35
35
|
|
|
36
|
-
|
|
37
|
-
|
|
36
|
+
* the CE is able to receive any token. Validation: 'Tag = Token' should be included in the CE parameters.
|
|
37
|
+
* the CE is able to receive VO-specifc tokens. Validation: 'Tag = Token:<VO>' should be included in the CE parameters.
|
|
38
38
|
|
|
39
39
|
"""
|
|
40
40
|
|
|
@@ -50,9 +50,10 @@ from DIRAC.FrameworkSystem.Client.ProxyManagerClient import gProxyManager
|
|
|
50
50
|
from DIRAC.WorkloadManagementSystem.Utilities.JobParameters import (
|
|
51
51
|
getNumberOfGPUs,
|
|
52
52
|
getNumberOfProcessors,
|
|
53
|
+
getAvailableRAM,
|
|
53
54
|
)
|
|
54
55
|
|
|
55
|
-
INTEGER_PARAMETERS = ["CPUTime", "NumberOfProcessors", "NumberOfPayloadProcessors", "MaxRAM"]
|
|
56
|
+
INTEGER_PARAMETERS = ["CPUTime", "CPUNormalizationFactor", "NumberOfProcessors", "NumberOfPayloadProcessors", "MaxRAM"]
|
|
56
57
|
FLOAT_PARAMETERS = ["WaitingToRunningRatio"]
|
|
57
58
|
LIST_PARAMETERS = ["Tag", "RequiredTag"]
|
|
58
59
|
WAITING_TO_RUNNING_RATIO = 0.5
|
|
@@ -211,12 +212,14 @@ class ComputingElement:
|
|
|
211
212
|
generalCEDict.update(self.ceParameters)
|
|
212
213
|
self.ceParameters = generalCEDict
|
|
213
214
|
|
|
214
|
-
# If NumberOfProcessors/GPUs is present in the description but is equal to zero
|
|
215
|
+
# If NumberOfProcessors/GPUs/MaxRAM is present in the description but is equal to zero
|
|
215
216
|
# interpret it as needing local evaluation
|
|
216
|
-
if self.ceParameters.get("NumberOfProcessors", -1) == 0:
|
|
217
|
+
if int(self.ceParameters.get("NumberOfProcessors", -1)) == 0:
|
|
217
218
|
self.ceParameters["NumberOfProcessors"] = getNumberOfProcessors()
|
|
218
|
-
if self.ceParameters.get("NumberOfGPUs", -1) == 0:
|
|
219
|
+
if int(self.ceParameters.get("NumberOfGPUs", -1)) == 0:
|
|
219
220
|
self.ceParameters["NumberOfGPUs"] = getNumberOfGPUs()
|
|
221
|
+
if int(self.ceParameters.get("MaxRAM", 0)) == 0:
|
|
222
|
+
self.ceParameters["MaxRAM"] = getAvailableRAM()
|
|
220
223
|
|
|
221
224
|
for key in ceOptions:
|
|
222
225
|
if key in INTEGER_PARAMETERS:
|
|
@@ -252,6 +255,7 @@ class ComputingElement:
|
|
|
252
255
|
runningJobs = result["RunningJobs"]
|
|
253
256
|
waitingJobs = result["WaitingJobs"]
|
|
254
257
|
availableProcessors = result.get("AvailableProcessors")
|
|
258
|
+
|
|
255
259
|
ceInfoDict = dict(result)
|
|
256
260
|
|
|
257
261
|
maxTotalJobs = int(self.ceParameters.get("MaxTotalJobs", 0))
|
|
@@ -404,6 +408,7 @@ class ComputingElement:
|
|
|
404
408
|
result = self.getCEStatus()
|
|
405
409
|
if result["OK"]:
|
|
406
410
|
ceDict["NumberOfProcessors"] = result.get("AvailableProcessors", result.get("NumberOfProcessors", 1))
|
|
411
|
+
ceDict["MaxRAM"] = result.get("AvailableRAM", result.get("MaxRAM", 1024))
|
|
407
412
|
else:
|
|
408
413
|
self.log.error(
|
|
409
414
|
"Failure getting CE status", "(we keep going without the number of waiting and running pilots/jobs)"
|
|
@@ -450,7 +455,7 @@ def getCEConfigDict(section: str) -> dict:
|
|
|
450
455
|
ceOptions = result["Value"]
|
|
451
456
|
for key in ceOptions:
|
|
452
457
|
if key in INTEGER_PARAMETERS:
|
|
453
|
-
ceOptions[key] = int(ceOptions[key])
|
|
458
|
+
ceOptions[key] = int(float(ceOptions[key]))
|
|
454
459
|
if key in FLOAT_PARAMETERS:
|
|
455
460
|
ceOptions[key] = float(ceOptions[key])
|
|
456
461
|
if key in LIST_PARAMETERS:
|
|
@@ -1,15 +1,15 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""The simplest of the "inner" CEs (meaning it's used by a jobAgent inside a pilot)
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
3
|
+
A "InProcess" CE instance submits jobs in the current process.
|
|
4
|
+
This is the standard "inner CE" invoked from the JobAgent, main alternative being the PoolCE
|
|
5
5
|
"""
|
|
6
|
+
|
|
6
7
|
import os
|
|
7
8
|
import stat
|
|
8
9
|
|
|
9
|
-
from DIRAC import
|
|
10
|
-
from DIRAC.Core.Utilities.ThreadScheduler import gThreadScheduler
|
|
10
|
+
from DIRAC import S_ERROR, S_OK
|
|
11
11
|
from DIRAC.Core.Utilities.CGroups2 import CG2Manager
|
|
12
|
-
|
|
12
|
+
from DIRAC.Core.Utilities.ThreadScheduler import gThreadScheduler
|
|
13
13
|
from DIRAC.Resources.Computing.ComputingElement import ComputingElement
|
|
14
14
|
|
|
15
15
|
|
|
@@ -21,7 +21,8 @@ class InProcessComputingElement(ComputingElement):
|
|
|
21
21
|
self.submittedJobs = 0
|
|
22
22
|
self.runningJobs = 0
|
|
23
23
|
|
|
24
|
-
self.processors =
|
|
24
|
+
self.processors = 1
|
|
25
|
+
self.maxRAM = 0
|
|
25
26
|
self.ceParameters["MaxTotalJobs"] = 1
|
|
26
27
|
|
|
27
28
|
def submitJob(self, executableFile, proxy=None, inputs=None, **kwargs):
|
|
@@ -33,6 +34,16 @@ class InProcessComputingElement(ComputingElement):
|
|
|
33
34
|
:param list inputs: dependencies of executableFile
|
|
34
35
|
:return: S_OK(payload exit code) / S_ERROR() if submission issue
|
|
35
36
|
"""
|
|
37
|
+
self.processors = int(self.ceParameters.get("NumberOfProcessors", self.processors))
|
|
38
|
+
self.maxRAM = int(self.ceParameters.get("MaxRAM", self.maxRAM))
|
|
39
|
+
|
|
40
|
+
if "numberOfProcessors" in kwargs:
|
|
41
|
+
if self.processors < int(kwargs["numberOfProcessors"]):
|
|
42
|
+
return S_ERROR("Requesting processors not available")
|
|
43
|
+
if "MaxRAM" in kwargs:
|
|
44
|
+
if self.maxRAM < int(kwargs["MaxRAM"]):
|
|
45
|
+
return S_ERROR("Requesting RAM not available")
|
|
46
|
+
|
|
36
47
|
payloadEnv = dict(os.environ)
|
|
37
48
|
payloadProxy = ""
|
|
38
49
|
renewTask = None
|
|
@@ -118,4 +129,6 @@ class InProcessComputingElement(ComputingElement):
|
|
|
118
129
|
result["WaitingJobs"] = 0
|
|
119
130
|
# processors
|
|
120
131
|
result["AvailableProcessors"] = self.processors
|
|
132
|
+
# RAM
|
|
133
|
+
result["AvailableRAM"] = self.maxRAM
|
|
121
134
|
return result
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""The Pool Computing Element is an "inner" CE (meaning it's used by a jobAgent inside a pilot)
|
|
2
2
|
|
|
3
3
|
It's used running several jobs simultaneously in separate processes, managed by a ProcessPool.
|
|
4
4
|
|
|
@@ -10,7 +10,7 @@ LocalCEType:
|
|
|
10
10
|
LocalCEType = Pool
|
|
11
11
|
|
|
12
12
|
The Pool Computing Element is specific: it embeds an additional "inner" CE
|
|
13
|
-
(`InProcess` by default,
|
|
13
|
+
(`InProcess` by default, or `Singularity`). The "inner" CE can be specified such as::
|
|
14
14
|
|
|
15
15
|
LocalCEType = Pool/Singularity
|
|
16
16
|
|
|
@@ -19,24 +19,19 @@ NumberOfProcessors:
|
|
|
19
19
|
|
|
20
20
|
**Code Documentation**
|
|
21
21
|
"""
|
|
22
|
-
|
|
23
|
-
import os
|
|
22
|
+
|
|
24
23
|
import concurrent.futures
|
|
24
|
+
import functools
|
|
25
25
|
|
|
26
|
-
from DIRAC import
|
|
26
|
+
from DIRAC import S_ERROR, S_OK
|
|
27
27
|
from DIRAC.ConfigurationSystem.private.ConfigurationData import ConfigurationData
|
|
28
|
-
|
|
29
28
|
from DIRAC.Resources.Computing.ComputingElement import ComputingElement
|
|
30
|
-
|
|
31
29
|
from DIRAC.Resources.Computing.InProcessComputingElement import InProcessComputingElement
|
|
32
30
|
from DIRAC.Resources.Computing.SingularityComputingElement import SingularityComputingElement
|
|
33
31
|
|
|
34
|
-
# Number of unix users to run job payloads with sudo
|
|
35
|
-
MAX_NUMBER_OF_SUDO_UNIX_USERS = 32
|
|
36
|
-
|
|
37
32
|
|
|
38
|
-
def executeJob(executableFile, proxy, taskID, inputs, **kwargs):
|
|
39
|
-
"""wrapper around ce.submitJob: decides which CE to use (
|
|
33
|
+
def executeJob(executableFile, proxy, taskID, inputs, innerCEParameters, **kwargs):
|
|
34
|
+
"""wrapper around ce.submitJob: decides which inner CE to use (InProcess or Singularity)
|
|
40
35
|
|
|
41
36
|
:param str executableFile: location of the executable file
|
|
42
37
|
:param str proxy: proxy file location to be used for job submission
|
|
@@ -52,6 +47,9 @@ def executeJob(executableFile, proxy, taskID, inputs, **kwargs):
|
|
|
52
47
|
else:
|
|
53
48
|
ce = InProcessComputingElement("Task-" + str(taskID))
|
|
54
49
|
|
|
50
|
+
# adding the number of processors to use and the RAM
|
|
51
|
+
ce.ceParameters["NumberOfProcessors"] = innerCEParameters["NumberOfProcessors"]
|
|
52
|
+
ce.ceParameters["MaxRAM"] = innerCEParameters["MaxRAM"]
|
|
55
53
|
return ce.submitJob(executableFile, proxy, inputs=inputs, **kwargs)
|
|
56
54
|
|
|
57
55
|
|
|
@@ -66,7 +64,8 @@ class PoolComputingElement(ComputingElement):
|
|
|
66
64
|
self.pPool = None
|
|
67
65
|
self.taskID = 0
|
|
68
66
|
self.processorsPerTask = {}
|
|
69
|
-
self.
|
|
67
|
+
self.ramPerTask = {}
|
|
68
|
+
self.ram = 0 # effectively this means "no limits"
|
|
70
69
|
|
|
71
70
|
# This CE will effectively submit to another "Inner"CE
|
|
72
71
|
# (by default to the InProcess CE)
|
|
@@ -80,22 +79,14 @@ class PoolComputingElement(ComputingElement):
|
|
|
80
79
|
|
|
81
80
|
self.processors = int(self.ceParameters.get("NumberOfProcessors", self.processors))
|
|
82
81
|
self.ceParameters["MaxTotalJobs"] = self.processors
|
|
82
|
+
if self.ceParameters.get("MaxRAM", 0): # if there's a limit, we set it
|
|
83
|
+
self.ram = int(self.ceParameters["MaxRAM"])
|
|
83
84
|
# Indicates that the submission is done asynchronously
|
|
84
85
|
# The result is not immediately available
|
|
85
86
|
self.ceParameters["AsyncSubmission"] = True
|
|
86
87
|
self.innerCESubmissionType = self.ceParameters.get("InnerCESubmissionType", self.innerCESubmissionType)
|
|
87
88
|
return S_OK()
|
|
88
89
|
|
|
89
|
-
def getProcessorsInUse(self):
|
|
90
|
-
"""Get the number of currently allocated processor cores
|
|
91
|
-
|
|
92
|
-
:return: number of processors in use
|
|
93
|
-
"""
|
|
94
|
-
processorsInUse = 0
|
|
95
|
-
for future in self.processorsPerTask:
|
|
96
|
-
processorsInUse += self.processorsPerTask[future]
|
|
97
|
-
return processorsInUse
|
|
98
|
-
|
|
99
90
|
#############################################################################
|
|
100
91
|
def submitJob(self, executableFile, proxy=None, inputs=None, **kwargs):
|
|
101
92
|
"""Method to submit job.
|
|
@@ -118,33 +109,42 @@ class PoolComputingElement(ComputingElement):
|
|
|
118
109
|
self.taskID += 1
|
|
119
110
|
return S_OK(taskID)
|
|
120
111
|
|
|
121
|
-
|
|
112
|
+
memoryForJob = self._getMemoryForJobs(kwargs)
|
|
113
|
+
|
|
114
|
+
if memoryForJob is None:
|
|
115
|
+
self.taskResults[self.taskID] = S_ERROR("Not enough memory for the job")
|
|
116
|
+
taskID = self.taskID
|
|
117
|
+
self.taskID += 1
|
|
118
|
+
return S_OK(taskID)
|
|
119
|
+
|
|
120
|
+
# Now persisting the job limits for later use in pilot.cfg file
|
|
122
121
|
cd = ConfigurationData(loadDefaultCFG=False)
|
|
123
122
|
res = cd.loadFile("pilot.cfg")
|
|
124
123
|
if not res["OK"]:
|
|
125
124
|
self.log.error("Could not load pilot.cfg", res["Message"])
|
|
126
125
|
else:
|
|
127
|
-
# only NumberOfProcessors for now, but RAM (or other stuff) can also be added
|
|
128
126
|
jobID = int(kwargs.get("jobDesc", {}).get("jobID", 0))
|
|
129
127
|
cd.setOptionInCFG("/Resources/Computing/JobLimits/%d/NumberOfProcessors" % jobID, processorsForJob)
|
|
128
|
+
cd.setOptionInCFG("/Resources/Computing/JobLimits/%d/MaxRAM" % jobID, memoryForJob)
|
|
130
129
|
res = cd.dumpLocalCFGToFile("pilot.cfg")
|
|
131
130
|
if not res["OK"]:
|
|
132
131
|
self.log.error("Could not dump cfg to pilot.cfg", res["Message"])
|
|
133
132
|
|
|
133
|
+
# Define the innerCEParameters
|
|
134
|
+
innerCEParameters = {}
|
|
135
|
+
innerCEParameters["NumberOfProcessors"] = processorsForJob
|
|
136
|
+
innerCEParameters["MaxRAM"] = memoryForJob
|
|
137
|
+
|
|
134
138
|
# Here we define task kwargs: adding complex objects like thread.Lock can trigger errors in the task
|
|
135
139
|
taskKwargs = {"InnerCESubmissionType": self.innerCESubmissionType}
|
|
136
140
|
taskKwargs["jobDesc"] = kwargs.get("jobDesc", {})
|
|
137
|
-
if self.innerCESubmissionType == "Sudo":
|
|
138
|
-
for nUser in range(MAX_NUMBER_OF_SUDO_UNIX_USERS):
|
|
139
|
-
if nUser not in self.userNumberPerTask.values():
|
|
140
|
-
break
|
|
141
|
-
taskKwargs["NUser"] = nUser
|
|
142
|
-
if "USER" in os.environ:
|
|
143
|
-
taskKwargs["PayloadUser"] = os.environ["USER"] + f"p{str(nUser).zfill(2)}"
|
|
144
141
|
|
|
145
142
|
# Submission
|
|
146
|
-
future = self.pPool.submit(
|
|
143
|
+
future = self.pPool.submit(
|
|
144
|
+
executeJob, executableFile, proxy, self.taskID, inputs, innerCEParameters, **taskKwargs
|
|
145
|
+
)
|
|
147
146
|
self.processorsPerTask[future] = processorsForJob
|
|
147
|
+
self.ramPerTask[future] = memoryForJob
|
|
148
148
|
future.add_done_callback(functools.partial(self.finalizeJob, self.taskID))
|
|
149
149
|
|
|
150
150
|
taskID = self.taskID
|
|
@@ -154,7 +154,7 @@ class PoolComputingElement(ComputingElement):
|
|
|
154
154
|
|
|
155
155
|
def _getProcessorsForJobs(self, kwargs):
|
|
156
156
|
"""helper function"""
|
|
157
|
-
processorsInUse = self.
|
|
157
|
+
processorsInUse = sum(self.processorsPerTask.values())
|
|
158
158
|
availableProcessors = self.processors - processorsInUse
|
|
159
159
|
|
|
160
160
|
self.log.verbose(
|
|
@@ -191,29 +191,61 @@ class PoolComputingElement(ComputingElement):
|
|
|
191
191
|
|
|
192
192
|
return requestedProcessors
|
|
193
193
|
|
|
194
|
+
def _getMemoryForJobs(self, kwargs):
|
|
195
|
+
"""helper function to get the memory that will be allocated for the job
|
|
196
|
+
|
|
197
|
+
:param kwargs: job parameters
|
|
198
|
+
:return: memory in MB or None if not enough memory
|
|
199
|
+
"""
|
|
200
|
+
|
|
201
|
+
# # job requirements
|
|
202
|
+
requestedMemory = kwargs.get("MinRAM", kwargs.get("MaxRAM", 0))
|
|
203
|
+
# if there's no limit, we just let it match the maximum
|
|
204
|
+
if not self.ram:
|
|
205
|
+
return max(requestedMemory, kwargs.get("MaxRAM", 0))
|
|
206
|
+
|
|
207
|
+
# # now check what the slot can provide
|
|
208
|
+
# Do we have enough memory?
|
|
209
|
+
availableMemory = self.ram - sum(self.ramPerTask.values())
|
|
210
|
+
if availableMemory < requestedMemory:
|
|
211
|
+
return None
|
|
212
|
+
|
|
213
|
+
# if there's a MaxRAM requested, we allocate it all (if it fits),
|
|
214
|
+
# and if it doesn't, we stop here instead of using MinRAM
|
|
215
|
+
if kwargs.get("MaxRAM", 0):
|
|
216
|
+
if availableMemory >= kwargs.get("MaxRAM"):
|
|
217
|
+
requestedMemory = kwargs.get("MaxRAM")
|
|
218
|
+
else:
|
|
219
|
+
return None
|
|
220
|
+
|
|
221
|
+
return requestedMemory
|
|
222
|
+
|
|
194
223
|
def finalizeJob(self, taskID, future):
|
|
195
224
|
"""Finalize the job by updating the process utilisation counters
|
|
196
225
|
|
|
197
226
|
:param future: evaluating the future result
|
|
198
227
|
"""
|
|
199
228
|
nProc = self.processorsPerTask.pop(future)
|
|
229
|
+
ram = self.ramPerTask.pop(future, None)
|
|
200
230
|
|
|
201
231
|
result = future.result() # This would be the result of the e.g. InProcess.submitJob()
|
|
202
232
|
if result["OK"]:
|
|
203
|
-
self.log.info("Task finished successfully:", f"{taskID}; {nProc} processor(s) freed")
|
|
233
|
+
self.log.info("Task finished successfully:", f"{taskID}; {nProc} processor(s) and {ram}MB freed")
|
|
204
234
|
else:
|
|
205
235
|
self.log.error("Task failed submission:", f"{taskID}; message: {result['Message']}")
|
|
206
236
|
self.taskResults[taskID] = result
|
|
207
237
|
|
|
208
238
|
def getCEStatus(self):
|
|
209
239
|
"""Method to return information on running and waiting jobs,
|
|
210
|
-
|
|
240
|
+
as well as the number of processors (used, and available),
|
|
241
|
+
and the RAM (used, and available)
|
|
211
242
|
|
|
212
243
|
:return: dictionary of numbers of jobs per status and processors (used, and available)
|
|
213
244
|
"""
|
|
214
245
|
|
|
215
246
|
result = S_OK()
|
|
216
247
|
nJobs = 0
|
|
248
|
+
|
|
217
249
|
for _j, value in self.processorsPerTask.items():
|
|
218
250
|
if value > 0:
|
|
219
251
|
nJobs += 1
|
|
@@ -222,9 +254,16 @@ class PoolComputingElement(ComputingElement):
|
|
|
222
254
|
result["WaitingJobs"] = 0
|
|
223
255
|
|
|
224
256
|
# dealing with processors
|
|
225
|
-
processorsInUse = self.
|
|
257
|
+
processorsInUse = sum(self.processorsPerTask.values())
|
|
226
258
|
result["UsedProcessors"] = processorsInUse
|
|
227
259
|
result["AvailableProcessors"] = self.processors - processorsInUse
|
|
260
|
+
# dealing with RAM
|
|
261
|
+
result["UsedRAM"] = sum(self.ramPerTask.values())
|
|
262
|
+
if self.ram:
|
|
263
|
+
result["AvailableRAM"] = self.ram - sum(self.ramPerTask.values())
|
|
264
|
+
else:
|
|
265
|
+
result["AvailableRAM"] = 0
|
|
266
|
+
|
|
228
267
|
return result
|
|
229
268
|
|
|
230
269
|
def getDescription(self):
|
|
@@ -1,16 +1,17 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
1
|
+
"""SingularityCE is a type of "inner" CEs
|
|
2
|
+
(meaning it's used by a jobAgent inside a pilot).
|
|
3
|
+
A computing element class using singularity containers,
|
|
4
|
+
where Singularity is supposed to be found on the WN.
|
|
5
5
|
|
|
6
|
-
|
|
7
|
-
|
|
6
|
+
The goal of this CE is to start the job in the container set by
|
|
7
|
+
the "ContainerRoot" config option.
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
DIRAC can be re-installed within the container.
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
|
|
11
|
+
See the Configuration/Resources/Computing documention for details on
|
|
12
|
+
where to set the option parameters.
|
|
13
13
|
"""
|
|
14
|
+
|
|
14
15
|
import json
|
|
15
16
|
import os
|
|
16
17
|
import re
|
|
@@ -115,6 +116,7 @@ class SingularityComputingElement(ComputingElement):
|
|
|
115
116
|
self.__installDIRACInContainer = False
|
|
116
117
|
|
|
117
118
|
self.processors = int(self.ceParameters.get("NumberOfProcessors", 1))
|
|
119
|
+
self.maxRAM = int(self.ceParameters.get("MaxRAM", 0))
|
|
118
120
|
|
|
119
121
|
@staticmethod
|
|
120
122
|
def __findInstallBaseDir():
|
|
@@ -415,6 +417,12 @@ class SingularityComputingElement(ComputingElement):
|
|
|
415
417
|
|
|
416
418
|
self.log.debug(f"Execute singularity command: {cmd}")
|
|
417
419
|
self.log.debug(f"Execute singularity env: {self.__getEnv()}")
|
|
420
|
+
# systemCall below uses ceParameters["MemoryLimitMB"] as CG2 upper memory limit
|
|
421
|
+
# if there's a max RAM available to the job, use that
|
|
422
|
+
if self.maxRAM:
|
|
423
|
+
self.ceParameters["MemoryLimitMB"] = min(
|
|
424
|
+
self.maxRAM, int(self.ceParameters.get("MemoryLimitMB", 1024 * 1024))
|
|
425
|
+
) # 1024 * 1024 is an arbitrary large number
|
|
418
426
|
result = CG2Manager().systemCall(
|
|
419
427
|
0, cmd, callbackFunction=self.sendOutput, env=self.__getEnv(), ceParameters=self.ceParameters
|
|
420
428
|
)
|
|
@@ -451,4 +459,6 @@ class SingularityComputingElement(ComputingElement):
|
|
|
451
459
|
result["WaitingJobs"] = 0
|
|
452
460
|
# processors
|
|
453
461
|
result["AvailableProcessors"] = self.processors
|
|
462
|
+
# RAM
|
|
463
|
+
result["AvailableRAM"] = self.maxRAM
|
|
454
464
|
return result
|
|
@@ -8,25 +8,41 @@ import shutil
|
|
|
8
8
|
|
|
9
9
|
import pytest
|
|
10
10
|
|
|
11
|
-
from DIRAC.Resources.Computing.test.Test_PoolComputingElement import jobScript, _stopJob
|
|
12
|
-
from DIRAC.WorkloadManagementSystem.Utilities.Utils import createJobWrapper
|
|
13
|
-
|
|
14
11
|
# sut
|
|
15
12
|
from DIRAC.Resources.Computing.InProcessComputingElement import InProcessComputingElement
|
|
13
|
+
from DIRAC.Resources.Computing.test.Test_PoolComputingElement import _stopJob, jobScript
|
|
14
|
+
from DIRAC.WorkloadManagementSystem.Utilities.Utils import createJobWrapper
|
|
16
15
|
|
|
17
16
|
|
|
18
17
|
@pytest.mark.slow
|
|
19
|
-
|
|
18
|
+
@pytest.mark.parametrize(
|
|
19
|
+
"ce_parameters, available_processors, ram",
|
|
20
|
+
[
|
|
21
|
+
({}, 1, 0),
|
|
22
|
+
({"NumberOfProcessors": 8}, 8, 0),
|
|
23
|
+
({"MaxRAM": 2048}, 1, 2048),
|
|
24
|
+
({"NumberOfProcessors": 8, "MaxRAM": 2048}, 8, 2048),
|
|
25
|
+
],
|
|
26
|
+
)
|
|
27
|
+
def test_submitJob(ce_parameters, available_processors, ram):
|
|
28
|
+
# initialization
|
|
29
|
+
ce = InProcessComputingElement("InProcessCE")
|
|
30
|
+
ce.ceParameters = ce_parameters
|
|
31
|
+
|
|
32
|
+
# simple
|
|
20
33
|
with open("testJob.py", "w") as execFile:
|
|
21
34
|
execFile.write(jobScript % "1")
|
|
22
35
|
os.chmod("testJob.py", 0o755)
|
|
23
36
|
|
|
24
|
-
ce = InProcessComputingElement("InProcessCE")
|
|
25
37
|
res = ce.submitJob("testJob.py", None)
|
|
26
38
|
assert res["OK"] is True
|
|
27
39
|
res = ce.getCEStatus()
|
|
28
40
|
assert res["OK"] is True
|
|
29
41
|
assert res["SubmittedJobs"] == 1
|
|
42
|
+
assert res["RunningJobs"] == 0
|
|
43
|
+
assert res["WaitingJobs"] == 0
|
|
44
|
+
assert res["AvailableProcessors"] == available_processors
|
|
45
|
+
assert res["AvailableRAM"] == ram
|
|
30
46
|
_stopJob(1)
|
|
31
47
|
for ff in ["testJob.py", "stop_job_2", "job.info", "std.out"]:
|
|
32
48
|
if os.path.isfile(ff):
|
|
@@ -50,21 +66,66 @@ def test_submitJob():
|
|
|
50
66
|
res = ce.submitJob(
|
|
51
67
|
wrapperFile,
|
|
52
68
|
proxy=None,
|
|
53
|
-
numberOfProcessors=
|
|
54
|
-
maxNumberOfProcessors=
|
|
69
|
+
numberOfProcessors=available_processors,
|
|
70
|
+
maxNumberOfProcessors=available_processors,
|
|
55
71
|
wholeNode=False,
|
|
56
72
|
mpTag=True,
|
|
73
|
+
MinRAM=ram,
|
|
74
|
+
MaxRAM=ram,
|
|
57
75
|
jobDesc={"jobParams": jobParams, "resourceParams": resourceParams, "optimizerParams": optimizerParams},
|
|
58
76
|
)
|
|
59
77
|
assert res["OK"] is True
|
|
78
|
+
_stopJob(2)
|
|
60
79
|
|
|
61
80
|
res = ce.getCEStatus()
|
|
62
81
|
assert res["OK"] is True
|
|
63
82
|
assert res["SubmittedJobs"] == 2
|
|
83
|
+
assert res["RunningJobs"] == 0
|
|
84
|
+
assert res["WaitingJobs"] == 0
|
|
85
|
+
assert res["AvailableProcessors"] == available_processors
|
|
86
|
+
assert res["AvailableRAM"] == ram
|
|
64
87
|
|
|
65
|
-
_stopJob(2)
|
|
66
88
|
for ff in ["testJob.py", "stop_job_2", "job.info", "std.out"]:
|
|
67
89
|
if os.path.isfile(ff):
|
|
68
90
|
os.remove(ff)
|
|
69
91
|
if os.path.isdir("job"):
|
|
70
92
|
shutil.rmtree("job")
|
|
93
|
+
|
|
94
|
+
# failing
|
|
95
|
+
with open("testJob.py", "w") as execFile:
|
|
96
|
+
execFile.write(jobScript % "3")
|
|
97
|
+
os.chmod("testJob.py", 0o755)
|
|
98
|
+
|
|
99
|
+
jobParams = {"JobType": "User", "Executable": "testJob.py"}
|
|
100
|
+
resourceParams = {"GridCE": "some_CE"}
|
|
101
|
+
optimizerParams = {}
|
|
102
|
+
|
|
103
|
+
wrapperFile = createJobWrapper(
|
|
104
|
+
jobID=3, jobParams=jobParams, resourceParams=resourceParams, optimizerParams=optimizerParams, logLevel="DEBUG"
|
|
105
|
+
)["Value"][
|
|
106
|
+
"JobExecutablePath"
|
|
107
|
+
] # This is not under test, assuming it works fine
|
|
108
|
+
|
|
109
|
+
res = ce.submitJob(
|
|
110
|
+
wrapperFile,
|
|
111
|
+
proxy=None,
|
|
112
|
+
numberOfProcessors=4 + available_processors,
|
|
113
|
+
maxNumberOfProcessors=8 + available_processors,
|
|
114
|
+
wholeNode=False,
|
|
115
|
+
mpTag=True,
|
|
116
|
+
MinRAM=2500,
|
|
117
|
+
MaxRAM=4000,
|
|
118
|
+
jobDesc={"jobParams": jobParams, "resourceParams": resourceParams, "optimizerParams": optimizerParams},
|
|
119
|
+
)
|
|
120
|
+
assert res["OK"] is False
|
|
121
|
+
res = ce.getCEStatus()
|
|
122
|
+
assert res["OK"] is True
|
|
123
|
+
assert res["SubmittedJobs"] == 2
|
|
124
|
+
assert res["RunningJobs"] == 0
|
|
125
|
+
assert res["WaitingJobs"] == 0
|
|
126
|
+
assert res["AvailableProcessors"] == available_processors
|
|
127
|
+
assert res["AvailableRAM"] == ram
|
|
128
|
+
_stopJob(1)
|
|
129
|
+
for ff in ["testJob.py", "stop_job_3", "job.info", "std.out"]:
|
|
130
|
+
if os.path.isfile(ff):
|
|
131
|
+
os.remove(ff)
|