DIRAC 9.0.13__py3-none-any.whl → 9.0.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. DIRAC/ConfigurationSystem/Client/CSAPI.py +11 -0
  2. DIRAC/Core/Utilities/CGroups2.py +1 -0
  3. DIRAC/Core/Utilities/ElasticSearchDB.py +1 -1
  4. DIRAC/Core/Utilities/MySQL.py +51 -25
  5. DIRAC/DataManagementSystem/Client/DataManager.py +7 -10
  6. DIRAC/DataManagementSystem/Client/FTS3Job.py +12 -3
  7. DIRAC/FrameworkSystem/Service/SystemAdministratorHandler.py +41 -11
  8. DIRAC/Interfaces/API/Dirac.py +12 -4
  9. DIRAC/Interfaces/API/Job.py +62 -17
  10. DIRAC/RequestManagementSystem/private/RequestTask.py +2 -1
  11. DIRAC/Resources/Catalog/FileCatalogClient.py +18 -7
  12. DIRAC/Resources/Catalog/Utilities.py +3 -3
  13. DIRAC/Resources/Computing/BatchSystems/SLURM.py +1 -1
  14. DIRAC/Resources/Computing/BatchSystems/TimeLeft/TimeLeft.py +3 -1
  15. DIRAC/Resources/Computing/ComputingElement.py +39 -34
  16. DIRAC/Resources/Computing/InProcessComputingElement.py +20 -7
  17. DIRAC/Resources/Computing/PoolComputingElement.py +76 -37
  18. DIRAC/Resources/Computing/SingularityComputingElement.py +19 -9
  19. DIRAC/Resources/Computing/test/Test_InProcessComputingElement.py +69 -8
  20. DIRAC/Resources/Computing/test/Test_PoolComputingElement.py +102 -35
  21. DIRAC/Resources/Storage/GFAL2_StorageBase.py +9 -0
  22. DIRAC/TransformationSystem/Agent/TransformationAgent.py +12 -13
  23. DIRAC/WorkloadManagementSystem/Agent/JobCleaningAgent.py +1 -1
  24. DIRAC/WorkloadManagementSystem/Agent/PilotSyncAgent.py +4 -3
  25. DIRAC/WorkloadManagementSystem/Agent/StalledJobAgent.py +1 -1
  26. DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_JobAgent.py +4 -3
  27. DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_PilotLoggingAgent.py +3 -3
  28. DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_PilotStatusAgent.py +4 -2
  29. DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_PushJobAgent.py +5 -4
  30. DIRAC/WorkloadManagementSystem/Agent/test/Test_Agent_StalledJobAgent.py +4 -2
  31. DIRAC/WorkloadManagementSystem/Client/JobReport.py +10 -6
  32. DIRAC/WorkloadManagementSystem/Client/JobState/JobState.py +12 -3
  33. DIRAC/WorkloadManagementSystem/Client/Matcher.py +18 -24
  34. DIRAC/WorkloadManagementSystem/DB/TaskQueueDB.py +137 -7
  35. DIRAC/WorkloadManagementSystem/Executor/JobScheduling.py +8 -14
  36. DIRAC/WorkloadManagementSystem/Executor/test/Test_Executor.py +3 -5
  37. DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapper.py +4 -5
  38. DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperOfflineTemplate.py +1 -1
  39. DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapperTemplate.py +1 -2
  40. DIRAC/WorkloadManagementSystem/JobWrapper/test/Test_JobWrapper.py +1 -1
  41. DIRAC/WorkloadManagementSystem/Utilities/JobParameters.py +81 -2
  42. DIRAC/WorkloadManagementSystem/Utilities/QueueUtilities.py +5 -5
  43. DIRAC/WorkloadManagementSystem/Utilities/RemoteRunner.py +2 -1
  44. DIRAC/WorkloadManagementSystem/Utilities/test/Test_RemoteRunner.py +7 -3
  45. DIRAC/WorkloadManagementSystem/scripts/dirac_wms_get_wn_parameters.py +3 -3
  46. DIRAC/__init__.py +1 -1
  47. DIRAC/tests/Utilities/testJobDefinitions.py +57 -20
  48. {dirac-9.0.13.dist-info → dirac-9.0.15.dist-info}/METADATA +2 -2
  49. {dirac-9.0.13.dist-info → dirac-9.0.15.dist-info}/RECORD +53 -53
  50. {dirac-9.0.13.dist-info → dirac-9.0.15.dist-info}/WHEEL +0 -0
  51. {dirac-9.0.13.dist-info → dirac-9.0.15.dist-info}/entry_points.txt +0 -0
  52. {dirac-9.0.13.dist-info → dirac-9.0.15.dist-info}/licenses/LICENSE +0 -0
  53. {dirac-9.0.13.dist-info → dirac-9.0.15.dist-info}/top_level.txt +0 -0
@@ -1,40 +1,40 @@
1
- """ The Computing Element class is a base class for all the various
2
- types CEs. It serves several purposes:
1
+ """The Computing Element class is a base class for all the various
2
+ types CEs. It serves several purposes:
3
3
 
4
- - collects general CE related parameters to generate CE description
5
- for the job matching
6
- - provides logic for evaluation of the number of available CPU slots
7
- - provides logic for the proxy renewal while executing jobs
4
+ - collects general CE related parameters to generate CE description
5
+ for the job matching
6
+ - provides logic for evaluation of the number of available CPU slots
7
+ - provides logic for the proxy renewal while executing jobs
8
8
 
9
- The CE parameters are collected from the following sources, in hierarchy
10
- descending order:
9
+ The CE parameters are collected from the following sources, in hierarchy
10
+ descending order:
11
11
 
12
- - parameters provided through setParameters() method of the class
13
- - parameters in /LocalSite configuration section
14
- - parameters in /LocalSite/<ceName>/ResourceDict configuration section
15
- - parameters in /LocalSite/ResourceDict configuration section
16
- - parameters in /LocalSite/<ceName> configuration section
17
- - parameters in /Resources/Computing/<ceName> configuration section
18
- - parameters in /Resources/Computing/CEDefaults configuration section
12
+ - parameters provided through setParameters() method of the class
13
+ - parameters in /LocalSite configuration section
14
+ - parameters in /LocalSite/<ceName>/ResourceDict configuration section
15
+ - parameters in /LocalSite/ResourceDict configuration section
16
+ - parameters in /LocalSite/<ceName> configuration section
17
+ - parameters in /Resources/Computing/<ceName> configuration section
18
+ - parameters in /Resources/Computing/CEDefaults configuration section
19
19
 
20
- The ComputingElement objects are usually instantiated with the help of
21
- ComputingElementFactory.
20
+ The ComputingElement objects are usually instantiated with the help of
21
+ ComputingElementFactory.
22
22
 
23
- The ComputingElement class can be considered abstract. 3 kinds of abstract ComputingElements
24
- can be distinguished from it:
23
+ The ComputingElement class can be considered abstract. 3 kinds of abstract ComputingElements
24
+ can be distinguished from it:
25
25
 
26
- - Remote ComputingElement: includes methods to interact with a remote ComputingElement
27
- (e.g. HtCondorCEComputingElement, AREXComputingElement).
28
- - Inner ComputingElement: includes methods to locally interact with an underlying worker node.
29
- It is worth noting that an Inner ComputingElement provides synchronous submission
30
- (the submission of a job is blocking the execution until its completion). It deals with one job at a time.
31
- - Inner Pool ComputingElement: includes methods to locally interact with Inner ComputingElements asynchronously.
32
- It can manage a pool of jobs running simultaneously.
26
+ - Remote ComputingElement: includes methods to interact with a remote ComputingElement
27
+ (e.g. HtCondorCEComputingElement, AREXComputingElement).
28
+ - Inner ComputingElement: includes methods to locally interact with an underlying worker node.
29
+ It is worth noting that an Inner ComputingElement provides synchronous submission
30
+ (the submission of a job is blocking the execution until its completion). It deals with one job at a time.
31
+ - Inner Pool ComputingElement: includes methods to locally interact with Inner ComputingElements asynchronously.
32
+ It can manage a pool of jobs running simultaneously.
33
33
 
34
- To configure the use of Tokens for CEs:
34
+ To configure the use of Tokens for CEs:
35
35
 
36
- * the CE is able to receive any token. Validation: 'Tag = Token' should be included in the CE parameters.
37
- * the CE is able to receive VO-specifc tokens. Validation: 'Tag = Token:<VO>' should be included in the CE parameters.
36
+ * the CE is able to receive any token. Validation: 'Tag = Token' should be included in the CE parameters.
37
+ * the CE is able to receive VO-specifc tokens. Validation: 'Tag = Token:<VO>' should be included in the CE parameters.
38
38
 
39
39
  """
40
40
 
@@ -50,9 +50,10 @@ from DIRAC.FrameworkSystem.Client.ProxyManagerClient import gProxyManager
50
50
  from DIRAC.WorkloadManagementSystem.Utilities.JobParameters import (
51
51
  getNumberOfGPUs,
52
52
  getNumberOfProcessors,
53
+ getAvailableRAM,
53
54
  )
54
55
 
55
- INTEGER_PARAMETERS = ["CPUTime", "NumberOfProcessors", "NumberOfPayloadProcessors", "MaxRAM"]
56
+ INTEGER_PARAMETERS = ["CPUTime", "CPUNormalizationFactor", "NumberOfProcessors", "NumberOfPayloadProcessors", "MaxRAM"]
56
57
  FLOAT_PARAMETERS = ["WaitingToRunningRatio"]
57
58
  LIST_PARAMETERS = ["Tag", "RequiredTag"]
58
59
  WAITING_TO_RUNNING_RATIO = 0.5
@@ -211,12 +212,14 @@ class ComputingElement:
211
212
  generalCEDict.update(self.ceParameters)
212
213
  self.ceParameters = generalCEDict
213
214
 
214
- # If NumberOfProcessors/GPUs is present in the description but is equal to zero
215
+ # If NumberOfProcessors/GPUs/MaxRAM is present in the description but is equal to zero
215
216
  # interpret it as needing local evaluation
216
- if self.ceParameters.get("NumberOfProcessors", -1) == 0:
217
+ if int(self.ceParameters.get("NumberOfProcessors", -1)) == 0:
217
218
  self.ceParameters["NumberOfProcessors"] = getNumberOfProcessors()
218
- if self.ceParameters.get("NumberOfGPUs", -1) == 0:
219
+ if int(self.ceParameters.get("NumberOfGPUs", -1)) == 0:
219
220
  self.ceParameters["NumberOfGPUs"] = getNumberOfGPUs()
221
+ if int(self.ceParameters.get("MaxRAM", 0)) == 0:
222
+ self.ceParameters["MaxRAM"] = getAvailableRAM()
220
223
 
221
224
  for key in ceOptions:
222
225
  if key in INTEGER_PARAMETERS:
@@ -252,6 +255,7 @@ class ComputingElement:
252
255
  runningJobs = result["RunningJobs"]
253
256
  waitingJobs = result["WaitingJobs"]
254
257
  availableProcessors = result.get("AvailableProcessors")
258
+
255
259
  ceInfoDict = dict(result)
256
260
 
257
261
  maxTotalJobs = int(self.ceParameters.get("MaxTotalJobs", 0))
@@ -404,6 +408,7 @@ class ComputingElement:
404
408
  result = self.getCEStatus()
405
409
  if result["OK"]:
406
410
  ceDict["NumberOfProcessors"] = result.get("AvailableProcessors", result.get("NumberOfProcessors", 1))
411
+ ceDict["MaxRAM"] = result.get("AvailableRAM", result.get("MaxRAM", 1024))
407
412
  else:
408
413
  self.log.error(
409
414
  "Failure getting CE status", "(we keep going without the number of waiting and running pilots/jobs)"
@@ -450,7 +455,7 @@ def getCEConfigDict(section: str) -> dict:
450
455
  ceOptions = result["Value"]
451
456
  for key in ceOptions:
452
457
  if key in INTEGER_PARAMETERS:
453
- ceOptions[key] = int(ceOptions[key])
458
+ ceOptions[key] = int(float(ceOptions[key]))
454
459
  if key in FLOAT_PARAMETERS:
455
460
  ceOptions[key] = float(ceOptions[key])
456
461
  if key in LIST_PARAMETERS:
@@ -1,15 +1,15 @@
1
- """ The simplest of the "inner" CEs (meaning it's used by a jobAgent inside a pilot)
1
+ """The simplest of the "inner" CEs (meaning it's used by a jobAgent inside a pilot)
2
2
 
3
- A "InProcess" CE instance submits jobs in the current process.
4
- This is the standard "inner CE" invoked from the JobAgent, main alternative being the PoolCE
3
+ A "InProcess" CE instance submits jobs in the current process.
4
+ This is the standard "inner CE" invoked from the JobAgent, main alternative being the PoolCE
5
5
  """
6
+
6
7
  import os
7
8
  import stat
8
9
 
9
- from DIRAC import S_OK, S_ERROR
10
- from DIRAC.Core.Utilities.ThreadScheduler import gThreadScheduler
10
+ from DIRAC import S_ERROR, S_OK
11
11
  from DIRAC.Core.Utilities.CGroups2 import CG2Manager
12
-
12
+ from DIRAC.Core.Utilities.ThreadScheduler import gThreadScheduler
13
13
  from DIRAC.Resources.Computing.ComputingElement import ComputingElement
14
14
 
15
15
 
@@ -21,7 +21,8 @@ class InProcessComputingElement(ComputingElement):
21
21
  self.submittedJobs = 0
22
22
  self.runningJobs = 0
23
23
 
24
- self.processors = int(self.ceParameters.get("NumberOfProcessors", 1))
24
+ self.processors = 1
25
+ self.maxRAM = 0
25
26
  self.ceParameters["MaxTotalJobs"] = 1
26
27
 
27
28
  def submitJob(self, executableFile, proxy=None, inputs=None, **kwargs):
@@ -33,6 +34,16 @@ class InProcessComputingElement(ComputingElement):
33
34
  :param list inputs: dependencies of executableFile
34
35
  :return: S_OK(payload exit code) / S_ERROR() if submission issue
35
36
  """
37
+ self.processors = int(self.ceParameters.get("NumberOfProcessors", self.processors))
38
+ self.maxRAM = int(self.ceParameters.get("MaxRAM", self.maxRAM))
39
+
40
+ if "numberOfProcessors" in kwargs:
41
+ if self.processors < int(kwargs["numberOfProcessors"]):
42
+ return S_ERROR("Requesting processors not available")
43
+ if "MaxRAM" in kwargs:
44
+ if self.maxRAM < int(kwargs["MaxRAM"]):
45
+ return S_ERROR("Requesting RAM not available")
46
+
36
47
  payloadEnv = dict(os.environ)
37
48
  payloadProxy = ""
38
49
  renewTask = None
@@ -118,4 +129,6 @@ class InProcessComputingElement(ComputingElement):
118
129
  result["WaitingJobs"] = 0
119
130
  # processors
120
131
  result["AvailableProcessors"] = self.processors
132
+ # RAM
133
+ result["AvailableRAM"] = self.maxRAM
121
134
  return result
@@ -1,4 +1,4 @@
1
- """ The Pool Computing Element is an "inner" CE (meaning it's used by a jobAgent inside a pilot)
1
+ """The Pool Computing Element is an "inner" CE (meaning it's used by a jobAgent inside a pilot)
2
2
 
3
3
  It's used running several jobs simultaneously in separate processes, managed by a ProcessPool.
4
4
 
@@ -10,7 +10,7 @@ LocalCEType:
10
10
  LocalCEType = Pool
11
11
 
12
12
  The Pool Computing Element is specific: it embeds an additional "inner" CE
13
- (`InProcess` by default, `Sudo`, `Singularity`). The "inner" CE can be specified such as::
13
+ (`InProcess` by default, or `Singularity`). The "inner" CE can be specified such as::
14
14
 
15
15
  LocalCEType = Pool/Singularity
16
16
 
@@ -19,24 +19,19 @@ NumberOfProcessors:
19
19
 
20
20
  **Code Documentation**
21
21
  """
22
- import functools
23
- import os
22
+
24
23
  import concurrent.futures
24
+ import functools
25
25
 
26
- from DIRAC import S_OK, S_ERROR
26
+ from DIRAC import S_ERROR, S_OK
27
27
  from DIRAC.ConfigurationSystem.private.ConfigurationData import ConfigurationData
28
-
29
28
  from DIRAC.Resources.Computing.ComputingElement import ComputingElement
30
-
31
29
  from DIRAC.Resources.Computing.InProcessComputingElement import InProcessComputingElement
32
30
  from DIRAC.Resources.Computing.SingularityComputingElement import SingularityComputingElement
33
31
 
34
- # Number of unix users to run job payloads with sudo
35
- MAX_NUMBER_OF_SUDO_UNIX_USERS = 32
36
-
37
32
 
38
- def executeJob(executableFile, proxy, taskID, inputs, **kwargs):
39
- """wrapper around ce.submitJob: decides which CE to use (Sudo or InProcess or Singularity)
33
+ def executeJob(executableFile, proxy, taskID, inputs, innerCEParameters, **kwargs):
34
+ """wrapper around ce.submitJob: decides which inner CE to use (InProcess or Singularity)
40
35
 
41
36
  :param str executableFile: location of the executable file
42
37
  :param str proxy: proxy file location to be used for job submission
@@ -52,6 +47,9 @@ def executeJob(executableFile, proxy, taskID, inputs, **kwargs):
52
47
  else:
53
48
  ce = InProcessComputingElement("Task-" + str(taskID))
54
49
 
50
+ # adding the number of processors to use and the RAM
51
+ ce.ceParameters["NumberOfProcessors"] = innerCEParameters["NumberOfProcessors"]
52
+ ce.ceParameters["MaxRAM"] = innerCEParameters["MaxRAM"]
55
53
  return ce.submitJob(executableFile, proxy, inputs=inputs, **kwargs)
56
54
 
57
55
 
@@ -66,7 +64,8 @@ class PoolComputingElement(ComputingElement):
66
64
  self.pPool = None
67
65
  self.taskID = 0
68
66
  self.processorsPerTask = {}
69
- self.userNumberPerTask = {}
67
+ self.ramPerTask = {}
68
+ self.ram = 0 # effectively this means "no limits"
70
69
 
71
70
  # This CE will effectively submit to another "Inner"CE
72
71
  # (by default to the InProcess CE)
@@ -80,22 +79,14 @@ class PoolComputingElement(ComputingElement):
80
79
 
81
80
  self.processors = int(self.ceParameters.get("NumberOfProcessors", self.processors))
82
81
  self.ceParameters["MaxTotalJobs"] = self.processors
82
+ if self.ceParameters.get("MaxRAM", 0): # if there's a limit, we set it
83
+ self.ram = int(self.ceParameters["MaxRAM"])
83
84
  # Indicates that the submission is done asynchronously
84
85
  # The result is not immediately available
85
86
  self.ceParameters["AsyncSubmission"] = True
86
87
  self.innerCESubmissionType = self.ceParameters.get("InnerCESubmissionType", self.innerCESubmissionType)
87
88
  return S_OK()
88
89
 
89
- def getProcessorsInUse(self):
90
- """Get the number of currently allocated processor cores
91
-
92
- :return: number of processors in use
93
- """
94
- processorsInUse = 0
95
- for future in self.processorsPerTask:
96
- processorsInUse += self.processorsPerTask[future]
97
- return processorsInUse
98
-
99
90
  #############################################################################
100
91
  def submitJob(self, executableFile, proxy=None, inputs=None, **kwargs):
101
92
  """Method to submit job.
@@ -118,33 +109,42 @@ class PoolComputingElement(ComputingElement):
118
109
  self.taskID += 1
119
110
  return S_OK(taskID)
120
111
 
121
- # Now persisting the job limits for later use in pilot.cfg file (pilot 3 default)
112
+ memoryForJob = self._getMemoryForJobs(kwargs)
113
+
114
+ if memoryForJob is None:
115
+ self.taskResults[self.taskID] = S_ERROR("Not enough memory for the job")
116
+ taskID = self.taskID
117
+ self.taskID += 1
118
+ return S_OK(taskID)
119
+
120
+ # Now persisting the job limits for later use in pilot.cfg file
122
121
  cd = ConfigurationData(loadDefaultCFG=False)
123
122
  res = cd.loadFile("pilot.cfg")
124
123
  if not res["OK"]:
125
124
  self.log.error("Could not load pilot.cfg", res["Message"])
126
125
  else:
127
- # only NumberOfProcessors for now, but RAM (or other stuff) can also be added
128
126
  jobID = int(kwargs.get("jobDesc", {}).get("jobID", 0))
129
127
  cd.setOptionInCFG("/Resources/Computing/JobLimits/%d/NumberOfProcessors" % jobID, processorsForJob)
128
+ cd.setOptionInCFG("/Resources/Computing/JobLimits/%d/MaxRAM" % jobID, memoryForJob)
130
129
  res = cd.dumpLocalCFGToFile("pilot.cfg")
131
130
  if not res["OK"]:
132
131
  self.log.error("Could not dump cfg to pilot.cfg", res["Message"])
133
132
 
133
+ # Define the innerCEParameters
134
+ innerCEParameters = {}
135
+ innerCEParameters["NumberOfProcessors"] = processorsForJob
136
+ innerCEParameters["MaxRAM"] = memoryForJob
137
+
134
138
  # Here we define task kwargs: adding complex objects like thread.Lock can trigger errors in the task
135
139
  taskKwargs = {"InnerCESubmissionType": self.innerCESubmissionType}
136
140
  taskKwargs["jobDesc"] = kwargs.get("jobDesc", {})
137
- if self.innerCESubmissionType == "Sudo":
138
- for nUser in range(MAX_NUMBER_OF_SUDO_UNIX_USERS):
139
- if nUser not in self.userNumberPerTask.values():
140
- break
141
- taskKwargs["NUser"] = nUser
142
- if "USER" in os.environ:
143
- taskKwargs["PayloadUser"] = os.environ["USER"] + f"p{str(nUser).zfill(2)}"
144
141
 
145
142
  # Submission
146
- future = self.pPool.submit(executeJob, executableFile, proxy, self.taskID, inputs, **taskKwargs)
143
+ future = self.pPool.submit(
144
+ executeJob, executableFile, proxy, self.taskID, inputs, innerCEParameters, **taskKwargs
145
+ )
147
146
  self.processorsPerTask[future] = processorsForJob
147
+ self.ramPerTask[future] = memoryForJob
148
148
  future.add_done_callback(functools.partial(self.finalizeJob, self.taskID))
149
149
 
150
150
  taskID = self.taskID
@@ -154,7 +154,7 @@ class PoolComputingElement(ComputingElement):
154
154
 
155
155
  def _getProcessorsForJobs(self, kwargs):
156
156
  """helper function"""
157
- processorsInUse = self.getProcessorsInUse()
157
+ processorsInUse = sum(self.processorsPerTask.values())
158
158
  availableProcessors = self.processors - processorsInUse
159
159
 
160
160
  self.log.verbose(
@@ -191,29 +191,61 @@ class PoolComputingElement(ComputingElement):
191
191
 
192
192
  return requestedProcessors
193
193
 
194
+ def _getMemoryForJobs(self, kwargs):
195
+ """helper function to get the memory that will be allocated for the job
196
+
197
+ :param kwargs: job parameters
198
+ :return: memory in MB or None if not enough memory
199
+ """
200
+
201
+ # # job requirements
202
+ requestedMemory = kwargs.get("MinRAM", kwargs.get("MaxRAM", 0))
203
+ # if there's no limit, we just let it match the maximum
204
+ if not self.ram:
205
+ return max(requestedMemory, kwargs.get("MaxRAM", 0))
206
+
207
+ # # now check what the slot can provide
208
+ # Do we have enough memory?
209
+ availableMemory = self.ram - sum(self.ramPerTask.values())
210
+ if availableMemory < requestedMemory:
211
+ return None
212
+
213
+ # if there's a MaxRAM requested, we allocate it all (if it fits),
214
+ # and if it doesn't, we stop here instead of using MinRAM
215
+ if kwargs.get("MaxRAM", 0):
216
+ if availableMemory >= kwargs.get("MaxRAM"):
217
+ requestedMemory = kwargs.get("MaxRAM")
218
+ else:
219
+ return None
220
+
221
+ return requestedMemory
222
+
194
223
  def finalizeJob(self, taskID, future):
195
224
  """Finalize the job by updating the process utilisation counters
196
225
 
197
226
  :param future: evaluating the future result
198
227
  """
199
228
  nProc = self.processorsPerTask.pop(future)
229
+ ram = self.ramPerTask.pop(future, None)
200
230
 
201
231
  result = future.result() # This would be the result of the e.g. InProcess.submitJob()
202
232
  if result["OK"]:
203
- self.log.info("Task finished successfully:", f"{taskID}; {nProc} processor(s) freed")
233
+ self.log.info("Task finished successfully:", f"{taskID}; {nProc} processor(s) and {ram}MB freed")
204
234
  else:
205
235
  self.log.error("Task failed submission:", f"{taskID}; message: {result['Message']}")
206
236
  self.taskResults[taskID] = result
207
237
 
208
238
  def getCEStatus(self):
209
239
  """Method to return information on running and waiting jobs,
210
- as well as the number of processors (used, and available).
240
+ as well as the number of processors (used, and available),
241
+ and the RAM (used, and available)
211
242
 
212
243
  :return: dictionary of numbers of jobs per status and processors (used, and available)
213
244
  """
214
245
 
215
246
  result = S_OK()
216
247
  nJobs = 0
248
+
217
249
  for _j, value in self.processorsPerTask.items():
218
250
  if value > 0:
219
251
  nJobs += 1
@@ -222,9 +254,16 @@ class PoolComputingElement(ComputingElement):
222
254
  result["WaitingJobs"] = 0
223
255
 
224
256
  # dealing with processors
225
- processorsInUse = self.getProcessorsInUse()
257
+ processorsInUse = sum(self.processorsPerTask.values())
226
258
  result["UsedProcessors"] = processorsInUse
227
259
  result["AvailableProcessors"] = self.processors - processorsInUse
260
+ # dealing with RAM
261
+ result["UsedRAM"] = sum(self.ramPerTask.values())
262
+ if self.ram:
263
+ result["AvailableRAM"] = self.ram - sum(self.ramPerTask.values())
264
+ else:
265
+ result["AvailableRAM"] = 0
266
+
228
267
  return result
229
268
 
230
269
  def getDescription(self):
@@ -1,16 +1,17 @@
1
- """ SingularityCE is a type of "inner" CEs
2
- (meaning it's used by a jobAgent inside a pilot).
3
- A computing element class using singularity containers,
4
- where Singularity is supposed to be found on the WN.
1
+ """SingularityCE is a type of "inner" CEs
2
+ (meaning it's used by a jobAgent inside a pilot).
3
+ A computing element class using singularity containers,
4
+ where Singularity is supposed to be found on the WN.
5
5
 
6
- The goal of this CE is to start the job in the container set by
7
- the "ContainerRoot" config option.
6
+ The goal of this CE is to start the job in the container set by
7
+ the "ContainerRoot" config option.
8
8
 
9
- DIRAC can be re-installed within the container.
9
+ DIRAC can be re-installed within the container.
10
10
 
11
- See the Configuration/Resources/Computing documention for details on
12
- where to set the option parameters.
11
+ See the Configuration/Resources/Computing documention for details on
12
+ where to set the option parameters.
13
13
  """
14
+
14
15
  import json
15
16
  import os
16
17
  import re
@@ -115,6 +116,7 @@ class SingularityComputingElement(ComputingElement):
115
116
  self.__installDIRACInContainer = False
116
117
 
117
118
  self.processors = int(self.ceParameters.get("NumberOfProcessors", 1))
119
+ self.maxRAM = int(self.ceParameters.get("MaxRAM", 0))
118
120
 
119
121
  @staticmethod
120
122
  def __findInstallBaseDir():
@@ -415,6 +417,12 @@ class SingularityComputingElement(ComputingElement):
415
417
 
416
418
  self.log.debug(f"Execute singularity command: {cmd}")
417
419
  self.log.debug(f"Execute singularity env: {self.__getEnv()}")
420
+ # systemCall below uses ceParameters["MemoryLimitMB"] as CG2 upper memory limit
421
+ # if there's a max RAM available to the job, use that
422
+ if self.maxRAM:
423
+ self.ceParameters["MemoryLimitMB"] = min(
424
+ self.maxRAM, self.ceParameters.get("MemoryLimitMB", 1024 * 1024)
425
+ ) # 1024 * 1024 is an arbitrary large number
418
426
  result = CG2Manager().systemCall(
419
427
  0, cmd, callbackFunction=self.sendOutput, env=self.__getEnv(), ceParameters=self.ceParameters
420
428
  )
@@ -451,4 +459,6 @@ class SingularityComputingElement(ComputingElement):
451
459
  result["WaitingJobs"] = 0
452
460
  # processors
453
461
  result["AvailableProcessors"] = self.processors
462
+ # RAM
463
+ result["AvailableRAM"] = self.maxRAM
454
464
  return result
@@ -8,25 +8,41 @@ import shutil
8
8
 
9
9
  import pytest
10
10
 
11
- from DIRAC.Resources.Computing.test.Test_PoolComputingElement import jobScript, _stopJob
12
- from DIRAC.WorkloadManagementSystem.Utilities.Utils import createJobWrapper
13
-
14
11
  # sut
15
12
  from DIRAC.Resources.Computing.InProcessComputingElement import InProcessComputingElement
13
+ from DIRAC.Resources.Computing.test.Test_PoolComputingElement import _stopJob, jobScript
14
+ from DIRAC.WorkloadManagementSystem.Utilities.Utils import createJobWrapper
16
15
 
17
16
 
18
17
  @pytest.mark.slow
19
- def test_submitJob():
18
+ @pytest.mark.parametrize(
19
+ "ce_parameters, available_processors, ram",
20
+ [
21
+ ({}, 1, 0),
22
+ ({"NumberOfProcessors": 8}, 8, 0),
23
+ ({"MaxRAM": 2048}, 1, 2048),
24
+ ({"NumberOfProcessors": 8, "MaxRAM": 2048}, 8, 2048),
25
+ ],
26
+ )
27
+ def test_submitJob(ce_parameters, available_processors, ram):
28
+ # initialization
29
+ ce = InProcessComputingElement("InProcessCE")
30
+ ce.ceParameters = ce_parameters
31
+
32
+ # simple
20
33
  with open("testJob.py", "w") as execFile:
21
34
  execFile.write(jobScript % "1")
22
35
  os.chmod("testJob.py", 0o755)
23
36
 
24
- ce = InProcessComputingElement("InProcessCE")
25
37
  res = ce.submitJob("testJob.py", None)
26
38
  assert res["OK"] is True
27
39
  res = ce.getCEStatus()
28
40
  assert res["OK"] is True
29
41
  assert res["SubmittedJobs"] == 1
42
+ assert res["RunningJobs"] == 0
43
+ assert res["WaitingJobs"] == 0
44
+ assert res["AvailableProcessors"] == available_processors
45
+ assert res["AvailableRAM"] == ram
30
46
  _stopJob(1)
31
47
  for ff in ["testJob.py", "stop_job_2", "job.info", "std.out"]:
32
48
  if os.path.isfile(ff):
@@ -50,21 +66,66 @@ def test_submitJob():
50
66
  res = ce.submitJob(
51
67
  wrapperFile,
52
68
  proxy=None,
53
- numberOfProcessors=4,
54
- maxNumberOfProcessors=8,
69
+ numberOfProcessors=available_processors,
70
+ maxNumberOfProcessors=available_processors,
55
71
  wholeNode=False,
56
72
  mpTag=True,
73
+ MinRAM=ram,
74
+ MaxRAM=ram,
57
75
  jobDesc={"jobParams": jobParams, "resourceParams": resourceParams, "optimizerParams": optimizerParams},
58
76
  )
59
77
  assert res["OK"] is True
78
+ _stopJob(2)
60
79
 
61
80
  res = ce.getCEStatus()
62
81
  assert res["OK"] is True
63
82
  assert res["SubmittedJobs"] == 2
83
+ assert res["RunningJobs"] == 0
84
+ assert res["WaitingJobs"] == 0
85
+ assert res["AvailableProcessors"] == available_processors
86
+ assert res["AvailableRAM"] == ram
64
87
 
65
- _stopJob(2)
66
88
  for ff in ["testJob.py", "stop_job_2", "job.info", "std.out"]:
67
89
  if os.path.isfile(ff):
68
90
  os.remove(ff)
69
91
  if os.path.isdir("job"):
70
92
  shutil.rmtree("job")
93
+
94
+ # failing
95
+ with open("testJob.py", "w") as execFile:
96
+ execFile.write(jobScript % "3")
97
+ os.chmod("testJob.py", 0o755)
98
+
99
+ jobParams = {"JobType": "User", "Executable": "testJob.py"}
100
+ resourceParams = {"GridCE": "some_CE"}
101
+ optimizerParams = {}
102
+
103
+ wrapperFile = createJobWrapper(
104
+ jobID=3, jobParams=jobParams, resourceParams=resourceParams, optimizerParams=optimizerParams, logLevel="DEBUG"
105
+ )["Value"][
106
+ "JobExecutablePath"
107
+ ] # This is not under test, assuming it works fine
108
+
109
+ res = ce.submitJob(
110
+ wrapperFile,
111
+ proxy=None,
112
+ numberOfProcessors=4 + available_processors,
113
+ maxNumberOfProcessors=8 + available_processors,
114
+ wholeNode=False,
115
+ mpTag=True,
116
+ MinRAM=2500,
117
+ MaxRAM=4000,
118
+ jobDesc={"jobParams": jobParams, "resourceParams": resourceParams, "optimizerParams": optimizerParams},
119
+ )
120
+ assert res["OK"] is False
121
+ res = ce.getCEStatus()
122
+ assert res["OK"] is True
123
+ assert res["SubmittedJobs"] == 2
124
+ assert res["RunningJobs"] == 0
125
+ assert res["WaitingJobs"] == 0
126
+ assert res["AvailableProcessors"] == available_processors
127
+ assert res["AvailableRAM"] == ram
128
+ _stopJob(1)
129
+ for ff in ["testJob.py", "stop_job_3", "job.info", "std.out"]:
130
+ if os.path.isfile(ff):
131
+ os.remove(ff)