DIRAC 9.0.14__py3-none-any.whl → 9.0.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. DIRAC/ConfigurationSystem/Client/CSAPI.py +11 -0
  2. DIRAC/Core/Tornado/Client/private/TornadoBaseClient.py +1 -1
  3. DIRAC/Core/Utilities/CGroups2.py +1 -0
  4. DIRAC/Core/Utilities/ElasticSearchDB.py +1 -1
  5. DIRAC/Core/Utilities/MySQL.py +51 -25
  6. DIRAC/DataManagementSystem/Client/DataManager.py +7 -10
  7. DIRAC/DataManagementSystem/Client/FTS3Job.py +12 -3
  8. DIRAC/FrameworkSystem/Service/SystemAdministratorHandler.py +41 -11
  9. DIRAC/Interfaces/API/Dirac.py +12 -4
  10. DIRAC/Interfaces/API/Job.py +62 -17
  11. DIRAC/RequestManagementSystem/private/RequestTask.py +2 -1
  12. DIRAC/Resources/Catalog/FileCatalogClient.py +18 -7
  13. DIRAC/Resources/Catalog/Utilities.py +3 -3
  14. DIRAC/Resources/Computing/BatchSystems/SLURM.py +1 -1
  15. DIRAC/Resources/Computing/BatchSystems/TimeLeft/TimeLeft.py +3 -1
  16. DIRAC/Resources/Computing/ComputingElement.py +39 -34
  17. DIRAC/Resources/Computing/InProcessComputingElement.py +20 -7
  18. DIRAC/Resources/Computing/PoolComputingElement.py +76 -37
  19. DIRAC/Resources/Computing/SingularityComputingElement.py +19 -9
  20. DIRAC/Resources/Computing/test/Test_InProcessComputingElement.py +69 -8
  21. DIRAC/Resources/Computing/test/Test_PoolComputingElement.py +102 -35
  22. DIRAC/Resources/Storage/GFAL2_StorageBase.py +9 -0
  23. DIRAC/TransformationSystem/Agent/TransformationAgent.py +12 -13
  24. DIRAC/WorkloadManagementSystem/Client/JobReport.py +10 -6
  25. DIRAC/WorkloadManagementSystem/Client/JobState/JobState.py +12 -3
  26. DIRAC/WorkloadManagementSystem/Client/Matcher.py +18 -24
  27. DIRAC/WorkloadManagementSystem/DB/TaskQueueDB.py +137 -7
  28. DIRAC/WorkloadManagementSystem/Executor/JobScheduling.py +8 -14
  29. DIRAC/WorkloadManagementSystem/Executor/test/Test_Executor.py +3 -5
  30. DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapper.py +2 -2
  31. DIRAC/WorkloadManagementSystem/JobWrapper/test/Test_JobWrapper.py +1 -1
  32. DIRAC/WorkloadManagementSystem/Service/JobManagerHandler.py +7 -1
  33. DIRAC/WorkloadManagementSystem/Utilities/JobParameters.py +81 -2
  34. DIRAC/WorkloadManagementSystem/Utilities/PilotCStoJSONSynchronizer.py +7 -6
  35. DIRAC/WorkloadManagementSystem/Utilities/QueueUtilities.py +5 -5
  36. DIRAC/WorkloadManagementSystem/Utilities/RemoteRunner.py +2 -1
  37. DIRAC/WorkloadManagementSystem/Utilities/Utils.py +21 -4
  38. DIRAC/WorkloadManagementSystem/Utilities/test/Test_RemoteRunner.py +7 -3
  39. DIRAC/WorkloadManagementSystem/scripts/dirac_wms_get_wn_parameters.py +3 -3
  40. DIRAC/__init__.py +1 -1
  41. DIRAC/tests/Utilities/testJobDefinitions.py +57 -20
  42. {dirac-9.0.14.dist-info → dirac-9.0.16.dist-info}/METADATA +2 -2
  43. {dirac-9.0.14.dist-info → dirac-9.0.16.dist-info}/RECORD +47 -47
  44. {dirac-9.0.14.dist-info → dirac-9.0.16.dist-info}/WHEEL +0 -0
  45. {dirac-9.0.14.dist-info → dirac-9.0.16.dist-info}/entry_points.txt +0 -0
  46. {dirac-9.0.14.dist-info → dirac-9.0.16.dist-info}/licenses/LICENSE +0 -0
  47. {dirac-9.0.14.dist-info → dirac-9.0.16.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,5 @@
1
- """ TaskQueueDB class is a front-end to the task queues db
2
- """
1
+ """TaskQueueDB class is a front-end to the task queues db"""
2
+
3
3
  import random
4
4
  import string
5
5
  from collections import defaultdict
@@ -22,12 +22,13 @@ TQ_MIN_SHARE = 0.001
22
22
  # For checks at insertion time, and not only
23
23
  singleValueDefFields = ("Owner", "OwnerGroup", "CPUTime")
24
24
  multiValueDefFields = ("Sites", "GridCEs", "BannedSites", "Platforms", "JobTypes", "Tags")
25
+ rangeValueDefFields = ("MinRAM", "MaxRAM")
25
26
 
26
27
  # Used for matching
27
28
  multiValueMatchFields = ("GridCE", "Site", "Platform", "JobType", "Tag")
28
29
  bannedJobMatchFields = ("Site",)
29
30
  mandatoryMatchFields = ("CPUTime",)
30
- priorityIgnoredFields = ("Sites", "BannedSites")
31
+ priorityIgnoredFields = ("Sites", "BannedSites", "MinRAM", "MaxRAM")
31
32
 
32
33
 
33
34
  def _lowerAndRemovePunctuation(s):
@@ -129,6 +130,16 @@ class TaskQueueDB(DB):
129
130
  "ForeignKeys": {"TQId": "tq_TaskQueues.TQId"},
130
131
  }
131
132
 
133
+ self.__tablesDesc["tq_RAM_requirements"] = {
134
+ "Fields": {
135
+ "TQId": "INTEGER(11) UNSIGNED NOT NULL",
136
+ "MinRAM": "INTEGER UNSIGNED NOT NULL DEFAULT 0",
137
+ "MaxRAM": "INTEGER UNSIGNED NOT NULL DEFAULT 0",
138
+ },
139
+ "PrimaryKey": "TQId",
140
+ "ForeignKeys": {"TQId": "tq_TaskQueues.TQId"},
141
+ }
142
+
132
143
  for multiField in multiValueDefFields:
133
144
  tableName = f"tq_TQTo{multiField}"
134
145
  self.__tablesDesc[tableName] = {
@@ -206,6 +217,20 @@ class TaskQueueDB(DB):
206
217
  return result
207
218
  tqDefDict[field] = result["Value"]
208
219
 
220
+ # Check range value fields (RAM requirements)
221
+ for field in rangeValueDefFields:
222
+ if field not in tqDefDict:
223
+ continue
224
+ if not isinstance(tqDefDict[field], int):
225
+ return S_ERROR(f"Range value field {field} value type is not valid: {type(tqDefDict[field])}")
226
+ if tqDefDict[field] < 0:
227
+ return S_ERROR(f"Range value field {field} must be non-negative: {tqDefDict[field]}")
228
+
229
+ # Validate that MinRAM <= MaxRAM if both are specified
230
+ if "MinRAM" in tqDefDict and "MaxRAM" in tqDefDict:
231
+ if tqDefDict["MaxRAM"] > 0 and tqDefDict["MinRAM"] > tqDefDict["MaxRAM"]:
232
+ return S_ERROR(f"MinRAM ({tqDefDict['MinRAM']}) cannot be greater than MaxRAM ({tqDefDict['MaxRAM']})")
233
+
209
234
  return S_OK(tqDefDict)
210
235
 
211
236
  def _checkMatchDefinition(self, tqMatchDict):
@@ -251,6 +276,13 @@ class TaskQueueDB(DB):
251
276
  return S_ERROR(f"Match definition field {field} failed : {result['Message']}")
252
277
  tqMatchDict[field] = result["Value"]
253
278
 
279
+ # Check range value fields (RAM requirements for matching)
280
+ if "MaxRAM" in tqMatchDict:
281
+ result = travelAndCheckType(tqMatchDict["MaxRAM"], int, escapeValues=False)
282
+ if not result["OK"]:
283
+ return S_ERROR(f"Match definition field RAM failed : {result['Message']}")
284
+ tqMatchDict["MaxRAM"] = result["Value"]
285
+
254
286
  return S_OK(tqMatchDict)
255
287
 
256
288
  def __createTaskQueue(self, tqDefDict, priority=1, connObj=False):
@@ -303,6 +335,20 @@ class TaskQueueDB(DB):
303
335
  self.log.error("Failed to insert condition", f"{field} : {result['Message']}")
304
336
  self.cleanOrphanedTaskQueues(connObj=connObj)
305
337
  return S_ERROR(f"Can't insert values {values} for field {field}: {result['Message']}")
338
+
339
+ # Insert RAM requirements if specified and not both zero
340
+ if "MinRAM" in tqDefDict or "MaxRAM" in tqDefDict:
341
+ minRAM = tqDefDict.get("MinRAM", 0)
342
+ maxRAM = tqDefDict.get("MaxRAM", 0)
343
+ # Only insert if at least one value is non-zero (optimization: avoid unnecessary rows)
344
+ if minRAM > 0 or maxRAM > 0:
345
+ cmd = f"INSERT INTO `tq_RAM_requirements` (TQId, MinRAM, MaxRAM) VALUES ({tqId}, {minRAM}, {maxRAM})"
346
+ result = self._update(cmd, conn=connObj)
347
+ if not result["OK"]:
348
+ self.log.error("Failed to insert RAM requirements", result["Message"])
349
+ self.cleanOrphanedTaskQueues(connObj=connObj)
350
+ return S_ERROR(f"Can't insert RAM requirements: {result['Message']}")
351
+
306
352
  self.log.info("Created TQ", tqId)
307
353
  return S_OK(tqId)
308
354
 
@@ -327,6 +373,13 @@ class TaskQueueDB(DB):
327
373
  if not result["OK"]:
328
374
  return result
329
375
 
376
+ # Delete RAM requirements for orphaned TQs
377
+ result = self._update(
378
+ f"DELETE FROM `tq_RAM_requirements` WHERE TQId in ( {','.join(orphanedTQs)} )", conn=connObj
379
+ )
380
+ if not result["OK"]:
381
+ return result
382
+
330
383
  result = self._update(f"DELETE FROM `tq_TaskQueues` WHERE TQId in ( {','.join(orphanedTQs)} )", conn=connObj)
331
384
  if not result["OK"]:
332
385
  return result
@@ -473,6 +526,26 @@ class TaskQueueDB(DB):
473
526
  sqlCondList.append(f"{numValues} = ({secondQuery} {grouping})")
474
527
  else:
475
528
  sqlCondList.append(f"`tq_TaskQueues`.TQId not in ( SELECT DISTINCT {tableName}.TQId from {tableName} )")
529
+
530
+ # Handle RAM requirements matching
531
+ hasRAMRequirements = "MinRAM" in tqDefDict or "MaxRAM" in tqDefDict
532
+ if hasRAMRequirements:
533
+ minRAM = tqDefDict.get("MinRAM", 0)
534
+ maxRAM = tqDefDict.get("MaxRAM", 0)
535
+ # Only match TQs with the same RAM requirements if at least one is non-zero
536
+ if minRAM > 0 or maxRAM > 0:
537
+ # Match TQs that have the exact same RAM requirements
538
+ sqlCondList.append(
539
+ f"`tq_TaskQueues`.TQId IN ( SELECT TQId FROM `tq_RAM_requirements` "
540
+ f"WHERE MinRAM = {minRAM} AND MaxRAM = {maxRAM} )"
541
+ )
542
+ else:
543
+ # Both are 0, so match TQs with no RAM requirements row
544
+ sqlCondList.append("`tq_TaskQueues`.TQId NOT IN ( SELECT DISTINCT TQId FROM `tq_RAM_requirements` )")
545
+ else:
546
+ # Match TQs that have no RAM requirements
547
+ sqlCondList.append("`tq_TaskQueues`.TQId NOT IN ( SELECT DISTINCT TQId FROM `tq_RAM_requirements` )")
548
+
476
549
  # END MAGIC: That was easy ;)
477
550
  return S_OK(" AND ".join(sqlCondList))
478
551
 
@@ -722,6 +795,19 @@ WHERE `tq_Jobs`.TQId = %s ORDER BY RAND() / `tq_Jobs`.RealPriority ASC LIMIT 1"
722
795
  if "CPUTime" in tqMatchDict:
723
796
  sqlCondList.append(self.__generateSQLSubCond("tq.%s <= %%s" % "CPUTime", tqMatchDict["CPUTime"]))
724
797
 
798
+ # RAM matching logic
799
+ if "MaxRAM" in tqMatchDict:
800
+ ram = tqMatchDict["MaxRAM"]
801
+ # Join with tq_RAM_requirements table
802
+ sqlTables["tq_RAM_requirements"] = "ram_req"
803
+ # Match if:
804
+ # 1. No RAM requirement exists for this TQ (LEFT JOIN will give NULL)
805
+ # 2. OR the resource has at least MinRAM
806
+ # Note: MinRAM is used for matching, MaxRAM is informational for post-match scheduling
807
+ # A job requiring MinRAM=2GB can run on any machine with 2GB or more
808
+ ramCond = f"( ram_req.TQId IS NULL OR {ram} >= ram_req.MinRAM )"
809
+ sqlCondList.append(ramCond)
810
+
725
811
  tag_fv = []
726
812
 
727
813
  # Match multi value fields
@@ -844,10 +930,14 @@ WHERE `tq_Jobs`.TQId = %s ORDER BY RAND() / `tq_Jobs`.RealPriority ASC LIMIT 1"
844
930
  if negativeCond:
845
931
  sqlCondList.append(self.__generateNotSQL(negativeCond))
846
932
 
847
- # Generate the final query string
848
- tqSqlCmd = "SELECT tq.TQId, tq.Owner, tq.OwnerGroup FROM `tq_TaskQueues` tq WHERE %s" % (
849
- " AND ".join(sqlCondList)
850
- )
933
+ # Generate the final query string with proper JOINs
934
+ fromClause = "`tq_TaskQueues` tq"
935
+
936
+ # Add LEFT JOIN for RAM requirements if needed
937
+ if "tq_RAM_requirements" in sqlTables:
938
+ fromClause += " LEFT JOIN `tq_RAM_requirements` ram_req ON tq.TQId = ram_req.TQId"
939
+
940
+ tqSqlCmd = f"SELECT tq.TQId, tq.Owner, tq.OwnerGroup FROM {fromClause} WHERE {' AND '.join(sqlCondList)}"
851
941
 
852
942
  # Apply priorities
853
943
  tqSqlCmd = f"{tqSqlCmd} ORDER BY RAND() / tq.Priority ASC"
@@ -994,6 +1084,12 @@ WHERE j.JobId = %s AND t.TQId = j.TQId"
994
1084
  retVal = self._update(f"DELETE FROM `tq_TQTo{mvField}` WHERE TQId = {tqId}", conn=connObj)
995
1085
  if not retVal["OK"]:
996
1086
  return retVal
1087
+
1088
+ # Delete RAM requirements if they exist
1089
+ retVal = self._update(f"DELETE FROM `tq_RAM_requirements` WHERE TQId = {tqId}", conn=connObj)
1090
+ if not retVal["OK"]:
1091
+ return retVal
1092
+
997
1093
  retVal = self._update(f"DELETE FROM `tq_TaskQueues` WHERE TQId = {tqId}", conn=connObj)
998
1094
  if not retVal["OK"]:
999
1095
  return retVal
@@ -1065,6 +1161,40 @@ WHERE j.JobId = %s AND t.TQId = j.TQId"
1065
1161
  if field not in tqData[tqId]:
1066
1162
  tqData[tqId][field] = []
1067
1163
  tqData[tqId][field].append(value)
1164
+
1165
+ # Retrieve RAM requirements (if table exists)
1166
+ # Note: The table should be auto-created by __initializeDB, but we check for safety
1167
+ sqlCmd = "SELECT TQId, MinRAM, MaxRAM FROM `tq_RAM_requirements`"
1168
+ if tqIdList is not None:
1169
+ if tqIdList:
1170
+ # Only retrieve RAM requirements for specific TQIds
1171
+ sqlCmd += f" WHERE TQId IN ( {', '.join([str(id_) for id_ in tqIdList])} )"
1172
+ # else: empty list was already handled earlier with fast-track return
1173
+ retVal = self._query(sqlCmd)
1174
+ if not retVal["OK"]:
1175
+ # If table doesn't exist (e.g., old installation), log a warning but continue
1176
+ # This provides backward compatibility
1177
+ if "doesn't exist" in retVal["Message"] or "Table" in retVal["Message"]:
1178
+ self.log.warn("RAM requirements table not found, skipping RAM data retrieval", retVal["Message"])
1179
+ else:
1180
+ self.log.error("Can't retrieve RAM requirements", retVal["Message"])
1181
+ return retVal
1182
+ else:
1183
+ for record in retVal["Value"]:
1184
+ tqId = record[0]
1185
+ minRAM = record[1]
1186
+ maxRAM = record[2]
1187
+ if tqId not in tqData:
1188
+ if tqIdList is None or tqId in tqIdList:
1189
+ self.log.verbose(
1190
+ "Task Queue has RAM requirements but does not exist: triggering a cleaning",
1191
+ f"TQID: {tqId}",
1192
+ )
1193
+ tqNeedCleaning = True
1194
+ else:
1195
+ tqData[tqId]["MinRAM"] = minRAM
1196
+ tqData[tqId]["MaxRAM"] = maxRAM
1197
+
1068
1198
  if tqNeedCleaning:
1069
1199
  self.cleanOrphanedTaskQueues()
1070
1200
  return S_OK(tqData)
@@ -1,12 +1,12 @@
1
- """ The Job Scheduling Executor takes the information gained from all previous
2
- optimizers and makes a scheduling decision for the jobs.
1
+ """The Job Scheduling Executor takes the information gained from all previous
2
+ optimizers and makes a scheduling decision for the jobs.
3
3
 
4
- Subsequent to this jobs are added into a Task Queue and pilot agents can be submitted.
4
+ Subsequent to this jobs are added into a Task Queue and pilot agents can be submitted.
5
5
 
6
- All issues preventing the successful resolution of a site candidate are discovered
7
- here where all information is available.
6
+ All issues preventing the successful resolution of a site candidate are discovered
7
+ here where all information is available.
8
8
 
9
- This Executor will fail affected jobs meaningfully.
9
+ This Executor will fail affected jobs meaningfully.
10
10
  """
11
11
 
12
12
  import random
@@ -249,7 +249,7 @@ class JobScheduling(OptimizerExecutor):
249
249
 
250
250
  # Get stageSites[0] because it has already been randomized and it's as good as any in stageSites
251
251
  stageSite = stageSites[0]
252
- self.jobLog.verbose(" Staging site will be", stageSite)
252
+ self.jobLog.verbose("Staging site will be", stageSite)
253
253
  stageData = idSites[stageSite]
254
254
  # Set as if everything has already been staged
255
255
  stageData["disk"] += stageData["tape"]
@@ -351,12 +351,6 @@ class JobScheduling(OptimizerExecutor):
351
351
  tagList.append("WholeNode")
352
352
  tagList.append("MultiProcessor")
353
353
 
354
- # sorting out the RAM (this should be probably coded ~same as number of processors)
355
- if "MaxRAM" in jobManifest:
356
- maxRAM = jobManifest.getOption("MaxRAM", 0)
357
- if maxRAM:
358
- tagList.append("%dGB" % maxRAM)
359
-
360
354
  # other tags? Just add them
361
355
  if "Tags" in jobManifest:
362
356
  tagList.extend(jobManifest.getOption("Tags", []))
@@ -391,7 +385,7 @@ class JobScheduling(OptimizerExecutor):
391
385
 
392
386
  # Job multivalue requirement keys are specified as singles in the job descriptions
393
387
  # but for backward compatibility can be also plurals
394
- for key in ("JobType", "GridRequiredCEs", "GridCE", "Tags"):
388
+ for key in ("JobType", "GridRequiredCEs", "GridCE", "MinRAM", "MaxRAM", "Tags"):
395
389
  reqKey = key
396
390
  if key == "JobType":
397
391
  reqKey = "JobTypes"
@@ -1,5 +1,5 @@
1
- """ pytest(s) for Executors
2
- """
1
+ """pytest(s) for Executors"""
2
+
3
3
  # pylint: disable=protected-access, missing-docstring
4
4
 
5
5
  from unittest.mock import MagicMock
@@ -54,9 +54,7 @@ def test__applySiteFilter(sites, banned, expected):
54
54
  ({}, []),
55
55
  ({"Tag": "bof"}, ["bof"]),
56
56
  ({"Tags": "bof, bif"}, ["bof", "bif"]),
57
- ({"MaxRAM": 2}, ["2GB"]),
58
- ({"Tags": "bof, bif", "MaxRAM": 2}, ["bof", "bif", "2GB"]),
59
- ({"WholeNode": "yes", "MaxRAM": 2}, ["WholeNode", "MultiProcessor", "2GB"]),
57
+ ({"WholeNode": "yes"}, ["WholeNode", "MultiProcessor"]),
60
58
  ({"NumberOfProcessors": 1}, []),
61
59
  ({"NumberOfProcessors": 4}, ["MultiProcessor", "4Processors"]),
62
60
  ({"NumberOfProcessors": 4, "MinNumberOfProcessors": 2}, ["MultiProcessor", "4Processors"]),
@@ -546,7 +546,7 @@ class JobWrapper:
546
546
  self.__report(status=JobStatus.FAILED, minorStatus=JobMinorStatus.APP_THREAD_FAILED, sendFlag=True)
547
547
  applicationErrorStatus = "None reported"
548
548
  if payloadStatus:
549
- applicationErrorStatus = payloadStatus
549
+ applicationErrorStatus = str(payloadStatus)
550
550
  self.__setJobParam("ApplicationError", applicationErrorStatus, sendFlag=True)
551
551
 
552
552
  # This might happen if process() and postProcess() are called on different machines
@@ -1544,7 +1544,7 @@ class JobWrapper:
1544
1544
  #############################################################################
1545
1545
  def __setJobParam(self, name, value, sendFlag=False):
1546
1546
  """Wraps around setJobParameter of JobReport client"""
1547
- jobParam = self.jobReport.setJobParameter(str(name), str(value), sendFlag)
1547
+ jobParam = self.jobReport.setJobParameter(str(name), value, sendFlag)
1548
1548
  if not jobParam["OK"]:
1549
1549
  self.log.warn("Failed setting job parameter", jobParam["Message"])
1550
1550
  if self.jobID:
@@ -565,7 +565,7 @@ def test_postProcess_executor_failed_status_defined(setup_job_wrapper, mocker, m
565
565
  assert result["OK"]
566
566
  assert report_args[-1]["status"] == JobStatus.COMPLETING
567
567
  assert report_args[-1]["minorStatus"] == JobMinorStatus.APP_ERRORS
568
- assert set_param_args[-3][0][1] == 126
568
+ assert set_param_args[-3][0][1] == "126"
569
569
 
570
570
 
571
571
  def test_postProcess_subprocess_not_complete(setup_job_wrapper, mocker, mock_report_and_set_param):
@@ -354,7 +354,13 @@ class JobManagerHandlerMixin:
354
354
  validJobList, invalidJobList, nonauthJobList, ownerJobList = self.jobPolicy.evaluateJobRights(
355
355
  jobList, RIGHT_RESCHEDULE
356
356
  )
357
- res = rescheduleJobs(validJobList, source="JobManager")
357
+ res = rescheduleJobs(
358
+ validJobList,
359
+ source="JobManager",
360
+ jobDB=self.jobDB,
361
+ taskQueueDB=self.taskQueueDB,
362
+ jobLoggingDB=self.jobLoggingDB,
363
+ )
358
364
  if not res["OK"]:
359
365
  self.log.error(res["Message"])
360
366
 
@@ -10,7 +10,7 @@ def getMemoryFromProc():
10
10
  meminfo = {i.split()[0].rstrip(":"): int(i.split()[1]) for i in open("/proc/meminfo").readlines()}
11
11
  maxRAM = meminfo["MemTotal"]
12
12
  if maxRAM:
13
- return int(maxRAM / 1024)
13
+ return int(maxRAM / 1024) # from KB to MB
14
14
 
15
15
 
16
16
  def getNumberOfProcessors(siteName=None, gridCE=None, queue=None):
@@ -57,7 +57,7 @@ def getNumberOfProcessors(siteName=None, gridCE=None, queue=None):
57
57
  return numberOfProcessors
58
58
 
59
59
  # 3) looks in CS for tags
60
- gLogger.info(f"Getting tagsfor {siteName}: {gridCE}: {queue}")
60
+ gLogger.info(f"Getting tags for {siteName}: {gridCE}: {queue}")
61
61
  # Tags of the CE
62
62
  tags = fromChar(
63
63
  gConfig.getValue(f"/Resources/Sites/{siteName.split('.')[0]}/{siteName}/CEs/{gridCE}/Tag", "")
@@ -201,3 +201,82 @@ def getJobParameters(jobIDs: list[int], parName: str | None, vo: str = "") -> di
201
201
  if jobID not in final:
202
202
  final[jobID] = parameters[jobID]
203
203
  return S_OK(final)
204
+
205
+
206
+ def getAvailableRAM(siteName=None, gridCE=None, queue=None):
207
+ """Gets the available RAM on a certain CE/queue/node (what the pilot administers)
208
+
209
+ The siteName/gridCE/queue parameters are normally not necessary.
210
+
211
+ Tries to find it in this order:
212
+ 1) from the /Resources/Computing/CEDefaults/MaxRAM (which is what the pilot might fill up)
213
+ 2) if not present looks in CS for "MemoryLimitMB" Queue or CE or site option
214
+ 3) if not present but there's WholeNode tag, look what the WN provides using _getMemoryFromProc()
215
+ 4) return 0
216
+ """
217
+
218
+ # 1) from /Resources/Computing/CEDefaults/MaxRAM
219
+ gLogger.info("Getting MaxRAM from /Resources/Computing/CEDefaults/MaxRAM")
220
+ availableRAM = gConfig.getValue("/Resources/Computing/CEDefaults/MaxRAM", None)
221
+ if availableRAM:
222
+ return availableRAM
223
+
224
+ # 2) looks in CS for "MaxRAM" Queue or CE or site option
225
+ if not siteName:
226
+ siteName = gConfig.getValue("/LocalSite/Site", "")
227
+ if not gridCE:
228
+ gridCE = gConfig.getValue("/LocalSite/GridCE", "")
229
+ if not queue:
230
+ queue = gConfig.getValue("/LocalSite/CEQueue", "")
231
+ if not (siteName and gridCE and queue):
232
+ gLogger.warn("Could not find AvailableRAM: missing siteName or gridCE or queue. Returning 0")
233
+ return 0
234
+
235
+ grid = siteName.split(".")[0]
236
+ csPaths = [
237
+ f"/Resources/Sites/{grid}/{siteName}/CEs/{gridCE}/Queues/{queue}/MemoryLimitMB",
238
+ f"/Resources/Sites/{grid}/{siteName}/CEs/{gridCE}/MemoryLimitMB",
239
+ f"/Resources/Sites/{grid}/{siteName}/MemoryLimitMB",
240
+ ]
241
+ for csPath in csPaths:
242
+ gLogger.info("Looking in", csPath)
243
+ availableRAM = gConfig.getValue(csPath, None)
244
+ if availableRAM:
245
+ return int(availableRAM)
246
+
247
+ # 3) checks if 'WholeNode' is one of the used tags
248
+ # Tags of the CE
249
+ tags = fromChar(
250
+ gConfig.getValue(f"/Resources/Sites/{siteName.split('.')[0]}/{siteName}/CEs/{gridCE}/Tag", "")
251
+ ) + fromChar(gConfig.getValue(f"/Resources/Sites/{siteName.split('.')[0]}/{siteName}/Cloud/{gridCE}/Tag", ""))
252
+ # Tags of the Queue
253
+ tags += fromChar(
254
+ gConfig.getValue(f"/Resources/Sites/{siteName.split('.')[0]}/{siteName}/CEs/{gridCE}/Queues/{queue}/Tag", "")
255
+ ) + fromChar(
256
+ gConfig.getValue(f"/Resources/Sites/{siteName.split('.')[0]}/{siteName}/Cloud/{gridCE}/VMTypes/{queue}/Tag", "")
257
+ )
258
+
259
+ if "WholeNode" in tags:
260
+ gLogger.info("Found WholeNode tag, using getMemoryFromProc()")
261
+ return getMemoryFromProc()
262
+
263
+ # 4) return 0
264
+ gLogger.info("RAM limits could not be found in CS, and WholeNode tag not found")
265
+ return 0
266
+
267
+
268
+ def getRAMForJob(jobID):
269
+ """Gets the RAM allowed for the job.
270
+ This can be used to communicate to your job payload the RAM it's allowed to use,
271
+ so this function should be called from your extension.
272
+
273
+ If the JobAgent is using "InProcess" CE (which is the default),
274
+ then what's returned will basically be the same of what's returned by the getAvailableRAM() function above
275
+ """
276
+
277
+ # from /Resources/Computing/JobLimits/jobID/MaxRAM (set by PoolComputingElement)
278
+ ram = gConfig.getValue(f"Resources/Computing/JobLimits/{jobID}/MaxRAM")
279
+ if ram:
280
+ return int(ram)
281
+
282
+ return getAvailableRAM()
@@ -208,12 +208,13 @@ class PilotCStoJSONSynchronizer:
208
208
  if defaultSetup:
209
209
  pilotDict["DefaultSetup"] = defaultSetup
210
210
 
211
- self.log.debug("From DIRAC/Configuration")
212
- configurationServers = gConfig.getServersList()
213
- if not includeMasterCS:
214
- masterCS = gConfigurationData.getMasterServer()
215
- configurationServers = exclude_master_cs_aliases(configurationServers, masterCS)
216
-
211
+ configurationServers = Operations().getValue("Pilot/OverrideConfigurationServers", [])
212
+ if not configurationServers:
213
+ self.log.debug("From DIRAC/Configuration")
214
+ configurationServers = gConfig.getServersList()
215
+ if not includeMasterCS:
216
+ masterCS = gConfigurationData.getMasterServer()
217
+ configurationServers = exclude_master_cs_aliases(configurationServers, masterCS)
217
218
  pilotDict["ConfigurationServers"] = configurationServers
218
219
 
219
220
  preferredURLPatterns = gConfigurationData.extractOptionFromCFG("/DIRAC/PreferredURLPatterns")
@@ -1,5 +1,5 @@
1
- """Utilities to help Computing Element Queues manipulation
2
- """
1
+ """Utilities to help Computing Element Queues manipulation"""
2
+
3
3
  import hashlib
4
4
 
5
5
  from DIRAC import S_OK, S_ERROR
@@ -222,10 +222,10 @@ def matchQueue(jobJDL, queueDict, fullMatch=False):
222
222
  return S_OK({"Match": False, "Reason": noMatchReasons[0]})
223
223
 
224
224
  # 5. RAM
225
- ram = job.getAttributeInt("RAM")
225
+ ram = job.getAttributeInt("MaxRAM")
226
226
  # If MaxRAM is not specified in the queue description, assume 2GB
227
- if ram and ram > int(queueDict.get("MaxRAM", 2048) / 1024):
228
- noMatchReasons.append("Job RAM %d requirement not satisfied" % ram)
227
+ if ram and ram > int(queueDict.get("MaxRAM", 2048)):
228
+ noMatchReasons.append(f"Job RAM {ram} requirement not satisfied")
229
229
  if not fullMatch:
230
230
  return S_OK({"Match": False, "Reason": noMatchReasons[0]})
231
231
 
@@ -1,4 +1,4 @@
1
- """ RemoteRunner
1
+ """RemoteRunner
2
2
 
3
3
  RemoteRunner has been designed to send scripts/applications and input files on remote worker nodes having
4
4
  no outbound connectivity (e.g. supercomputers)
@@ -6,6 +6,7 @@ no outbound connectivity (e.g. supercomputers)
6
6
  Mostly called by workflow modules, RemoteRunner is generally the last component to get through before
7
7
  the script/application execution on a remote machine.
8
8
  """
9
+
9
10
  import hashlib
10
11
  import os
11
12
  import shlex
@@ -118,12 +118,21 @@ def createJobWrapper(
118
118
  return S_OK(generatedFiles)
119
119
 
120
120
 
121
- def rescheduleJobs(jobIDs: list[int], source: str = "") -> dict:
121
+ def rescheduleJobs(
122
+ jobIDs: list[int],
123
+ source: str = "",
124
+ jobDB: JobDB | None = None,
125
+ taskQueueDB: TaskQueueDB | None = None,
126
+ jobLoggingDB: JobLoggingDB | None = None,
127
+ ) -> dict:
122
128
  """Utility to reschedule jobs (not atomic, nor bulk)
123
129
  Requires direct access to the JobDB and TaskQueueDB
124
130
 
125
131
  :param jobIDs: list of jobIDs
126
132
  :param source: source of the reschedule
133
+ :param jobDB: optional JobDB instance to reuse (creates new if not provided)
134
+ :param taskQueueDB: optional TaskQueueDB instance to reuse (creates new if not provided)
135
+ :param jobLoggingDB: optional JobLoggingDB instance to reuse (creates new if not provided)
127
136
  :return: S_OK/S_ERROR
128
137
  :rtype: dict
129
138
 
@@ -131,13 +140,21 @@ def rescheduleJobs(jobIDs: list[int], source: str = "") -> dict:
131
140
 
132
141
  failedJobs = []
133
142
 
143
+ # Reuse provided DB instances or create new ones
144
+ if jobDB is None:
145
+ jobDB = JobDB()
146
+ if taskQueueDB is None:
147
+ taskQueueDB = TaskQueueDB()
148
+ if jobLoggingDB is None:
149
+ jobLoggingDB = JobLoggingDB()
150
+
134
151
  for jobID in jobIDs:
135
- result = JobDB().rescheduleJob(jobID)
152
+ result = jobDB.rescheduleJob(jobID)
136
153
  if not result["OK"]:
137
154
  failedJobs.append(jobID)
138
155
  continue
139
- TaskQueueDB().deleteJob(jobID)
140
- JobLoggingDB().addLoggingRecord(
156
+ taskQueueDB.deleteJob(jobID)
157
+ jobLoggingDB.addLoggingRecord(
141
158
  result["JobID"],
142
159
  status=result["Status"],
143
160
  minorStatus=result["MinorStatus"],
@@ -1,5 +1,4 @@
1
- """ Test class for Job Agent
2
- """
1
+ """Test class for Job Agent"""
3
2
 
4
3
  # imports
5
4
  import pytest
@@ -60,11 +59,16 @@ def test__wrapCommand(command, workingDirectory, expectedContent):
60
59
  (1, 1, True, 1),
61
60
  (2, 2, True, 2),
62
61
  (1, 2, True, 1),
62
+ (
63
+ 1,
64
+ 0,
65
+ True,
66
+ 1,
67
+ ), # if ceNumberOfProcessors is 0, it will be interpreted as needing local evaluation. That will return 1.
63
68
  # CE has less processors than the payload requests
64
69
  (2, 1, False, "Not enough processors to execute the command"),
65
70
  # Specific case: we should not have 0
66
71
  (0, 1, False, "Inappropriate NumberOfProcessors value"),
67
- (1, 0, False, "Inappropriate NumberOfProcessors value"),
68
72
  (-4, 1, False, "Inappropriate NumberOfProcessors value"),
69
73
  (1, -4, False, "Inappropriate NumberOfProcessors value"),
70
74
  (0, 0, False, "Inappropriate NumberOfProcessors value"),
@@ -39,12 +39,12 @@ def main():
39
39
  gLogger.info("Getting number of processors")
40
40
  numberOfProcessor = JobParameters.getNumberOfProcessors(Site, ceName, Queue)
41
41
 
42
- gLogger.info("Getting memory (RAM)")
43
- maxRAM = JobParameters.getMemoryFromProc()
44
-
45
42
  gLogger.info("Getting number of GPUs")
46
43
  numberOfGPUs = JobParameters.getNumberOfGPUs(Site, ceName, Queue)
47
44
 
45
+ gLogger.info("Getting maximum RAM")
46
+ maxRAM = JobParameters.getAvailableRAM(Site, ceName, Queue)
47
+
48
48
  # just communicating it back
49
49
  gLogger.notice(" ".join(str(wnPar) for wnPar in [numberOfProcessor, maxRAM, numberOfGPUs]))
50
50
 
DIRAC/__init__.py CHANGED
@@ -139,7 +139,7 @@ def _computeRootPath(rootPath):
139
139
  if versionsPath.parent.name != "versions":
140
140
  return str(rootPath)
141
141
  # VERSION-INSTALL_TIME
142
- pattern1 = re.compile(r"v(\d+\.\d+\.\d+[^\-]*)\-(\d+)")
142
+ pattern1 = re.compile(r"(v\d+\.\d+\.\d+[^\-]*|[^-]+)-(\d+)")
143
143
  # $(uname -s)-$(uname -m)
144
144
  pattern2 = re.compile(r"([^\-]+)-([^\-]+)")
145
145
  if pattern1.fullmatch(versionsPath.name) and pattern2.fullmatch(rootPath.name):