DIRAC 9.0.14__py3-none-any.whl → 9.0.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- DIRAC/ConfigurationSystem/Client/CSAPI.py +11 -0
- DIRAC/Core/Tornado/Client/private/TornadoBaseClient.py +1 -1
- DIRAC/Core/Utilities/CGroups2.py +1 -0
- DIRAC/Core/Utilities/ElasticSearchDB.py +1 -1
- DIRAC/Core/Utilities/MySQL.py +51 -25
- DIRAC/DataManagementSystem/Client/DataManager.py +7 -10
- DIRAC/DataManagementSystem/Client/FTS3Job.py +12 -3
- DIRAC/FrameworkSystem/Service/SystemAdministratorHandler.py +41 -11
- DIRAC/Interfaces/API/Dirac.py +12 -4
- DIRAC/Interfaces/API/Job.py +62 -17
- DIRAC/RequestManagementSystem/private/RequestTask.py +2 -1
- DIRAC/Resources/Catalog/FileCatalogClient.py +18 -7
- DIRAC/Resources/Catalog/Utilities.py +3 -3
- DIRAC/Resources/Computing/BatchSystems/SLURM.py +1 -1
- DIRAC/Resources/Computing/BatchSystems/TimeLeft/TimeLeft.py +3 -1
- DIRAC/Resources/Computing/ComputingElement.py +39 -34
- DIRAC/Resources/Computing/InProcessComputingElement.py +20 -7
- DIRAC/Resources/Computing/PoolComputingElement.py +76 -37
- DIRAC/Resources/Computing/SingularityComputingElement.py +19 -9
- DIRAC/Resources/Computing/test/Test_InProcessComputingElement.py +69 -8
- DIRAC/Resources/Computing/test/Test_PoolComputingElement.py +102 -35
- DIRAC/Resources/Storage/GFAL2_StorageBase.py +9 -0
- DIRAC/TransformationSystem/Agent/TransformationAgent.py +12 -13
- DIRAC/WorkloadManagementSystem/Client/JobReport.py +10 -6
- DIRAC/WorkloadManagementSystem/Client/JobState/JobState.py +12 -3
- DIRAC/WorkloadManagementSystem/Client/Matcher.py +18 -24
- DIRAC/WorkloadManagementSystem/DB/TaskQueueDB.py +137 -7
- DIRAC/WorkloadManagementSystem/Executor/JobScheduling.py +8 -14
- DIRAC/WorkloadManagementSystem/Executor/test/Test_Executor.py +3 -5
- DIRAC/WorkloadManagementSystem/JobWrapper/JobWrapper.py +2 -2
- DIRAC/WorkloadManagementSystem/JobWrapper/test/Test_JobWrapper.py +1 -1
- DIRAC/WorkloadManagementSystem/Service/JobManagerHandler.py +7 -1
- DIRAC/WorkloadManagementSystem/Utilities/JobParameters.py +81 -2
- DIRAC/WorkloadManagementSystem/Utilities/PilotCStoJSONSynchronizer.py +7 -6
- DIRAC/WorkloadManagementSystem/Utilities/QueueUtilities.py +5 -5
- DIRAC/WorkloadManagementSystem/Utilities/RemoteRunner.py +2 -1
- DIRAC/WorkloadManagementSystem/Utilities/Utils.py +21 -4
- DIRAC/WorkloadManagementSystem/Utilities/test/Test_RemoteRunner.py +7 -3
- DIRAC/WorkloadManagementSystem/scripts/dirac_wms_get_wn_parameters.py +3 -3
- DIRAC/__init__.py +1 -1
- DIRAC/tests/Utilities/testJobDefinitions.py +57 -20
- {dirac-9.0.14.dist-info → dirac-9.0.16.dist-info}/METADATA +2 -2
- {dirac-9.0.14.dist-info → dirac-9.0.16.dist-info}/RECORD +47 -47
- {dirac-9.0.14.dist-info → dirac-9.0.16.dist-info}/WHEEL +0 -0
- {dirac-9.0.14.dist-info → dirac-9.0.16.dist-info}/entry_points.txt +0 -0
- {dirac-9.0.14.dist-info → dirac-9.0.16.dist-info}/licenses/LICENSE +0 -0
- {dirac-9.0.14.dist-info → dirac-9.0.16.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
1
|
+
"""TaskQueueDB class is a front-end to the task queues db"""
|
|
2
|
+
|
|
3
3
|
import random
|
|
4
4
|
import string
|
|
5
5
|
from collections import defaultdict
|
|
@@ -22,12 +22,13 @@ TQ_MIN_SHARE = 0.001
|
|
|
22
22
|
# For checks at insertion time, and not only
|
|
23
23
|
singleValueDefFields = ("Owner", "OwnerGroup", "CPUTime")
|
|
24
24
|
multiValueDefFields = ("Sites", "GridCEs", "BannedSites", "Platforms", "JobTypes", "Tags")
|
|
25
|
+
rangeValueDefFields = ("MinRAM", "MaxRAM")
|
|
25
26
|
|
|
26
27
|
# Used for matching
|
|
27
28
|
multiValueMatchFields = ("GridCE", "Site", "Platform", "JobType", "Tag")
|
|
28
29
|
bannedJobMatchFields = ("Site",)
|
|
29
30
|
mandatoryMatchFields = ("CPUTime",)
|
|
30
|
-
priorityIgnoredFields = ("Sites", "BannedSites")
|
|
31
|
+
priorityIgnoredFields = ("Sites", "BannedSites", "MinRAM", "MaxRAM")
|
|
31
32
|
|
|
32
33
|
|
|
33
34
|
def _lowerAndRemovePunctuation(s):
|
|
@@ -129,6 +130,16 @@ class TaskQueueDB(DB):
|
|
|
129
130
|
"ForeignKeys": {"TQId": "tq_TaskQueues.TQId"},
|
|
130
131
|
}
|
|
131
132
|
|
|
133
|
+
self.__tablesDesc["tq_RAM_requirements"] = {
|
|
134
|
+
"Fields": {
|
|
135
|
+
"TQId": "INTEGER(11) UNSIGNED NOT NULL",
|
|
136
|
+
"MinRAM": "INTEGER UNSIGNED NOT NULL DEFAULT 0",
|
|
137
|
+
"MaxRAM": "INTEGER UNSIGNED NOT NULL DEFAULT 0",
|
|
138
|
+
},
|
|
139
|
+
"PrimaryKey": "TQId",
|
|
140
|
+
"ForeignKeys": {"TQId": "tq_TaskQueues.TQId"},
|
|
141
|
+
}
|
|
142
|
+
|
|
132
143
|
for multiField in multiValueDefFields:
|
|
133
144
|
tableName = f"tq_TQTo{multiField}"
|
|
134
145
|
self.__tablesDesc[tableName] = {
|
|
@@ -206,6 +217,20 @@ class TaskQueueDB(DB):
|
|
|
206
217
|
return result
|
|
207
218
|
tqDefDict[field] = result["Value"]
|
|
208
219
|
|
|
220
|
+
# Check range value fields (RAM requirements)
|
|
221
|
+
for field in rangeValueDefFields:
|
|
222
|
+
if field not in tqDefDict:
|
|
223
|
+
continue
|
|
224
|
+
if not isinstance(tqDefDict[field], int):
|
|
225
|
+
return S_ERROR(f"Range value field {field} value type is not valid: {type(tqDefDict[field])}")
|
|
226
|
+
if tqDefDict[field] < 0:
|
|
227
|
+
return S_ERROR(f"Range value field {field} must be non-negative: {tqDefDict[field]}")
|
|
228
|
+
|
|
229
|
+
# Validate that MinRAM <= MaxRAM if both are specified
|
|
230
|
+
if "MinRAM" in tqDefDict and "MaxRAM" in tqDefDict:
|
|
231
|
+
if tqDefDict["MaxRAM"] > 0 and tqDefDict["MinRAM"] > tqDefDict["MaxRAM"]:
|
|
232
|
+
return S_ERROR(f"MinRAM ({tqDefDict['MinRAM']}) cannot be greater than MaxRAM ({tqDefDict['MaxRAM']})")
|
|
233
|
+
|
|
209
234
|
return S_OK(tqDefDict)
|
|
210
235
|
|
|
211
236
|
def _checkMatchDefinition(self, tqMatchDict):
|
|
@@ -251,6 +276,13 @@ class TaskQueueDB(DB):
|
|
|
251
276
|
return S_ERROR(f"Match definition field {field} failed : {result['Message']}")
|
|
252
277
|
tqMatchDict[field] = result["Value"]
|
|
253
278
|
|
|
279
|
+
# Check range value fields (RAM requirements for matching)
|
|
280
|
+
if "MaxRAM" in tqMatchDict:
|
|
281
|
+
result = travelAndCheckType(tqMatchDict["MaxRAM"], int, escapeValues=False)
|
|
282
|
+
if not result["OK"]:
|
|
283
|
+
return S_ERROR(f"Match definition field RAM failed : {result['Message']}")
|
|
284
|
+
tqMatchDict["MaxRAM"] = result["Value"]
|
|
285
|
+
|
|
254
286
|
return S_OK(tqMatchDict)
|
|
255
287
|
|
|
256
288
|
def __createTaskQueue(self, tqDefDict, priority=1, connObj=False):
|
|
@@ -303,6 +335,20 @@ class TaskQueueDB(DB):
|
|
|
303
335
|
self.log.error("Failed to insert condition", f"{field} : {result['Message']}")
|
|
304
336
|
self.cleanOrphanedTaskQueues(connObj=connObj)
|
|
305
337
|
return S_ERROR(f"Can't insert values {values} for field {field}: {result['Message']}")
|
|
338
|
+
|
|
339
|
+
# Insert RAM requirements if specified and not both zero
|
|
340
|
+
if "MinRAM" in tqDefDict or "MaxRAM" in tqDefDict:
|
|
341
|
+
minRAM = tqDefDict.get("MinRAM", 0)
|
|
342
|
+
maxRAM = tqDefDict.get("MaxRAM", 0)
|
|
343
|
+
# Only insert if at least one value is non-zero (optimization: avoid unnecessary rows)
|
|
344
|
+
if minRAM > 0 or maxRAM > 0:
|
|
345
|
+
cmd = f"INSERT INTO `tq_RAM_requirements` (TQId, MinRAM, MaxRAM) VALUES ({tqId}, {minRAM}, {maxRAM})"
|
|
346
|
+
result = self._update(cmd, conn=connObj)
|
|
347
|
+
if not result["OK"]:
|
|
348
|
+
self.log.error("Failed to insert RAM requirements", result["Message"])
|
|
349
|
+
self.cleanOrphanedTaskQueues(connObj=connObj)
|
|
350
|
+
return S_ERROR(f"Can't insert RAM requirements: {result['Message']}")
|
|
351
|
+
|
|
306
352
|
self.log.info("Created TQ", tqId)
|
|
307
353
|
return S_OK(tqId)
|
|
308
354
|
|
|
@@ -327,6 +373,13 @@ class TaskQueueDB(DB):
|
|
|
327
373
|
if not result["OK"]:
|
|
328
374
|
return result
|
|
329
375
|
|
|
376
|
+
# Delete RAM requirements for orphaned TQs
|
|
377
|
+
result = self._update(
|
|
378
|
+
f"DELETE FROM `tq_RAM_requirements` WHERE TQId in ( {','.join(orphanedTQs)} )", conn=connObj
|
|
379
|
+
)
|
|
380
|
+
if not result["OK"]:
|
|
381
|
+
return result
|
|
382
|
+
|
|
330
383
|
result = self._update(f"DELETE FROM `tq_TaskQueues` WHERE TQId in ( {','.join(orphanedTQs)} )", conn=connObj)
|
|
331
384
|
if not result["OK"]:
|
|
332
385
|
return result
|
|
@@ -473,6 +526,26 @@ class TaskQueueDB(DB):
|
|
|
473
526
|
sqlCondList.append(f"{numValues} = ({secondQuery} {grouping})")
|
|
474
527
|
else:
|
|
475
528
|
sqlCondList.append(f"`tq_TaskQueues`.TQId not in ( SELECT DISTINCT {tableName}.TQId from {tableName} )")
|
|
529
|
+
|
|
530
|
+
# Handle RAM requirements matching
|
|
531
|
+
hasRAMRequirements = "MinRAM" in tqDefDict or "MaxRAM" in tqDefDict
|
|
532
|
+
if hasRAMRequirements:
|
|
533
|
+
minRAM = tqDefDict.get("MinRAM", 0)
|
|
534
|
+
maxRAM = tqDefDict.get("MaxRAM", 0)
|
|
535
|
+
# Only match TQs with the same RAM requirements if at least one is non-zero
|
|
536
|
+
if minRAM > 0 or maxRAM > 0:
|
|
537
|
+
# Match TQs that have the exact same RAM requirements
|
|
538
|
+
sqlCondList.append(
|
|
539
|
+
f"`tq_TaskQueues`.TQId IN ( SELECT TQId FROM `tq_RAM_requirements` "
|
|
540
|
+
f"WHERE MinRAM = {minRAM} AND MaxRAM = {maxRAM} )"
|
|
541
|
+
)
|
|
542
|
+
else:
|
|
543
|
+
# Both are 0, so match TQs with no RAM requirements row
|
|
544
|
+
sqlCondList.append("`tq_TaskQueues`.TQId NOT IN ( SELECT DISTINCT TQId FROM `tq_RAM_requirements` )")
|
|
545
|
+
else:
|
|
546
|
+
# Match TQs that have no RAM requirements
|
|
547
|
+
sqlCondList.append("`tq_TaskQueues`.TQId NOT IN ( SELECT DISTINCT TQId FROM `tq_RAM_requirements` )")
|
|
548
|
+
|
|
476
549
|
# END MAGIC: That was easy ;)
|
|
477
550
|
return S_OK(" AND ".join(sqlCondList))
|
|
478
551
|
|
|
@@ -722,6 +795,19 @@ WHERE `tq_Jobs`.TQId = %s ORDER BY RAND() / `tq_Jobs`.RealPriority ASC LIMIT 1"
|
|
|
722
795
|
if "CPUTime" in tqMatchDict:
|
|
723
796
|
sqlCondList.append(self.__generateSQLSubCond("tq.%s <= %%s" % "CPUTime", tqMatchDict["CPUTime"]))
|
|
724
797
|
|
|
798
|
+
# RAM matching logic
|
|
799
|
+
if "MaxRAM" in tqMatchDict:
|
|
800
|
+
ram = tqMatchDict["MaxRAM"]
|
|
801
|
+
# Join with tq_RAM_requirements table
|
|
802
|
+
sqlTables["tq_RAM_requirements"] = "ram_req"
|
|
803
|
+
# Match if:
|
|
804
|
+
# 1. No RAM requirement exists for this TQ (LEFT JOIN will give NULL)
|
|
805
|
+
# 2. OR the resource has at least MinRAM
|
|
806
|
+
# Note: MinRAM is used for matching, MaxRAM is informational for post-match scheduling
|
|
807
|
+
# A job requiring MinRAM=2GB can run on any machine with 2GB or more
|
|
808
|
+
ramCond = f"( ram_req.TQId IS NULL OR {ram} >= ram_req.MinRAM )"
|
|
809
|
+
sqlCondList.append(ramCond)
|
|
810
|
+
|
|
725
811
|
tag_fv = []
|
|
726
812
|
|
|
727
813
|
# Match multi value fields
|
|
@@ -844,10 +930,14 @@ WHERE `tq_Jobs`.TQId = %s ORDER BY RAND() / `tq_Jobs`.RealPriority ASC LIMIT 1"
|
|
|
844
930
|
if negativeCond:
|
|
845
931
|
sqlCondList.append(self.__generateNotSQL(negativeCond))
|
|
846
932
|
|
|
847
|
-
# Generate the final query string
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
933
|
+
# Generate the final query string with proper JOINs
|
|
934
|
+
fromClause = "`tq_TaskQueues` tq"
|
|
935
|
+
|
|
936
|
+
# Add LEFT JOIN for RAM requirements if needed
|
|
937
|
+
if "tq_RAM_requirements" in sqlTables:
|
|
938
|
+
fromClause += " LEFT JOIN `tq_RAM_requirements` ram_req ON tq.TQId = ram_req.TQId"
|
|
939
|
+
|
|
940
|
+
tqSqlCmd = f"SELECT tq.TQId, tq.Owner, tq.OwnerGroup FROM {fromClause} WHERE {' AND '.join(sqlCondList)}"
|
|
851
941
|
|
|
852
942
|
# Apply priorities
|
|
853
943
|
tqSqlCmd = f"{tqSqlCmd} ORDER BY RAND() / tq.Priority ASC"
|
|
@@ -994,6 +1084,12 @@ WHERE j.JobId = %s AND t.TQId = j.TQId"
|
|
|
994
1084
|
retVal = self._update(f"DELETE FROM `tq_TQTo{mvField}` WHERE TQId = {tqId}", conn=connObj)
|
|
995
1085
|
if not retVal["OK"]:
|
|
996
1086
|
return retVal
|
|
1087
|
+
|
|
1088
|
+
# Delete RAM requirements if they exist
|
|
1089
|
+
retVal = self._update(f"DELETE FROM `tq_RAM_requirements` WHERE TQId = {tqId}", conn=connObj)
|
|
1090
|
+
if not retVal["OK"]:
|
|
1091
|
+
return retVal
|
|
1092
|
+
|
|
997
1093
|
retVal = self._update(f"DELETE FROM `tq_TaskQueues` WHERE TQId = {tqId}", conn=connObj)
|
|
998
1094
|
if not retVal["OK"]:
|
|
999
1095
|
return retVal
|
|
@@ -1065,6 +1161,40 @@ WHERE j.JobId = %s AND t.TQId = j.TQId"
|
|
|
1065
1161
|
if field not in tqData[tqId]:
|
|
1066
1162
|
tqData[tqId][field] = []
|
|
1067
1163
|
tqData[tqId][field].append(value)
|
|
1164
|
+
|
|
1165
|
+
# Retrieve RAM requirements (if table exists)
|
|
1166
|
+
# Note: The table should be auto-created by __initializeDB, but we check for safety
|
|
1167
|
+
sqlCmd = "SELECT TQId, MinRAM, MaxRAM FROM `tq_RAM_requirements`"
|
|
1168
|
+
if tqIdList is not None:
|
|
1169
|
+
if tqIdList:
|
|
1170
|
+
# Only retrieve RAM requirements for specific TQIds
|
|
1171
|
+
sqlCmd += f" WHERE TQId IN ( {', '.join([str(id_) for id_ in tqIdList])} )"
|
|
1172
|
+
# else: empty list was already handled earlier with fast-track return
|
|
1173
|
+
retVal = self._query(sqlCmd)
|
|
1174
|
+
if not retVal["OK"]:
|
|
1175
|
+
# If table doesn't exist (e.g., old installation), log a warning but continue
|
|
1176
|
+
# This provides backward compatibility
|
|
1177
|
+
if "doesn't exist" in retVal["Message"] or "Table" in retVal["Message"]:
|
|
1178
|
+
self.log.warn("RAM requirements table not found, skipping RAM data retrieval", retVal["Message"])
|
|
1179
|
+
else:
|
|
1180
|
+
self.log.error("Can't retrieve RAM requirements", retVal["Message"])
|
|
1181
|
+
return retVal
|
|
1182
|
+
else:
|
|
1183
|
+
for record in retVal["Value"]:
|
|
1184
|
+
tqId = record[0]
|
|
1185
|
+
minRAM = record[1]
|
|
1186
|
+
maxRAM = record[2]
|
|
1187
|
+
if tqId not in tqData:
|
|
1188
|
+
if tqIdList is None or tqId in tqIdList:
|
|
1189
|
+
self.log.verbose(
|
|
1190
|
+
"Task Queue has RAM requirements but does not exist: triggering a cleaning",
|
|
1191
|
+
f"TQID: {tqId}",
|
|
1192
|
+
)
|
|
1193
|
+
tqNeedCleaning = True
|
|
1194
|
+
else:
|
|
1195
|
+
tqData[tqId]["MinRAM"] = minRAM
|
|
1196
|
+
tqData[tqId]["MaxRAM"] = maxRAM
|
|
1197
|
+
|
|
1068
1198
|
if tqNeedCleaning:
|
|
1069
1199
|
self.cleanOrphanedTaskQueues()
|
|
1070
1200
|
return S_OK(tqData)
|
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
1
|
+
"""The Job Scheduling Executor takes the information gained from all previous
|
|
2
|
+
optimizers and makes a scheduling decision for the jobs.
|
|
3
3
|
|
|
4
|
-
|
|
4
|
+
Subsequent to this jobs are added into a Task Queue and pilot agents can be submitted.
|
|
5
5
|
|
|
6
|
-
|
|
7
|
-
|
|
6
|
+
All issues preventing the successful resolution of a site candidate are discovered
|
|
7
|
+
here where all information is available.
|
|
8
8
|
|
|
9
|
-
|
|
9
|
+
This Executor will fail affected jobs meaningfully.
|
|
10
10
|
"""
|
|
11
11
|
|
|
12
12
|
import random
|
|
@@ -249,7 +249,7 @@ class JobScheduling(OptimizerExecutor):
|
|
|
249
249
|
|
|
250
250
|
# Get stageSites[0] because it has already been randomized and it's as good as any in stageSites
|
|
251
251
|
stageSite = stageSites[0]
|
|
252
|
-
self.jobLog.verbose("
|
|
252
|
+
self.jobLog.verbose("Staging site will be", stageSite)
|
|
253
253
|
stageData = idSites[stageSite]
|
|
254
254
|
# Set as if everything has already been staged
|
|
255
255
|
stageData["disk"] += stageData["tape"]
|
|
@@ -351,12 +351,6 @@ class JobScheduling(OptimizerExecutor):
|
|
|
351
351
|
tagList.append("WholeNode")
|
|
352
352
|
tagList.append("MultiProcessor")
|
|
353
353
|
|
|
354
|
-
# sorting out the RAM (this should be probably coded ~same as number of processors)
|
|
355
|
-
if "MaxRAM" in jobManifest:
|
|
356
|
-
maxRAM = jobManifest.getOption("MaxRAM", 0)
|
|
357
|
-
if maxRAM:
|
|
358
|
-
tagList.append("%dGB" % maxRAM)
|
|
359
|
-
|
|
360
354
|
# other tags? Just add them
|
|
361
355
|
if "Tags" in jobManifest:
|
|
362
356
|
tagList.extend(jobManifest.getOption("Tags", []))
|
|
@@ -391,7 +385,7 @@ class JobScheduling(OptimizerExecutor):
|
|
|
391
385
|
|
|
392
386
|
# Job multivalue requirement keys are specified as singles in the job descriptions
|
|
393
387
|
# but for backward compatibility can be also plurals
|
|
394
|
-
for key in ("JobType", "GridRequiredCEs", "GridCE", "Tags"):
|
|
388
|
+
for key in ("JobType", "GridRequiredCEs", "GridCE", "MinRAM", "MaxRAM", "Tags"):
|
|
395
389
|
reqKey = key
|
|
396
390
|
if key == "JobType":
|
|
397
391
|
reqKey = "JobTypes"
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
"""
|
|
2
|
-
|
|
1
|
+
"""pytest(s) for Executors"""
|
|
2
|
+
|
|
3
3
|
# pylint: disable=protected-access, missing-docstring
|
|
4
4
|
|
|
5
5
|
from unittest.mock import MagicMock
|
|
@@ -54,9 +54,7 @@ def test__applySiteFilter(sites, banned, expected):
|
|
|
54
54
|
({}, []),
|
|
55
55
|
({"Tag": "bof"}, ["bof"]),
|
|
56
56
|
({"Tags": "bof, bif"}, ["bof", "bif"]),
|
|
57
|
-
({"
|
|
58
|
-
({"Tags": "bof, bif", "MaxRAM": 2}, ["bof", "bif", "2GB"]),
|
|
59
|
-
({"WholeNode": "yes", "MaxRAM": 2}, ["WholeNode", "MultiProcessor", "2GB"]),
|
|
57
|
+
({"WholeNode": "yes"}, ["WholeNode", "MultiProcessor"]),
|
|
60
58
|
({"NumberOfProcessors": 1}, []),
|
|
61
59
|
({"NumberOfProcessors": 4}, ["MultiProcessor", "4Processors"]),
|
|
62
60
|
({"NumberOfProcessors": 4, "MinNumberOfProcessors": 2}, ["MultiProcessor", "4Processors"]),
|
|
@@ -546,7 +546,7 @@ class JobWrapper:
|
|
|
546
546
|
self.__report(status=JobStatus.FAILED, minorStatus=JobMinorStatus.APP_THREAD_FAILED, sendFlag=True)
|
|
547
547
|
applicationErrorStatus = "None reported"
|
|
548
548
|
if payloadStatus:
|
|
549
|
-
applicationErrorStatus = payloadStatus
|
|
549
|
+
applicationErrorStatus = str(payloadStatus)
|
|
550
550
|
self.__setJobParam("ApplicationError", applicationErrorStatus, sendFlag=True)
|
|
551
551
|
|
|
552
552
|
# This might happen if process() and postProcess() are called on different machines
|
|
@@ -1544,7 +1544,7 @@ class JobWrapper:
|
|
|
1544
1544
|
#############################################################################
|
|
1545
1545
|
def __setJobParam(self, name, value, sendFlag=False):
|
|
1546
1546
|
"""Wraps around setJobParameter of JobReport client"""
|
|
1547
|
-
jobParam = self.jobReport.setJobParameter(str(name),
|
|
1547
|
+
jobParam = self.jobReport.setJobParameter(str(name), value, sendFlag)
|
|
1548
1548
|
if not jobParam["OK"]:
|
|
1549
1549
|
self.log.warn("Failed setting job parameter", jobParam["Message"])
|
|
1550
1550
|
if self.jobID:
|
|
@@ -565,7 +565,7 @@ def test_postProcess_executor_failed_status_defined(setup_job_wrapper, mocker, m
|
|
|
565
565
|
assert result["OK"]
|
|
566
566
|
assert report_args[-1]["status"] == JobStatus.COMPLETING
|
|
567
567
|
assert report_args[-1]["minorStatus"] == JobMinorStatus.APP_ERRORS
|
|
568
|
-
assert set_param_args[-3][0][1] == 126
|
|
568
|
+
assert set_param_args[-3][0][1] == "126"
|
|
569
569
|
|
|
570
570
|
|
|
571
571
|
def test_postProcess_subprocess_not_complete(setup_job_wrapper, mocker, mock_report_and_set_param):
|
|
@@ -354,7 +354,13 @@ class JobManagerHandlerMixin:
|
|
|
354
354
|
validJobList, invalidJobList, nonauthJobList, ownerJobList = self.jobPolicy.evaluateJobRights(
|
|
355
355
|
jobList, RIGHT_RESCHEDULE
|
|
356
356
|
)
|
|
357
|
-
res = rescheduleJobs(
|
|
357
|
+
res = rescheduleJobs(
|
|
358
|
+
validJobList,
|
|
359
|
+
source="JobManager",
|
|
360
|
+
jobDB=self.jobDB,
|
|
361
|
+
taskQueueDB=self.taskQueueDB,
|
|
362
|
+
jobLoggingDB=self.jobLoggingDB,
|
|
363
|
+
)
|
|
358
364
|
if not res["OK"]:
|
|
359
365
|
self.log.error(res["Message"])
|
|
360
366
|
|
|
@@ -10,7 +10,7 @@ def getMemoryFromProc():
|
|
|
10
10
|
meminfo = {i.split()[0].rstrip(":"): int(i.split()[1]) for i in open("/proc/meminfo").readlines()}
|
|
11
11
|
maxRAM = meminfo["MemTotal"]
|
|
12
12
|
if maxRAM:
|
|
13
|
-
return int(maxRAM / 1024)
|
|
13
|
+
return int(maxRAM / 1024) # from KB to MB
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def getNumberOfProcessors(siteName=None, gridCE=None, queue=None):
|
|
@@ -57,7 +57,7 @@ def getNumberOfProcessors(siteName=None, gridCE=None, queue=None):
|
|
|
57
57
|
return numberOfProcessors
|
|
58
58
|
|
|
59
59
|
# 3) looks in CS for tags
|
|
60
|
-
gLogger.info(f"Getting
|
|
60
|
+
gLogger.info(f"Getting tags for {siteName}: {gridCE}: {queue}")
|
|
61
61
|
# Tags of the CE
|
|
62
62
|
tags = fromChar(
|
|
63
63
|
gConfig.getValue(f"/Resources/Sites/{siteName.split('.')[0]}/{siteName}/CEs/{gridCE}/Tag", "")
|
|
@@ -201,3 +201,82 @@ def getJobParameters(jobIDs: list[int], parName: str | None, vo: str = "") -> di
|
|
|
201
201
|
if jobID not in final:
|
|
202
202
|
final[jobID] = parameters[jobID]
|
|
203
203
|
return S_OK(final)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def getAvailableRAM(siteName=None, gridCE=None, queue=None):
|
|
207
|
+
"""Gets the available RAM on a certain CE/queue/node (what the pilot administers)
|
|
208
|
+
|
|
209
|
+
The siteName/gridCE/queue parameters are normally not necessary.
|
|
210
|
+
|
|
211
|
+
Tries to find it in this order:
|
|
212
|
+
1) from the /Resources/Computing/CEDefaults/MaxRAM (which is what the pilot might fill up)
|
|
213
|
+
2) if not present looks in CS for "MemoryLimitMB" Queue or CE or site option
|
|
214
|
+
3) if not present but there's WholeNode tag, look what the WN provides using _getMemoryFromProc()
|
|
215
|
+
4) return 0
|
|
216
|
+
"""
|
|
217
|
+
|
|
218
|
+
# 1) from /Resources/Computing/CEDefaults/MaxRAM
|
|
219
|
+
gLogger.info("Getting MaxRAM from /Resources/Computing/CEDefaults/MaxRAM")
|
|
220
|
+
availableRAM = gConfig.getValue("/Resources/Computing/CEDefaults/MaxRAM", None)
|
|
221
|
+
if availableRAM:
|
|
222
|
+
return availableRAM
|
|
223
|
+
|
|
224
|
+
# 2) looks in CS for "MaxRAM" Queue or CE or site option
|
|
225
|
+
if not siteName:
|
|
226
|
+
siteName = gConfig.getValue("/LocalSite/Site", "")
|
|
227
|
+
if not gridCE:
|
|
228
|
+
gridCE = gConfig.getValue("/LocalSite/GridCE", "")
|
|
229
|
+
if not queue:
|
|
230
|
+
queue = gConfig.getValue("/LocalSite/CEQueue", "")
|
|
231
|
+
if not (siteName and gridCE and queue):
|
|
232
|
+
gLogger.warn("Could not find AvailableRAM: missing siteName or gridCE or queue. Returning 0")
|
|
233
|
+
return 0
|
|
234
|
+
|
|
235
|
+
grid = siteName.split(".")[0]
|
|
236
|
+
csPaths = [
|
|
237
|
+
f"/Resources/Sites/{grid}/{siteName}/CEs/{gridCE}/Queues/{queue}/MemoryLimitMB",
|
|
238
|
+
f"/Resources/Sites/{grid}/{siteName}/CEs/{gridCE}/MemoryLimitMB",
|
|
239
|
+
f"/Resources/Sites/{grid}/{siteName}/MemoryLimitMB",
|
|
240
|
+
]
|
|
241
|
+
for csPath in csPaths:
|
|
242
|
+
gLogger.info("Looking in", csPath)
|
|
243
|
+
availableRAM = gConfig.getValue(csPath, None)
|
|
244
|
+
if availableRAM:
|
|
245
|
+
return int(availableRAM)
|
|
246
|
+
|
|
247
|
+
# 3) checks if 'WholeNode' is one of the used tags
|
|
248
|
+
# Tags of the CE
|
|
249
|
+
tags = fromChar(
|
|
250
|
+
gConfig.getValue(f"/Resources/Sites/{siteName.split('.')[0]}/{siteName}/CEs/{gridCE}/Tag", "")
|
|
251
|
+
) + fromChar(gConfig.getValue(f"/Resources/Sites/{siteName.split('.')[0]}/{siteName}/Cloud/{gridCE}/Tag", ""))
|
|
252
|
+
# Tags of the Queue
|
|
253
|
+
tags += fromChar(
|
|
254
|
+
gConfig.getValue(f"/Resources/Sites/{siteName.split('.')[0]}/{siteName}/CEs/{gridCE}/Queues/{queue}/Tag", "")
|
|
255
|
+
) + fromChar(
|
|
256
|
+
gConfig.getValue(f"/Resources/Sites/{siteName.split('.')[0]}/{siteName}/Cloud/{gridCE}/VMTypes/{queue}/Tag", "")
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
if "WholeNode" in tags:
|
|
260
|
+
gLogger.info("Found WholeNode tag, using getMemoryFromProc()")
|
|
261
|
+
return getMemoryFromProc()
|
|
262
|
+
|
|
263
|
+
# 4) return 0
|
|
264
|
+
gLogger.info("RAM limits could not be found in CS, and WholeNode tag not found")
|
|
265
|
+
return 0
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def getRAMForJob(jobID):
|
|
269
|
+
"""Gets the RAM allowed for the job.
|
|
270
|
+
This can be used to communicate to your job payload the RAM it's allowed to use,
|
|
271
|
+
so this function should be called from your extension.
|
|
272
|
+
|
|
273
|
+
If the JobAgent is using "InProcess" CE (which is the default),
|
|
274
|
+
then what's returned will basically be the same of what's returned by the getAvailableRAM() function above
|
|
275
|
+
"""
|
|
276
|
+
|
|
277
|
+
# from /Resources/Computing/JobLimits/jobID/MaxRAM (set by PoolComputingElement)
|
|
278
|
+
ram = gConfig.getValue(f"Resources/Computing/JobLimits/{jobID}/MaxRAM")
|
|
279
|
+
if ram:
|
|
280
|
+
return int(ram)
|
|
281
|
+
|
|
282
|
+
return getAvailableRAM()
|
|
@@ -208,12 +208,13 @@ class PilotCStoJSONSynchronizer:
|
|
|
208
208
|
if defaultSetup:
|
|
209
209
|
pilotDict["DefaultSetup"] = defaultSetup
|
|
210
210
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
211
|
+
configurationServers = Operations().getValue("Pilot/OverrideConfigurationServers", [])
|
|
212
|
+
if not configurationServers:
|
|
213
|
+
self.log.debug("From DIRAC/Configuration")
|
|
214
|
+
configurationServers = gConfig.getServersList()
|
|
215
|
+
if not includeMasterCS:
|
|
216
|
+
masterCS = gConfigurationData.getMasterServer()
|
|
217
|
+
configurationServers = exclude_master_cs_aliases(configurationServers, masterCS)
|
|
217
218
|
pilotDict["ConfigurationServers"] = configurationServers
|
|
218
219
|
|
|
219
220
|
preferredURLPatterns = gConfigurationData.extractOptionFromCFG("/DIRAC/PreferredURLPatterns")
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
"""Utilities to help Computing Element Queues manipulation
|
|
2
|
-
|
|
1
|
+
"""Utilities to help Computing Element Queues manipulation"""
|
|
2
|
+
|
|
3
3
|
import hashlib
|
|
4
4
|
|
|
5
5
|
from DIRAC import S_OK, S_ERROR
|
|
@@ -222,10 +222,10 @@ def matchQueue(jobJDL, queueDict, fullMatch=False):
|
|
|
222
222
|
return S_OK({"Match": False, "Reason": noMatchReasons[0]})
|
|
223
223
|
|
|
224
224
|
# 5. RAM
|
|
225
|
-
ram = job.getAttributeInt("
|
|
225
|
+
ram = job.getAttributeInt("MaxRAM")
|
|
226
226
|
# If MaxRAM is not specified in the queue description, assume 2GB
|
|
227
|
-
if ram and ram > int(queueDict.get("MaxRAM", 2048)
|
|
228
|
-
noMatchReasons.append("Job RAM
|
|
227
|
+
if ram and ram > int(queueDict.get("MaxRAM", 2048)):
|
|
228
|
+
noMatchReasons.append(f"Job RAM {ram} requirement not satisfied")
|
|
229
229
|
if not fullMatch:
|
|
230
230
|
return S_OK({"Match": False, "Reason": noMatchReasons[0]})
|
|
231
231
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""RemoteRunner
|
|
2
2
|
|
|
3
3
|
RemoteRunner has been designed to send scripts/applications and input files on remote worker nodes having
|
|
4
4
|
no outbound connectivity (e.g. supercomputers)
|
|
@@ -6,6 +6,7 @@ no outbound connectivity (e.g. supercomputers)
|
|
|
6
6
|
Mostly called by workflow modules, RemoteRunner is generally the last component to get through before
|
|
7
7
|
the script/application execution on a remote machine.
|
|
8
8
|
"""
|
|
9
|
+
|
|
9
10
|
import hashlib
|
|
10
11
|
import os
|
|
11
12
|
import shlex
|
|
@@ -118,12 +118,21 @@ def createJobWrapper(
|
|
|
118
118
|
return S_OK(generatedFiles)
|
|
119
119
|
|
|
120
120
|
|
|
121
|
-
def rescheduleJobs(
|
|
121
|
+
def rescheduleJobs(
|
|
122
|
+
jobIDs: list[int],
|
|
123
|
+
source: str = "",
|
|
124
|
+
jobDB: JobDB | None = None,
|
|
125
|
+
taskQueueDB: TaskQueueDB | None = None,
|
|
126
|
+
jobLoggingDB: JobLoggingDB | None = None,
|
|
127
|
+
) -> dict:
|
|
122
128
|
"""Utility to reschedule jobs (not atomic, nor bulk)
|
|
123
129
|
Requires direct access to the JobDB and TaskQueueDB
|
|
124
130
|
|
|
125
131
|
:param jobIDs: list of jobIDs
|
|
126
132
|
:param source: source of the reschedule
|
|
133
|
+
:param jobDB: optional JobDB instance to reuse (creates new if not provided)
|
|
134
|
+
:param taskQueueDB: optional TaskQueueDB instance to reuse (creates new if not provided)
|
|
135
|
+
:param jobLoggingDB: optional JobLoggingDB instance to reuse (creates new if not provided)
|
|
127
136
|
:return: S_OK/S_ERROR
|
|
128
137
|
:rtype: dict
|
|
129
138
|
|
|
@@ -131,13 +140,21 @@ def rescheduleJobs(jobIDs: list[int], source: str = "") -> dict:
|
|
|
131
140
|
|
|
132
141
|
failedJobs = []
|
|
133
142
|
|
|
143
|
+
# Reuse provided DB instances or create new ones
|
|
144
|
+
if jobDB is None:
|
|
145
|
+
jobDB = JobDB()
|
|
146
|
+
if taskQueueDB is None:
|
|
147
|
+
taskQueueDB = TaskQueueDB()
|
|
148
|
+
if jobLoggingDB is None:
|
|
149
|
+
jobLoggingDB = JobLoggingDB()
|
|
150
|
+
|
|
134
151
|
for jobID in jobIDs:
|
|
135
|
-
result =
|
|
152
|
+
result = jobDB.rescheduleJob(jobID)
|
|
136
153
|
if not result["OK"]:
|
|
137
154
|
failedJobs.append(jobID)
|
|
138
155
|
continue
|
|
139
|
-
|
|
140
|
-
|
|
156
|
+
taskQueueDB.deleteJob(jobID)
|
|
157
|
+
jobLoggingDB.addLoggingRecord(
|
|
141
158
|
result["JobID"],
|
|
142
159
|
status=result["Status"],
|
|
143
160
|
minorStatus=result["MinorStatus"],
|
|
@@ -1,5 +1,4 @@
|
|
|
1
|
-
"""
|
|
2
|
-
"""
|
|
1
|
+
"""Test class for Job Agent"""
|
|
3
2
|
|
|
4
3
|
# imports
|
|
5
4
|
import pytest
|
|
@@ -60,11 +59,16 @@ def test__wrapCommand(command, workingDirectory, expectedContent):
|
|
|
60
59
|
(1, 1, True, 1),
|
|
61
60
|
(2, 2, True, 2),
|
|
62
61
|
(1, 2, True, 1),
|
|
62
|
+
(
|
|
63
|
+
1,
|
|
64
|
+
0,
|
|
65
|
+
True,
|
|
66
|
+
1,
|
|
67
|
+
), # if ceNumberOfProcessors is 0, it will be interpreted as needing local evaluation. That will return 1.
|
|
63
68
|
# CE has less processors than the payload requests
|
|
64
69
|
(2, 1, False, "Not enough processors to execute the command"),
|
|
65
70
|
# Specific case: we should not have 0
|
|
66
71
|
(0, 1, False, "Inappropriate NumberOfProcessors value"),
|
|
67
|
-
(1, 0, False, "Inappropriate NumberOfProcessors value"),
|
|
68
72
|
(-4, 1, False, "Inappropriate NumberOfProcessors value"),
|
|
69
73
|
(1, -4, False, "Inappropriate NumberOfProcessors value"),
|
|
70
74
|
(0, 0, False, "Inappropriate NumberOfProcessors value"),
|
|
@@ -39,12 +39,12 @@ def main():
|
|
|
39
39
|
gLogger.info("Getting number of processors")
|
|
40
40
|
numberOfProcessor = JobParameters.getNumberOfProcessors(Site, ceName, Queue)
|
|
41
41
|
|
|
42
|
-
gLogger.info("Getting memory (RAM)")
|
|
43
|
-
maxRAM = JobParameters.getMemoryFromProc()
|
|
44
|
-
|
|
45
42
|
gLogger.info("Getting number of GPUs")
|
|
46
43
|
numberOfGPUs = JobParameters.getNumberOfGPUs(Site, ceName, Queue)
|
|
47
44
|
|
|
45
|
+
gLogger.info("Getting maximum RAM")
|
|
46
|
+
maxRAM = JobParameters.getAvailableRAM(Site, ceName, Queue)
|
|
47
|
+
|
|
48
48
|
# just communicating it back
|
|
49
49
|
gLogger.notice(" ".join(str(wnPar) for wnPar in [numberOfProcessor, maxRAM, numberOfGPUs]))
|
|
50
50
|
|
DIRAC/__init__.py
CHANGED
|
@@ -139,7 +139,7 @@ def _computeRootPath(rootPath):
|
|
|
139
139
|
if versionsPath.parent.name != "versions":
|
|
140
140
|
return str(rootPath)
|
|
141
141
|
# VERSION-INSTALL_TIME
|
|
142
|
-
pattern1 = re.compile(r"v
|
|
142
|
+
pattern1 = re.compile(r"(v\d+\.\d+\.\d+[^\-]*|[^-]+)-(\d+)")
|
|
143
143
|
# $(uname -s)-$(uname -m)
|
|
144
144
|
pattern2 = re.compile(r"([^\-]+)-([^\-]+)")
|
|
145
145
|
if pattern1.fullmatch(versionsPath.name) and pattern2.fullmatch(rootPath.name):
|